import pandas as pd
import re
import nltk
from nltk import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from gensim.corpora import Dictionary
from gensim.models import LdaModel

# 下载NLTK的停用词、情感分析和词性标注所需的资源
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

# 加载SpaCy的英文NLP模型
nlp = spacy.load("en_core_web_sm")

# 读取Excel文件
df = pd.read_excel('nltk分词处理结果第二次.xlsx')

# 定义文本清洗函数
def clean_text(text):
    # 去除HTML标签
    cleaned_text = re.sub(r'<.*?>', '', text)
    # 去除多余空格和换行符
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    # 转换为小写
    cleaned_text = cleaned_text.lower()
    return cleaned_text

# 清洗文本数据
df['cleaned_content'] = df['content'].apply(clean_text)

# 词频分析
words = []
for text in df['cleaned_content']:
    words += word_tokenize(text)
freq_dist = FreqDist(words)
print("词频分析结果:", freq_dist.most_common(10))

# 情感分析
sia = SentimentIntensityAnalyzer()
df['sentiment_score'] = df['cleaned_content'].apply(lambda x: sia.polarity_scores(x)['compound'])
print("情感分析结果:", df['sentiment_score'])

# 定义阈值
positive_threshold = 0.5
negative_threshold = -0.5

# 根据情感分数进行分类
def classify_sentiment(score):
    if score > positive_threshold:
        return '积极'
    elif score < negative_threshold:
        return '消极'
    else:
        return '中性'

# 应用分类函数,创建新的列 'sentiment_category'
df['sentiment_category'] = df['sentiment_score'].apply(classify_sentiment)

# 输出带有情感分类的数据
print(df[['cleaned_content', 'sentiment_score', 'sentiment_category']])


# 主题建模
tokens = [[token.text.lower() for token in nlp(text) if token.is_alpha and token.text.lower() not in STOP_WORDS] for text in df['cleaned_content']]
dictionary = Dictionary(tokens)
corpus = [dictionary.doc2bow(text) for text in tokens]
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)
topics = lda_model.print_topics(num_words=5)
print("主题建模结果:")
for topic in topics:
    print(topic)
 

10-27 22:06