JoJo的数据分析历险记

JoJo的数据分析历险记

【R语言文本挖掘】:情感分析与词云图绘制


【R语言文本挖掘】:情感分析与词云图绘制-LMLPHP

引言

【R语言文本挖掘】:情感分析与词云图绘制-LMLPHP

1.情感数据集

  • AFINN
  • bing
  • nrc
library(`tidytext`)
library(dplyr)
get_sentiments("nrc") %>% head()
get_sentiments("bing") %>% head()
get_sentiments("afinn") %>% head()

2.使用内连接进行情感分析

library(janeaustenr)
library(dplyr)
library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text,
                                regex("^chapter [\\divxlc]",
                                      ignore_case = TrUE)))) %>%#使用正则表达式来定义章节
  ungroup() %>%
  unnest_tokens(word, text)#分词
tidy_books %>% head()
nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

tidy_books %>%
    filter(book=='Emma') %>%
    inner_join(nrc_joy) %>%
    count(word,sort=TrUE)%>%
    head()
library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%#使用bing情绪词典进行内连接
  count(book, index = linenumber %/% 80, sentiment) %>%#按八十行为一个小段进行记数
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%#将数据转换成宽数据 
  mutate(sentiment = positive - negative)#计算净情绪,如果大于0说明是积极情绪,小于0说明是消极的
jane_austen_sentiment %>% head()
library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col()+#绘制柱形图
  facet_wrap(~book, ncol = 2, scales = "free_x")#根据不同书进行分面绘图,两行


【R语言文本挖掘】:情感分析与词云图绘制-LMLPHP

3.对比三种情感字典

pride_prejudice <- tidy_books %>%
    filter(book == 'Pride & Prejudice')
pride_prejudice %>% head()
# 使用AFINN词典
afinn <- pride_prejudice %>%
  inner_join(get_sentiments("afinn")) %>% #内连接,得到带有情感的文本
  group_by(index = linenumber %/% 80) %>% #每隔80行作为一小段
  summarise(sentiment = sum(value)) %>% #这里我们进行一个求和处理,因为这里是以数字表示情感的
  mutate(method = "AFINN")

# bing词典
bing <- pride_prejudice %>%
        inner_join(get_sentiments("bing")) %>%#使用bing词典进行内连接
        mutate(method = "Bing")%>%
        count(index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>%
  mutate(sentiment = positive - negative)
# 使用nrc词典
nrc <- pride_prejudice %>%
        inner_join(get_sentiments("nrc") %>%
        filter(sentiment %in% c('positive','negative'))) %>%
        mutate(method = "NRC")%>%
        count(index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>%
  mutate(sentiment = positive - negative)

bind_rows(afinn,
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")


【R语言文本挖掘】:情感分析与词云图绘制-LMLPHP

get_sentiments('nrc') %>%
  filter(sentiment %in% c('positive','negative')) %>%
  count(sentiment)
get_sentiments('bing') %>%
  count(sentiment)

4.最常见的积极和消极的单词

bing_word_count <- tidy_books %>%
 inner_join(get_sentiments('bing')) %>%
 count(word, sentiment, sort = TrUE) %>%
 ungroup()
[1m[22mJoining, by = "word"
bing_word_count %>% head()
bing_word_count %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% #删选出前10的
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%#重新排序
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)


【R语言文本挖掘】:情感分析与词云图绘制-LMLPHP

custom_stop_words <- bind_rows(tibble(word = c("miss"),
                                      lexicon = c("custom")),
                               stop_words)

custom_stop_words %>% head()

5.词云绘制

library(wordcloud)
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))#出现次数最多的前100个

【R语言文本挖掘】:情感分析与词云图绘制-LMLPHP

library(reshape2)
tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TrUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
 comparison.cloud(colors = c("blue", "red"),

                  max.words = 100)

【R语言文本挖掘】:情感分析与词云图绘制-LMLPHP

6.总结

参考资料:Text Mining with R

07-18 11:31