A data visualization project using data retrieved from Dr. Ko Wen-Je(柯文哲)’s facebook page

#Load required packages
library(jsonlite)
library(tm)
library(tmcn)
library(wordcloud)

Extraction

#Extract only data of interest, i.e., comments from the post
fb_comment <- fb_post$data$message

Data Cleansing and Transformation

#Store data in Corpus database and perform neccessary data cleansing
comment_db <- Corpus(VectorSource(fb_comment), list(language = NA))
comment_db <- tm_map(comment_db, stripWhitespace)
comment_db <- tm_map(comment_db, removePunctuation)
comment_db <- tm_map(comment_db, removeNumbers)
comment_db <- tm_map(comment_db, function(word) {gsub("[A-Za-z0-9]", "", word)})
comment_db <- tm_map(comment_db, segmentCN, nature = TRUE)
comment_db <- tm_map(comment_db, function(sentence) {
  noun <- lapply(sentence, function(w) {
    w[names(w) == "n"]
  })
  unlist(noun)
})

Load and Aggregate

#Convert data back to vector and aggregate on extracted words
comment_db <- Corpus(VectorSource(comment_db))

tdm <- TermDocumentMatrix(comment_db, control = list(wordLengths = c(2, Inf)))

m1 <- as.matrix(tdm)
v <- sort(rowSums(m1), decreasing = TRUE)
d <- data.frame(word = names(v), freq = v)

Visualization

#Visualize result using word cloud
par(family = 'STHeiti')
wordcloud(d$word, d$freq, min.freq = 5, random.order = F,
          ordered.colors = F,
          colors = rainbow(length(row.names(m1))), family="Heiti TC Light", scale = c(12, .4))