################## #MATH 5366 NOTES # #TEXT MINING IN R# ################## ################################################################################################# #This script explores text mining using data from the Kaggle data mining competition, # #"Bag of Words Meets Bag of Popcorn". Only about 10% of the training and test data are used # #in this script to reduce computation time. The full competition and data can be found here: # # # #https://www.kaggle.com/c/word2vec-nlp-tutorial # # # #Thanks to John Koo for writing a tutorial related to this project, which provided most of the # #material for this script. # ################################################################################################# #Obtaining Movie Review Data movie.data = read.delim("http://faculty.tarleton.edu/crawford/documents/Math5364/MovieReviews.txt", header = T, quote = '', stringsAsFactors = F) ######################################## #Text Cleaning with Regular Expressions# ######################################## #gsub Examples gsub("ABC","123", "ABCDEFGHIJKLMNOP") #A dot represents any single character gsub("D.F","456","ABCDEFGHIJKLMNOP") gsub("a.c","DOG","11111111abc22222222azc3333333ahc") #An * indicates that the previous character can appear zero or more times. gsub("ab*c","DOG","1111111abbbbbbbbbc2222222abc333333333333ac") gsub("A.*C","DOG","wwwwwwwwwwwAqwertyuiopCwwwwww") gsub("<.*>","","

This is the Heading of a Wepage

") #A ? after an * causes the replacement to be as small as possible. gsub("<.*?>","","

This is the Heading of a Wepage

") #Removing HTML Tags from our Movie Review Data movie.data$review.clean = gsub('<.*?>', '', movie.data$review) #Removing Punctuation movie.data$review.clean = gsub('[[:punct:]]','',movie.data$review.clean) #Converting to Lower Case movie.data$review.clean = tolower(movie.data$review.clean) #More info on regular expressions: #https://en.wikipedia.org/wiki/Regular_expression ################ #The AFINN List# ################ # Read in the AFINN list afinn <- read.delim('http://faculty.tarleton.edu/crawford/documents/Math5364/AFINN.txt', header = F, quote = '', stringsAsFactors = F) names(afinn) <- c('word','score') #Replacing Hyphens with Spaces afinn$word.clean <- gsub('-',' ' , afinn$word) #Removing punctuation afinn$word.clean <- gsub("[[:punct:]]", '', afinn$word.clean) #Histogram of Word Scores hist(afinn$score) ############################################## #Term Frequencies using the str_count Command# ############################################## library(stringr) #Examples fruit.names=c("apple","banana","cantaloupe","date") str_count("i want to eat an apple or a banana, preferably an apple",fruit.names) #AFINN Frequency Function afinn.frequencies=function(x){ str_count(x,afinn$word.clean) } #Term Frequencies (Takes about two minutes to run) term.freq <- t(apply(t(movie.data$review.clean), 2, afinn.frequencies)) dim(term.freq) #Example movie.data$review.clean[50] afinn$word[1438] term.freq[50,1438] ############### #AFINN Ratings# ############### movie.data$afinn.rating = as.vector(term.freq %*% afinn$score) #Probability Densities for AFINN Ratings library(ggplot2) ggplot(movie.data, aes(afinn.rating, fill = as.factor(sentiment))) + geom_density(alpha = .2) ######################### #Selecting Training Data# ######################### source("http://faculty.tarleton.edu/crawford/documents/Math5364/MiscRFunctions.txt") set.seed(256) train=trainsample(movie.data,0.9) ################### #Naive Bayes Model# ################### install.packages("e1071") library(e1071) movie.data$sentiment=as.factor(movie.data$sentiment) nb.model=naiveBayes(sentiment~afinn.rating, data=movie.data[train,]) #Classification Accuracy source("http://faculty.tarleton.edu/crawford/documents/Math5364/MiscRFunctions.txt") pred.sentiment=predict(nb.model, newdata=movie.data[-train,]) confmatrix(pred.sentiment,movie.data$sentiment[-train]) #ROC Curve phat=predict(nb.model, newdata=movie.data[-train,], type="raw") library(pROC) plot(roc(movie.data$sentiment[-train],phat[,2])) #Memory Cleanup remove(nb.model,phat) #List Objects in Order of Memory Usage in Bytes sort( sapply(ls(),function(x){object.size(get(x))})) #Total Memory Usage in Bytes sum(sapply(ls(),function(x){object.size(get(x))})) ################################## #Bag of Words Random Forest Model# ################################## install.packages("randomForest") library(randomForest) rf.movie.data=data.frame(sentiment=movie.data$sentiment,term.freq) #Random Forest (Takes about 15 min to run) rf.model=randomForest(sentiment~.,data=rf.movie.data[train,]) #Classification Accuracy source("http://faculty.tarleton.edu/crawford/documents/Math5364/MiscRFunctions.txt") pred.sentiment=predict(rf.model, newdata=rf.movie.data[-train,]) confmatrix(pred.sentiment,movie.data$sentiment[-train]) #ROC Curve phat=predict(rf.model, newdata=rf.movie.data[-train,], type="prob") library(pROC) plot(roc(movie.data$sentiment[-train],phat[,2])) ############################## #Inverse Document Frequencies# ############################## View(term.freq) inv.doc.freq=log(nrow(movie.data)/colSums(sign(term.freq))) range(inv.doc.freq) inv.doc.freq[is.infinite(inv.doc.freq)]=0 range(inv.doc.freq) tf.idf = term.freq %*% diag(inv.doc.freq) #Example movie.data$review.clean[50] afinn$word[1438] term.freq[50,1438] inv.doc.freq[1438] term.freq[50,1438]*inv.doc.freq[1438] tf.idf[50,1438] ############################ #Random Forest Using tf.idf# ############################ rf.movie.data=data.frame(sentiment=movie.data$sentiment,tf.idf) #Random Forest (Takes about 15 min to run) rf.model=randomForest(sentiment~.,data=rf.movie.data[train,]) #Classification Accuracy source("http://faculty.tarleton.edu/crawford/documents/Math5364/MiscRFunctions.txt") pred.sentiment=predict(rf.model, newdata=rf.movie.data[-train,]) confmatrix(pred.sentiment,movie.data$sentiment[-train]) #ROC Curve phat=predict(rf.model, newdata=rf.movie.data[-train,], type="prob") library(pROC) plot(roc(movie.data$sentiment[-train],phat[,2])) ################################################## #Building a Term Frequency Matrix from the Corpus# ################################################## library(tm) corpus=Corpus(VectorSource(movie.data$review.clean)) tf = DocumentTermMatrix(corpus, control = list(stopwords = stopwords('english'), removeNumbers = T)) #Only include words that occur in at least 0.1% of reviews. tf = removeSparseTerms(tf, .999) #Convert to a matrix tf = as.matrix(tf) View(tf) dim(tf) #Total Word Frequencies word.freq=colSums(tf) head(word.freq) head(names(word.freq)) #2 Column Format word.freq=data.frame(word=names(word.freq),freq=word.freq) head(word.freq) #Remove extra row names rownames(word.freq)=NULL head(word.freq) #Sorting by Frequency in Descending Order head(word.freq[order(-word.freq$freq),]) #Writing a Word Frequency Function word.freq <- function(document.vector, sparsity = .999) { # construct corpus temp.corpus <- Corpus(VectorSource(document.vector)) # construct tf matrix and remove sparse terms temp.tf <- DocumentTermMatrix(temp.corpus, control = list(stopwords = stopwords('english'), removeNumbers = T)) temp.tf <- removeSparseTerms(temp.tf, sparsity) temp.tf <- as.matrix(temp.tf) # construct word frequency df freq.df <- colSums(temp.tf) freq.df <- data.frame(word = names(freq.df), freq = freq.df) rownames(freq.df) <- NULL return(freq.df) } head(word.freq(movie.data$review.clean)) ############################################## #Normalized Sentiment Difference Index (NDSI)# ############################################## #Word Frequencies for Positive and Negative Reviews in Training Data train.data=movie.data[train,] word.freq.pos = word.freq(train.data$review.clean[train.data$sentiment == 1], sparsity=0.99) word.freq.neg = word.freq(train.data$review.clean[train.data$sentiment[train] == 0], sparsity=0.99) word.freq.pos[1:20,] word.freq.neg[1:20,] #Merge by word freq.all = merge(word.freq.neg, word.freq.pos, by = 'word', all = T) dim(word.freq.pos) dim(word.freq.neg) dim(freq.all) word.freq.pos[1:20,] word.freq.neg[1:20,] freq.all[1:20,] #Remove NA's freq.all$freq.x[is.na(freq.all$freq.x)] = 0 freq.all$freq.y[is.na(freq.all$freq.y)] = 0 #Differences between Positive and Negative Frequencies freq.all$diff = abs(freq.all$freq.x - freq.all$freq.y) head(freq.all[order(-freq.all$diff), ]) #Smoothing term alpha <- 2^7 #NDSI freq.all$ndsi = abs(freq.all$freq.x - freq.all$freq.y)/(freq.all$freq.x + freq.all$freq.y + 2 * alpha) #Sorting by NDSI freq.all = freq.all[order(-freq.all$ndsi), ] head(freq.all) #Convert word to a string head(freq.all$word) freq.all$word = as.character(freq.all$word) head(freq.all$word) ###################################### #Term Frequencies and tfidf with NDSI# ###################################### #AFINN Frequency Function library(stringr) ndsi.frequencies=function(x){ str_count(x,freq.all$word[1:1024]) } #Term Frequencies (Takes about two minutes to run) term.freq <- t(apply(t(movie.data$review.clean), 2, ndsi.frequencies)) inv.doc.freq=log(nrow(movie.data)/colSums(sign(term.freq))) range(inv.doc.freq) inv.doc.freq[is.infinite(inv.doc.freq)]=0 range(inv.doc.freq) tf.idf = term.freq %*% diag(inv.doc.freq) ################################# #Random Forest Using NDSI tf.idf# ################################# movie.data$sentiment=as.factor(movie.data$sentiment) rf.movie.data=data.frame(sentiment=movie.data$sentiment,tf.idf) #Random Forest (Takes about 15 min to run) rf.model=randomForest(sentiment~.,data=rf.movie.data[train,]) #Classification Accuracy source("http://faculty.tarleton.edu/crawford/documents/Math5364/MiscRFunctions.txt") pred.sentiment=predict(rf.model, newdata=rf.movie.data[-train,]) confmatrix(pred.sentiment,movie.data$sentiment[-train]) #ROC Curve phat=predict(rf.model, newdata=rf.movie.data[-train,], type="prob") library(pROC) plot(roc(movie.data$sentiment[-train],phat[,2])) ######### #TwitteR# ######### install.packages("twitteR") library(twitteR) #Trying to search for Verizon tweets. verizon=searchTwitteR("Verizon",n=100,lang="en") #We need to setup a twitteR app. #https://apps.twitter.com/ #WARNING: Don't use the keys and secrets below. Create your own #twitter app at https://apps.twitter.com/, and use the keys and secrets #provided for your account. setup_twitter_oauth(consumer_key="rf9Bym8zIRbxzwI8vEQTmLu5S", consumer_secret ="BuN2d38QHcoH5PgxdUwIp6jps7poyQWxRxE2z2DbYf3N8oSISs", access_token="4913426354-dWGlYAlEgr2xZVU0Z85P8xDLKUI4Guxq4RR9C2P", access_secret="UTNWewqwm1kWc6OIEgscaJiKOJQRIx0wtHd8aRxqN5UFC") #Searching Again. verizon=searchTwitteR("Verizon",n=100,lang="en") #Converting Search Results into a Data Frame verizon=twListToDF(verizon) #Cleaning the tweets verizon$text.clean = gsub('<.*?>', '', verizon$text) verizon$text.clean = gsub('[[:punct:]]','',verizon$text.clean) verizon$text.clean = tolower(verizon$text.clean) imdb=read.delim(file="C:\\Users\\jcrawford\\Downloads\\sentiment labelled sentences\\sentiment labelled sentences\\imdb_labelled.txt", header=F, sep="\n", quote='', stringsAsFactors=F) amazon=read.table(file="C:\\Users\\jcrawford\\Downloads\\sentiment labelled sentences\\sentiment labelled sentences\\amazon_cells_labelled.txt", header=F, sep="\n", quote='', stringsAsFactors=F) yelp=read.delim(file="C:\\Users\\jcrawford\\Downloads\\sentiment labelled sentences\\sentiment labelled sentences\\yelp_labelled.txt", header=F, sep="\n", quote='', stringsAsFactors=F)