Skip to content

Instantly share code, notes, and snippets.

@josefslerka
Created December 3, 2012 14:23
Show Gist options
  • Save josefslerka/4195326 to your computer and use it in GitHub Desktop.
Save josefslerka/4195326 to your computer and use it in GitHub Desktop.
Digital Humanities 8 - Text mining
#
#
# https://sites.google.com/site/genbiovis/code/network
#
# natazeni knihovny
library(tm)
# nacteni korpusu
corpusHavel <- Corpus(DirSource("havel"), readerControl = list(language = "cz"))
corpusKlaus <- Corpus(DirSource("klaus"), readerControl = list(language = "cz"))
# pristup k jednotlivym dokuemntu
corpusKlaus[[1]]
# transformace
mydata.corpus <- corpusHavel
mydata.corpus <- tm_map(mydata.corpus, tolower)
mydata.corpus <- tm_map(mydata.corpus, removePunctuation)
# spinave transformace
mydata.corpus <- iconv(mydata.corpus, from="utf8", to="ASCII//TRANSLIT")
mydata.corpus <- gsub("\'", "", mydata.corpus)
mydata.corpus <- gsub("\"", "", mydata.corpus)
mydata.corpus <- gsub("\\n", "", mydata.corpus)
mydata.corpus <- Corpus(VectorSource(mydata.corpus))
# odstraneni stopwords
my_stopwords <- c(stopwords('english'), 'se', 'na', 'v', 'co', 'ze',
'bych','dekuji','kdy','tomu','totiz',
'panove', 'vazeni', 'svou', 'velmi',
'abych','toto',
'o', 'je', 'k', 'z', 'proti', 'neni','byly', 'si', 'dnes', 'cz', 'timto', 'budes', 'budem', 'byli', 'jses', 'muj', 'svym', 'ta', 'tomto', 'tohle', 'tuto', 'tyto', 'jej', 'zda', 'proc', 'mate', 'tato', 'kam', 'tohoto', 'kdo', 'kteri', 'mi', 'nam', 'tom', 'tomuto', 'mit', 'nic', 'proto', 'kterou', 'byla', 'toho', 'protoze', 'asi', 'ho', 'nasi', 'napiste', 're', 'rt', 'coz', 'tim', 'takze', 'svych', 'jeji', 'svymi', 'jste', 'aj', 'tu', 'tedy', 'teto', 'bylo', 'kde', 'ke', 'prave', 'ji', 'nad', 'nejsou', 'ci', 'pod', 'tema', 'mezi', 'pres', 'ty', 'pak', 'vam', 'ani', 'kdyz', 'vsak', 'ne', 'jsem', 'tento', 'aby', 'jsme', 'pred', 'pta', 'jejich', 'byl', 'jeste', 'az', 'bez', 'take', 'pouze', 'prvni', 'vase', 'ktera', 'nas', 'novy', 'pokud', 'muze', 'jeho', 'sve', 'jine', 'zpravy', 'nove', 'neni', 'vas', 'jen', 'podle', 'zde', 'clanek', 'uz', 'byt', 'vice', 'bude', 'jiz', 'nez', 'ktery', 'by', 'ktere', 'co', 'nebo', 'ten', 'tak', 'ma', 'pri', 'od', 'po', 'jsou', 'jak', 'dalsi', 'ale', 'si', 've', 'to', 'jako', 'za', 'zpet', 'ze', 'do', 'pro', 'je', 'na')
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords)
# vytvoreni Term Document Matrixu
mydata.dtm <- TermDocumentMatrix(mydata.corpus)
# nalezni prekventovanych slov
findFreqTerms(mydata.dtm, 40, Inf)
# nalezeni slov korelujicich se slovem (umi byt zatracene pomale)
findAssocs(mydata.dtm, "svet", 0.85)
# nalezeni slov korelujicich se slovem (umi byt zatracene pomale)
removeSparseTerms(mydata.dtm, sparse=0.65)
# remove sparse terms from the term document matrix.
# In other words, we'll remove terms which have at least a sparse percentage of empty elements (e.g. terms occurring 0 times in a document).
mydata.dtm2 <- removeSparseTerms(mydata.dtm, sparse=0.65)
# vytvoreni objetu matrix
m = as.matrix(mydata.dtm2)
# get the top ten words
dtm <- DocumentTermMatrix(mydata.corpus)
dtm <- removeSparseTerms(dtm, 0.65)
top10 <- as.matrix(dtm)
v <- apply(top10,2,sum)
v <- sort(v, decreasing = TRUE)
v1 <- sort(v[1:10])
barplot(v1, horiz=TRUE, cex.names = 0.7, las = 1, col=grey.colors(10), main="Frequency of Terms")
# matice korelaci
library(corrplot)
words <- names(findAssocs(dtm, "stat", .2)[2:11])
oi <- as.matrix(dtm)
find <- colnames(oi) %in% words
corr <- cor(oi[,find])
corrplot(corr)
#############
# wordcloud #
#############
library(RColorBrewer)
library(wordcloud)
tdm <- removeSparseTerms(mydata.dtm, .975)
m = as.matrix(tdm)
v = sort(rowSums(m), decreasing=TRUE)
# create data frame of words and frequencies
d = data.frame(word=names(v), freq=v)
purd = brewer.pal(9, "PuRd")[-c(1:3)]
wordcloud(d$word, d$freq, colors=purd, random.order=FALSE)
#####################
# network of words #
#####################
library(igraph)
library(network)
library(sna)
tdm <- removeSparseTerms(mydata.dtm, .975)
m = as.matrix(tdm)
# adjacency matrix
adj.mat = m %*% t(m)
# zeroes in the diagonal of the matrix
diag(adj.mat) = 0
# generate graph from adjacency matrix
word.graph = graph.adjacency(adj.mat)
# position matrix
pos.matrix <- layout.fruchterman.reingold(word.graph)
plot(pos.matrix, type="n", xaxt="n", yaxt="n", xlab="", ylab="")
text(pos.matrix[,1], pos.matrix[,2], labels=row.names(m), cex=.7, col='gray20')
# another network with adj.mat
word.net = network(adj.mat)
# get the edges
edges = word.net[[1]]
x = unlist(lapply(edges, function(x) x$inl))
y = unlist(lapply(edges, function(x) x$outl))
xy.edges = cbind(x, y)
# function to plot network
mynet = function(word, m, xy.edges, pos.matrix, cex=.8, col1='gray80', col2='gray40', col3='red')
{
if (!is.character(word))
stop("Argument 'word' must be a character string")
wnum = which(rownames(m)==word)
if (length(wnum)==0)
stop(paste("word", word, "not founded"))
plot(pos.matrix, type="n", xaxt="n", yaxt="n", xlab="", ylab="")
x0 = rep(pos.matrix[wnum,1], sum(xy.edges[,1]==wnum))
y0 = rep(pos.matrix[wnum,2], sum(xy.edges[,1]==wnum))
x1 = pos.matrix[xy.edges[which(xy.edges[,1]==wnum),2],1]
y1 = pos.matrix[xy.edges[which(xy.edges[,1]==wnum),2],2]
aux = which(xy.edges[,1]==wnum)
arrows(x0, y0, x1, y1, length=0, col='gray90')
text(pos.matrix[,1], pos.matrix[,2], labels=row.names(m), cex=cex, col=col1)
text(pos.matrix[xy.edges[aux,2],1], pos.matrix[xy.edges[aux,2],2],
labels=row.names(m)[xy.edges[aux,2]], cex=cex, col=col2)
text(pos.matrix[wnum,1], pos.matrix[wnum,2], rownames(m)[wnum], cex=cex, col=col3)
}
# graph of association rules
library(arules)
library(arulesViz)
tdm <- removeSparseTerms(mydata.dtm, .65)
m = as.matrix(tdm)
rules = apriori(t(m), parameter=list(support=0.005, confidence=0.1))
# plot rules
plot(rules, method="graph", control=list(type="items"))
##########################
# knihovna texcat #
##########################
library(textcat)
textcat(c("This is an english sentence.", "Das ist ein deutscher satz."))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment