Created
December 3, 2012 14:23
-
-
Save josefslerka/4195326 to your computer and use it in GitHub Desktop.
Digital Humanities 8 - Text mining
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# | |
# https://sites.google.com/site/genbiovis/code/network | |
# | |
# natazeni knihovny | |
library(tm) | |
# nacteni korpusu | |
corpusHavel <- Corpus(DirSource("havel"), readerControl = list(language = "cz")) | |
corpusKlaus <- Corpus(DirSource("klaus"), readerControl = list(language = "cz")) | |
# pristup k jednotlivym dokuemntu | |
corpusKlaus[[1]] | |
# transformace | |
mydata.corpus <- corpusHavel | |
mydata.corpus <- tm_map(mydata.corpus, tolower) | |
mydata.corpus <- tm_map(mydata.corpus, removePunctuation) | |
# spinave transformace | |
mydata.corpus <- iconv(mydata.corpus, from="utf8", to="ASCII//TRANSLIT") | |
mydata.corpus <- gsub("\'", "", mydata.corpus) | |
mydata.corpus <- gsub("\"", "", mydata.corpus) | |
mydata.corpus <- gsub("\\n", "", mydata.corpus) | |
mydata.corpus <- Corpus(VectorSource(mydata.corpus)) | |
# odstraneni stopwords | |
my_stopwords <- c(stopwords('english'), 'se', 'na', 'v', 'co', 'ze', | |
'bych','dekuji','kdy','tomu','totiz', | |
'panove', 'vazeni', 'svou', 'velmi', | |
'abych','toto', | |
'o', 'je', 'k', 'z', 'proti', 'neni','byly', 'si', 'dnes', 'cz', 'timto', 'budes', 'budem', 'byli', 'jses', 'muj', 'svym', 'ta', 'tomto', 'tohle', 'tuto', 'tyto', 'jej', 'zda', 'proc', 'mate', 'tato', 'kam', 'tohoto', 'kdo', 'kteri', 'mi', 'nam', 'tom', 'tomuto', 'mit', 'nic', 'proto', 'kterou', 'byla', 'toho', 'protoze', 'asi', 'ho', 'nasi', 'napiste', 're', 'rt', 'coz', 'tim', 'takze', 'svych', 'jeji', 'svymi', 'jste', 'aj', 'tu', 'tedy', 'teto', 'bylo', 'kde', 'ke', 'prave', 'ji', 'nad', 'nejsou', 'ci', 'pod', 'tema', 'mezi', 'pres', 'ty', 'pak', 'vam', 'ani', 'kdyz', 'vsak', 'ne', 'jsem', 'tento', 'aby', 'jsme', 'pred', 'pta', 'jejich', 'byl', 'jeste', 'az', 'bez', 'take', 'pouze', 'prvni', 'vase', 'ktera', 'nas', 'novy', 'pokud', 'muze', 'jeho', 'sve', 'jine', 'zpravy', 'nove', 'neni', 'vas', 'jen', 'podle', 'zde', 'clanek', 'uz', 'byt', 'vice', 'bude', 'jiz', 'nez', 'ktery', 'by', 'ktere', 'co', 'nebo', 'ten', 'tak', 'ma', 'pri', 'od', 'po', 'jsou', 'jak', 'dalsi', 'ale', 'si', 've', 'to', 'jako', 'za', 'zpet', 'ze', 'do', 'pro', 'je', 'na') | |
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords) | |
# vytvoreni Term Document Matrixu | |
mydata.dtm <- TermDocumentMatrix(mydata.corpus) | |
# nalezni prekventovanych slov | |
findFreqTerms(mydata.dtm, 40, Inf) | |
# nalezeni slov korelujicich se slovem (umi byt zatracene pomale) | |
findAssocs(mydata.dtm, "svet", 0.85) | |
# nalezeni slov korelujicich se slovem (umi byt zatracene pomale) | |
removeSparseTerms(mydata.dtm, sparse=0.65) | |
# remove sparse terms from the term document matrix. | |
# In other words, we'll remove terms which have at least a sparse percentage of empty elements (e.g. terms occurring 0 times in a document). | |
mydata.dtm2 <- removeSparseTerms(mydata.dtm, sparse=0.65) | |
# vytvoreni objetu matrix | |
m = as.matrix(mydata.dtm2) | |
# get the top ten words | |
dtm <- DocumentTermMatrix(mydata.corpus) | |
dtm <- removeSparseTerms(dtm, 0.65) | |
top10 <- as.matrix(dtm) | |
v <- apply(top10,2,sum) | |
v <- sort(v, decreasing = TRUE) | |
v1 <- sort(v[1:10]) | |
barplot(v1, horiz=TRUE, cex.names = 0.7, las = 1, col=grey.colors(10), main="Frequency of Terms") | |
# matice korelaci | |
library(corrplot) | |
words <- names(findAssocs(dtm, "stat", .2)[2:11]) | |
oi <- as.matrix(dtm) | |
find <- colnames(oi) %in% words | |
corr <- cor(oi[,find]) | |
corrplot(corr) | |
############# | |
# wordcloud # | |
############# | |
library(RColorBrewer) | |
library(wordcloud) | |
tdm <- removeSparseTerms(mydata.dtm, .975) | |
m = as.matrix(tdm) | |
v = sort(rowSums(m), decreasing=TRUE) | |
# create data frame of words and frequencies | |
d = data.frame(word=names(v), freq=v) | |
purd = brewer.pal(9, "PuRd")[-c(1:3)] | |
wordcloud(d$word, d$freq, colors=purd, random.order=FALSE) | |
##################### | |
# network of words # | |
##################### | |
library(igraph) | |
library(network) | |
library(sna) | |
tdm <- removeSparseTerms(mydata.dtm, .975) | |
m = as.matrix(tdm) | |
# adjacency matrix | |
adj.mat = m %*% t(m) | |
# zeroes in the diagonal of the matrix | |
diag(adj.mat) = 0 | |
# generate graph from adjacency matrix | |
word.graph = graph.adjacency(adj.mat) | |
# position matrix | |
pos.matrix <- layout.fruchterman.reingold(word.graph) | |
plot(pos.matrix, type="n", xaxt="n", yaxt="n", xlab="", ylab="") | |
text(pos.matrix[,1], pos.matrix[,2], labels=row.names(m), cex=.7, col='gray20') | |
# another network with adj.mat | |
word.net = network(adj.mat) | |
# get the edges | |
edges = word.net[[1]] | |
x = unlist(lapply(edges, function(x) x$inl)) | |
y = unlist(lapply(edges, function(x) x$outl)) | |
xy.edges = cbind(x, y) | |
# function to plot network | |
mynet = function(word, m, xy.edges, pos.matrix, cex=.8, col1='gray80', col2='gray40', col3='red') | |
{ | |
if (!is.character(word)) | |
stop("Argument 'word' must be a character string") | |
wnum = which(rownames(m)==word) | |
if (length(wnum)==0) | |
stop(paste("word", word, "not founded")) | |
plot(pos.matrix, type="n", xaxt="n", yaxt="n", xlab="", ylab="") | |
x0 = rep(pos.matrix[wnum,1], sum(xy.edges[,1]==wnum)) | |
y0 = rep(pos.matrix[wnum,2], sum(xy.edges[,1]==wnum)) | |
x1 = pos.matrix[xy.edges[which(xy.edges[,1]==wnum),2],1] | |
y1 = pos.matrix[xy.edges[which(xy.edges[,1]==wnum),2],2] | |
aux = which(xy.edges[,1]==wnum) | |
arrows(x0, y0, x1, y1, length=0, col='gray90') | |
text(pos.matrix[,1], pos.matrix[,2], labels=row.names(m), cex=cex, col=col1) | |
text(pos.matrix[xy.edges[aux,2],1], pos.matrix[xy.edges[aux,2],2], | |
labels=row.names(m)[xy.edges[aux,2]], cex=cex, col=col2) | |
text(pos.matrix[wnum,1], pos.matrix[wnum,2], rownames(m)[wnum], cex=cex, col=col3) | |
} | |
# graph of association rules | |
library(arules) | |
library(arulesViz) | |
tdm <- removeSparseTerms(mydata.dtm, .65) | |
m = as.matrix(tdm) | |
rules = apriori(t(m), parameter=list(support=0.005, confidence=0.1)) | |
# plot rules | |
plot(rules, method="graph", control=list(type="items")) | |
########################## | |
# knihovna texcat # | |
########################## | |
library(textcat) | |
textcat(c("This is an english sentence.", "Das ist ein deutscher satz.")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment