josefslerka · December 3, 2012 14:23
diff --git a/gistfile1.r b/gistfile1.r
 #
 #
 # https://sites.google.com/site/genbiovis/code/network
 #


 # natazeni knihovny
 library(tm) 

 # nacteni korpusu
 corpusHavel <- Corpus(DirSource("havel"), readerControl = list(language = "cz"))
 corpusKlaus <- Corpus(DirSource("klaus"), readerControl = list(language = "cz"))

 # pristup k jednotlivym dokuemntu
 corpusKlaus[[1]]


 # transformace
 mydata.corpus <- corpusHavel
 mydata.corpus <- tm_map(mydata.corpus, tolower)
 mydata.corpus <- tm_map(mydata.corpus, removePunctuation)

 # spinave transformace
 mydata.corpus <- iconv(mydata.corpus,  from="utf8", to="ASCII//TRANSLIT")
 mydata.corpus <- gsub("\'", "", mydata.corpus)
 mydata.corpus <- gsub("\"", "", mydata.corpus)
 mydata.corpus <- gsub("\\n", "", mydata.corpus)
 mydata.corpus <- Corpus(VectorSource(mydata.corpus))

 # odstraneni stopwords
 my_stopwords <- c(stopwords('english'), 'se', 'na', 'v', 'co', 'ze', 
                  'bych','dekuji','kdy','tomu','totiz',
                  'panove', 'vazeni', 'svou', 'velmi',
                  'abych','toto',
                  'o', 'je', 'k', 'z', 'proti', 'neni','byly', 'si', 'dnes', 'cz', 'timto', 'budes', 'budem', 'byli', 'jses', 'muj', 'svym', 'ta', 'tomto', 'tohle', 'tuto', 'tyto', 'jej', 'zda', 'proc', 'mate', 'tato', 'kam', 'tohoto', 'kdo', 'kteri', 'mi', 'nam', 'tom', 'tomuto', 'mit', 'nic', 'proto', 'kterou', 'byla', 'toho', 'protoze', 'asi', 'ho', 'nasi', 'napiste', 're', 'rt', 'coz', 'tim', 'takze', 'svych', 'jeji', 'svymi', 'jste', 'aj', 'tu', 'tedy', 'teto', 'bylo', 'kde', 'ke', 'prave', 'ji', 'nad', 'nejsou', 'ci', 'pod', 'tema', 'mezi', 'pres', 'ty', 'pak', 'vam', 'ani', 'kdyz', 'vsak', 'ne', 'jsem', 'tento', 'aby', 'jsme', 'pred', 'pta', 'jejich', 'byl', 'jeste', 'az', 'bez', 'take', 'pouze', 'prvni', 'vase', 'ktera', 'nas', 'novy', 'pokud', 'muze', 'jeho', 'sve', 'jine', 'zpravy', 'nove', 'neni', 'vas', 'jen', 'podle', 'zde', 'clanek', 'uz', 'byt', 'vice', 'bude', 'jiz', 'nez', 'ktery', 'by', 'ktere', 'co', 'nebo', 'ten', 'tak', 'ma', 'pri', 'od', 'po', 'jsou', 'jak', 'dalsi', 'ale', 'si', 've', 'to', 'jako', 'za', 'zpet', 'ze', 'do', 'pro', 'je', 'na')
 mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords)

 # vytvoreni Term Document Matrixu
 mydata.dtm <- TermDocumentMatrix(mydata.corpus)

 # nalezni prekventovanych slov
 findFreqTerms(mydata.dtm, 40, Inf)

 # nalezeni slov korelujicich se slovem (umi byt zatracene pomale)
 findAssocs(mydata.dtm, "svet", 0.85)

 # nalezeni slov korelujicich se slovem (umi byt zatracene pomale)
 removeSparseTerms(mydata.dtm, sparse=0.65)

 # remove sparse terms from the term document matrix. 
 # In other words, we'll remove terms which have at least a sparse percentage of empty  elements (e.g. terms occurring 0 times in a document).
 mydata.dtm2 <- removeSparseTerms(mydata.dtm, sparse=0.65)

 # vytvoreni objetu matrix
 m = as.matrix(mydata.dtm2)

 # get the top ten words
 dtm <- DocumentTermMatrix(mydata.corpus)
 dtm <- removeSparseTerms(dtm, 0.65)
 top10 <- as.matrix(dtm)
 v <- apply(top10,2,sum)
 v <- sort(v, decreasing = TRUE)
 v1 <- sort(v[1:10])
 barplot(v1, horiz=TRUE, cex.names = 0.7, las = 1, col=grey.colors(10), main="Frequency of Terms")

 # matice korelaci
 library(corrplot)
 words <- names(findAssocs(dtm, "stat", .2)[2:11])
 oi <- as.matrix(dtm)
 find <- colnames(oi) %in% words
 corr <- cor(oi[,find]) 
 corrplot(corr)

 #############
 # wordcloud #
 #############
 library(RColorBrewer)
 library(wordcloud)
 tdm <- removeSparseTerms(mydata.dtm, .975)
 m = as.matrix(tdm)
 v = sort(rowSums(m), decreasing=TRUE)
 # create data frame of words and frequencies
 d = data.frame(word=names(v), freq=v)
 purd = brewer.pal(9, "PuRd")[-c(1:3)]
 wordcloud(d$word, d$freq, colors=purd, random.order=FALSE)

 #####################
 # network of words  #
 #####################
 library(igraph)
 library(network)
 library(sna)

 tdm <- removeSparseTerms(mydata.dtm, .975)
 m = as.matrix(tdm)

 # adjacency matrix
 adj.mat = m %*% t(m)

 # zeroes in the diagonal of the matrix
 diag(adj.mat) = 0

 # generate graph from adjacency matrix
 word.graph = graph.adjacency(adj.mat)

 # position matrix
 pos.matrix <- layout.fruchterman.reingold(word.graph)

 plot(pos.matrix, type="n", xaxt="n", yaxt="n", xlab="", ylab="")
 text(pos.matrix[,1], pos.matrix[,2], labels=row.names(m), cex=.7, col='gray20')

 # another network with adj.mat
 word.net = network(adj.mat)

 # get the edges
 edges = word.net[[1]]
 x = unlist(lapply(edges, function(x) x$inl))
 y = unlist(lapply(edges, function(x) x$outl))
 xy.edges = cbind(x, y)

 # function to plot network
 mynet = function(word, m, xy.edges, pos.matrix, cex=.8, col1='gray80', col2='gray40', col3='red')
 {
  if (!is.character(word)) 
    stop("Argument 'word' must be a character string")
  wnum = which(rownames(m)==word)
  if (length(wnum)==0) 
    stop(paste("word", word, "not founded"))
  plot(pos.matrix, type="n", xaxt="n", yaxt="n", xlab="", ylab="")
  x0 = rep(pos.matrix[wnum,1], sum(xy.edges[,1]==wnum))
  y0 = rep(pos.matrix[wnum,2], sum(xy.edges[,1]==wnum))
  x1 = pos.matrix[xy.edges[which(xy.edges[,1]==wnum),2],1]
  y1 = pos.matrix[xy.edges[which(xy.edges[,1]==wnum),2],2]
  aux = which(xy.edges[,1]==wnum)
  arrows(x0, y0, x1, y1, length=0, col='gray90')
  text(pos.matrix[,1], pos.matrix[,2], labels=row.names(m), cex=cex, col=col1)
  text(pos.matrix[xy.edges[aux,2],1], pos.matrix[xy.edges[aux,2],2], 
       labels=row.names(m)[xy.edges[aux,2]], cex=cex, col=col2)
  text(pos.matrix[wnum,1], pos.matrix[wnum,2], rownames(m)[wnum],  cex=cex, col=col3)
 }

 # graph of association rules
 library(arules)
 library(arulesViz)
 tdm <- removeSparseTerms(mydata.dtm, .65)
 m = as.matrix(tdm)
 rules = apriori(t(m), parameter=list(support=0.005, confidence=0.1))
 # plot rules
 plot(rules, method="graph", control=list(type="items"))

 ##########################
 #     knihovna texcat    #
 ##########################
 library(textcat)
 textcat(c("This is an english sentence.", "Das ist ein deutscher satz."))
	#
	#
	# https://sites.google.com/site/genbiovis/code/network
	#


	# natazeni knihovny
	library(tm)

	# nacteni korpusu
	corpusHavel <- Corpus(DirSource("havel"), readerControl = list(language = "cz"))
	corpusKlaus <- Corpus(DirSource("klaus"), readerControl = list(language = "cz"))

	# pristup k jednotlivym dokuemntu
	corpusKlaus[[1]]


	# transformace
	mydata.corpus <- corpusHavel
	mydata.corpus <- tm_map(mydata.corpus, tolower)
	mydata.corpus <- tm_map(mydata.corpus, removePunctuation)

	# spinave transformace
	mydata.corpus <- iconv(mydata.corpus, from="utf8", to="ASCII//TRANSLIT")
	mydata.corpus <- gsub("\'", "", mydata.corpus)
	mydata.corpus <- gsub("\"", "", mydata.corpus)
	mydata.corpus <- gsub("\\n", "", mydata.corpus)
	mydata.corpus <- Corpus(VectorSource(mydata.corpus))

	# odstraneni stopwords
	my_stopwords <- c(stopwords('english'), 'se', 'na', 'v', 'co', 'ze',
	'bych','dekuji','kdy','tomu','totiz',
	'panove', 'vazeni', 'svou', 'velmi',
	'abych','toto',
	'o', 'je', 'k', 'z', 'proti', 'neni','byly', 'si', 'dnes', 'cz', 'timto', 'budes', 'budem', 'byli', 'jses', 'muj', 'svym', 'ta', 'tomto', 'tohle', 'tuto', 'tyto', 'jej', 'zda', 'proc', 'mate', 'tato', 'kam', 'tohoto', 'kdo', 'kteri', 'mi', 'nam', 'tom', 'tomuto', 'mit', 'nic', 'proto', 'kterou', 'byla', 'toho', 'protoze', 'asi', 'ho', 'nasi', 'napiste', 're', 'rt', 'coz', 'tim', 'takze', 'svych', 'jeji', 'svymi', 'jste', 'aj', 'tu', 'tedy', 'teto', 'bylo', 'kde', 'ke', 'prave', 'ji', 'nad', 'nejsou', 'ci', 'pod', 'tema', 'mezi', 'pres', 'ty', 'pak', 'vam', 'ani', 'kdyz', 'vsak', 'ne', 'jsem', 'tento', 'aby', 'jsme', 'pred', 'pta', 'jejich', 'byl', 'jeste', 'az', 'bez', 'take', 'pouze', 'prvni', 'vase', 'ktera', 'nas', 'novy', 'pokud', 'muze', 'jeho', 'sve', 'jine', 'zpravy', 'nove', 'neni', 'vas', 'jen', 'podle', 'zde', 'clanek', 'uz', 'byt', 'vice', 'bude', 'jiz', 'nez', 'ktery', 'by', 'ktere', 'co', 'nebo', 'ten', 'tak', 'ma', 'pri', 'od', 'po', 'jsou', 'jak', 'dalsi', 'ale', 'si', 've', 'to', 'jako', 'za', 'zpet', 'ze', 'do', 'pro', 'je', 'na')
	mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords)

	# vytvoreni Term Document Matrixu
	mydata.dtm <- TermDocumentMatrix(mydata.corpus)

	# nalezni prekventovanych slov
	findFreqTerms(mydata.dtm, 40, Inf)

	# nalezeni slov korelujicich se slovem (umi byt zatracene pomale)
	findAssocs(mydata.dtm, "svet", 0.85)

	# nalezeni slov korelujicich se slovem (umi byt zatracene pomale)
	removeSparseTerms(mydata.dtm, sparse=0.65)

	# remove sparse terms from the term document matrix.
	# In other words, we'll remove terms which have at least a sparse percentage of empty elements (e.g. terms occurring 0 times in a document).
	mydata.dtm2 <- removeSparseTerms(mydata.dtm, sparse=0.65)

	# vytvoreni objetu matrix
	m = as.matrix(mydata.dtm2)

	# get the top ten words
	dtm <- DocumentTermMatrix(mydata.corpus)
	dtm <- removeSparseTerms(dtm, 0.65)
	top10 <- as.matrix(dtm)
	v <- apply(top10,2,sum)
	v <- sort(v, decreasing = TRUE)
	v1 <- sort(v[1:10])
	barplot(v1, horiz=TRUE, cex.names = 0.7, las = 1, col=grey.colors(10), main="Frequency of Terms")

	# matice korelaci
	library(corrplot)
	words <- names(findAssocs(dtm, "stat", .2)[2:11])
	oi <- as.matrix(dtm)
	find <- colnames(oi) %in% words
	corr <- cor(oi[,find])
	corrplot(corr)

	#############
	# wordcloud #
	#############
	library(RColorBrewer)
	library(wordcloud)
	tdm <- removeSparseTerms(mydata.dtm, .975)
	m = as.matrix(tdm)
	v = sort(rowSums(m), decreasing=TRUE)
	# create data frame of words and frequencies
	d = data.frame(word=names(v), freq=v)
	purd = brewer.pal(9, "PuRd")[-c(1:3)]
	wordcloud(d$word, d$freq, colors=purd, random.order=FALSE)

	#####################
	# network of words #
	#####################
	library(igraph)
	library(network)
	library(sna)

	tdm <- removeSparseTerms(mydata.dtm, .975)
	m = as.matrix(tdm)

	# adjacency matrix
	adj.mat = m %*% t(m)

	# zeroes in the diagonal of the matrix
	diag(adj.mat) = 0

	# generate graph from adjacency matrix
	word.graph = graph.adjacency(adj.mat)

	# position matrix
	pos.matrix <- layout.fruchterman.reingold(word.graph)

	plot(pos.matrix, type="n", xaxt="n", yaxt="n", xlab="", ylab="")
	text(pos.matrix[,1], pos.matrix[,2], labels=row.names(m), cex=.7, col='gray20')

	# another network with adj.mat
	word.net = network(adj.mat)

	# get the edges
	edges = word.net[[1]]
	x = unlist(lapply(edges, function(x) x$inl))
	y = unlist(lapply(edges, function(x) x$outl))
	xy.edges = cbind(x, y)

	# function to plot network
	mynet = function(word, m, xy.edges, pos.matrix, cex=.8, col1='gray80', col2='gray40', col3='red')
	{
	if (!is.character(word))
	stop("Argument 'word' must be a character string")
	wnum = which(rownames(m)==word)
	if (length(wnum)==0)
	stop(paste("word", word, "not founded"))
	plot(pos.matrix, type="n", xaxt="n", yaxt="n", xlab="", ylab="")
	x0 = rep(pos.matrix[wnum,1], sum(xy.edges[,1]==wnum))
	y0 = rep(pos.matrix[wnum,2], sum(xy.edges[,1]==wnum))
	x1 = pos.matrix[xy.edges[which(xy.edges[,1]==wnum),2],1]
	y1 = pos.matrix[xy.edges[which(xy.edges[,1]==wnum),2],2]
	aux = which(xy.edges[,1]==wnum)
	arrows(x0, y0, x1, y1, length=0, col='gray90')
	text(pos.matrix[,1], pos.matrix[,2], labels=row.names(m), cex=cex, col=col1)
	text(pos.matrix[xy.edges[aux,2],1], pos.matrix[xy.edges[aux,2],2],
	labels=row.names(m)[xy.edges[aux,2]], cex=cex, col=col2)
	text(pos.matrix[wnum,1], pos.matrix[wnum,2], rownames(m)[wnum], cex=cex, col=col3)
	}

	# graph of association rules
	library(arules)
	library(arulesViz)
	tdm <- removeSparseTerms(mydata.dtm, .65)
	m = as.matrix(tdm)
	rules = apriori(t(m), parameter=list(support=0.005, confidence=0.1))
	# plot rules
	plot(rules, method="graph", control=list(type="items"))

	##########################
	# knihovna texcat #
	##########################
	library(textcat)
	textcat(c("This is an english sentence.", "Das ist ein deutscher satz."))