joelkr · September 5, 2014 22:36
diff --git a/BayesProbability.R b/BayesProbability.R
 # BayesProbability.R

 #set.seed(49)

 # Build a dummy dictionary
 a <- letters
 b <- letters
 tokens <- apply(expand.grid(a, b), 1, function(x) paste(x, collapse=""))
 # Number of tokens to use 1-676
 tl <- 100

 p_spam_tokens <- rep(c(0.19, 0.01), (tl/2))
 #p_ham_tokens <- rep(c(0.01, 0.19), (tl/2))
 p_ham_tokens <- 1 - p_spam_tokens

 hamcount <- rep(0, tl)
 names(hamcount) <- tokens[1:tl]
 spamcount <- rep(0, tl)
 names(spamcount) <- tokens[1:tl]
 totalcount <- rep(0, tl)
 names(totalcount) <- tokens[1:tl]
 spam_messages <- 100 
 ham_messages <- 100
 Pspam <- spam_messages / (spam_messages + ham_messages)
 Pham <- 1 - Pspam

 for(i in 1:spam_messages) {
  s1 <- sample(tokens[1:tl], sample(10:20, 1), replace=T, prob=p_spam_tokens)
  s1 <- unique(s1)
  spamcount[s1] <- spamcount[s1] + 1
  totalcount[s1] <- totalcount[s1] + 1
 } 

 for(i in 1:ham_messages) {
  h1 <- sample(tokens[1:tl], sample(10:20, 1), replace=T, prob=p_ham_tokens)
  h1 <- unique(h1)
  hamcount[h1] <- hamcount[h1] + 1
  totalcount[h1] <- totalcount[h1] + 1
 } 

 # Calculate theta
 thetaS <- (1 + spamcount) / (2 + spam_messages)
 # from book
 #alpha <- 0.2
 #beta <- 0.201
 #thetaS <- (alpha + spamcount) / (beta + totalcount)

 thetaH <- (1 + hamcount) / (2 + ham_messages)
 # from book
 #thetaH <- (alpha + hamcount) / (beta + totalcount)

 # Calculate logXGivenSpam
 wS <- log(thetaS/(1 - thetaS))
 w0S <- sum(log(1-thetaS))
 # logXGivenHam
 wH <- log(thetaH/(1 - thetaH))
 w0H <- sum(log(1-thetaH))

 x <- list()
 # All spam words
 x[[1]] <- rep(c(1,0), (tl/2))
 # All ham words
 x[[2]] <- rep(c(0,1), (tl/2))
 # Every word
 x[[3]] <- rep(1, tl)
 # Random favoring 0
 x[[4]] <- sample(c(1,0), tl, replace=TRUE, prob=c(0.29,0.71))
 # Random favoring 1
 x[[5]] <- sample(c(1,0), tl, replace=TRUE, prob=c(0.71,0.29))
 logXGivenSpam <- rep(0, 5)
 logXGivenHam <- rep(0,5)
 logSpamGivenX <- rep(0,5)

 logPspam <- log(Pspam)
 logPham <- log(Pham)

 for(i in 1:5) {
  print(x[[i]])
  logXGivenSpam[i] <- sum(x[[i]] * wS) + w0S
  logXGivenHam[i] <- sum(x[[i]] * wH) + w0H
  cat("logXGivenSpam: ", logXGivenSpam[i], "\n")
  cat("logXGivenHam: ", logXGivenHam[i], "\n")
 # log identity: log(a + b) = log(a) + log(1 + exp(log(b) - log(a)))
  # formula being solved:
  # log(p(spam|word)) = log(p(word|spam)p(spam)/
  #        (p(word|spam)p(spam) + p(word|ham)p(ham)))
  logSpamGivenX[i] <- -( log(1 + exp(logXGivenHam[i] + logPham - logXGivenSpam[i] - logPspam)) )
  cat("**logSpamGivenX: ", logSpamGivenX[i], "\n")
  cat("** p(S|x): ", exp(logSpamGivenX[i]), "\n")
 }
	# BayesProbability.R

	#set.seed(49)

	# Build a dummy dictionary
	a <- letters
	b <- letters
	tokens <- apply(expand.grid(a, b), 1, function(x) paste(x, collapse=""))
	# Number of tokens to use 1-676
	tl <- 100

	p_spam_tokens <- rep(c(0.19, 0.01), (tl/2))
	#p_ham_tokens <- rep(c(0.01, 0.19), (tl/2))
	p_ham_tokens <- 1 - p_spam_tokens

	hamcount <- rep(0, tl)
	names(hamcount) <- tokens[1:tl]
	spamcount <- rep(0, tl)
	names(spamcount) <- tokens[1:tl]
	totalcount <- rep(0, tl)
	names(totalcount) <- tokens[1:tl]
	spam_messages <- 100
	ham_messages <- 100
	Pspam <- spam_messages / (spam_messages + ham_messages)
	Pham <- 1 - Pspam

	for(i in 1:spam_messages) {
	s1 <- sample(tokens[1:tl], sample(10:20, 1), replace=T, prob=p_spam_tokens)
	s1 <- unique(s1)
	spamcount[s1] <- spamcount[s1] + 1
	totalcount[s1] <- totalcount[s1] + 1
	}

	for(i in 1:ham_messages) {
	h1 <- sample(tokens[1:tl], sample(10:20, 1), replace=T, prob=p_ham_tokens)
	h1 <- unique(h1)
	hamcount[h1] <- hamcount[h1] + 1
	totalcount[h1] <- totalcount[h1] + 1
	}

	# Calculate theta
	thetaS <- (1 + spamcount) / (2 + spam_messages)
	# from book
	#alpha <- 0.2
	#beta <- 0.201
	#thetaS <- (alpha + spamcount) / (beta + totalcount)

	thetaH <- (1 + hamcount) / (2 + ham_messages)
	# from book
	#thetaH <- (alpha + hamcount) / (beta + totalcount)

	# Calculate logXGivenSpam
	wS <- log(thetaS/(1 - thetaS))
	w0S <- sum(log(1-thetaS))
	# logXGivenHam
	wH <- log(thetaH/(1 - thetaH))
	w0H <- sum(log(1-thetaH))

	x <- list()
	# All spam words
	x[[1]] <- rep(c(1,0), (tl/2))
	# All ham words
	x[[2]] <- rep(c(0,1), (tl/2))
	# Every word
	x[[3]] <- rep(1, tl)
	# Random favoring 0
	x[[4]] <- sample(c(1,0), tl, replace=TRUE, prob=c(0.29,0.71))
	# Random favoring 1
	x[[5]] <- sample(c(1,0), tl, replace=TRUE, prob=c(0.71,0.29))
	logXGivenSpam <- rep(0, 5)
	logXGivenHam <- rep(0,5)
	logSpamGivenX <- rep(0,5)

	logPspam <- log(Pspam)
	logPham <- log(Pham)

	for(i in 1:5) {
	print(x[[i]])
	logXGivenSpam[i] <- sum(x[[i]] * wS) + w0S
	logXGivenHam[i] <- sum(x[[i]] * wH) + w0H
	cat("logXGivenSpam: ", logXGivenSpam[i], "\n")
	cat("logXGivenHam: ", logXGivenHam[i], "\n")
	# log identity: log(a + b) = log(a) + log(1 + exp(log(b) - log(a)))
	# formula being solved:
	# log(p(spam\|word)) = log(p(word\|spam)p(spam)/
	# (p(word\|spam)p(spam) + p(word\|ham)p(ham)))
	logSpamGivenX[i] <- -( log(1 + exp(logXGivenHam[i] + logPham - logXGivenSpam[i] - logPspam)) )
	cat("**logSpamGivenX: ", logSpamGivenX[i], "\n")
	cat("** p(S\|x): ", exp(logSpamGivenX[i]), "\n")
	}