BrianWeinstein · October 19, 2016 16:43 · BrianWeinstein · Oct 19, 2016
diff --git a/RemoveSparseTermsLarge.R b/RemoveSparseTermsLarge.R
 # tm::removeSparseTerms attempts to remove sparse terms via slicing a sparse matrix.
 # The slicing operation tries to convert the sparse matrix to a dense matrix, but this
 # fails if the dense matrix has more than ((2^31) - 1) entries [i.e., if (nrow * ncol) > ((2^31) - 1)]
 # 
 # The error message is 
 # In nr * nc : NAs produced by integer overflow
 # 
 # Instead of using tm::removeSparseTerms, the following function subsets the sparse matrix directly
 # and avoids converting the sparse matrix to a dense one.

 library(tm)
 library(slam)

 RemoveSparseTermsLarge <- function(x, sparse){
  
  stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")), 
            is.numeric(sparse), sparse > 0, sparse < 1)
  
  # define mm as a DTM
  mm <- if(inherits(x, "TermDocumentMatrix")){
    t(x)
  } else {
    x
  }
  
  # find the non-sparse terms
  tt <- table(mm$j) > mm$nrow * (1 - sparse)
  termIndex <- as.numeric(names(tt[tt]))
  nonSparseTermVec <- (mm$j %in% termIndex)
  
  # subset the simple triplet matrix
  dtm.ns <- simple_triplet_matrix(i = mm$i[nonSparseTermVec],
                                  j = as.integer(as.factor(mm$j[nonSparseTermVec])),
                                  v = mm$v[nonSparseTermVec],
                                  nrow = mm$nrow,
                                  ncol = length(termIndex),
                                  dimnames = list(mm$dimnames$Docs, mm$dimnames$Terms[termIndex]))
  
  # convert back to a DTM/TDM
  if(inherits(x, "TermDocumentMatrix")){
    as.TermDocumentMatrix(t(dtm.ns), weighting = weightTf)
  } else {
    as.DocumentTermMatrix(dtm.ns, weighting = weightTf)
  }
  
 }
	# tm::removeSparseTerms attempts to remove sparse terms via slicing a sparse matrix.
	# The slicing operation tries to convert the sparse matrix to a dense matrix, but this
	# fails if the dense matrix has more than ((2^31) - 1) entries [i.e., if (nrow * ncol) > ((2^31) - 1)]
	#
	# The error message is
	# In nr * nc : NAs produced by integer overflow
	#
	# Instead of using tm::removeSparseTerms, the following function subsets the sparse matrix directly
	# and avoids converting the sparse matrix to a dense one.

	library(tm)
	library(slam)

	RemoveSparseTermsLarge <- function(x, sparse){

	stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")),
	is.numeric(sparse), sparse > 0, sparse < 1)

	# define mm as a DTM
	mm <- if(inherits(x, "TermDocumentMatrix")){
	t(x)
	} else {
	x
	}

	# find the non-sparse terms
	tt <- table(mm$j) > mm$nrow * (1 - sparse)
	termIndex <- as.numeric(names(tt[tt]))
	nonSparseTermVec <- (mm$j %in% termIndex)

	# subset the simple triplet matrix
	dtm.ns <- simple_triplet_matrix(i = mm$i[nonSparseTermVec],
	j = as.integer(as.factor(mm$j[nonSparseTermVec])),
	v = mm$v[nonSparseTermVec],
	nrow = mm$nrow,
	ncol = length(termIndex),
	dimnames = list(mm$dimnames$Docs, mm$dimnames$Terms[termIndex]))

	# convert back to a DTM/TDM
	if(inherits(x, "TermDocumentMatrix")){
	as.TermDocumentMatrix(t(dtm.ns), weighting = weightTf)
	} else {
	as.DocumentTermMatrix(dtm.ns, weighting = weightTf)
	}

	}