# The get_sentiws function will download the zip-file with the SentiWS dictionary, # unzip it and return a data.table. library(data.table) get_sentiws <- function(){ sentiws_tmp_dir <- file.path(tempdir(), "sentiws") if (!file.exists(sentiws_tmp_dir)) dir.create(sentiws_tmp_dir) sentiws_zipfile <- file.path(sentiws_tmp_dir, "SentiWS_v2.0c.zip") sentiws_url <- "http://pcai056.informatik.uni-leipzig.de/downloads/etc/SentiWS/SentiWS_v2.0.zip" download.file(url = sentiws_url, destfile = sentiws_zipfile) unzip(zipfile = sentiws_zipfile, exdir = sentiws_tmp_dir) .unfold <- function(.SD){ pos <- gsub("^([A-Z]+)\\s+.*$", "\\1", .SD[["data"]][1]) weight <- as.numeric(gsub("^[A-Z]+\\s+(-?\\d\\.\\d+).*$", "\\1", .SD[["data"]][1])) words <- gsub("^[A-Z]+\\s+-?\\d\\.\\d+\\s*(.*?)\\s*$", "\\1", .SD[["data"]][1]) words <- if (!grepl("^\\s*$", words)) strsplit(x = words, split = ",")[[1]] else NULL list( word = c(.SD[["word"]][1], words), base = c(TRUE, rep(FALSE, times = length(words))), lemma = .SD[["word"]][1], pos = pos, weight = weight ) } dts <- lapply( c(positive = "SentiWS_v2.0_Positive.txt", negative = "SentiWS_v2.0_Negative.txt"), function(filename){ dt <- fread(file.path(sentiws_tmp_dir, filename), sep = "|", encoding="UTF-8") colnames(dt) <- c("word", "data") dt[, "id" := 1L:nrow(dt)] dt[, .unfold(.SD), by = c("id")] } ) rbindlist(dts) }