Created
October 11, 2020 18:53
-
-
Save tallguyjenks/7fb36226217cdeadd47512e8d754f2ab to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Obsidian Sentiment Analysis" | |
author: "Bryan Jenks" | |
date: "2020-10-10" | |
output: html_document | |
--- | |
```{r} | |
require(here) | |
require(jsonlite) | |
require(tidyverse) | |
require(tidytext) | |
require(tokenizers) | |
require(wordcloud) | |
``` | |
```{bash} | |
# rm -r ./Data #voletile but good for a hard reset, unomment to use | |
mkdir -p ./Data | |
curl https://forum.obsidian.md/t/obsidian-zettelkasten/1999/247.json > Data/post1.json | |
curl https://forum.obsidian.md/t/block-reference/674/183.json > Data/post2.json | |
curl https://forum.obsidian.md/t/meta-post-common-css-hacks/1978/171.json > Data/post3.json | |
curl https://forum.obsidian.md/t/how-do-i-work-with-obsidian-on-mobile/471.json > Data/post4.json | |
curl https://forum.obsidian.md/t/a-wysiwym-typora-like-editing-mode/1953.json > Data/post5.json | |
curl https://forum.obsidian.md/t/theme-obsidian-ia-another-ia-writer-inspired-theme/2562/110.json > Data/post6.json | |
curl https://forum.obsidian.md/t/to-filter-the-graph-view/88.json > Data/post7.json | |
curl https://forum.obsidian.md/t/folding-in-preview/1328/101.json > Data/post8.json | |
curl https://forum.obsidian.md/t/lyt-kit-live-on-obsidian-publish-downloadable-on-oct-13th/390/98.json > Data/post9.json | |
curl https://forum.obsidian.md/t/obsidian-to-anki-v2-9-a-feature-rich-script-that-allows-you-to-add-notes-from-obsidian-to-anki/5030/72.json > Data/post10.json | |
curl https://forum.obsidian.md/t/on-the-process-of-making-mocs/1060.json > Data/post11.json | |
curl https://forum.obsidian.md/t/andy-matuschak-mode-v2-7-updated-for-0-7-new-panes/170/75.json > Data/post12.json | |
curl https://forum.obsidian.md/t/zotero-best-practices/164/69.json > Data/post13.json | |
curl https://forum.obsidian.md/t/theme-obsdn-dark-rmx-now-with-light-dark-updated-2020-09-11/2225/46.json > Data/post14.json | |
curl https://forum.obsidian.md/t/zotero-integrations/91/56.json > Data/post15.json | |
curl https://forum.obsidian.md/t/research-phd-academics/1446/57.json > Data/post16.json | |
curl https://forum.obsidian.md/t/url-scheme/653.json > Data/post17.json | |
curl https://forum.obsidian.md/t/obsidian-for-web/2049.json > Data/post18.json | |
curl https://forum.obsidian.md/t/some-thoughts-on-using-roam-as-an-obsidian-person-questions-for-roam-refugees/4868.json > Data/post19.json | |
curl https://forum.obsidian.md/t/web-clipper-highlighter-and-kindle-highlights-notes-extraction-extension/852.json > Data/post20.json | |
curl https://forum.obsidian.md/t/how-is-obsidian-more-than-a-wiki/2914.json > Data/post21.json | |
#curl .json > Data/post22.json | |
#curl .json > Data/post23.json | |
touch Data/output.txt | |
echo "" > output.txt | |
``` | |
```{r} | |
fileNames <- Sys.glob('Data/*.json') | |
cleanFun <- function(file) { | |
JSONdata <- fromJSON(file) | |
htmlString <- JSONdata$post_stream$posts$cooked | |
returnStr <- gsub("<.*?>", "", htmlString) %>% | |
gsub("\\n", "", .) %>% | |
as_tibble() %>% | |
unlist(recursive = TRUE, use.names=FALSE) %>% | |
as.vector() | |
return(returnStr) | |
} | |
for (fileName in fileNames) { | |
line <- cleanFun(fileName) | |
write(line, file = here("Data", "output.txt"), append = TRUE) | |
} | |
cleanedJSON <- read_file(here('Data', 'output.txt')) | |
``` | |
```{r parseWordTokens} | |
parsed_words <- cleanedJSON %>% | |
tokenize_words(strip_punct = TRUE) %>% | |
unlist() %>% | |
as_tibble() | |
``` | |
```{r removeStopWords} | |
removedWords <- stop_words | |
cleaned_words <- parsed_words %>% | |
anti_join(removedWords, by = c('value' = 'word')) | |
``` | |
# Text Analysis | |
## Frequency Distribution | |
Frequency count of each word's occurrence | |
```{r wordFrequency} | |
freq <- cleaned_words %>% | |
count(value, sort = TRUE) | |
``` | |
## Sentiment Analysis | |
```{r getSentimentLexicon} | |
sentiments <- get_sentiments("nrc") | |
# 'bing' is better for binary sentiment | |
# but 'nrc' produces nicer visuals 🤷 | |
``` | |
```{r sentimentAnalysis} | |
word_count_sentiments <- cleaned_words %>% | |
count(value, sort = TRUE) %>% | |
left_join(sentiments, by = c('value' = 'word')) %>% | |
dplyr::filter(!is.na(sentiment)) | |
``` | |
# Visualizations | |
```{r visualizeSentiments} | |
word_count_sentiments %>% | |
ggplot() + | |
aes(x = reorder(sentiment, n), fill = sentiment, size = n) + | |
geom_bar() + | |
scale_fill_viridis_d(option = "magma") + | |
labs(x = "Sentiments", | |
y = "frequency", | |
title = "Frequency of Sentiment", | |
subtitle = "using NRC sentiment lexicon") + | |
coord_flip() + | |
theme_minimal() + | |
theme(legend.position = "top") | |
``` | |
```{r visualizeTopWords} | |
freq %>% | |
filter(n > 50) %>% | |
mutate(value = reorder(value, n)) %>% | |
ggplot(aes(value, n)) + | |
geom_col() + | |
labs(x = "Count", | |
y = "frequency", | |
title = "Frequency of Word Occurance", | |
subtitle = "Subset of words occurring > 50 times") + | |
xlab(NULL) + | |
coord_flip() | |
``` | |
```{r visualizeWordCloud} | |
cloudWords <- freq %>% | |
filter(n > 25) | |
wordcloud::wordcloud(cloudWords$value, cloudWords$n) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment