vipl0ve · March 6, 2017 19:11
diff --git a/linkedin.R b/linkedin.R
 # Get a person's name, location, summary, # of connections, and skills & endorsements from LinkedIn

 # URL of the LinkedIn page 
 user_url <- "https://www.linkedin.com/in/daattali"

 # since the information isn't available without being logged in, the web
 # scraper needs to log in. Provide your LinkedIn user/pw here (this isn't stored
 # anywhere as you can see, it's just used to log in during the scrape session)
 username <- "yourusername"
 password <- "yourpassword"

 # takes a couple seconds and might throw a warning, but ignore the warning
 # (linkedin_info <- scrape_linkedin(user_url))

 ############################

 library(rvest)

 scrape_linkedin <- function(user_url) {
  linkedin_url <- "http://linkedin.com/"
  pgsession <- html_session(linkedin_url) 
  pgform <- html_form(pgsession)[[1]]
  filled_form <- set_values(pgform,
                            session_key = username, 
                            session_password = password)
  
  submit_form(pgsession, filled_form)
  
  pgsession <- jump_to(pgsession, user_url)
  page_html <- read_html(pgsession)
  
  name <-
    page_html %>% html_nodes("#name") %>% html_text()
  
  location <-
    page_html %>% html_nodes("#location .locality") %>% html_text()
  
  num_connections <-
    page_html %>% html_nodes(".member-connections strong") %>% html_text()
  
  summary <- 
    page_html %>% html_nodes("#summary-item-view") %>% html_text()

  skills_nodes <-
    page_html %>% html_nodes("#profile-skills .skill-pill")
  
  skills <-
    lapply(skills_nodes, function(node) {
      num <- node %>% html_nodes(".num-endorsements") %>% html_text()
      name <- node %>% html_nodes(".endorse-item-name-text") %>% html_text()
      data.frame(name = name, num = num)
    })
  
  skills <- do.call(rbind, skills)
  
  
  list(
    name = name,
    location = location,
    num_connections = num_connections,
    summary = summary,
    skills = skills
  ) 
 }
diff --git a/twitter.R b/twitter.R
 # Make a wordcloud of the most common words in a person's tweets

 # Need to create a Twitter App and get credentials
 setup_twitter_oauth(USE_YOUR_CREDENTIALS_HERE)

 # Username of the Twitter user
 name <- "daattali"

 #####################

 library(twitteR)
 library(SnowballC)
 library(wordcloud)
 library(tm)
 library(stringr)
 library(dplyr)

 user <- userTimeline(user = name, n = 3200, includeRts = FALSE, excludeReplies = TRUE)
 tweets <- sapply(user, function(x) { strsplit(gsub("[^[:alnum:] ]", "", x$text), " +")[[1]] })

 topwords <- 
  tweets %>%
  paste(collapse = " ") %>%
  str_split("\\s") %>%
  unlist %>%
  tolower %>%
  removePunctuation %>%
  removeWords(stopwords("english")) %>%
  #wordStem %>%
  .[. != ""] %>% 
  table %>%
  sort(decreasing = TRUE) %>%
  head(100)

 wordcloud(names(topwordscloud), topwords, min.freq = 3)
	# Get a person's name, location, summary, # of connections, and skills & endorsements from LinkedIn

	# URL of the LinkedIn page
	user_url <- "https://www.linkedin.com/in/daattali"

	# since the information isn't available without being logged in, the web
	# scraper needs to log in. Provide your LinkedIn user/pw here (this isn't stored
	# anywhere as you can see, it's just used to log in during the scrape session)
	username <- "yourusername"
	password <- "yourpassword"

	# takes a couple seconds and might throw a warning, but ignore the warning
	# (linkedin_info <- scrape_linkedin(user_url))

	############################

	library(rvest)

	scrape_linkedin <- function(user_url) {
	linkedin_url <- "http://linkedin.com/"
	pgsession <- html_session(linkedin_url)
	pgform <- html_form(pgsession)[[1]]
	filled_form <- set_values(pgform,
	session_key = username,
	session_password = password)

	submit_form(pgsession, filled_form)

	pgsession <- jump_to(pgsession, user_url)
	page_html <- read_html(pgsession)

	name <-
	page_html %>% html_nodes("#name") %>% html_text()

	location <-
	page_html %>% html_nodes("#location .locality") %>% html_text()

	num_connections <-
	page_html %>% html_nodes(".member-connections strong") %>% html_text()

	summary <-
	page_html %>% html_nodes("#summary-item-view") %>% html_text()

	skills_nodes <-
	page_html %>% html_nodes("#profile-skills .skill-pill")

	skills <-
	lapply(skills_nodes, function(node) {
	num <- node %>% html_nodes(".num-endorsements") %>% html_text()
	name <- node %>% html_nodes(".endorse-item-name-text") %>% html_text()
	data.frame(name = name, num = num)
	})

	skills <- do.call(rbind, skills)


	list(
	name = name,
	location = location,
	num_connections = num_connections,
	summary = summary,
	skills = skills
	)
	}
	# Make a wordcloud of the most common words in a person's tweets

	# Need to create a Twitter App and get credentials
	setup_twitter_oauth(USE_YOUR_CREDENTIALS_HERE)

	# Username of the Twitter user
	name <- "daattali"

	#####################

	library(twitteR)
	library(SnowballC)
	library(wordcloud)
	library(tm)
	library(stringr)
	library(dplyr)

	user <- userTimeline(user = name, n = 3200, includeRts = FALSE, excludeReplies = TRUE)
	tweets <- sapply(user, function(x) { strsplit(gsub("[^[:alnum:] ]", "", x$text), " +")[[1]] })

	topwords <-
	tweets %>%
	paste(collapse = " ") %>%
	str_split("\\s") %>%
	unlist %>%
	tolower %>%
	removePunctuation %>%
	removeWords(stopwords("english")) %>%
	#wordStem %>%
	.[. != ""] %>%
	table %>%
	sort(decreasing = TRUE) %>%
	head(100)

	wordcloud(names(topwordscloud), topwords, min.freq = 3)