Skip to content

Instantly share code, notes, and snippets.

@ojessen
Created April 14, 2017 21:33
Show Gist options
  • Save ojessen/a0719086fce8081547f20b024c7a4fdc to your computer and use it in GitHub Desktop.
Save ojessen/a0719086fce8081547f20b024c7a4fdc to your computer and use it in GitHub Desktop.
Scraping NBA results and plotting them
library(rvest)
library(dplyr)
library(stringr)
base_url = "http://www.basketball-reference.com/play-index/tscore.cgi?request=1&match=combined&"
post_fix_1 = "is_playoffs=N&order_by=pts"
post_fix_rest = "year_min=1955&year_max=2017&team_id=&opp_id=&quarter_is_1=&quarter_is_2=&quarter_is_3=&quarter_is_4=&quarter_is_5=&is_playoffs=N&round_id=&game_num_type=&game_num_min=&game_num_max=&game_month=&game_location=&game_result=&is_overtime=&c1stat=&c1comp=&c1val=&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&order_by=pts&order_by_asc=&offset="
vec_url = c(paste0(base_url, post_fix_1),
paste0(base_url, post_fix_rest, seq(from = 100, to = 110000, by = 100)))
head(vec_url)
season_overview = read_html("https://en.wikipedia.org/wiki/List_of_National_Basketball_Association_seasons")
df_overview = season_overview %>% html_node("table") %>% html_table(fill = TRUE)
names(df_overview)
sel_overview = df_overview %>% select(7,10)
names(sel_overview) = c("year", "num_games")
num_all_regular = sel_overview %>% filter(year != "Finals") %>% select(num_games) %>%
mutate(num_games = as.numeric(str_sub(trimws(num_games), str_length(num_games)-1,str_length(num_games))))
sum(num_all_regular)
res_page = read_html(vec_url[1])
str(res_page)
df_results = res_page %>% html_node("table") %>%
html_table()
for(iter in 2:length(vec_url)){
res_page = read_html(vec_url[iter])
tmp_df = res_page %>% html_node("table") %>%
html_table()
df_results = rbind(df_results, tmp_df)
cat(paste0(iter, "\n"))
}
save(df_results, file = "NBA_results.RData")
final_df = df_results %>% filter(Rk != "Rk") %>% mutate(PS = as.numeric(PS), PA = as.numeric(PA))
names(final_df)[4] = "adj"
summary(final_df)
head(final_df)
final_df %>% filter(Date == "Fri, Mar 2, 1962")
tab_PA = table(final_df$PA)
tab_PS = table(final_df$PS)
tab_PA == tab_PS
head(final_df$Team)
head(final_df$Opponent)
Opponent= "Philadelphia Warriors"
Team = "New York Knicks"
alpha_pairing = function(Team, Opponent){
paste(sort(c(Team, Opponent)), collapse = " - ")
}
final_df$alpha_pair = ""
for(iter in 1:nrow(final_df)){
final_df$alpha_pair[iter] = alpha_pairing(final_df$Team[iter], final_df$Opponent[iter])
}
final_df = final_df %>% mutate(key = paste(Date, alpha_pair))
head(final_df)
length(unique(final_df$key))
unique_games = data.frame(key = unique(final_df$key), res1 = 0L, res2 = 0L)
for(iter in 1:nrow(unique_games)){
unique_games$res1[iter] = final_df[which(final_df$key == unique_games$key[iter])[1],"PA"]
unique_games$res2[iter] = final_df[which(final_df$key == unique_games$key[iter])[1],"PS"]
}
x = "Tue, Dec 13, 1983 Denver Nuggets - Detroit Pistons"
extract_year = function(x){
as.numeric(vapply(x, FUN = function(x){unlist(str_split(x, " "))[4]}, FUN.VALUE = "character"))
}
unique_games = unique_games %>% mutate(year = extract_year(key))
library(ggplot2)
library(tidyr)
tmp_1 = unique_games[, -3]
tmp_2 = unique_games[, -2]
names(tmp_1) = names(tmp_2) = c("key", "res", "year")
all_res = rbind(tmp_1, tmp_2)
all_res = all_res %>% mutate(epoche = cut(year, breaks = c(1954,1980, 2018)))
all_res %>% group_by(year) %>% summarise(mean_points = mean(res), sd_points = sd(res),
lower_sd = mean_points - sd_points,
upper_sd = mean_points + sd_points) %>%
select(-sd_points) %>% gather(,,-year) %>%
ggplot(aes(x = year, y = value, color = key)) + geom_line(size = 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment