Last active
April 25, 2017 01:10
-
-
Save benmarwick/c6f3226a21e4f44c81a643b81e3ca426 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get list of wikileaks files and directories | |
library(rvest) | |
wl <- read_html("https://file.wikileaks.org/file/") | |
# get file/directory names & clean a little | |
wllt <- | |
wl %>% | |
# Get text of URLS | |
html_nodes("a") %>% | |
html_text() %>% | |
# replace _, - and . with spaces | |
gsub("_|-|//.", " ", .) %>% | |
# all lower case | |
tolower | |
# get a list of country names | |
library(countrycode) | |
countries <- tolower(countrycode_data$country.name.en) | |
# add US and UK in the way that it's used in wikileaks file/folder names | |
countries <- c(countries, "us", "uk") | |
# compute frequency of countries in file/directory names | |
# object to store the results in, a matrix where each row is a country | |
# and each column is a wikileaks file/folder name | |
output <- matrix(nrow = length(countries), | |
ncol = length(wllt)) | |
# for each wikilieaks file/folder names, see if each country name is present | |
for(i in seq_along(countries)){ | |
for(j in seq_along(wllt)){ | |
# to help with debugging | |
# print(paste0("now on ", countries[i], " and ", wllt[j])) | |
output[i, j] <- grepl(countries[i], wllt[j]) | |
} | |
} | |
# make the output a bit easier to read | |
output_df <- as.data.frame(output) | |
names(output_df) <- wllt | |
row.names(output_df) <- countries | |
# tally up the number of times each country is mentioned | |
library(dplyr) | |
country_freqs <- | |
output_df %>% | |
rowSums() %>% | |
as_data_frame() %>% | |
mutate(country = row.names(output_df)) %>% | |
rename(Freq = value) %>% | |
arrange(desc(Freq)) %>% | |
# only countries mentioned at least twice | |
filter(Freq >= 2) | |
# plot | |
library(ggplot2) | |
base_size = 12 | |
wl_freq_plot <- | |
ggplot(country_freqs, | |
aes(reorder(country, | |
Freq), | |
Freq)) + | |
geom_col() + | |
coord_flip() + | |
theme_bw(base_size = base_size) + | |
xlab("Countries mentioned\nat least twice") + | |
ylab("Number of Wikileads file/folder-names \ncontaining country name") + | |
ggtitle("The US, UK, and Iraq are most frequently\nmentioned in Wikileaks") | |
#------------------------------------------------------------------------------ | |
# Get World Justice Project (WJP) Open Government Index data | |
library(readxl) | |
library(httr) | |
the_url <- "http://www.worldjusticeproject.org/sites/default/files/wjp-open-gov-2015_data.xlsx" | |
GET(the_url, write_disk(the_excel_file <- tempfile(fileext = ".xlsx"))) | |
og <- read_excel(the_excel_file) | |
names(og) <- make.names(names(og), unique = TRUE) | |
og_clean <- | |
og %>% | |
rename(countries = NA., | |
openness = Scores) %>% | |
select(countries, openness) %>% | |
mutate(countries = tolower(countries), | |
openness = as.numeric(openness)) %>% | |
mutate(countries = if_else(countries == "united states", "us", | |
if_else(countries == "united kingdom", "uk", | |
countries))) | |
# join with wikileaks data | |
wl_and_wjp <- | |
country_freqs %>% | |
left_join(og_clean, | |
by = c('country' = 'countries')) %>% | |
na.omit() | |
# plot frequency of wikileaks docs and WJP open governance index | |
library(ggrepel) | |
library(ggalt) | |
wl_and_wjp_all_plot <- | |
ggplot(wl_and_wjp, | |
aes(Freq, | |
openness)) + | |
geom_point(size = 3) + | |
geom_point(size = 3, | |
data = subset(wl_and_wjp, | |
openness < 0.5), | |
colour = "red") + | |
geom_text_repel(aes(label = country)) + | |
theme_bw(base_size = base_size) + | |
labs(x = "Wikileaks items", | |
y = "World Justice Project\nOpen Government Index") | |
# US is right out there! And UK is out there are bit too. | |
# Let's remove them and see what we have... | |
wl_and_wjp %>% | |
filter(!country %in% c("uk", "us")) %>% | |
ggplot( | |
aes(Freq, | |
openness)) + | |
geom_point(size = 3) + | |
geom_point(size = 3, | |
data = subset(wl_and_wjp, | |
openness < 0.5), | |
colour = "red") + | |
geom_text_repel(aes(label = country)) + | |
theme_bw(base_size = base_size) + | |
labs(x = "Wikileaks items", | |
y = "World Justice Project\nOpen Government Index") | |
# is there a linear relationship between the OGI and the WL count? | |
library(ggpmisc) | |
formula <- y ~ x | |
wl_and_wjp_regression <- | |
wl_and_wjp %>% | |
filter(!country %in% c("uk", "us")) %>% | |
ggplot( | |
aes(Freq, | |
openness)) + | |
geom_ribbon(stat='smooth', | |
method = "lm", | |
formula = formula, | |
se=TRUE, | |
alpha=0.05, | |
aes(color = NULL)) + | |
geom_line(stat='smooth', | |
method = "lm", | |
alpha=0.3, | |
size = 1) + | |
stat_poly_eq(aes(label = paste(..eq.label.., | |
..adj.rr.label.., | |
sep = "~~~~")), | |
formula = formula, | |
rr.digits = 3, | |
coef.digits = 2, | |
parse = TRUE, hjust = -0.25) + | |
geom_point(size = 3) + | |
geom_point(size = 3, | |
data = subset(wl_and_wjp, | |
openness < 0.5), | |
colour = "red") + | |
geom_text_repel(aes(label = country)) + | |
theme_bw(base_size = base_size) + | |
labs(x = "Wikileaks items (excluding US & UK)", | |
y = "World Justice Project\nOpen Government Index") + | |
ggtitle("A very weak relationship between the number of\nWikileaks items and the Open Government Index") | |
# not really... what about a difference between countries with OGI < 0.5 and countries with OGI > 0.5? | |
library(broom) | |
ogi_hi_lo <- | |
wl_and_wjp %>% | |
mutate(ogi = if_else(openness > 0.5, "high", "low")) %>% | |
group_by(ogi) %>% | |
summarise(count_countries = n(), | |
total_frequency = sum(Freq)) | |
ogi_hi_lo_test <- | |
ogi_hi_lo %>% | |
select(-ogi) %>% | |
chisq.test %>% | |
tidy | |
chi_sq_output <- paste0("chi-square = ", | |
round(ogi_hi_lo_test$statistic, 3), ", ", | |
"p-value = ", round(ogi_hi_lo_test$p.value, 3)) | |
library(ggmosaic) | |
library(viridis) | |
wl_ogi_rank_plot <- | |
ggplot(ogi_hi_lo) + | |
geom_mosaic(aes(weight = total_frequency, | |
x = product(count_countries), | |
fill = ogi)) + | |
scale_fill_viridis(discrete = TRUE, | |
name = "Open\nGovernment\nIndex rank") + | |
xlab("Number of countries in Wikileaks files/folder-names") + | |
theme_bw(base_size = base_size) + | |
ggtitle(paste0("Significantly more countries with high\nOpen Governance Index values in Wikileaks \n(", chi_sq_output, ")")) | |
# there is a significant difference, WL has significantly MORE documents | |
# for countries with HIGH OGI values. Not what we'd expect if WL is | |
# focused on opening goverment in the most needful cases. | |
# here are the main visualisations for the WL and OGI data | |
wl_freq_plot | |
wl_and_wjp_regression | |
wl_ogi_rank_plot | |
# Get Transparency International's (TI) Corruption Perceptions Index (CPI) data | |
library(readxl) | |
library(httr) | |
the_url <- "http://files.transparency.org/content/download/2060/13252/file/CPI2016_FullDataSetWithRegionalTables.xlsx" | |
GET(the_url, write_disk(the_excel_file <- tempfile(fileext = ".xlsx"))) | |
ti <- read_excel(the_excel_file) | |
names(ti) <- make.names(names(ti), unique = TRUE) | |
ti_clean <- | |
ti %>% | |
select(Country, CPI2016) %>% | |
mutate(Country = tolower(Country)) %>% | |
mutate(Country = if_else(Country == "the united states of america", "us", | |
if_else(Country == "united kingdom", "uk", | |
Country))) | |
# join with wikileaks data | |
ti_and_wjp <- | |
country_freqs %>% | |
left_join(ti_clean, | |
by = c('country' = 'Country')) %>% | |
na.omit() | |
# plot frequency of wikileaks docs and TI CPI | |
library(ggrepel) | |
library(ggalt) | |
ti_and_wjp_all_plot <- | |
ggplot(ti_and_wjp, | |
aes(Freq, | |
CPI2016)) + | |
geom_point(size = 3) + | |
geom_point(size = 3, | |
data = subset(ti_and_wjp, | |
CPI2016 < 50), | |
colour = "red") + | |
geom_text_repel(aes(label = country)) + | |
theme_bw(base_size = base_size) + | |
labs(x = "Wikileaks items", | |
y = "Transparency International\nCorruption Perceptions Index") | |
# is there a linear relationship between the CPI and the WL count? | |
library(ggpmisc) | |
formula <- y ~ x | |
ti_and_wjp_others <- | |
ti_and_wjp %>% | |
filter(!country %in% c("uk", "us", "iraq")) | |
ti_and_wjp_regression <- | |
ggplot(ti_and_wjp_others, | |
aes(Freq, | |
CPI2016)) + | |
geom_ribbon(stat='smooth', | |
method = "lm", | |
formula = formula, | |
se=TRUE, | |
alpha=0.05, | |
aes(color = NULL)) + | |
geom_line(stat='smooth', | |
method = "lm", | |
alpha=0.3, | |
size = 1) + | |
stat_poly_eq(aes(label = paste(..eq.label.., | |
..adj.rr.label.., | |
sep = "~~~~")), | |
formula = formula, | |
rr.digits = 3, | |
coef.digits = 2, | |
parse = TRUE, hjust = -0.25) + | |
geom_point(size = 3) + | |
geom_point(size = 3, | |
data = subset(ti_and_wjp_others, | |
CPI2016 < 50), | |
colour = "red") + | |
geom_text_repel(aes(label = country)) + | |
theme_bw(base_size = base_size) + | |
labs(x = "Wikileaks items (excluding US, UK & Iraq)", | |
y = "Transparency International\nCorruption Perceptions Index") + | |
ggtitle("A very weak relationship between the number of\nWikileaks items and the Corruption Perceptions Index") | |
# not really... what about a difference between countries with CPI < 50 and countries with CPI > 50? | |
library(broom) | |
cpi_hi_lo <- | |
ti_and_wjp %>% | |
mutate(cpi = if_else(CPI2016 > 50, "high", "low")) %>% | |
group_by(cpi) %>% | |
summarise(count_countries = n(), | |
total_frequency = sum(Freq)) | |
cpi_hi_lo_test <- | |
cpi_hi_lo %>% | |
select(-cpi) %>% | |
chisq.test %>% | |
tidy | |
chi_sq_output <- paste0("chi-square = ", | |
round(cpi_hi_lo_test$statistic, 3), ", ", | |
"p-value = ", round(cpi_hi_lo_test$p.value, 10)) | |
library(ggmosaic) | |
library(viridis) | |
wl_cpi_rank_plot <- | |
ggplot(cpi_hi_lo) + | |
geom_mosaic(aes(weight = total_frequency, | |
x = product(count_countries), | |
fill = cpi)) + | |
scale_fill_viridis(discrete = TRUE, | |
name = "Corruption\nPerceptions\nIndex rank") + | |
xlab("Number of countries in Wikileaks files/folder-names") + | |
theme_bw(base_size = base_size) + | |
ggtitle(paste0("Significantly more countries with\nlow corruption values in Wikileaks \n(", chi_sq_output, ")")) | |
# --------------------------------------------------------------------------- | |
library(readxl) | |
library(httr) | |
the_url <- "http://www.anderson.ucla.edu/faculty_pages/romain.wacziarg/downloads/fractionalization.xls" | |
GET(the_url, write_disk(the_excel_file <- tempfile(fileext = ".xls"))) | |
ef <- read_excel(the_excel_file, skip = 1) | |
ef_clean <- | |
ef %>% | |
select(Country, Ethnic) %>% | |
mutate(Country = tolower(Country), | |
Ethnic = as.numeric(Ethnic)) %>% | |
mutate(Country = if_else(Country == "the united states of america", "us", | |
if_else(Country == "united kingdom", "uk", | |
Country))) %>% | |
na.omit() | |
# see how this correlates with other things... | |
ef_and_osi <- | |
left_join(ef_clean, | |
og_clean, | |
by = c("Country" = "countries")) %>% | |
na.omit | |
ggplot(ef_and_osi, | |
aes(Ethnic, | |
openness)) + | |
geom_text(aes(label = Country)) + | |
geom_smooth(method = "lm") + | |
stat_poly_eq(aes(label = paste(..eq.label.., | |
..adj.rr.label.., | |
sep = "~~~~")), | |
formula = formula, | |
rr.digits = 3, | |
coef.digits = 2, | |
parse = TRUE, hjust = -0.25) + | |
theme_bw() | |
ef_and_ti <- | |
left_join(ef_clean, | |
ti_clean) %>% | |
na.omit | |
ggplot(ef_and_ti, | |
aes(Ethnic, | |
CPI2016)) + | |
geom_text(aes(label = Country)) + | |
geom_smooth(method = "lm") + | |
stat_poly_eq(aes(label = paste(..eq.label.., | |
..adj.rr.label.., | |
sep = "~~~~")), | |
formula = formula, | |
rr.digits = 3, | |
coef.digits = 2, | |
parse = TRUE, hjust = -0.25) + | |
theme_bw() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment