Last active
August 29, 2015 14:22
-
-
Save jhofman/9b0a8bb88fb2c99b0f75 to your computer and use it in GitHub Desktop.
You Draw It
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Scrape income distribution data from whatsmypercent.com | |
# | |
# Output is in incomes.csv (percentile,income) | |
# | |
# start at $100 / year | |
income=100 | |
# loop over all 100 percentiles | |
for f in {1..100} | |
do | |
# grab the bottom of the next percentile | |
income=`curl -silent 'http://whatsmypercent.com/incomeRank.php?income='$income'&status=All+Filers' | grep 'The next percentile begins at:' | awk -F"[<>]" '{print $9}'` | |
income=${income/\$/} | |
income=${income/,/} | |
# grab the percentile | |
percentile=`curl -silent 'http://whatsmypercent.com/incomeRank.php?income='$income'&status=All+Filers' | grep 'Your percentile is:' | awk -F"[<>]" '{print $9}'` | |
percentile=${percentile/\%/} | |
echo $percentile,$income | |
done | \ | |
grep -v '^0,' > incomes.csv | |
# write output to csv file |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Compare various plots of child college attendance by parent income | |
# | |
# Inspired by the interactive NYT piece "You Draw It" at http://www.nytimes.com/interactive/2015/05/28/upshot/you-draw-it-how-family-income-affects-childrens-college-chances.html | |
# | |
library(ggplot2) | |
library(scales) | |
# income distribution data (2010) from scrape_income_dist.sh | |
incomes <- structure(list(percentile = 2:99, dollars = c(2451L, 4134L, 5184L, | |
6028L, 6922L, 7626L, 8226L, 8764L, 9235L, 9832L, 10482L, 11366L, | |
12207L, 12999L, 13732L, 14447L, 15064L, 15736L, 16358L, 16992L, | |
17659L, 18204L, 18768L, 19375L, 19964L, 20860L, 22013L, 23034L, | |
23873L, 24675L, 25505L, 26311L, 27033L, 27811L, 28560L, 29306L, | |
29999L, 30999L, 32188L, 33281L, 34272L, 35295L, 36253L, 37194L, | |
38051L, 39064L, 39953L, 41113L, 42327L, 43564L, 44769L, 45871L, | |
46956L, 48095L, 49225L, 50353L, 51922L, 54282L, 57213L, 59670L, | |
61654L, 63469L, 65192L, 66639L, 68140L, 69658L, 71150L, 72539L, | |
73866L, 75296L, 77160L, 79838L, 83011L, 85811L, 88317L, 90794L, | |
93165L, 95174L, 97298L, 99424L, 102060L, 106770L, 117025L, 125260L, | |
131032L, 136231L, 141453L, 147725L, 154131L, 160864L, 168227L, | |
177123L, 187412L, 200026L, 235687L, 290860L, 360435L, 506553L | |
)), .Names = c("percentile", "dollars"), class = "data.frame", row.names = c(NA, | |
-98L)) | |
# create a column for the percent of children who attend college at each percentile | |
# (slope and intercept guesstimated from Chetty et. al.) | |
incomes <- transform(incomes, college=2/3*percentile + 27) | |
# plot college attendance vs parent income percentile | |
qplot(data=incomes, x=percentile, y=college) + | |
xlab('Parent income percentile') + | |
ylab('Percent of children who attend college') + | |
ylim(c(0,100)) | |
ggsave('percentile_college.png', width=4, height=4) | |
# plot college attendance vs parent income | |
qplot(data=incomes, x=dollars, y=college) + | |
xlab('Parent income') + | |
ylab('Percent of children who attend college') + | |
ylim(c(0,100)) + | |
scale_x_continuous(labels=comma) | |
ggsave('dollars_college.png', width=4, height=4) | |
# plot college attendance vs parent income, with log scale | |
qplot(data=incomes, x=dollars, y=college) + | |
xlab('Parent income') + | |
ylab('Percent of children who attend college') + | |
ylim(c(0,100)) + | |
scale_x_log10(labels=comma) | |
ggsave('dollars_college_log10.png', width=4, height=4) | |
# plot college attendance vs parent income, showing population at each income | |
incomes %>% | |
mutate(dollars_bin=round(dollars/10000)*10000) %>% | |
group_by(dollars_bin) %>% | |
summarize(size=n(), college=mean(college)) %>% | |
qplot(data=., x=dollars_bin, y=college, size=size) + | |
xlab('Parent income') + | |
ylab('Percent of children who attend college') + | |
ylim(c(0,100)) + | |
theme(legend.position="none") + | |
scale_x_continuous(labels=comma) | |
ggsave('dollars_college_sized.png', width=4, height=4) | |
# plot college attendance vs parent income, with log scale showing population at each income | |
incomes %>% | |
mutate(dollars_bin=round(dollars/10000)*10000) %>% | |
group_by(dollars_bin) %>% | |
summarize(size=n(), college=mean(college)) %>% | |
qplot(data=., x=dollars_bin, y=college, size=size) + | |
xlab('Parent income') + | |
ylab('Percent of children who attend college') + | |
ylim(c(0,100)) + | |
theme(legend.position="none") + | |
scale_x_log10(labels=comma) | |
ggsave('dollars_college_log10_sized.png', width=4, height=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment