Created
August 22, 2016 00:27
-
-
Save shuozhang1985/da03c78e983b56f305650c4ac628b719 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
setwd("~/Desktop/web scraping") | |
library(dplyr) | |
library(ggplot2) | |
library(dygraphs) | |
library(plotly) | |
event=read.csv('gender2.txt', header=T, stringsAsFactors = F, sep=',') | |
nrow(event) | |
View(event) | |
#length(event$Event) | |
lst2=filter(event, grepl('Mixed', Event)) | |
#length(lst2$Event) | |
lst1=filter(event, !grepl('Mixed', Event)) | |
temp=strsplit(lst1$Event, split="'") | |
A=matrix(unlist(temp), ncol=2, byrow=TRUE) | |
sport=as.data.frame(A) | |
sport=cbind(sport, lst1$Sport) | |
colnames(sport)=c('gender', 'event', 'sport') | |
#View(sport) | |
sport%>% | |
dplyr::group_by(gender)%>% | |
dplyr::summarise(n=n()) | |
sport2=sport%>% | |
dplyr::group_by(event)%>% | |
dplyr::summarise(n=n()) | |
common=dplyr::filter(sport2, n>=2) | |
uncommon=dplyr::filter(sport2, n==1) | |
uncommon1=merge(x=uncommon, y=sport, by='event', all.x = T) | |
#View(uncommon1) | |
uncommon1%>% | |
dplyr::group_by(gender)%>% | |
dplyr::summarise(n()) | |
Event=c('men','men', 'women', 'women','mixed') | |
Category=c('only to men', 'equal to men and women', 'only to women', 'equal to men and women', | |
'equal to men and women') | |
number=c(42, 120, 15, 117, 8 ) | |
df=as.data.frame(Event,colnames=c('event')) | |
df$Category=Category | |
df$Number=number | |
df$Percent=df$Number/nrow(event) | |
ggplot(data=df, aes(x=reorder(Event, Percent), y=Percent, fill=Category))+ | |
geom_bar(stat = 'identity')+ | |
xlab('Event')+ | |
ylab('Percent')+ | |
ggtitle('Gender distribution of events in 2012 Olympics')+ | |
theme_bw()+ | |
theme(legend.position='bottom') | |
uncommonbysport=uncommon1%>% | |
group_by(gender,sport)%>% | |
dplyr::summarise(n=n()) | |
ggplot(data=uncommonbysport, aes(x=reorder(sport,n), y=n, fill=gender))+ | |
geom_bar(stat='identity')+ | |
xlab('Sport')+ | |
ylab('Number of inequal events')+ | |
ggtitle('Events of inequal gender in 2012 Olympics')+ | |
theme_bw()+ | |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) | |
#competitor | |
total=read.csv('event.txt', header=T, stringsAsFactors = F, sep=',') | |
total$Women=as.numeric(gsub(',', '', total$Women)) | |
total$Men=as.numeric(gsub(',','',total$Men)) | |
total$Year=as.numeric(total$Year) | |
summary(total) | |
total=mutate(total, womenpercent=Women/sum(Men+Women), menpercent=Men/sum(Women+Men)) | |
plot(total$Year, total$Men, type='l', col='red', xlab='Year', ylab='Number of athelets', | |
main='Athletes vs Gender') | |
lines(total$Year, total$Women,col='blue') | |
legend("topleft", inset=.05, | |
c("Women","Men"), | |
lty=c(1,1), | |
lwd=c(2.5,2.5), | |
col=c('blue', 'red'), horiz=FALSE) | |
people=read.csv('gender.txt', header=T, stringsAsFactors = F, sep=',') | |
summary(people) | |
people1=people%>% | |
group_by(Gender)%>% | |
dplyr::summarise(n=n()) | |
df1=people%>% | |
group_by(Gender, Gold)%>% | |
dplyr::summarise(n=n()) | |
df2=people%>% | |
group_by(Gender, Silver)%>% | |
dplyr::summarise(n=n()) | |
df3=people%>% | |
group_by(Gender, Bronze)%>% | |
dplyr::summarise(n=n()) | |
Gender=c('Female','Female', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male') | |
Metal=c('Gold', 'Silver', 'Bronze', 'No metal', 'Gold', 'Silver', 'Bronze', 'No metal') | |
Number=c(sum(filter(df1, Gender=='Female',Gold>=1)$n), | |
sum(filter(df2, Gender=='Female',Silver>=1)$n), | |
sum(filter(df3, Gender=='Female',Bronze>=1)$n), | |
filter(people1, Gender=='Female')$n-sum(filter(df1, Gender=='Female',Gold>=1)$n)-sum(filter(df2, Gender=='Female',Silver>=1)$n)- sum(filter(df3, Gender=='Female',Bronze>=1)$n), | |
sum(filter(df1, Gender=='Male',Gold>=1)$n), | |
sum(filter(df2, Gender=='Male',Silver>=1)$n), | |
sum(filter(df3, Gender=='Male',Bronze>=1)$n), | |
filter(people1, Gender=='Male')$n-sum(filter(df1, Gender=='Male',Gold>=1)$n)-sum(filter(df2, Gender=='Male',Silver>=1)$n)- sum(filter(df3, Gender=='Male',Bronze>=1)$n) | |
) | |
df4=as.data.frame(Gender, colnames=c('Gender')) | |
df4$Metal=Metal | |
df4$Number=Number | |
df4=mutate(df4,Percent=Number/sum(Number)) | |
ggplot(data=df4, aes(x=Gender, y=Number, fill=Metal))+ | |
geom_bar(stat='identity')+ | |
xlab('Gender')+ | |
ylab('Number of metal')+ | |
ggtitle('Metal distribution by gender in 2012 Olympics')+ | |
theme_bw() | |
# age | |
totalage=select(people, Age) | |
totalage$Type=rep('All', nrow(totalage)) | |
age1=filter(people, Gold>=1)%>% | |
select(Age, Metal= Gold) | |
age1= mutate(age1, Type=rep('Gold',nrow(age1))) | |
summary(age1) | |
age2=filter(people, Silver>=1)%>% | |
select(Age, Metal=Silver) | |
summary(age2) | |
age2=mutate(age2, Type=rep('Silver', nrow(age2))) | |
age3=filter(people, Bronze>=1)%>% | |
select(Age,Metal= Bronze) | |
age3=mutate(age3, Type=rep('Bronze', nrow(age3))) | |
age=rbind(totalage, select(age1, Age, Type), select(age2, Age, Type), | |
select(age3, Age, Type)) | |
ggplot(data=age, aes(Age, color=Type))+ | |
geom_density(alpha=0.2)+ | |
xlab('Age')+ | |
ylab('Density')+ | |
ggtitle('Athletes age distribution in 2012 Olympics')+ | |
theme_bw()+ | |
theme(legend.position='bottom') | |
people[is.na(people)]=0 | |
people=mutate(people, Total=Gold+Silver+Bronze) | |
summary(people) | |
peoplebysport=filter(people, Total>=1)%>% | |
group_by(Age, Sport)%>% | |
dplyr::summarise(n=n()) | |
ggplot(peoplebysport, aes(x=Age, y=n, fill=Sport))+ | |
geom_bar(stat = 'identity')+ | |
xlab('Age')+ | |
ylab('Number of Medals')+ | |
ggtitle('Medal winners age distribution by sport in 2012 Olympics')+ | |
theme_bw()+ | |
theme(legend.position='bottom') | |
ggplot(filter(peoplebysport, Sport %in% c('Equestrianism', 'Gymnastics')), aes(x=Age, y=n, fill=Sport))+ | |
geom_bar(stat = 'identity')+ | |
xlab('Age')+ | |
ylab('Number of Medals')+ | |
ggtitle('Medal winners age distribution by sport in 2012 Olympics')+ | |
theme_bw()+ | |
theme(legend.position='bottom') | |
# sport | |
peoplebysport1=filter(people, Gold>=1)%>% | |
group_by(Sport, Gold)%>% | |
dplyr::summarise(n=n())%>% | |
select(Sport, n)%>% | |
peoplebysport1$Type=rep('Gold', nrow(peoplebysport1)) | |
peoplebysport2=filter(people, Silver>=1)%>% | |
group_by(Sport, Silver)%>% | |
dplyr::summarise(n=n())%>% | |
select(Sport, n) | |
peoplebysport2$Type=rep('Silver', nrow(peoplebysport2)) | |
peoplebysport3=filter(people, Bronze>=1)%>% | |
group_by(Sport, Bronze)%>% | |
dplyr::summarise(n=n())%>% | |
select(Sport, n) | |
peoplebysport3$Type=rep('Bronze', nrow(peoplebysport3)) | |
peoplebysport=rbind(peoplebysport1, peoplebysport2, peoplebysport3) | |
ggplot(peoplebysport, aes(x=reorder(Sport, n), y=n, fill=Type))+ | |
geom_bar(stat = 'identity')+ | |
xlab('Sport')+ | |
ylab('Number of medal')+ | |
ggtitle('Medal distribution by sport in 2012 Olympics')+ | |
theme_bw()+ | |
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position='bottom') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment