Last active
June 15, 2016 21:41
-
-
Save sonamgupta1105/f9f8005093d0e0282c2a63182184ea5e to your computer and use it in GitHub Desktop.
Problem_set2_ANLY512-50
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# All the visualizations are for Title.type = Feature Films. I have used the same packages as mentioned in the example you had given. | |
library(ggplot2) | |
library(dplyr) | |
movie_data <- read.csv("movie_data.csv") | |
#creating the movies dataframe | |
dfMoviesFeatureFilm <- subset(movie_data, movie_data$Title.type == "Feature Film") | |
#Vis1 that plots imdb-ratings for feature films excluding the missing values | |
dfMoviesFeatureFilmRating <- subset(dfMoviesFeatureFilm, dfMoviesFeatureFilm$IMDb.Rating != "") | |
plotRatingVis1 <- ggplot(dfMoviesFeatureFilm, aes_(x = dfMoviesFeatureFilm$IMDb.Rating)) | |
plotRatingVis1 + geom_bar(stat = "count", fill = "coral3") | |
#In the above plot, the notable information about the feature films is that, most of the movies are rated between 6.0 to 8.0, few below 6.0 | |
#and few above than 8.0, almost forming a bell curve for normal distribution. So we can conclude that the feature films present in the | |
#dataset, falls between 6.0-8.0 IMDb ratings. | |
#Vis 2 | |
# to see the relation between domestic earnings and IMDb ratings | |
dfMoviesFeatureFilm[,4] <- as.numeric(sub('\\$','',gsub(',','',as.character(dfMoviesFeatureFilm[,4])))) | |
earningsDom <- dfMoviesFeatureFilm$Domestic | |
binsDomEarning <- 20 | |
cutpointsDomEarnings <-quantile(earningsDom, (0:binsDomEarning)/binsDomEarning, na.rm=TRUE) | |
binnedDomEarnings <- cut(earningsDom, cutpointsDomEarnings, include.lowest=TRUE) | |
imdbRatingDomEarning <- ggplot(dfMoviesFeatureFilm, aes(x = binnedDomEarnings, y = dfMoviesFeatureFilm$IMDbRating)) + geom_tile(aes(fill = IMDbRating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb Ratings and Domestic Earnings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank()) | |
imdbRatingDomEarning | |
#Vis 3 | |
# International earnings and IMDbRating | |
dfMoviesFeatureFilm[,5] <- as.numeric(sub('\\$','',gsub(',','',as.character(dfMoviesFeatureFilm[,5])))) | |
earnings <- dfMoviesFeatureFilm$International | |
binsIntEarning <- 20 | |
cutpointsIntEarnings <-quantile(earnings, (0:binsIntEarning)/binsIntEarning, na.rm=TRUE) | |
binnedIntEarnings <- cut(earnings, cutpointsIntEarnings, include.lowest=TRUE) | |
imdbRatingInternationalEarning <- ggplot(dfMoviesFeatureFilm, aes(x = binnedIntEarnings, y = dfMoviesFeatureFilm$IMDbRating)) + geom_tile(aes(fill = IMDbRating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb Ratings and international Earnings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank()) | |
imdbRatingInternationalEarning | |
# For Vis 2 & 3, there are lot of NAs for domestic and international earnings columns but the highly rated movies are more popular | |
#internationally than domestic. THe movies that were rated around 7.0 did not really earn much internationally. | |
# Vis 4 This visualization has 2-3 iterations | |
# file name is ratingsVotesIteration1 | |
imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = dfMoviesFeatureFilm$X..of.IMDb.votes, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = dfMoviesFeatureFilm$IMDb.Rating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.title.y = element_blank()) | |
imdbRatingVotes | |
#Plotting heatmap for finding relation between IMDb Rating and number of IMDb votes-iteration 1 | |
#This is first attempt to generate a heat map, preferred over creating a scatter plot since there are lot of data points to be | |
#plotted for votes. The idea behind the map was to see if the relation between the IMDb ratings and votes is significant and appropriate. | |
#After looking at the plot, we can tell the number of votes are according to the ratings. This plot needed revisions since the votes | |
#needed to be binned as they couldn't fit on the X-axis. Referred: http://datascienceplus.com/building-heatmaps-in-r/ | |
#Vis 4 -- iteration 2 -- file name is messedXaxisBinnedVotesRating | |
#Binning the number of votes with x-axis labels messed up | |
votes <- dfMoviesFeatureFilm$X..of.IMDb.votes | |
binsVotes <- 20 | |
cutpointsVotesRating <- quantile(votes, (0:binsVotes)/binsVotes, na.rm = TRUE) | |
binned <- cut(votes, cutpointsVotesRating, include.lowest = TRUE) | |
imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = binned, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = IMDb.Rating), color = 'red') + xlab('Number of binned votes') + scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.title.y = element_blank()) | |
imdbRatingVotes | |
#This iteration then created a better looking heatmap which is visually understandable to realize the fact that the films that has | |
#higher rating has higher number of votes. | |
#Vis 4 -- iteration 3 -- file name is votesBinnedImprovedXaxis | |
This revision was to fix the labels of X-axis | |
```{r, echo = TRUE} | |
#Binning the number of votes with 90degree labels for x-axis | |
votes <- dfMoviesFeatureFilm$X..of.IMDb.votes | |
binsVotes <- 20 | |
cutpointsVotesRating <- quantile(votes, (0:binsVotes)/binsVotes, na.rm = TRUE) | |
binned <- cut(votes, cutpointsVotesRating, include.lowest = TRUE) | |
imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = binned, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = dfMoviesFeatureFilm$IMDb.Rating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank()) | |
imdbRatingVotes | |
#Vis 5 | |
ratingsIMDBTomato <- ggplot(dfMoviesFeatureFilm, aes(x = Rotten.Tom., y = IMDb.Rating)) + geom_point(aes(color = IMDb.Rating)) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) | |
ratingsIMDBTomato | |
#Plotting a scatter plot between Rotten tomato and IMDb ratings. The darker the points on the graph, lesser are the ratings. | |
#The dataset is very messy and has a lot of missing values for rotten tomato ratings as we see a straight line of points denoting that. | |
#The labels on the X-axis are not sorted but are in the order as they are in the dataset. |
Author
sonamgupta1105
commented
Jun 15, 2016
•
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment