sonamgupta1105 · June 15, 2016 21:41 · sonamgupta1105 · Jun 15, 2016
diff --git a/gistfile1.txt b/gistfile1.txt
 # All the visualizations are for Title.type = Feature Films. I have used the same packages as mentioned in the example you had given. 
 library(ggplot2)
 library(dplyr)
 movie_data <- read.csv("movie_data.csv")

 #creating the movies dataframe
 dfMoviesFeatureFilm <- subset(movie_data, movie_data$Title.type == "Feature Film")

 #Vis1 that plots imdb-ratings for feature films excluding the missing values
 dfMoviesFeatureFilmRating <- subset(dfMoviesFeatureFilm, dfMoviesFeatureFilm$IMDb.Rating != "")
 plotRatingVis1 <- ggplot(dfMoviesFeatureFilm, aes_(x = dfMoviesFeatureFilm$IMDb.Rating))
 plotRatingVis1 + geom_bar(stat = "count", fill = "coral3")

 #In the above plot, the notable information about the feature films is that, most of the movies are rated between 6.0 to 8.0, few below 6.0 
 #and few above than 8.0, almost forming a bell curve for normal distribution. So we can conclude that the feature films present in the
 #dataset, falls between 6.0-8.0 IMDb ratings. 

 #Vis 2
 # to see the relation between domestic earnings and IMDb ratings
 dfMoviesFeatureFilm[,4] <- as.numeric(sub('\\$','',gsub(',','',as.character(dfMoviesFeatureFilm[,4]))))
 earningsDom <- dfMoviesFeatureFilm$Domestic
 binsDomEarning <- 20
 cutpointsDomEarnings <-quantile(earningsDom, (0:binsDomEarning)/binsDomEarning, na.rm=TRUE)
 binnedDomEarnings <- cut(earningsDom, cutpointsDomEarnings, include.lowest=TRUE)
 imdbRatingDomEarning <- ggplot(dfMoviesFeatureFilm, aes(x = binnedDomEarnings, y = dfMoviesFeatureFilm$IMDbRating)) + geom_tile(aes(fill = IMDbRating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb Ratings and Domestic Earnings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank())
 imdbRatingDomEarning

 #Vis 3
 # International earnings and IMDbRating
 dfMoviesFeatureFilm[,5] <- as.numeric(sub('\\$','',gsub(',','',as.character(dfMoviesFeatureFilm[,5]))))
 earnings <- dfMoviesFeatureFilm$International
 binsIntEarning <- 20
 cutpointsIntEarnings <-quantile(earnings, (0:binsIntEarning)/binsIntEarning, na.rm=TRUE)
 binnedIntEarnings <- cut(earnings, cutpointsIntEarnings, include.lowest=TRUE)
 imdbRatingInternationalEarning <- ggplot(dfMoviesFeatureFilm, aes(x = binnedIntEarnings, y = dfMoviesFeatureFilm$IMDbRating)) + geom_tile(aes(fill = IMDbRating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb Ratings and international Earnings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank())
 imdbRatingInternationalEarning
 # For Vis 2 & 3, there are lot of NAs for domestic and international earnings columns but the highly rated movies are more popular
 #internationally than domestic. THe movies that were rated around 7.0 did not really earn much internationally. 

 # Vis 4 This visualization has 2-3 iterations
 # file name is ratingsVotesIteration1
 imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = dfMoviesFeatureFilm$X..of.IMDb.votes, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = dfMoviesFeatureFilm$IMDb.Rating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.title.y = element_blank())
 imdbRatingVotes
 #Plotting heatmap for finding relation between IMDb Rating and number of IMDb votes-iteration 1
 #This is first attempt to generate a heat map, preferred over creating a scatter plot since there are lot of data points to be
 #plotted for votes. The idea behind the map was to see if the relation between the IMDb ratings and votes is significant and appropriate.
 #After looking at the plot, we can tell the number of votes are according to the ratings. This plot needed revisions since the votes
 #needed to be binned as they couldn't fit on the X-axis. Referred: http://datascienceplus.com/building-heatmaps-in-r/ 

 #Vis 4 -- iteration 2 -- file name is messedXaxisBinnedVotesRating
 #Binning the number of votes with x-axis labels messed up
 votes <- dfMoviesFeatureFilm$X..of.IMDb.votes
 binsVotes <- 20
 cutpointsVotesRating <- quantile(votes, (0:binsVotes)/binsVotes, na.rm = TRUE)
 binned <- cut(votes, cutpointsVotesRating, include.lowest = TRUE)
 imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = binned, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = IMDb.Rating), color = 'red') + xlab('Number of binned votes') + scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.title.y = element_blank())
 imdbRatingVotes
 #This iteration then created a better looking heatmap which is visually understandable to realize the fact that the films that has 
 #higher rating has higher number of votes.  

 #Vis 4 -- iteration 3 -- file name is votesBinnedImprovedXaxis
 This revision was to fix the labels of X-axis 
 ```{r, echo = TRUE}
 #Binning the number of votes with 90degree labels for x-axis
 votes <- dfMoviesFeatureFilm$X..of.IMDb.votes
 binsVotes <- 20
 cutpointsVotesRating <- quantile(votes, (0:binsVotes)/binsVotes, na.rm = TRUE)
 binned <- cut(votes, cutpointsVotesRating, include.lowest = TRUE)
 imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = binned, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = dfMoviesFeatureFilm$IMDb.Rating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank())
 imdbRatingVotes

 #Vis 5
 ratingsIMDBTomato <- ggplot(dfMoviesFeatureFilm, aes(x = Rotten.Tom., y = IMDb.Rating)) + geom_point(aes(color = IMDb.Rating)) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
 ratingsIMDBTomato
 #Plotting a scatter plot between Rotten tomato and IMDb ratings. The darker the points on the graph, lesser are the ratings. 
 #The dataset is very messy and has a lot of missing values for rotten tomato ratings as we see a straight line of points denoting that.
 #The labels on the X-axis are not sorted but are in the order as they are in the dataset.
	# All the visualizations are for Title.type = Feature Films. I have used the same packages as mentioned in the example you had given.
	library(ggplot2)
	library(dplyr)
	movie_data <- read.csv("movie_data.csv")

	#creating the movies dataframe
	dfMoviesFeatureFilm <- subset(movie_data, movie_data$Title.type == "Feature Film")

	#Vis1 that plots imdb-ratings for feature films excluding the missing values
	dfMoviesFeatureFilmRating <- subset(dfMoviesFeatureFilm, dfMoviesFeatureFilm$IMDb.Rating != "")
	plotRatingVis1 <- ggplot(dfMoviesFeatureFilm, aes_(x = dfMoviesFeatureFilm$IMDb.Rating))
	plotRatingVis1 + geom_bar(stat = "count", fill = "coral3")

	#In the above plot, the notable information about the feature films is that, most of the movies are rated between 6.0 to 8.0, few below 6.0
	#and few above than 8.0, almost forming a bell curve for normal distribution. So we can conclude that the feature films present in the
	#dataset, falls between 6.0-8.0 IMDb ratings.

	#Vis 2
	# to see the relation between domestic earnings and IMDb ratings
	dfMoviesFeatureFilm[,4] <- as.numeric(sub('\\$','',gsub(',','',as.character(dfMoviesFeatureFilm[,4]))))
	earningsDom <- dfMoviesFeatureFilm$Domestic
	binsDomEarning <- 20
	cutpointsDomEarnings <-quantile(earningsDom, (0:binsDomEarning)/binsDomEarning, na.rm=TRUE)
	binnedDomEarnings <- cut(earningsDom, cutpointsDomEarnings, include.lowest=TRUE)
	imdbRatingDomEarning <- ggplot(dfMoviesFeatureFilm, aes(x = binnedDomEarnings, y = dfMoviesFeatureFilm$IMDbRating)) + geom_tile(aes(fill = IMDbRating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb Ratings and Domestic Earnings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank())
	imdbRatingDomEarning

	#Vis 3
	# International earnings and IMDbRating
	dfMoviesFeatureFilm[,5] <- as.numeric(sub('\\$','',gsub(',','',as.character(dfMoviesFeatureFilm[,5]))))
	earnings <- dfMoviesFeatureFilm$International
	binsIntEarning <- 20
	cutpointsIntEarnings <-quantile(earnings, (0:binsIntEarning)/binsIntEarning, na.rm=TRUE)
	binnedIntEarnings <- cut(earnings, cutpointsIntEarnings, include.lowest=TRUE)
	imdbRatingInternationalEarning <- ggplot(dfMoviesFeatureFilm, aes(x = binnedIntEarnings, y = dfMoviesFeatureFilm$IMDbRating)) + geom_tile(aes(fill = IMDbRating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb Ratings and international Earnings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank())
	imdbRatingInternationalEarning
	# For Vis 2 & 3, there are lot of NAs for domestic and international earnings columns but the highly rated movies are more popular
	#internationally than domestic. THe movies that were rated around 7.0 did not really earn much internationally.

	# Vis 4 This visualization has 2-3 iterations
	# file name is ratingsVotesIteration1
	imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = dfMoviesFeatureFilm$X..of.IMDb.votes, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = dfMoviesFeatureFilm$IMDb.Rating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.title.y = element_blank())
	imdbRatingVotes
	#Plotting heatmap for finding relation between IMDb Rating and number of IMDb votes-iteration 1
	#This is first attempt to generate a heat map, preferred over creating a scatter plot since there are lot of data points to be
	#plotted for votes. The idea behind the map was to see if the relation between the IMDb ratings and votes is significant and appropriate.
	#After looking at the plot, we can tell the number of votes are according to the ratings. This plot needed revisions since the votes
	#needed to be binned as they couldn't fit on the X-axis. Referred: http://datascienceplus.com/building-heatmaps-in-r/

	#Vis 4 -- iteration 2 -- file name is messedXaxisBinnedVotesRating
	#Binning the number of votes with x-axis labels messed up
	votes <- dfMoviesFeatureFilm$X..of.IMDb.votes
	binsVotes <- 20
	cutpointsVotesRating <- quantile(votes, (0:binsVotes)/binsVotes, na.rm = TRUE)
	binned <- cut(votes, cutpointsVotesRating, include.lowest = TRUE)
	imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = binned, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = IMDb.Rating), color = 'red') + xlab('Number of binned votes') + scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.title.y = element_blank())
	imdbRatingVotes
	#This iteration then created a better looking heatmap which is visually understandable to realize the fact that the films that has
	#higher rating has higher number of votes.

	#Vis 4 -- iteration 3 -- file name is votesBinnedImprovedXaxis
	This revision was to fix the labels of X-axis
	```{r, echo = TRUE}
	#Binning the number of votes with 90degree labels for x-axis
	votes <- dfMoviesFeatureFilm$X..of.IMDb.votes
	binsVotes <- 20
	cutpointsVotesRating <- quantile(votes, (0:binsVotes)/binsVotes, na.rm = TRUE)
	binned <- cut(votes, cutpointsVotesRating, include.lowest = TRUE)
	imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = binned, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = dfMoviesFeatureFilm$IMDb.Rating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank())
	imdbRatingVotes

	#Vis 5
	ratingsIMDBTomato <- ggplot(dfMoviesFeatureFilm, aes(x = Rotten.Tom., y = IMDb.Rating)) + geom_point(aes(color = IMDb.Rating)) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
	ratingsIMDBTomato
	#Plotting a scatter plot between Rotten tomato and IMDb ratings. The darker the points on the graph, lesser are the ratings.
	#The dataset is very messy and has a lot of missing values for rotten tomato ratings as we see a straight line of points denoting that.
	#The labels on the X-axis are not sorted but are in the order as they are in the dataset.