# basic imports
import requests
import tarfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

# so things render correctly
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8');

# basic imports
plt.rcParams["figure.figsize"]= (12, 9)


# downloading the data. These are datasets provided for non-commercial use
# url= 'https://datasets.imdbws.com/title.basics.tsv.gz';
# url2= 'https://datasets.imdbws.com/title.ratings.tsv.gz';

# titledata= requests.get(url);
# ratingdata= requests.get(url2);

title_filename= 'title_data.tsv.gz';
ratings_filename= 'ratings.tsv.gz';

# write the files to our directory
# open(title_filename, 'wb').write(titledata.content);
# open(ratings_filename, 'wb').write(ratingdata.content);


# reading the data into panda dataframes
title_df= pd.read_csv(title_filename, sep="\t", compression="gzip",
                                      dtype={'isAdult': str});
ratings_df= pd.read_csv(ratings_filename, sep="\t", compression="gzip");

print(title_df.head(5), ratings_df.head(5))

      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  isAdult startYear endYear runtimeMinutes                    genres  
0       0      1894      \N              1         Documentary,Short  
1       0      1892      \N              5           Animation,Short  
2       0      1892      \N              4  Animation,Comedy,Romance  
3       0      1892      \N             12           Animation,Short  
4       0      1893      \N              1              Comedy,Short         tconst  averageRating  numVotes
0  tt0000001            5.7      1971
1  tt0000002            5.8       263
2  tt0000003            6.5      1817
3  tt0000004            5.6       178
4  tt0000005            6.2      2613


# reading basics and raties data into panda dataframes
title_df= pd.read_csv(title_filename, sep="\t",
                                                  dtype={'isAdult': str});
ratings_df= pd.read_csv(ratings_filename, sep="\t");
print(title_df.head(5), ratings_df.head(5))

      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  isAdult startYear endYear runtimeMinutes                    genres  
0       0      1894      \N              1         Documentary,Short  
1       0      1892      \N              5           Animation,Short  
2       0      1892      \N              4  Animation,Comedy,Romance  
3       0      1892      \N             12           Animation,Short  
4       0      1893      \N              1              Comedy,Short         tconst  averageRating  numVotes
0  tt0000001            5.7      1971
1  tt0000002            5.8       263
2  tt0000003            6.5      1817
3  tt0000004            5.6       178
4  tt0000005            6.2      2613


# Collect box office data
box_office_url= 'https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/';
box_office_url= box_office_url + '?offset=';
box_office_df= pd.DataFrame();

# iterate over each webpage 200 ranks at a time
for i in range(0, 1000, 200):
    # append the offset to the url
    box_req= requests.get(box_office_url + str(i));
    soup= BeautifulSoup(box_req.content, 'html.parser');

    # add the next 200 ranks to the current 
    box_office_df = pd.concat([box_office_df,
                               pd.read_html(io=str(soup.find('table')))[0]])
    
# Rename the 'Title' column to 'primaryTitle' to merge with the other dataframe
box_office_df.rename(columns={'Title': 'primaryTitle'},inplace=True)

box_office_df.head(10)


# Write data out to csv, so the results are reproducable
box_office_df.to_csv("./box_office.tsv", sep= "\t")


# Merge title and rating data, and remove unnecessary columns
df= pd.merge(left=title_df, right=ratings_df, on='tconst', how='inner');

# used in shows; not relevant in this analysis
df.drop(labels=['endYear'],axis=1, inplace=True);

df.head(5)


df.dtypes

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult            object
startYear          object
runtimeMinutes     object
genres             object
averageRating     float64
numVotes            int64
dtype: object


# keep only movies
df= df[df['titleType'] == 'movie']

# drop movies without a release date
df= df[df['startYear'] != '\\N']

# convert release date from strings to integers
df['startYear']= df['startYear'].apply(int)


# convert to float type
to_number = lambda x: float(x) if x.isdigit() else np.nan

df['runtimeMinutes'] = df['runtimeMinutes'].apply(to_number)


# 1 if isAdult else, if it's any other value (0 or invalid)
function = lambda x: True if x == '1' else False

df['isAdult'] = df['isAdult'].apply(function)


df.head(5)


# count of movies for each year
years= df['startYear'];
# create bins of size 5 starting 
bins= np.arange(start=df['startYear'].min(),stop=df['startYear'].max(),step=5);
plt.xlabel('Period (5 years)');
plt.ylabel('Amount of Movies');
plt.title(label='Number of Movies per 5 Years');
plt.hist(x= years, bins=bins);


# Drop null values and store runtime by year
by_year= df[df['runtimeMinutes'] != '\\N']
by_year = by_year.dropna(axis=0, subset='runtimeMinutes').groupby(by='startYear')


by_year['runtimeMinutes'].head(10)

8          45.0
144       100.0
326       100.0
358        70.0
366        90.0
          ...  
619272     90.0
649109     52.0
734933    120.0
780545     45.0
867340     61.0
Name: runtimeMinutes, Length: 1159, dtype: float64


#Find average runtime of each year and plot it

avgs = by_year['runtimeMinutes'].mean();
plt.xlabel('year');
plt.ylabel('runtime (minutes)');
plt.title('Average runtime by year');
plt.plot(avgs);


plt.xlabel('year');
plt.ylabel('runtime (minutes)');
plt.title('Median runtime by year');
plt.plot(by_year['runtimeMinutes'].median());


#This block is looking at the frequency of genre over time
#The first step is to split a data cell of a movie with multiple genres into separate cells
genre = df['genres'].str.split(",", n=2, expand = True)
#Then we add in year into our new dataframe
genre.insert(3, "year", df['startYear'], True)
#We delete NaN and Null values
genre = genre[genre[0] != '\\N']
#We drop the secondary and tertiary genres as we are only looking at the primary genre
genre.drop([1, 2], axis=1)
#We then group the genre by year and put it in a data frame
genre_tot = genre[[0, 'year']].groupby('year')[0].value_counts().reset_index(name='count')
#Now we plot our data in a stacked bar graph
genre.groupby(['year', 0]).size().unstack().plot.bar(stacked=True, figsize=(12, 9));
#We set 12 x ticks
plt.locator_params(axis='x', nbins=12);
#We label our graph
plt.xlabel("Year");
plt.ylabel("Frequency");
plt.title("Frequency of Genres by Year");


df.sort_values(by=['numVotes'], ascending=False).head(10)


# convert the ratings to floats to make analysis easier
df_avg_rating= df.groupby(by= 'startYear')['averageRating'].mean();
df_avg_rating.plot.line(x= 'startingYear', y= 'averageRating',
                        xlabel= 'Year', ylabel= 'Average Rating',
                        title= 'Average Ratings by Year', color= 'red',
                        xticks= range(1894, 2023, 7), figsize= (12, 9));


# convert the ratings to floats to make analysis easier
df_avg_rating2= df[df['numVotes'] >= 50]
df_avg_rating2= df_avg_rating2.groupby(by= 'startYear')['averageRating'].mean();

df_avg_rating2.plot.line(x= 'startingYear', y= 'averageRating', figsize= (12, 9), xlabel= 'Year', ylabel= 'Average Rating', \
                        color= 'red', title= 'Average Ratings by Year (Rotten Tomatoes Critereon)', xticks= range(1885, 2023, 10));


df_avg_rating= df.groupby(by= 'startYear')['averageRating'].mean();
df_rated_x= df[df['isAdult'] == True].groupby(by= 'startYear')['averageRating'].mean();
df_rated_x.plot.line(x= 'startingYear', y= 'averageRating', figsize= (12, 9),
                     xlabel= 'Year', ylabel= 'Average Rating',
                     color= 'blue',
                     label= 'Rated X', title= 'Average Ratings by Year',
                     xticks= range(1894, 2023, 7));
df_avg_rating.plot.line(x= 'startingYear', y= 'averageRating',
                        figsize= (12, 9),
                        xlabel= 'Year', ylabel= 'Average Rating',
                        color= 'red', label= 'Overall',
                        title= 'Average Ratings by Year',
                        xticks= range(1894, 2023, 7));

plt.legend();


# count of movies for each year
years = df['startYear']

# create bins of size 5 starting 
bins = np.arange(start=df['startYear'].min(),stop=df['startYear'].max(),step=5)

plt.xlabel('period (5 years)');
plt.ylabel('Amount of Movies');
plt.title(label='Number of Movies per 5 Years');
plt.hist(x=years,bins=bins);


by_year = df.dropna(axis=0,subset='runtimeMinutes').groupby(by='startYear')


avgs = by_year['runtimeMinutes'].mean();

plt.xlabel('year');
plt.ylabel('runtime (minutes)');
plt.title('Average runtime by year');
plt.plot(avgs);


plt.xlabel('year')
plt.ylabel('runtime (minutes)')
plt.title('Median runtime by year')

plt.plot(by_year['runtimeMinutes'].median());


# TODO move to step 2 

# add the averageRatings numVotes, and genres to the box office dataframe
# df would be the 'left' set and box_office_df would be the 'right' set
interested_columns= ['primaryTitle', 'genres',
                     'averageRating', 'numVotes', 'runtimeMinutes'];
boxdf= df[interested_columns].merge(box_office_df,
                                    how='right', on='primaryTitle')

# remove the duplicates of movies with the highest number of votes
boxdf= boxdf.sort_values('numVotes', ascending=False)
boxdf= boxdf.drop_duplicates('primaryTitle')
boxdf= boxdf.sort_values(by='Rank')

boxdf.head(5)


# given a dollar amount ($123,456,789), this function removes the currecny symbol 
# then, it parses the comma delimited number.
convert= lambda x: locale.atof(x.strip('$')) if x != '-' else np.nan

# apply this function to each of the revenue columns to convert them to floats
boxdf['Worldwide Lifetime Gross']= boxdf['Worldwide Lifetime Gross'].apply(convert)
boxdf['Domestic Lifetime Gross']= boxdf['Domestic Lifetime Gross'].apply(convert)
boxdf['Foreign Lifetime Gross']= boxdf['Foreign Lifetime Gross'].apply(convert)

boxdf.head(5)


figure, (axis1, axis2)= plt.subplots(1,2)

# plot the top 1000 box office movies
axis1.set_xlabel('average rating')
axis1.set_ylabel('Worldwide Lifetime Gross')
axis1.set_title('Top 1000 Box Office Movies')
axis1.scatter(x=boxdf['averageRating'], y=boxdf['Worldwide Lifetime Gross'])

# plot all movies
axis2.set_xlabel('number of votes')
axis2.set_ylabel('average rating (out of 10)')
axis2.set_title('Top 1000 Box Office Movies')
axis2.scatter(x=boxdf['numVotes'], y=boxdf['Worldwide Lifetime Gross'])

figure.set_size_inches(12,6)
plt.show()


figure, (axis1, axis2)= plt.subplots(1,2)
x= 'runtime (minutes)'

# plot the top 1000 box office movies
axis1.set_xlabel(x)
axis1.set_ylabel('Worldwide Lifetime Gross')
axis1.set_title('Top 1000 Box Office Movies')
axis1.scatter(boxdf['runtimeMinutes'], boxdf['Worldwide Lifetime Gross'])

# plot all movies
axis2.set_xlabel(x)
axis2.set_ylabel('amount of movies')
axis2.set_title('Runtime Distribution')
axis2.hist(boxdf['runtimeMinutes'], bins=15)

figure.set_size_inches(12,6)
plt.show()


boxdf= boxdf.dropna(how='all',subset='numVotes')
# rename to remove spaces
boxdf= boxdf.rename(columns={'Worldwide Lifetime Gross': 'boxOffice'})
# take a subset of the movies with at least 15,0000 votes
thousand = df[df['numVotes'] > 15000].copy()


figure, (axis1, axis2)= plt.subplots(1,2)

# plot the top 1000 box office movies
axis1.set_xlabel('number of votes')
axis1.set_ylabel('average rating (out of 10)')
axis1.set_title('Top 1000 Box Office Movies')
axis1.scatter(boxdf['numVotes'], boxdf['averageRating'])

# plot all movies
axis2.set_xlabel('number of votes')
axis2.set_ylabel('average rating (out of 10)')
axis2.set_title('All Movies (> 1000 Votes)')
axis2.scatter(thousand['numVotes'], thousand['averageRating'])

figure.set_size_inches(12,6)
plt.show()


from sklearn.linear_model import LinearRegression
from statsmodels.formula import api as stats


lin_model= stats.ols(formula='averageRating ~ numVotes', data=thousand).fit()


votes= thousand['numVotes']
preds= lin_model.predict(votes)

figure, (axis1, axis2)= plt.subplots(1,2)

# plot the number of votes vs rating with the linear regression model
axis1.set_xlabel('number of votes')
axis1.set_ylabel('average rating (out of 10)')
axis1.set_title('Predictions')
axis1.scatter(x=votes, y=thousand['averageRating'])
axis1.plot(votes, lin_model.predict(votes))

# plot all movies
axis2.set_xlabel('predictions')
axis2.set_ylabel('residuals')
axis2.set_title('All Movies (> 1000 Votes)')
axis2.scatter(x=preds, y=lin_model.resid)

figure.set_size_inches(12,6)
plt.show()


thousand['logRating']= thousand['averageRating'].apply(np.exp);
exp_model= stats.ols(formula='logRating ~ numVotes',data=thousand).fit();
plt.scatter(votes,thousand['logRating']);
plt.xlabel('Votes');
plt.ylabel('Rating');
plt.title('Votes vs. Ratings for films with >15000 votes');


log_preds= exp_model.predict(votes);
plt.xlabel('predictions');
plt.ylabel('residuals');
plt.scatter(preds, exp_model.resid);


plt.hist(exp_model.resid);

	Rank	primaryTitle	Worldwide Lifetime Gross	Domestic Lifetime Gross	Domestic %	Foreign Lifetime Gross	Foreign %	Year
0	1	Avatar	$2,923,706,026	$785,221,649	26.9%	$2,138,484,377	73.1%	2009
1	2	Avengers: Endgame	$2,799,439,100	$858,373,000	30.7%	$1,941,066,100	69.3%	2019
2	3	Avatar: The Way of Water	$2,320,250,281	$684,075,767	29.5%	$1,636,174,514	70.5%	2022
3	4	Titanic	$2,264,750,694	$674,292,608	29.8%	$1,590,458,086	70.2%	1997
4	5	Star Wars: Episode VII - The Force Awakens	$2,071,310,218	$936,662,225	45.2%	$1,134,647,993	54.8%	2015
5	6	Avengers: Infinity War	$2,052,415,039	$678,815,482	33.1%	$1,373,599,557	66.9%	2018
6	7	Spider-Man: No Way Home	$1,922,598,800	$814,866,759	42.4%	$1,107,732,041	57.6%	2021
7	8	Jurassic World	$1,671,537,444	$653,406,625	39.1%	$1,018,130,819	60.9%	2015
8	9	The Lion King	$1,663,079,059	$543,638,043	32.7%	$1,119,441,016	67.3%	2019
9	10	The Avengers	$1,520,538,536	$623,357,910	41%	$897,180,626	59%	2012

	tconst	titleType	primaryTitle	originalTitle	isAdult	startYear	runtimeMinutes	genres	averageRating	numVotes
82557	tt0111161	movie	The Shawshank Redemption	The Shawshank Redemption	False	1994	142.0	Drama	9.3	2737560
250364	tt0468569	movie	The Dark Knight	The Dark Knight	False	2008	152.0	Action,Crime,Drama	9.0	2710629
637745	tt1375666	movie	Inception	Inception	False	2010	148.0	Action,Adventure,Sci-Fi	8.8	2406192
99043	tt0137523	movie	Fight Club	Fight Club	False	1999	139.0	Drama	8.8	2179773
81462	tt0109830	movie	Forrest Gump	Forrest Gump	False	1994	142.0	Drama,Romance	8.8	2130268
82340	tt0110912	movie	Pulp Fiction	Pulp Fiction	False	1994	154.0	Crime,Drama	8.9	2103714
96895	tt0133093	movie	The Matrix	The Matrix	False	1999	136.0	Action,Sci-Fi	8.7	1952760
90341	tt0120737	movie	The Lord of the Rings: The Fellowship of the Ring	The Lord of the Rings: The Fellowship of the Ring	False	2001	178.0	Action,Adventure,Drama	8.8	1911387
46210	tt0068646	movie	The Godfather	The Godfather	False	1972	175.0	Crime,Drama	9.2	1903690
395446	tt0816692	movie	Interstellar	Interstellar	False	2014	169.0	Adventure,Drama,Sci-Fi	8.6	1901627

	primaryTitle	genres	averageRating	numVotes	runtimeMinutes	Rank	Worldwide Lifetime Gross	Domestic Lifetime Gross	Domestic %	Foreign Lifetime Gross	Foreign %	Year
2	Avatar	Action,Adventure,Fantasy	7.9	1339903.0	162.0	1	$2,923,706,026	$785,221,649	26.9%	$2,138,484,377	73.1%	2009
4	Avengers: Endgame	Action,Adventure,Drama	8.4	1174213.0	181.0	2	$2,799,439,100	$858,373,000	30.7%	$1,941,066,100	69.3%	2019
5	Avatar: The Way of Water	Action,Adventure,Fantasy	7.7	375367.0	192.0	3	$2,320,250,281	$684,075,767	29.5%	$1,636,174,514	70.5%	2022
8	Titanic	Drama,Romance	7.9	1216748.0	194.0	4	$2,264,750,694	$674,292,608	29.8%	$1,590,458,086	70.2%	1997
12	Star Wars: Episode VII - The Force Awakens	Action,Adventure,Sci-Fi	7.8	945729.0	138.0	5	$2,071,310,218	$936,662,225	45.2%	$1,134,647,993	54.8%	2015

	primaryTitle	genres	averageRating	numVotes	runtimeMinutes	Rank	Worldwide Lifetime Gross	Domestic Lifetime Gross	Domestic %	Foreign Lifetime Gross	Foreign %	Year
2	Avatar	Action,Adventure,Fantasy	7.9	1339903.0	162.0	1	2.923706e+09	785221649.0	26.9%	2.138484e+09	73.1%	2009
4	Avengers: Endgame	Action,Adventure,Drama	8.4	1174213.0	181.0	2	2.799439e+09	858373000.0	30.7%	1.941066e+09	69.3%	2019
5	Avatar: The Way of Water	Action,Adventure,Fantasy	7.7	375367.0	192.0	3	2.320250e+09	684075767.0	29.5%	1.636175e+09	70.5%	2022
8	Titanic	Drama,Romance	7.9	1216748.0	194.0	4	2.264751e+09	674292608.0	29.8%	1.590458e+09	70.2%	1997
12	Star Wars: Episode VII - The Force Awakens	Action,Adventure,Sci-Fi	7.8	945729.0	138.0	5	2.071310e+09	936662225.0	45.2%	1.134648e+09	54.8%	2015

Movie Trends Over Time: A Short History

by: James Trauger, Pushkar Bhargiri, Sid Su

Originally Published: May 12, 2023 | Last Edited Jun. 12, 2024

Step 1: Data Collection¶

Step 2: Data Processing¶

Step 3: Exploratory Analysis and Data Vizualization¶

Amount of movies¶

Runtime¶

Genres¶

Amount of movies¶

Runtime¶

Box office¶

Analysis, Hypothesis Testing, and Machine Learning¶

Insight and Policy Decision¶

	tconst	titleType	primaryTitle	originalTitle	startYear	runtimeMinutes	genres	averageRating	numVotes
0	tt0000001	short	Carmencita	Carmencita	1894	1	Documentary,Short	5.7	1971
1	tt0000002	short	Le clown et ses chiens	Le clown et ses chiens	1892	5	Animation,Short	5.8	263
2	tt0000003	short	Pauvre Pierrot	Pauvre Pierrot	1892	4	Animation,Comedy,Romance	6.5	1817
3	tt0000004	short	Un bon bock	Un bon bock	1892	12	Animation,Short	5.6	178
4	tt0000005	short	Blacksmith Scene	Blacksmith Scene	1893	1	Comedy,Short	6.2	2613

	tconst	titleType	primaryTitle	originalTitle	isAdult	startYear	runtimeMinutes	genres	averageRating	numVotes
8	tt0000009	movie	Miss Jerry	Miss Jerry	False	1894	45.0	Romance	5.3	204
144	tt0000147	movie	The Corbett-Fitzsimmons Fight	The Corbett-Fitzsimmons Fight	False	1897	100.0	Documentary,News,Sport	5.3	469
326	tt0000502	movie	Bohemios	Bohemios	False	1905	100.0	\N	4.1	15
358	tt0000574	movie	The Story of the Kelly Gang	The Story of the Kelly Gang	False	1906	70.0	Action,Adventure,Biography	6.0	826
366	tt0000591	movie	The Prodigal Son	L'enfant prodigue	False	1907	90.0	Drama	4.4	20