movie correlation project in Python.
Trying to find a correlation between budget, votes & company or maybe other factors.
View On Github
Code
# Import Necessary Libraries
import pandas as pd
import matplotlib
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline
from matplotlib.pyplot import figure
matplotlib.rcParams["figure.figsize"] = (12, 8) #Adjusts The Configuration of the Plot We Will Create
# let's get the data
df = pd.read_csv("movies.csv")
#let's look at the data
df.head()
name rating genre year released score votes director writer star country budget gross company runtime
0 The Shining R Drama 1980 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0
1 The Blue Lagoon R Adventure 1980 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0
3 Airplane! PG Comedy 1980 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0
4 Caddyshack R Comedy 1980 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0
# let's see if there is any missing data
for col in df.columns:
p = np.mean(df[col].isnull())
print("{} - {}%".format(col, round(p*100, 2)))
name - 2.03%
rating - 1.0%
genre - 0.0%
year - 0.0%
released - 0.03%
score - 0.04%
votes - 0.0%
director - 0.0%
writer - 0.04%
star - 0.01%
country - 0.04%
budget - 0.0%
gross - 0.0%
company - 0.22%
runtime - 0.0%
# let's see the data types for columns
df.dtypes
name object
rating object
genre object
year int64
released object
score float64
votes float64
director object
writer object
star object
country object
budget float64
gross float64
company object
runtime float64
dtype: object
# let's change 'NA' to '0'
df["budget"] = df["budget"].fillna(0)
df["votes"] = df["votes"].fillna(0)
df["gross"] = df["gross"].fillna(0)
df["runtime"] = df["runtime"].fillna(0)
# changing data type of columns
df["budget"] = df["budget"].astype("int64")
df["votes"] = df["votes"].astype("int64")
df["gross"] = df["gross"].astype("int64")
df["runtime"] = df["runtime"].astype("int64")
df
name rating genre year released score votes director writer star country budget gross company runtime
5445 Avatar PG-13 Action 2009 2009-12-18 7.8 1100000 James Cameron James Cameron Sam Worthington United States 237000000 2847246203 Twentieth Century Fox 162
7445 Avengers: Endgame PG-13 Action 2019 2019-04-26 8.4 903000 Anthony Russo Christopher Markus Robert Downey Jr. United States 356000000 2797501328 Marvel Studios 181
3045 Titanic PG-13 Drama 1997 1997-12-19 7.8 1100000 James Cameron James Cameron Leonardo DiCaprio United States 200000000 2201647264 Twentieth Century Fox 194
6663 Star Wars: Episode VII - The Force Awakens PG-13 Action 2015 2015-12-18 7.8 876000 J.J. Abrams Lawrence Kasdan Daisy Ridley United States 245000000 2069521700 Lucasfilm 138
7244 Avengers: Infinity War PG-13 Action 2018 2018-04-27 8.4 897000 Anthony Russo Christopher Markus Robert Downey Jr. United States 321000000 2048359754 Marvel Studios 149
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1617 Mortal Passions R Crime 1990 1990-01-26 4.5 274 Andrew Lane Alan Moskowitz Zach Galligan United States 0 0 Gibraltar Entertainment 92
1614 Edge of Sanity R Horror 1989 1989-04-14 5.2 1300 Gérard Kikoïne J.P. Félix Anthony Perkins United Kingdom 0 0 Allied Vision 85
1606 I, Madman R Fantasy 1989 1989-04-07 6.0 2900 Tibor Takács David Chaskin Jenny Wright Canada 0 0 Trans World Entertainment (TWE) 89
1601 My Twentieth Century NaN Comedy 1990 1990-01-13 7.1 1500 Ildikó Enyedi Ildikó Enyedi Dorota Segda Hungary 0 0 Budapest Stúdió Vállalat 104
7667 Tee em el NaN Horror 2020 2020-08-19 5.7 7 Pereko Mosia Pereko Mosia Siyabonga Mabaso South Africa 0 0 PK 65 Films 102
7668 rows × 15 columns
df = df.sort_values(by=["gross"], inplace=False, ascending=False)
# adjusts what i see
pd.set_option('display.max_row', None)
pd.set_option('display.max_row', 12)
# modifying the column 'released' by removing the country name
df["released"] = df["released"].str.split("(").str.get(0)
0 1980-06-13
1 1980-07-02
2 1980-06-20
3 1980-07-02
4 1980-07-25
...
7663 2020-10-23
7664 2020-02-07
7665 2020-04-27
7666 2020-10-01
7667 2020-08-19
Name: released, Length: 7668, dtype: datetime64[ns]
# change the date formate
df["released"] = pd.to_datetime(df["released"])
# correct the year column
df["year"] = df["released"].astype(str).str[:4]
# Drop Ay Doplicates
df["name"] = df["name"].drop_duplicates().sort_values(ascending=False)
--there is no duplicates
--budget high correlation
--company high correlation
df.boxplot(column=['gross']) ---------------------------
CompanyGrossSum = df.groupby(['company'])[["gross"]].sum()
CompanyGrossSumSorted = CompanyGrossSum.sort_values(['gross','company'], ascending = False)[:15]
CompanyGrossSumSorted = CompanyGrossSumSorted['gross'].astype('int64')
CompanyGrossSumSorted
company
Warner Bros. 56491421806
Universal Pictures 52514188890
Columbia Pictures 43008941346
Paramount Pictures 40493607415
Twentieth Century Fox 40257053857
Walt Disney Pictures 36327887792
New Line Cinema 19883797684
Marvel Studios 15065592411
DreamWorks Animation 11873612858
Touchstone Pictures 11795832638
Dreamworks Pictures 11635441081
Metro-Goldwyn-Mayer (MGM) 9230230105
Summit Entertainment 8373718838
Pixar Animation Studios 7886344526
Fox 2000 Pictures 7443502667
Name: gross, dtype: int64
sns.regplot(x="score", y="gross", data=df, scatter_kws={"color":"green"}, line_kws={"color":"red"}) --------------
# scater plot with budget vs gross rev.
plt.scatter(x=df["budget"], y=df["gross"])
plt.title("Budget vs Gross Earnings")
plt.xlabel("Film Budget")
plt.ylabel("Gross Earnings") -------------------------------
# plot budget vs Gross using seaborn
sns.regplot(x="budget", y="gross", data=df, scatter_kws={"color":"blue"}, line_kws={"color":"red"}) ------------
# let's start looking at correlation
df.corr(method="pearson") # pearson, kendall, spearman
score votes budget gross runtime
score 1.000000 0.409182 0.055665 0.186392 0.398387
votes 0.409182 1.000000 0.486931 0.632870 0.306984
budget 0.055665 0.486931 1.000000 0.750157 0.268372
gross 0.186392 0.632870 0.750157 1.000000 0.244339
runtime 0.398387 0.306984 0.268372 0.244339 1.000000
correlation_matrix = df.corr(method="pearson") ------------------------
sns.heatmap(correlation_matrix, annot=True)
plt.title("Correlation matrix for Movies")
plt.xlabel("Movie features")
plt.ylabel("Movie features")
df_numerized = df
df_numerized
name rating genre year released score votes director writer star country budget gross company runtime
0 The Shining R Drama 1980 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0
1 The Blue Lagoon R Adventure 1980 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0
3 Airplane! PG Comedy 1980 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0
4 Caddyshack R Comedy 1980 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7663 More to Life NaN Drama 2020 October 23, 2020 (United States) 3.1 18.0 Joseph Ebanks Joseph Ebanks Shannon Bond United States 7000.0 NaN NaN 90.0
7664 Dream Round NaN Comedy 2020 February 7, 2020 (United States) 4.7 36.0 Dusty Dukatz Lisa Huston Michael Saquella United States NaN NaN Cactus Blue Entertainment 90.0
7665 Saving Mbango NaN Drama 2020 April 27, 2020 (Cameroon) 5.7 29.0 Nkanya Nkwai Lynno Lovert Onyama Laura United States 58750.0 NaN Embi Productions NaN
7666 It's Just Us NaN Drama 2020 October 1, 2020 (United States) NaN NaN James Randall James Randall Christina Roz United States 15000.0 NaN NaN 120.0
7667 Tee em el NaN Horror 2020 August 19, 2020 (United States) 5.7 7.0 Pereko Mosia Pereko Mosia Siyabonga Mabaso South Africa NaN NaN PK 65 Films 102.0
7668 rows × 15 columns
for col_name in df_numerized.columns:
if(df_numerized[col_name].dtype == 'object'):
df_numerized[col_name]= df_numerized[col_name].astype('category')
df_numerized[col_name] = df_numerized[col_name].cat.codes
# i can use df.apply(lambda x: x.factorize()[0]).corr()
correlation_matrix = df_numerized.corr(method='pearson') --------------------
sns.heatmap(correlation_matrix, annot = True)
plt.title("Correlation matrix for Movies")
plt.xlabel("Movie features")
plt.ylabel("Movie features")
Text(87.0, 0.5, 'Movie features')
correlation_mat = df_numerized.corr()
correlation_mat
name rating genre year released score votes director writer star country budget gross company runtime
name 1.000000 -0.008069 0.016355 0.011453 -0.011311 0.017097 0.013088 0.009079 0.009081 0.006472 -0.010737 0.023970 0.005533 0.009211 0.010392
rating -0.008069 1.000000 0.072423 0.008779 0.016613 -0.001314 0.033225 0.019483 -0.005921 0.013405 0.081244 -0.176002 -0.107339 -0.032943 0.062145
genre 0.016355 0.072423 1.000000 -0.081261 0.029822 0.027965 -0.145307 -0.015258 0.006567 -0.005477 -0.037615 -0.356564 -0.235650 -0.071067 -0.052711
year 0.011453 0.008779 -0.081261 1.000000 -0.000695 0.097995 0.222945 -0.020795 -0.008656 -0.027242 -0.070938 0.329321 0.257486 -0.010431 0.120811
released -0.011311 0.016613 0.029822 -0.000695 1.000000 0.042788 0.016097 -0.001478 -0.002404 0.015777 -0.020427 0.014683 0.001659 -0.010474 0.000868
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
country -0.010737 0.081244 -0.037615 -0.070938 -0.020427 -0.133348 0.073625 0.017490 0.015343 -0.012998 1.000000 0.054063 0.092129 0.095548 -0.078412
budget 0.023970 -0.176002 -0.356564 0.329321 0.014683 0.076254 0.442429 -0.012272 -0.039451 -0.019589 0.054063 1.000000 0.740395 0.173214 0.320447
gross 0.005533 -0.107339 -0.235650 0.257486 0.001659 0.186258 0.630757 -0.014441 -0.023519 -0.002717 0.092129 0.740395 1.000000 0.154840 0.245216
company 0.009211 -0.032943 -0.071067 -0.010431 -0.010474 0.001030 0.133204 0.004404 0.005646 0.012442 0.095548 0.173214 0.154840 1.000000 0.034402
runtime 0.010392 0.062145 -0.052711 0.120811 0.000868 0.399451 0.309212 0.017624 -0.003511 0.010174 -0.078412 0.320447 0.245216 0.034402 1.000000
15 rows × 15 columns
corr_pair = correlation_mat.unstack()
corr_pair
name name 1.000000
rating -0.008069
genre 0.016355
year 0.011453
released -0.011311
...
runtime country -0.078412
budget 0.320447
gross 0.245216
company 0.034402
runtime 1.000000
Length: 225, dtype: float64
pd.set_option('display.max_row', 20)
sorted_pairs = corr_pair.sort_values()
sorted_pairs
budget genre -0.356564
genre budget -0.356564
gross -0.235650
gross genre -0.235650
rating budget -0.176002
...
year year 1.000000
genre genre 1.000000
rating rating 1.000000
company company 1.000000
runtime runtime 1.000000
Length: 225, dtype: float64
high_corr = sorted_pairs[(sorted_pairs)> 0.5]
high_corr
gross votes 0.630757
votes gross 0.630757
budget gross 0.740395
gross budget 0.740395
name name 1.000000
director director 1.000000
gross gross 1.000000
budget budget 1.000000
country country 1.000000
star star 1.000000
writer writer 1.000000
votes votes 1.000000
score score 1.000000
released released 1.000000
year year 1.000000
genre genre 1.000000
rating rating 1.000000
company company 1.000000
runtime runtime 1.000000
dtype: float64