Data Exploration

Data Understanding

Code
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#Data import
movie_ratings_df = pd.read_csv('../data-cleaning/ml-latest-small/movie_ratings_cleaned.csv')
print(movie_ratings_df.head())

#Saving number of rows and columns
num_rows=len(movie_ratings_df.index)
num_cols=len(movie_ratings_df.columns)

print("Nrows = ",num_rows,"\nNcol=",num_cols,"\nMatrix entries = ", num_rows*num_cols)

keys = movie_ratings_df.columns
keys_dtype = movie_ratings_df.dtypes

print('--------------------')
print('GENERAL INFORMATION:')
print('--------------------')
print(f'number of rows: {num_rows}')
print(f'number of col: {num_cols}')
print(f'keys: {keys} {keys_dtype}')
   movieId             title     genres  userId  rating_x  timestamp  \
0        1  Toy Story (1995)  Adventure       1       4.0  964982703   
1        1  Toy Story (1995)  Animation       1       4.0  964982703   
2        1  Toy Story (1995)   Children       1       4.0  964982703   
3        1  Toy Story (1995)     Comedy       1       4.0  964982703   
4        1  Toy Story (1995)    Fantasy       1       4.0  964982703   

   avg_rating  
0     3.92093  
1     3.92093  
2     3.92093  
3     3.92093  
4     3.92093  
Nrows =  274480 
Ncol= 7 
Matrix entries =  1921360
--------------------
GENERAL INFORMATION:
--------------------
number of rows: 274480
number of col: 7
keys: Index(['movieId', 'title', 'genres', 'userId', 'rating_x', 'timestamp',
       'avg_rating'],
      dtype='object') movieId         int64
title          object
genres         object
userId          int64
rating_x      float64
timestamp       int64
avg_rating    float64
dtype: object
Code
print(movie_ratings_df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274480 entries, 0 to 274479
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   movieId     274480 non-null  int64  
 1   title       274480 non-null  object 
 2   genres      274480 non-null  object 
 3   userId      274480 non-null  int64  
 4   rating_x    274480 non-null  float64
 5   timestamp   274480 non-null  int64  
 6   avg_rating  274480 non-null  float64
dtypes: float64(2), int64(3), object(2)
memory usage: 14.7+ MB
None

Descriptive Statistics

Code
print('----------------------')
print('BASIC STATISTICS:')
print('----------------------')

print(movie_ratings_df.describe())
----------------------
BASIC STATISTICS:
----------------------
             movieId         userId       rating_x     timestamp  \
count  274480.000000  274480.000000  274480.000000  2.744800e+05   
mean    20302.543398     324.707607       3.517797  1.213584e+09   
std     35369.990843     182.550241       1.034705  2.167614e+08   
min         1.000000       1.000000       0.500000  8.281246e+08   
25%      1198.000000     177.000000       3.000000  1.030475e+09   
50%      3037.000000     323.000000       3.500000  1.196131e+09   
75%      8961.000000     477.000000       4.000000  1.442154e+09   
max    193609.000000     610.000000       5.000000  1.537799e+09   

          avg_rating  
count  274480.000000  
mean        3.517797  
std         0.540471  
min         0.500000  
25%         3.218750  
50%         3.590909  
75%         3.910714  
max         5.000000  
Code
#Exploring the relationship between users and their movie ratings
cross_tab1 = pd.crosstab(movie_ratings_df['userId'], movie_ratings_df['rating_x'])
print(cross_tab1)
rating_x  0.5  1.0  1.5  2.0  2.5  3.0  3.5   4.0  4.5  5.0
userId                                                     
1           0    3    0   16    0   82    0   231    0  365
2           0    0    0    3    2    9   14    24   12   11
3          55    0    0    3    0    2    3     4   12   28
4           0   61    0   54    0   96    0   154    0  150
5           0    4    0   10    0   46    0    36    0   32
...       ...  ...  ...  ...  ...  ...  ...   ...  ...  ...
606        14   14   31   67  219  327  556  1019  331   91
607         0   10    0   32    0  183    0   145    0  149
608        80   68   68  167  249  473  362   467  268   74
609         0    0    0    0    0   67    0    24    0    0
610         8   33   32  115  217  677  892   802  417  521

[610 rows x 10 columns]
Code
#Exploring the relationship between movies and their ratings
cross_tab2 = pd.crosstab(movie_ratings_df['movieId'], movie_ratings_df['rating_x'])
print(cross_tab2)
rating_x  0.5  1.0  1.5  2.0  2.5  3.0  3.5  4.0  4.5  5.0
movieId                                                   
1           5    0    5   30   40  170   90  410   90  235
2           3    3    6   15   36   84   33  108   21   21
3           2    6    2    4    4   40   10   24    0   12
4           0    3    3    3    0   12    0    0    0    0
5           1    0    2    6    2   25    1    8    1    3
...       ...  ...  ...  ...  ...  ...  ...  ...  ...  ...
193581      0    0    0    0    0    0    0    4    0    0
193583      0    0    0    0    0    0    3    0    0    0
193585      0    0    0    0    0    0    1    0    0    0
193587      0    0    0    0    0    0    2    0    0    0
193609      0    0    0    0    0    0    0    1    0    0

[9724 rows x 10 columns]

Data Visualization

Code
sns.histplot(movie_ratings_df['rating_x'], bins=20, kde=False, color=sns.xkcd_rgb['flat blue'])
plt.title('Movie Rating Distribution Histogram')
Text(0.5, 1.0, 'Movie Rating Distribution Histogram')

Code
sns.boxplot(x = movie_ratings_df['rating_x'], y=movie_ratings_df['genres'])
plt.title('Genre Rating Distribution')
Text(0.5, 1.0, 'Genre Rating Distribution')

Correlation Analysis

Code
print("----------------------")
print("PEARSON CORRELATION MATRIX:")
print("----------------------")
corr = movie_ratings_df.corr(numeric_only=True)
print(corr)

# # Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool)) 
f, ax = plt.subplots(figsize=(7, 5)) #initialize figure

cmap = sns.diverging_palette(230, 20, as_cmap=True) #custom diverging colormap

# # Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()
----------------------
PEARSON CORRELATION MATRIX:
----------------------
             movieId    userId  rating_x  timestamp  avg_rating
movieId     1.000000  0.014596 -0.002585   0.507722   -0.004949
userId      0.014596  1.000000 -0.044138   0.101638   -0.057814
rating_x   -0.002585 -0.044138  1.000000  -0.002031    0.522343
timestamp   0.507722  0.101638 -0.002031   1.000000    0.064190
avg_rating -0.004949 -0.057814  0.522343   0.064190    1.000000

Hypothesis Generation

Data Grouping and Segmentation

Identifying Outliers

Report and discuss your methods and finding

Tools and Software