Data Exploration

Data Understanding

Code

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#Data import
movie_ratings_df = pd.read_csv('../data-cleaning/ml-latest-small/movie_ratings_cleaned.csv')
print(movie_ratings_df.head())

#Saving number of rows and columns
num_rows=len(movie_ratings_df.index)
num_cols=len(movie_ratings_df.columns)

print("Nrows = ",num_rows,"\nNcol=",num_cols,"\nMatrix entries = ", num_rows*num_cols)

keys = movie_ratings_df.columns
keys_dtype = movie_ratings_df.dtypes

print('--------------------')
print('GENERAL INFORMATION:')
print('--------------------')
print(f'number of rows: {num_rows}')
print(f'number of col: {num_cols}')
print(f'keys: {keys} {keys_dtype}')

   movieId             title     genres  userId  rating_x  timestamp  \
0        1  Toy Story (1995)  Adventure       1       4.0  964982703   
1        1  Toy Story (1995)  Animation       1       4.0  964982703   
2        1  Toy Story (1995)   Children       1       4.0  964982703   
3        1  Toy Story (1995)     Comedy       1       4.0  964982703   
4        1  Toy Story (1995)    Fantasy       1       4.0  964982703   

   avg_rating  
0     3.92093  
1     3.92093  
2     3.92093  
3     3.92093  
4     3.92093  
Nrows =  274480 
Ncol= 7 
Matrix entries =  1921360
--------------------
GENERAL INFORMATION:
--------------------
number of rows: 274480
number of col: 7
keys: Index(['movieId', 'title', 'genres', 'userId', 'rating_x', 'timestamp',
       'avg_rating'],
      dtype='object') movieId         int64
title          object
genres         object
userId          int64
rating_x      float64
timestamp       int64
avg_rating    float64
dtype: object

Code

print(movie_ratings_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274480 entries, 0 to 274479
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   movieId     274480 non-null  int64  
 1   title       274480 non-null  object 
 2   genres      274480 non-null  object 
 3   userId      274480 non-null  int64  
 4   rating_x    274480 non-null  float64
 5   timestamp   274480 non-null  int64  
 6   avg_rating  274480 non-null  float64
dtypes: float64(2), int64(3), object(2)
memory usage: 14.7+ MB
None

Descriptive Statistics

Code

print('----------------------')
print('BASIC STATISTICS:')
print('----------------------')

print(movie_ratings_df.describe())

----------------------
BASIC STATISTICS:
----------------------
             movieId         userId       rating_x     timestamp  \
count  274480.000000  274480.000000  274480.000000  2.744800e+05   
mean    20302.543398     324.707607       3.517797  1.213584e+09   
std     35369.990843     182.550241       1.034705  2.167614e+08   
min         1.000000       1.000000       0.500000  8.281246e+08   
25%      1198.000000     177.000000       3.000000  1.030475e+09   
50%      3037.000000     323.000000       3.500000  1.196131e+09   
75%      8961.000000     477.000000       4.000000  1.442154e+09   
max    193609.000000     610.000000       5.000000  1.537799e+09   

          avg_rating  
count  274480.000000  
mean        3.517797  
std         0.540471  
min         0.500000  
25%         3.218750  
50%         3.590909  
75%         3.910714  
max         5.000000

Code

#Exploring the relationship between users and their movie ratings
cross_tab1 = pd.crosstab(movie_ratings_df['userId'], movie_ratings_df['rating_x'])
print(cross_tab1)

rating_x  0.5  1.0  1.5  2.0  2.5  3.0  3.5   4.0  4.5  5.0
userId                                                     
1           0    3    0   16    0   82    0   231    0  365
2           0    0    0    3    2    9   14    24   12   11
3          55    0    0    3    0    2    3     4   12   28
4           0   61    0   54    0   96    0   154    0  150
5           0    4    0   10    0   46    0    36    0   32
...       ...  ...  ...  ...  ...  ...  ...   ...  ...  ...
606        14   14   31   67  219  327  556  1019  331   91
607         0   10    0   32    0  183    0   145    0  149
608        80   68   68  167  249  473  362   467  268   74
609         0    0    0    0    0   67    0    24    0    0
610         8   33   32  115  217  677  892   802  417  521

[610 rows x 10 columns]

Code

#Exploring the relationship between movies and their ratings
cross_tab2 = pd.crosstab(movie_ratings_df['movieId'], movie_ratings_df['rating_x'])
print(cross_tab2)

rating_x  0.5  1.0  1.5  2.0  2.5  3.0  3.5  4.0  4.5  5.0
movieId                                                   
1           5    0    5   30   40  170   90  410   90  235
2           3    3    6   15   36   84   33  108   21   21
3           2    6    2    4    4   40   10   24    0   12
4           0    3    3    3    0   12    0    0    0    0
5           1    0    2    6    2   25    1    8    1    3
...       ...  ...  ...  ...  ...  ...  ...  ...  ...  ...
193581      0    0    0    0    0    0    0    4    0    0
193583      0    0    0    0    0    0    3    0    0    0
193585      0    0    0    0    0    0    1    0    0    0
193587      0    0    0    0    0    0    2    0    0    0
193609      0    0    0    0    0    0    0    1    0    0

[9724 rows x 10 columns]

Data Visualization

Code

sns.histplot(movie_ratings_df['rating_x'], bins=20, kde=False, color=sns.xkcd_rgb['flat blue'])
plt.title('Movie Rating Distribution Histogram')

Text(0.5, 1.0, 'Movie Rating Distribution Histogram')

Code

sns.boxplot(x = movie_ratings_df['rating_x'], y=movie_ratings_df['genres'])
plt.title('Genre Rating Distribution')

Text(0.5, 1.0, 'Genre Rating Distribution')

Correlation Analysis

Code

print("----------------------")
print("PEARSON CORRELATION MATRIX:")
print("----------------------")
corr = movie_ratings_df.corr(numeric_only=True)
print(corr)

# # Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool)) 
f, ax = plt.subplots(figsize=(7, 5)) #initialize figure

cmap = sns.diverging_palette(230, 20, as_cmap=True) #custom diverging colormap

# # Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

----------------------
PEARSON CORRELATION MATRIX:
----------------------
             movieId    userId  rating_x  timestamp  avg_rating
movieId     1.000000  0.014596 -0.002585   0.507722   -0.004949
userId      0.014596  1.000000 -0.044138   0.101638   -0.057814
rating_x   -0.002585 -0.044138  1.000000  -0.002031    0.522343
timestamp   0.507722  0.101638 -0.002031   1.000000    0.064190
avg_rating -0.004949 -0.057814  0.522343   0.064190    1.000000

Data Exploration

Data Understanding

Descriptive Statistics

Data Visualization

Correlation Analysis

Hypothesis Generation

Data Grouping and Segmentation

Identifying Outliers

Report and discuss your methods and finding

Tools and Software