In [30]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
#### making results reproducible
np.random.seed(42)
# utils import
from fuzzywuzzy import fuzz
# https://grouplens.org/datasets/movielens/latest/
In [4]:
# configure file path
DATA_FOLDER = '/home/lenovo/workspace/prepare/data/ml-latest-small/'
# data_path = os.path.join(DATA_FOLDER, 'MovieLens')
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'
# read data
df_movies = pd.read_csv(
os.path.join(DATA_FOLDER, movies_filename),
usecols=['movieId', 'title'],
dtype={'movieId': 'int32', 'title': 'str'})
df_ratings = pd.read_csv(
os.path.join(DATA_FOLDER, ratings_filename),
usecols=['userId', 'movieId', 'rating'],
dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
In [5]:
# data = pd.read_csv("/home/lenovo/workspace/prepare/data/ml-latest-small/")
In [6]:
df_movies.head()
Out[6]:
In [7]:
df_ratings.head()
Out[7]:
In [8]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
In [11]:
num_users = len(df_ratings.userId.unique())
num_items = len(df_ratings.movieId.unique())
print('There are {} unique users and {} unique movies in this data set'.format(num_users, num_items))
In [12]:
df_ratings_cnt_tmp = pd.DataFrame(df_ratings.groupby('rating').size(), columns=['count'])
df_ratings_cnt_tmp
Out[12]:
In [13]:
# there are a lot more counts in rating of zero
total_cnt = num_users * num_items
rating_zero_cnt = total_cnt - df_ratings.shape[0]
# append counts of zero rating to df_ratings_cnt
df_ratings_cnt = df_ratings_cnt_tmp.append(
pd.DataFrame({'count': rating_zero_cnt}, index=[0.0]),
verify_integrity=True,
).sort_index()
df_ratings_cnt
Out[13]:
In [14]:
# add log count
df_ratings_cnt['log_count'] = np.log(df_ratings_cnt['count'])
df_ratings_cnt
Out[14]:
In [15]:
ax = df_ratings_cnt[['count']].reset_index().rename(columns={'index': 'rating score'}).plot(
x='rating score',
y='count',
kind='bar',
figsize=(12, 8),
title='Count for Each Rating Score (in Log Scale)',
logy=True,
fontsize=12,
)
ax.set_xlabel("movie rating score")
ax.set_ylabel("number of ratings")
Out[15]:
In [16]:
df_ratings.head()
Out[16]:
In [17]:
# get rating frequency
df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(), columns=['count'])
df_movies_cnt.head()
Out[17]:
In [18]:
# plot rating frequency of all movies
ax = df_movies_cnt \
.sort_values('count', ascending=False) \
.reset_index(drop=True) \
.plot(
figsize=(12, 8),
title='Rating Frequency of All Movies',
fontsize=12
)
ax.set_xlabel("movie Id")
ax.set_ylabel("number of ratings")
Out[18]:
In [19]:
df_movies_cnt['count'].quantile(np.arange(1, 0.6, -0.05))
Out[19]:
In [20]:
# filter data
popularity_thres = 50
popular_movies = list(set(df_movies_cnt.query('count >= @popularity_thres').index))
df_ratings_drop_movies = df_ratings[df_ratings.movieId.isin(popular_movies)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping unpopular movies: ', df_ratings_drop_movies.shape)
In [21]:
# get number of ratings given by every user
df_users_cnt = pd.DataFrame(df_ratings_drop_movies.groupby('userId').size(), columns=['count'])
df_users_cnt.head()
Out[21]:
In [22]:
# plot rating frequency of all movies
ax = df_users_cnt \
.sort_values('count', ascending=False) \
.reset_index(drop=True) \
.plot(
figsize=(12, 8),
title='Rating Frequency of All Users',
fontsize=12
)
ax.set_xlabel("user Id")
ax.set_ylabel("number of ratings")
Out[22]:
In [23]:
df_users_cnt['count'].quantile(np.arange(1, 0.5, -0.05))
Out[23]:
In [24]:
# filter data
ratings_thres = 50
active_users = list(set(df_users_cnt.query('count >= @ratings_thres').index))
df_ratings_drop_users = df_ratings_drop_movies[df_ratings_drop_movies.userId.isin(active_users)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping both unpopular movies and inactive users: ', df_ratings_drop_users.shape)
In [25]:
# pivot and create movie-user matrix
movie_user_mat = df_ratings_drop_users.pivot(index='movieId', columns='userId', values='rating').fillna(0)
# create mapper from movie title to index
movie_to_idx = {
movie: i for i, movie in
enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
}
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)
In [35]:
movie_user_mat_sparse
Out[35]:
In [26]:
%env JOBLIB_TEMP_FOLDER=/tmp
# define model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
# fit
model_knn.fit(movie_user_mat_sparse)
Out[26]:
In [36]:
def fuzzy_matching(mapper, fav_movie, verbose=True):
"""
return the closest match via fuzzy ratio. If no match found, return None
Parameters
----------
mapper: dict, map movie title name to index of the movie in data
fav_movie: str, name of user input movie
verbose: bool, print log if True
Return
------
index of the closest match
"""
match_tuple = []
# get match
for title, idx in mapper.items():
ratio = fuzz.ratio(title.lower(), fav_movie.lower())
if ratio >= 60:
match_tuple.append((title, idx, ratio))
# sort
match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
if not match_tuple:
print('Oops! No match is found')
return
if verbose:
print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
return match_tuple[0][1]
def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
"""
return top n similar movie recommendations based on user's input movie
Parameters
----------
model_knn: sklearn model, knn model
data: movie-user matrix
mapper: dict, map movie title name to index of the movie in data
fav_movie: str, name of user input movie
n_recommendations: int, top n recommendations
Return
------
list of top n similar movie recommendations
"""
# fit
model_knn.fit(data)
# get input movie index
print('You have input movie:', fav_movie)
idx = fuzzy_matching(mapper, fav_movie, verbose=True)
print('idx is', idx)
# inference
print('Recommendation system start to make inference')
print('......\n')
distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
# get list of raw idx of recommendations
raw_recommends = \
sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
# get reverse mapper
reverse_mapper = {v: k for k, v in mapper.items()}
# print recommendations
print('Recommendations for {}:'.format(fav_movie))
for i, (idx, dist) in enumerate(raw_recommends):
print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))
In [37]:
my_favorite = 'Iron Man'
make_recommendation(
model_knn=model_knn,
data=movie_user_mat_sparse,
fav_movie=my_favorite,
mapper=movie_to_idx,
n_recommendations=10)
In [ ]: