Anime Recomendation System
imports
we've got some basic system imports, 🐼 and num🐍, 📊lib and fancy SimpleNamespace
for using dictionaries cuz '"[]
are annoying
import os,re,zipfile
import pandas as pd
import numpy as np
from types import SimpleNamespace
from matplotlib import pyplot as plt
import itertools
plt.style.use('seaborn')
base_size = 10
sizes = SimpleNamespace(**dict(small=(1*base_size,1*base_size),medium=(2*base_size,2*base_size),large=(3*base_size,3*base_size)))
Downlaod the data
%%time
zip_name = 'anime-recommendations-database.zip'
if not os.path.exists(zip_name):
os.environ['KAGGLE_USERNAME'] = "" # username from the json file
os.environ['KAGGLE_KEY'] = "" # key from the json file
!kaggle datasets download CooperUnion/anime-recommendations-database
Unzip the data
with zipfile.ZipFile(zip_name, 'r') as zip_ref:
zip_ref.extractall(zip_name.split('.')[0])
Let's find the files
os.listdir(zip_name.split('.')[0])
Have a look throught the dataset
anime = pd.read_csv(zip_name.split('.')[0] + '/anime.csv')
anime.genre = anime.genre.fillna('None')
anime
Let's get a list of unique genre
unique_genres = list(set([x.strip() for x in list(itertools.chain(*anime.genre.fillna('None').str.split(',')))]))
for gen in unique_genres:
anime[gen] = 0
For each row we create something similar with the output of pd.get_dummies
where we check the found genres against all the unique ones resulting into an array of 1
and 0
where given genres
vs unique_genres
def binary_match(x,y):
x = [t.strip() for t in x]
y = [t.strip() for t in y]
matches = dict(zip(y,np.zeros(len(y))))
for j in y:
if j in x:
matches[j] = 1
else:
matches[j] = 0
return matches
test = anime.genre.apply(lambda x: binary_match(x.split(','),unique_genres))
binary_df = pd.DataFrame(test.values.tolist())
anime[unique_genres] = binary_df[unique_genres]
anime
Looks good, right ?
from sklearn.cluster import KMeans
We use the number of unique genres as the number of clusters for a fast testing
kmeans = KMeans(n_clusters=len(unique_genres), random_state=0).fit(anime[unique_genres])
anime['clusters'] = kmeans.labels_
anime.clusters.value_counts().plot.barh(figsize=(10,10))
Then we define a searching by term function getting all the animes from that cluster sorted by rating
def awesome_find(df,search_term):
tmpdf = df.copy()
for x in search_term.split():
tmpdf = tmpdf[tmpdf.name.str.upper().str.contains(x.strip().upper())]
return df[df.clusters == tmpdf.sort_values(by='rating',ascending=False)[:1].clusters.values[0]][['anime_id','name','genre','type','episodes','members']]
awesome_find(anime,'Akame ga Kill')
from wordcloud import WordCloud
def create_wc(list_of_lists):
words = list()
for x in list_of_lists:
for y in x.split(','):
words.append(y.strip())
# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(' '.join(words))
return wordcloud
def plot_figures(figures, nrows = 1, ncols=1):
fig, axeslist = plt.subplots(ncols=ncols, nrows=nrows,figsize=(20,20))
for ind,title in zip(range(len(figures)), figures):
axeslist.ravel()[ind].imshow(figures[title], cmap=plt.jet())
axeslist.ravel()[ind].set_title(f'Most Freqent words for the group {title+1}')
axeslist.ravel()[ind].set_axis_off()
plt.tight_layout() # optional
clouds = dict()
for x in anime.clusters.unique():
clouds[x] = create_wc(anime[anime.clusters == x].genre.values.tolist())
Let's see our clusters
plot_figures(clouds,9,5)
from sklearn.metrics.pairwise import cosine_similarity
Our function computes the cosine similarity between the genre_vector
of the given search term against all of the others
def search_cosine(df,search):
tmpdf = df.copy()
search_term = tmpdf[tmpdf.name == search][unique_genres].values
tmpdf['similarity'] = cosine_similarity(search_term,tmpdf[unique_genres].values)[0].tolist()
return tmpdf.sort_values(by='similarity',ascending=False)[['anime_id','name','genre','type','episodes','members','similarity']]
The only do downgrade
on this approach is that the search has to be an exact match, however this can be solved through an elasticsearch
index
src = search_cosine(anime,'Sword Art Online')
src
rating = pd.read_csv(zip_name.split('.')[0]+'/rating.csv')
rating
mrating = rating.merge(rating.groupby('user_id').agg({'rating':'mean'}).reset_index(),on='user_id')
mrating = mrating.drop(mrating[mrating.rating_x < mrating.rating_y].index)
mrating
from sklearn.decomposition import PCA
n_comps = 3
comp = pd.DataFrame(PCA(n_components=n_comps).fit_transform(mrating[['user_id','anime_id','rating_x']]),columns=[f'comp_{x}'for x in range(1,4)])
comp['anime_id'] = mrating['anime_id']
comp = comp.merge(anime[['anime_id','name']],right_on='anime_id',left_on='anime_id',how='inner')
comp
comp_grp = comp.groupby('name').mean().reset_index()
comp_grp.name = comp_grp.name.str.replace('"',"`")
comp_grp
import plotly.express as px
fig = px.scatter_3d(comp_grp, x='comp_1', y='comp_2', z='comp_3',color='name',size_max=7, opacity=0.7)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.write_html('pca.html')
anime['genre_list'] = anime.genre.str.split(',').apply(lambda x: [y.strip() for y in x])
anime[:3]
rating[:3]
try:
import elasticsearch
except:
!pip install elasticsearch
import elasticsearch
from elasticsearch.helpers import bulk
import requests as req
es_host = ''
es_port = ''
es_user = ''
es_pass = ''
print(req.get(f'{es_host}:{es_port}/_cat/indices',auth=(es_user, es_pass)).text)
es = elasticsearch.Elasticsearch(
[
f'{es_host}:{es_port}'
],
http_auth=(es_user, es_pass),
)
print(es.indices.get('*'))
def index_template(index,id,doc):
return {
'_index': index,
'_id': id,
'_source': doc
}
anime[:2]
anime.anime_id.nunique() / len(anime)
anime_actions = [
index_template('anime',x.anime_id,{
'anime_id': x.anime_id ,
'name': x.name,
'genres': x.genre_list,
'type': x.type if str(x.type) != 'nan' else 'Unknown',
'episodes': int(x.episodes) if x.episodes not in ['Unknown'] else 0,
'rating': x.rating if str(x.rating) != 'nan' else 0
}) for x in anime.itertuples()
]
anime_actions[:2]
%%time
bulk(es,anime_actions)
def positive(x):
return 0 if x < 0 else x
frating = rating.groupby('user_id').agg({
'anime_id': list,
'rating': list
}).reset_index()
frating['mixed'] = frating[['anime_id','rating']].apply(lambda x: [ {'anime_id': i, 'rating': positive(r) if str(r) != 'nan' else 0} for i,r in zip(x.anime_id,x.rating)] ,axis=1)
frating
user_actions = [
index_template('anime-users',x.user_id,{
'user_id': x.user_id,
'rated': x.mixed
}) for x in frating.itertuples()
]
user_actions[:2]
%%time
bulk(es,user_actions)