## Movie recommendation

In [252]:
import os 
import pickle

path_data = r"data/movies"

with open(os.path.join(path_data,'movies_dict.pkl'), 'rb') as file:
    movies_data = pickle.load(file)

In [253]:
import pandas as pd
movies = pd.DataFrame(movies_data)
movies.drop_duplicates(inplace=True)

In [254]:
import re

def has_capital(string):
    for index, char in enumerate(string):
        if char.isupper() and index != 0:
            return True
    return False

def clean_tags(text):
    pattern1 = re.compile(r'[?!]')
    pattern2 = re.compile(r'\.(?!\s|$)')
    pattern3 = re.compile(r'\.[a-zA-Z]\.')
    
    text_clean = re.sub(pattern1, '. ', text)
    text_clean = re.sub(pattern2, "", text_clean)
    text_clean = re.sub(pattern3, "", text_clean)
    text_clean = text_clean.replace("RobertDowneyJr.","").replace("SamuelL.","").replace("ScienceFiction", "Sciencefiction")

    tags_words = " ".join([t for t in text_clean.split(" ") if has_capital(t)==False])
    tags_words = [t for t in tags_words.split(". ")[-1:][0].strip().split(" ")[:8] if t!=""]
    tags_words = [t for t in tags_words if t[0].isupper()==True]
    #tags_words_clean = [t for t in tags_words_clean if has_capital(t)==False]
    return " ".join(sorted(tags_words)).replace("Sciencefiction","Science Fiction")

In [255]:
movies["tags"] = movies["tags"].apply(lambda x: x.replace("…",".").replace("—","").replace("  "," "))
movies["description"] = movies["tags"].apply(lambda x: ".".join(x.split(".")[:-1] + [""]))
movies["tags_clean"] = movies["tags"].apply(clean_tags).apply(lambda x: x.replace("Science Fiction","Sciencefiction"))

In [256]:
movies

Unnamed: 0,movie_id,title,tags,description,tags_clean
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","In the 22nd century, a paraplegic Marine is di...",Action Adventure Fantasy Sciencefiction
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","Captain Barbossa, long believed to be dead, ha...",Action Adventure Fantasy
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,A cryptic message from Bond’s past sends him o...,M While
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,Following the death of District Attorney Harve...,Action Crime Drama Thriller
4,49529,John Carter,"John Carter is a war-weary, former military ca...","John Carter is a war-weary, former military ca...",Action Adventure Sciencefiction
...,...,...,...,...,...
4804,9367,El Mariachi,El Mariachi just wants to play his guitar and ...,El Mariachi just wants to play his guitar and ...,Action Crime Thriller
4805,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...,A newlywed couple's honeymoon is upended by th...,Comedy Romance
4806,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...","""Signed, Sealed, Delivered"" introduces a dedic...",Comedy Drama Romance
4807,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,When ambitious New York attorney Sam is sent t...,Anonymous Written


In [257]:
from collections import Counter
import numpy as np

count_genre = pd.Series([t_ for t in movies["tags_clean"].to_list() for t_ in t.split(" ")]).value_counts().to_frame()
list_genres = list(count_genre.loc[count_genre["count"]>75].index)

In [258]:
# index of movies with wrong tags
list_index = []
for index, t in enumerate(movies["tags_clean"].to_list()):
    for elem in t.split():
        if elem not in list_genres:
            list_index.append(index)
            break

In [259]:
dict_tags = dict()
for index, description in zip(list_index, movies.iloc[list_index]["tags"].to_list()):
    list_tags = []    
    for genre in list_genres:
        if genre in description: 
            list_tags.append(genre)
        dict_tags[index] = " ".join(list_tags)
            

In [260]:
movies["tags_clean"].iloc[list_index] = list(dict_tags.values())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["tags_clean"].iloc[list_index] = list(dict_tags.values())


In [261]:
movies.drop(columns="tags",inplace=True)
movies.rename({"tags_clean":"genre"},axis=1,inplace=True)

In [262]:
movies["genre"] = movies["genre"].apply(lambda x:x.replace(" ",", ").replace("Sciencefiction", "Science Fiction").replace("–"," "))

In [263]:
movies.loc[movies["title"]=="Avengers: Age of Ultron"]

Unnamed: 0,movie_id,title,description,genre
7,99861,Avengers: Age of Ultron,When Tony Stark tries to jumpstart a dormant p...,"Action, Adventure, Science Fiction"


In [264]:
movies["description"].to_list()[7]

'When Tony Stark tries to jumpstart a dormant peacekeeping program, things go awry and Earth’s Mightiest Heroes are put to the ultimate test as the fate of the planet hangs in the balance. As the villainous Ultron emerges, it is up to The Avengers to stop him from enacting his terrible plans, and soon uneasy alliances and unexpected action pave the way for an epic and unique global adventure. Action Adventure ScienceFiction marvelcomic sequel superhero basedoncomicbook vision superheroteam duringcreditsstinger marvelcinematicuniverse 3d RobertDowneyJr.'

In [265]:
def clean_description_v2(text):
    new_text = text.split(". ")[-1]
    for genre in list_genres:
        if genre in new_text:
            return ". ".join(text.split(". ")[:-1] + [""]).strip()
    return text

In [266]:
movies["description"] = movies["description"].apply(clean_description_v2)

In [267]:
movies.to_pickle("data/movies/movies_dict2.pkl")

In [268]:
vote_info = pickle.load(open(os.path.join(path_data,"vote_info.pkl"),"rb"))
vote = pd.DataFrame(vote_info)

In [271]:
movies.rename({"movie_id":"id"}, axis=1, inplace=True)

In [272]:
movies.merge(vote, on="id", how="left")

Unnamed: 0,id,title,description,genre,vote_average,vote_count
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","Action, Adventure, Fantasy, Science Fiction",7.2,11800
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","Action, Adventure, Fantasy",6.9,4500
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"Action, Adventure, Crime",6.3,4466
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"Action, Crime, Drama, Thriller",7.6,9106
4,49529,John Carter,"John Carter is a war-weary, former military ca...","Action, Adventure, Science Fiction",6.1,2124
...,...,...,...,...,...,...
4801,9367,El Mariachi,El Mariachi just wants to play his guitar and ...,"Action, Crime, Thriller",6.6,238
4802,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...,"Comedy, Romance",5.9,5
4803,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...","Comedy, Drama, Romance",7.0,6
4804,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,,5.7,7
