kanad13's picture
Update app.py
f1aba38 verified
import gradio as gr
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# Load a smaller portion of the dataset
dataset = load_dataset("MongoDB/embedded_movies", split='train[:80%]')
dataset_df = pd.DataFrame(dataset)
# Data cleaning and preprocessing
dataset_df = dataset_df.dropna(subset=["fullplot"]).reset_index(drop=True)
dataset_df = dataset_df.drop(columns=["plot_embedding"])
# Load a smaller embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
def get_embedding(text: str) -> list:
if not text.strip():
print("Attempted to get embedding for empty text.")
return []
embedding = embedding_model.encode(text)
return embedding.tolist()
# Generate embeddings for all plots
all_embeddings = []
batch_size = 32
for i in range(0, len(dataset_df), batch_size):
batch = dataset_df['fullplot'].iloc[i:i+batch_size].tolist()
batch_embeddings = embedding_model.encode(batch)
all_embeddings.extend(batch_embeddings)
# Add embeddings to the DataFrame
dataset_df['embedding'] = all_embeddings
print("Embeddings generated and added to DataFrame")
def vector_search(user_query):
query_embedding = get_embedding(user_query)
if not query_embedding:
return "Invalid query or embedding generation failed."
similarities = cosine_similarity([query_embedding], list(dataset_df['embedding']))[0]
top_indices = similarities.argsort()[-3:][::-1]
results = []
for idx in top_indices:
results.append({
"title": dataset_df.iloc[idx]["title"],
"fullplot": dataset_df.iloc[idx]["fullplot"],
"genres": dataset_df.iloc[idx]["genres"],
"score": similarities[idx]
})
return results
def get_search_result(query):
get_knowledge = vector_search(query)
search_result = ""
for result in get_knowledge:
search_result += f"Title: {result.get('title', 'N/A')}\nGenres: {', '.join(result.get('genres', ['N/A']))}\nPlot: {result.get('fullplot', 'N/A')[:150]}...\n\n"
return search_result
def generate_response(query):
source_information = get_search_result(query)
response = f"Based on your query '{query}', here are some movie recommendations:\n\n{source_information}\nThese movies match your query based on their plot summaries and genres. Let me know if you'd like more information about any of them!"
return response
def query_movie_db(user_query):
return generate_response(user_query)
description_and_article = """
Ask this bot to recommend you a movie.
Checkout [my github repo](https://github.com/kanad13/Movie-Recommendation-Bot) to look at the code that powers this bot.
Note that the bot provides concise recommendations based on a limited dataset to ensure optimal performance.
"""
iface = gr.Interface(
fn=query_movie_db,
inputs=gr.Textbox(lines=2, placeholder="Enter your movie query here..."),
outputs="text",
title="Movie Recommendation Bot",
description=description_and_article,
examples=[["Suggest me a scary movie?"], ["What action movie can I watch?"]]
)
if __name__ == "__main__":
iface.launch()