import gradio as gr import pandas as pd from datasets import load_dataset from sentence_transformers import SentenceTransformer import numpy as np from sklearn.metrics.pairwise import cosine_similarity # Load a smaller portion of the dataset dataset = load_dataset("MongoDB/embedded_movies", split='train[:80%]') dataset_df = pd.DataFrame(dataset) # Data cleaning and preprocessing dataset_df = dataset_df.dropna(subset=["fullplot"]).reset_index(drop=True) dataset_df = dataset_df.drop(columns=["plot_embedding"]) # Load a smaller embedding model embedding_model = SentenceTransformer("all-MiniLM-L6-v2") def get_embedding(text: str) -> list: if not text.strip(): print("Attempted to get embedding for empty text.") return [] embedding = embedding_model.encode(text) return embedding.tolist() # Generate embeddings for all plots all_embeddings = [] batch_size = 32 for i in range(0, len(dataset_df), batch_size): batch = dataset_df['fullplot'].iloc[i:i+batch_size].tolist() batch_embeddings = embedding_model.encode(batch) all_embeddings.extend(batch_embeddings) # Add embeddings to the DataFrame dataset_df['embedding'] = all_embeddings print("Embeddings generated and added to DataFrame") def vector_search(user_query): query_embedding = get_embedding(user_query) if not query_embedding: return "Invalid query or embedding generation failed." similarities = cosine_similarity([query_embedding], list(dataset_df['embedding']))[0] top_indices = similarities.argsort()[-3:][::-1] results = [] for idx in top_indices: results.append({ "title": dataset_df.iloc[idx]["title"], "fullplot": dataset_df.iloc[idx]["fullplot"], "genres": dataset_df.iloc[idx]["genres"], "score": similarities[idx] }) return results def get_search_result(query): get_knowledge = vector_search(query) search_result = "" for result in get_knowledge: search_result += f"Title: {result.get('title', 'N/A')}\nGenres: {', '.join(result.get('genres', ['N/A']))}\nPlot: {result.get('fullplot', 'N/A')[:150]}...\n\n" return search_result def generate_response(query): source_information = get_search_result(query) response = f"Based on your query '{query}', here are some movie recommendations:\n\n{source_information}\nThese movies match your query based on their plot summaries and genres. Let me know if you'd like more information about any of them!" return response def query_movie_db(user_query): return generate_response(user_query) description_and_article = """ Ask this bot to recommend you a movie. Checkout [my github repo](https://github.com/kanad13/Movie-Recommendation-Bot) to look at the code that powers this bot. Note that the bot provides concise recommendations based on a limited dataset to ensure optimal performance. """ iface = gr.Interface( fn=query_movie_db, inputs=gr.Textbox(lines=2, placeholder="Enter your movie query here..."), outputs="text", title="Movie Recommendation Bot", description=description_and_article, examples=[["Suggest me a scary movie?"], ["What action movie can I watch?"]] ) if __name__ == "__main__": iface.launch()