import pandas as pd import numpy as np import torch from textblob import TextBlob data = pd.read_csv("flipkart_com-ecommerce_sample.csv") MAX_TEXT_LENGTH = 1000 def auto_truncate(val): """Truncate the given text.""" return val[:MAX_TEXT_LENGTH] if isinstance(val, str) else val all_prods_df = pd.read_csv("flipkart_com-ecommerce_sample.csv", converters={ 'description': auto_truncate, 'product_specifications': auto_truncate, 'product_name': auto_truncate, 'product_category_tree': auto_truncate, }) all_prods_df['product_specifications'].replace('', None, inplace=True) all_prods_df.dropna(subset=['product_specifications'], inplace=True) all_prods_df.reset_index(drop=True, inplace=True) NUMBER_PRODUCTS = 16000 product_metadata = ( all_prods_df .head(NUMBER_PRODUCTS) .to_dict(orient='index') ) texts = [ v['product_name'] for k, v in product_metadata.items() ] metadatas = list(product_metadata.values()) !pip install openai import openai openai.api_key = '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' def get_embedding(text, model="text-embedding-ada-002"): return openai.embeddings.create(input=[text], model=model).data[0].embedding embeddings = [get_embedding(text) for text in texts] !pip install pinecone from pinecone import Pinecone pc = Pinecone(api_key="2c47d51e-211b-4611-8808-5510e07d1f94", environment="us-east-1") index = pc.Index('zepto') vectors = [] for i, (embedding, metadata) in enumerate(zip(embeddings, metadatas)): vectors.append({ 'id': str(i), 'values': embedding, 'metadata': { 'product_name': metadata.get('product_name', 'No name available'), 'product_url': metadata.get('product_url', 'No link available') } }) import math def batch_upsert(index, vectors, batch_size=100): """Upsert vectors to Pinecone in batches.""" num_batches = math.ceil(len(vectors) / batch_size) for i in range(num_batches): batch_start = i * batch_size batch_end = batch_start + batch_size batch_vectors = vectors[batch_start:batch_end] index.upsert(vectors=batch_vectors) print(f"Upserted batch {i + 1}/{num_batches}") batch_size = 50 batch_upsert(index, vectors, batch_size=batch_size) from langdetect import detect def check_and_correct_spelling(query): blob = TextBlob(query) corrected_query = str(blob.correct()) return corrected_query def correct_and_complete_query(text): blob = TextBlob(text) corrected_text = str(blob.correct()) # Use OpenAI to complete the query completion_prompt = f"Complete the following query in a way that is related to product search: '{corrected_text}'" response = openai.completions.create( model="gpt-3.5-turbo-instruct", prompt=completion_prompt, max_tokens=100, temperature=0.5 ) return response.choices[0].text.strip() def translate_to_english(text): if detect(text) != 'en': translation_prompt = f"Translate the following text to English:\n\n'{text}'" response = openai.completions.create( model="gpt-3.5-turbo-instruct", prompt=translation_prompt, max_tokens=100, temperature=0.5 ) return response.choices[0].text.strip() return text def is_query_relevant(query, relevant_keywords): for keyword in relevant_keywords: if keyword.lower() in query.lower(): return True return False def search_in_pinecone(query): embedding = get_embedding(query) search_result = index.query(vector=embedding, top_k=5, include_metadata=True) return search_result def process_query(query): query = check_and_correct_spelling(query) query = correct_and_complete_query(query) query = translate_to_english(query) # Step 4: Check if the query is relevant # if not is_query_relevant(query): # return "The query is not relevant. Please enter a different query." return query def search_in_pinecone2(query): processed_query = process_query(query) embedding = get_embedding(query) search_results = index.query(vector=embedding, top_k=5, include_metadata=True) result_strings = [] for result in search_results['matches']: product_name = result['metadata'].get('product_name', 'No name available') product_link = result['metadata'].get('product_url', 'No link available') score = result['score'] result_string = f"Product: {product_name}\nLink: {product_link}\nScore: {score}\n" result_strings.append(result_string) return "\n".join(result_strings) import gradio as gr interface = gr.Interface( fn=search_in_pinecone2, inputs=gr.Textbox(label="Enter your query"), outputs=gr.Textbox(label="Top 5 Similar Products"), title="Product Similarity Search", description="Enter a query to find the top 5 similar products based on your search." ) # Launch the interface interface.launch()