Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
import torch | |
from textblob import TextBlob | |
data = pd.read_csv("flipkart_com-ecommerce_sample.csv") | |
MAX_TEXT_LENGTH = 1000 | |
def auto_truncate(val): | |
"""Truncate the given text.""" | |
return val[:MAX_TEXT_LENGTH] if isinstance(val, str) else val | |
all_prods_df = pd.read_csv("flipkart_com-ecommerce_sample.csv", converters={ | |
'description': auto_truncate, | |
'product_specifications': auto_truncate, | |
'product_name': auto_truncate, | |
'product_category_tree': auto_truncate, | |
}) | |
all_prods_df['product_specifications'].replace('', None, inplace=True) | |
all_prods_df.dropna(subset=['product_specifications'], inplace=True) | |
all_prods_df.reset_index(drop=True, inplace=True) | |
NUMBER_PRODUCTS = 16000 | |
product_metadata = ( | |
all_prods_df | |
.head(NUMBER_PRODUCTS) | |
.to_dict(orient='index') | |
) | |
texts = [ | |
v['product_name'] for k, v in product_metadata.items() | |
] | |
metadatas = list(product_metadata.values()) | |
!pip install openai | |
import openai | |
openai.api_key = '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' | |
def get_embedding(text, model="text-embedding-ada-002"): | |
return openai.embeddings.create(input=[text], model=model).data[0].embedding | |
embeddings = [get_embedding(text) for text in texts] | |
!pip install pinecone | |
from pinecone import Pinecone | |
pc = Pinecone(api_key="2c47d51e-211b-4611-8808-5510e07d1f94", environment="us-east-1") | |
index = pc.Index('zepto') | |
vectors = [] | |
for i, (embedding, metadata) in enumerate(zip(embeddings, metadatas)): | |
vectors.append({ | |
'id': str(i), | |
'values': embedding, | |
'metadata': { | |
'product_name': metadata.get('product_name', 'No name available'), | |
'product_url': metadata.get('product_url', 'No link available') | |
} | |
}) | |
import math | |
def batch_upsert(index, vectors, batch_size=100): | |
"""Upsert vectors to Pinecone in batches.""" | |
num_batches = math.ceil(len(vectors) / batch_size) | |
for i in range(num_batches): | |
batch_start = i * batch_size | |
batch_end = batch_start + batch_size | |
batch_vectors = vectors[batch_start:batch_end] | |
index.upsert(vectors=batch_vectors) | |
print(f"Upserted batch {i + 1}/{num_batches}") | |
batch_size = 50 | |
batch_upsert(index, vectors, batch_size=batch_size) | |
from langdetect import detect | |
def check_and_correct_spelling(query): | |
blob = TextBlob(query) | |
corrected_query = str(blob.correct()) | |
return corrected_query | |
def correct_and_complete_query(text): | |
blob = TextBlob(text) | |
corrected_text = str(blob.correct()) | |
# Use OpenAI to complete the query | |
completion_prompt = f"Complete the following query in a way that is related to product search: '{corrected_text}'" | |
response = openai.completions.create( | |
model="gpt-3.5-turbo-instruct", | |
prompt=completion_prompt, | |
max_tokens=100, | |
temperature=0.5 | |
) | |
return response.choices[0].text.strip() | |
def translate_to_english(text): | |
if detect(text) != 'en': | |
translation_prompt = f"Translate the following text to English:\n\n'{text}'" | |
response = openai.completions.create( | |
model="gpt-3.5-turbo-instruct", | |
prompt=translation_prompt, | |
max_tokens=100, | |
temperature=0.5 | |
) | |
return response.choices[0].text.strip() | |
return text | |
def is_query_relevant(query, relevant_keywords): | |
for keyword in relevant_keywords: | |
if keyword.lower() in query.lower(): | |
return True | |
return False | |
def search_in_pinecone(query): | |
embedding = get_embedding(query) | |
search_result = index.query(vector=embedding, top_k=5, include_metadata=True) | |
return search_result | |
def process_query(query): | |
query = check_and_correct_spelling(query) | |
query = correct_and_complete_query(query) | |
query = translate_to_english(query) | |
# Step 4: Check if the query is relevant | |
# if not is_query_relevant(query): | |
# return "The query is not relevant. Please enter a different query." | |
return query | |
def search_in_pinecone2(query): | |
processed_query = process_query(query) | |
embedding = get_embedding(query) | |
search_results = index.query(vector=embedding, top_k=5, include_metadata=True) | |
result_strings = [] | |
for result in search_results['matches']: | |
product_name = result['metadata'].get('product_name', 'No name available') | |
product_link = result['metadata'].get('product_url', 'No link available') | |
score = result['score'] | |
result_string = f"Product: {product_name}\nLink: {product_link}\nScore: {score}\n" | |
result_strings.append(result_string) | |
return "\n".join(result_strings) | |
import gradio as gr | |
interface = gr.Interface( | |
fn=search_in_pinecone2, | |
inputs=gr.Textbox(label="Enter your query"), | |
outputs=gr.Textbox(label="Top 5 Similar Products"), | |
title="Product Similarity Search", | |
description="Enter a query to find the top 5 similar products based on your search." | |
) | |
# Launch the interface | |
interface.launch() |