import pandas as pd
import numpy as np
import torch
from textblob import TextBlob
data = pd.read_csv("flipkart_com-ecommerce_sample.csv")
MAX_TEXT_LENGTH = 1000  
def auto_truncate(val):
    """Truncate the given text."""
    return val[:MAX_TEXT_LENGTH] if isinstance(val, str) else val
all_prods_df = pd.read_csv("flipkart_com-ecommerce_sample.csv", converters={
    'description': auto_truncate,
    'product_specifications': auto_truncate,
    'product_name': auto_truncate,
    'product_category_tree': auto_truncate,
})

all_prods_df['product_specifications'].replace('', None, inplace=True)
all_prods_df.dropna(subset=['product_specifications'], inplace=True)

all_prods_df.reset_index(drop=True, inplace=True)
NUMBER_PRODUCTS = 16000

product_metadata = (
    all_prods_df
    .head(NUMBER_PRODUCTS)
    .to_dict(orient='index')
)
texts = [
    v['product_name'] for k, v in product_metadata.items()
]

metadatas = list(product_metadata.values())
!pip install openai
import openai
openai.api_key = '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'
def get_embedding(text, model="text-embedding-ada-002"):
    return openai.embeddings.create(input=[text], model=model).data[0].embedding

embeddings = [get_embedding(text) for text in texts]
!pip install pinecone
from pinecone import Pinecone
pc = Pinecone(api_key="2c47d51e-211b-4611-8808-5510e07d1f94", environment="us-east-1")
index = pc.Index('zepto')
vectors = []
for i, (embedding, metadata) in enumerate(zip(embeddings, metadatas)):
    vectors.append({
        'id': str(i),  
        'values': embedding,  
        'metadata': {
            'product_name': metadata.get('product_name', 'No name available'),
            'product_url': metadata.get('product_url', 'No link available')
        }
    })
import math

def batch_upsert(index, vectors, batch_size=100):
    """Upsert vectors to Pinecone in batches."""
    num_batches = math.ceil(len(vectors) / batch_size)

    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = batch_start + batch_size
        batch_vectors = vectors[batch_start:batch_end]

        index.upsert(vectors=batch_vectors)
        print(f"Upserted batch {i + 1}/{num_batches}")

batch_size = 50  
batch_upsert(index, vectors, batch_size=batch_size)
from langdetect import detect
def check_and_correct_spelling(query):
    blob = TextBlob(query)
    corrected_query = str(blob.correct())
    return corrected_query
def correct_and_complete_query(text):
    blob = TextBlob(text)
    corrected_text = str(blob.correct())

    # Use OpenAI to complete the query
    completion_prompt = f"Complete the following query in a way that is related to product search: '{corrected_text}'"
    response = openai.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt=completion_prompt,
        max_tokens=100,
        temperature=0.5
    )

    return response.choices[0].text.strip()
def translate_to_english(text):
    if detect(text) != 'en':
        translation_prompt = f"Translate the following text to English:\n\n'{text}'"
        response = openai.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=translation_prompt,
            max_tokens=100,
            temperature=0.5
        )
        return response.choices[0].text.strip()
    return text
def is_query_relevant(query, relevant_keywords):
    for keyword in relevant_keywords:
        if keyword.lower() in query.lower():
            return True
    return False
def search_in_pinecone(query):
    embedding = get_embedding(query)
    search_result = index.query(vector=embedding, top_k=5, include_metadata=True)
    return search_result
def process_query(query):
    query = check_and_correct_spelling(query)
    
    query = correct_and_complete_query(query)
    
    query = translate_to_english(query)
    
    # Step 4: Check if the query is relevant
    # if not is_query_relevant(query):
    #     return "The query is not relevant. Please enter a different query."
    
    return query 
def search_in_pinecone2(query):
    processed_query = process_query(query)
    embedding = get_embedding(query)
    search_results = index.query(vector=embedding, top_k=5, include_metadata=True)

    result_strings = []
    for result in search_results['matches']:
        product_name = result['metadata'].get('product_name', 'No name available')
        product_link = result['metadata'].get('product_url', 'No link available')
        score = result['score']
        result_string = f"Product: {product_name}\nLink: {product_link}\nScore: {score}\n"
        result_strings.append(result_string)
    
    return "\n".join(result_strings)
import gradio as gr
interface = gr.Interface(
    fn=search_in_pinecone2,
    inputs=gr.Textbox(label="Enter your query"),
    outputs=gr.Textbox(label="Top 5 Similar Products"),
    title="Product Similarity Search",
    description="Enter a query to find the top 5 similar products based on your search."
)

# Launch the interface
interface.launch()