indrasn0wal commited on
Commit
e886d9c
·
verified ·
1 Parent(s): b9e34b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -61
app.py CHANGED
@@ -1,30 +1,23 @@
1
- import gradio as gr
2
- import openai
3
- import pinecone
4
- from textblob import TextBlob
5
- from langdetect import detect
6
- from pinecone import Pinecone
7
  import pandas as pd
8
- # Initialize OpenAI API
9
- openai.api_key = "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" # Replace with your OpenAI API key
10
-
11
- # Initialize Pinecone
12
- pc = Pinecone(api_key="2c47d51e-211b-4611-8808-5510e07d1f94", environment="us-east-1")
13
-
14
- # Assume you have already created and populated an index
15
- index = pc.Index('zepto')
16
  data = pd.read_csv("flipkart_com-ecommerce_sample.csv")
17
  MAX_TEXT_LENGTH = 1000
18
  def auto_truncate(val):
19
  """Truncate the given text."""
20
  return val[:MAX_TEXT_LENGTH] if isinstance(val, str) else val
 
 
 
 
 
 
21
 
22
-
23
- all_prods_df = data.copy()
24
  all_prods_df['product_specifications'].replace('', None, inplace=True)
25
  all_prods_df.dropna(subset=['product_specifications'], inplace=True)
26
- all_prods_df.reset_index(drop=True, inplace=True)
27
 
 
28
  NUMBER_PRODUCTS = 16000
29
 
30
  product_metadata = (
@@ -32,26 +25,58 @@ product_metadata = (
32
  .head(NUMBER_PRODUCTS)
33
  .to_dict(orient='index')
34
  )
 
 
 
35
 
36
- texts = [v['product_name'] for k, v in product_metadata.items()]
37
  metadatas = list(product_metadata.values())
 
 
 
 
 
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
 
 
 
40
 
 
 
 
 
41
 
42
- def get_embedding(text, model="text-embedding-ada-002"):
43
- response = openai.embeddings.create(input=[text], model=model)
44
- return response['data'][0]['embedding']
45
 
 
 
 
46
  def check_and_correct_spelling(query):
47
  blob = TextBlob(query)
48
  corrected_query = str(blob.correct())
49
  return corrected_query
50
-
51
  def correct_and_complete_query(text):
52
  blob = TextBlob(text)
53
  corrected_text = str(blob.correct())
54
-
 
55
  completion_prompt = f"Complete the following query in a way that is related to product search: '{corrected_text}'"
56
  response = openai.completions.create(
57
  model="gpt-3.5-turbo-instruct",
@@ -59,13 +84,12 @@ def correct_and_complete_query(text):
59
  max_tokens=100,
60
  temperature=0.5
61
  )
62
-
63
- return response.choices[0].text.strip()
64
 
 
65
  def translate_to_english(text):
66
  if detect(text) != 'en':
67
  translation_prompt = f"Translate the following text to English:\n\n'{text}'"
68
- response = openai.Completion.create(
69
  model="gpt-3.5-turbo-instruct",
70
  prompt=translation_prompt,
71
  max_tokens=100,
@@ -73,18 +97,30 @@ def translate_to_english(text):
73
  )
74
  return response.choices[0].text.strip()
75
  return text
76
-
 
 
 
 
 
 
 
 
77
  def process_query(query):
78
  query = check_and_correct_spelling(query)
 
79
  query = correct_and_complete_query(query)
 
80
  query = translate_to_english(query)
81
 
 
 
 
 
82
  return query
83
-
84
- def search_in_pinecone(query):
85
  processed_query = process_query(query)
86
-
87
- embedding = get_embedding(processed_query)
88
  search_results = index.query(vector=embedding, top_k=5, include_metadata=True)
89
 
90
  result_strings = []
@@ -96,36 +132,9 @@ def search_in_pinecone(query):
96
  result_strings.append(result_string)
97
 
98
  return "\n".join(result_strings)
99
- embeddings = [get_embedding(text) for text in texts]
100
- vectors = []
101
- for i, (embedding, metadata) in enumerate(zip(embeddings, metadatas)):
102
- vectors.append({
103
- 'id': str(i),
104
- 'values': embedding,
105
- 'metadata': {
106
- 'product_name': metadata.get('product_name', 'No name available'),
107
- 'product_url': metadata.get('product_url', 'No link available')
108
- }
109
- })
110
- import math
111
-
112
- def batch_upsert(index, vectors, batch_size=100):
113
- """Upsert vectors to Pinecone in batches."""
114
- num_batches = math.ceil(len(vectors) / batch_size)
115
-
116
- for i in range(num_batches):
117
- batch_start = i * batch_size
118
- batch_end = batch_start + batch_size
119
- batch_vectors = vectors[batch_start:batch_end]
120
-
121
- index.upsert(vectors=batch_vectors)
122
- print(f"Upserted batch {i + 1}/{num_batches}")
123
-
124
- batch_size = 50
125
- batch_upsert(index, vectors, batch_size=batch_size)
126
- # Gradio Interface
127
  interface = gr.Interface(
128
- fn=search_in_pinecone,
129
  inputs=gr.Textbox(label="Enter your query"),
130
  outputs=gr.Textbox(label="Top 5 Similar Products"),
131
  title="Product Similarity Search",
@@ -133,4 +142,4 @@ interface = gr.Interface(
133
  )
134
 
135
  # Launch the interface
136
- interface.launch()
 
 
 
 
 
 
 
1
  import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ from textblob import TextBlob
 
 
 
 
 
5
  data = pd.read_csv("flipkart_com-ecommerce_sample.csv")
6
  MAX_TEXT_LENGTH = 1000
7
  def auto_truncate(val):
8
  """Truncate the given text."""
9
  return val[:MAX_TEXT_LENGTH] if isinstance(val, str) else val
10
+ all_prods_df = pd.read_csv("flipkart_com-ecommerce_sample.csv", converters={
11
+ 'description': auto_truncate,
12
+ 'product_specifications': auto_truncate,
13
+ 'product_name': auto_truncate,
14
+ 'product_category_tree': auto_truncate,
15
+ })
16
 
 
 
17
  all_prods_df['product_specifications'].replace('', None, inplace=True)
18
  all_prods_df.dropna(subset=['product_specifications'], inplace=True)
 
19
 
20
+ all_prods_df.reset_index(drop=True, inplace=True)
21
  NUMBER_PRODUCTS = 16000
22
 
23
  product_metadata = (
 
25
  .head(NUMBER_PRODUCTS)
26
  .to_dict(orient='index')
27
  )
28
+ texts = [
29
+ v['product_name'] for k, v in product_metadata.items()
30
+ ]
31
 
 
32
  metadatas = list(product_metadata.values())
33
+ !pip install openai
34
+ import openai
35
+ openai.api_key = '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'
36
+ def get_embedding(text, model="text-embedding-ada-002"):
37
+ return openai.embeddings.create(input=[text], model=model).data[0].embedding
38
 
39
+ embeddings = [get_embedding(text) for text in texts]
40
+ !pip install pinecone
41
+ from pinecone import Pinecone
42
+ pc = Pinecone(api_key="2c47d51e-211b-4611-8808-5510e07d1f94", environment="us-east-1")
43
+ index = pc.Index('zepto')
44
+ vectors = []
45
+ for i, (embedding, metadata) in enumerate(zip(embeddings, metadatas)):
46
+ vectors.append({
47
+ 'id': str(i),
48
+ 'values': embedding,
49
+ 'metadata': {
50
+ 'product_name': metadata.get('product_name', 'No name available'),
51
+ 'product_url': metadata.get('product_url', 'No link available')
52
+ }
53
+ })
54
+ import math
55
 
56
+ def batch_upsert(index, vectors, batch_size=100):
57
+ """Upsert vectors to Pinecone in batches."""
58
+ num_batches = math.ceil(len(vectors) / batch_size)
59
 
60
+ for i in range(num_batches):
61
+ batch_start = i * batch_size
62
+ batch_end = batch_start + batch_size
63
+ batch_vectors = vectors[batch_start:batch_end]
64
 
65
+ index.upsert(vectors=batch_vectors)
66
+ print(f"Upserted batch {i + 1}/{num_batches}")
 
67
 
68
+ batch_size = 50
69
+ batch_upsert(index, vectors, batch_size=batch_size)
70
+ from langdetect import detect
71
  def check_and_correct_spelling(query):
72
  blob = TextBlob(query)
73
  corrected_query = str(blob.correct())
74
  return corrected_query
 
75
  def correct_and_complete_query(text):
76
  blob = TextBlob(text)
77
  corrected_text = str(blob.correct())
78
+
79
+ # Use OpenAI to complete the query
80
  completion_prompt = f"Complete the following query in a way that is related to product search: '{corrected_text}'"
81
  response = openai.completions.create(
82
  model="gpt-3.5-turbo-instruct",
 
84
  max_tokens=100,
85
  temperature=0.5
86
  )
 
 
87
 
88
+ return response.choices[0].text.strip()
89
  def translate_to_english(text):
90
  if detect(text) != 'en':
91
  translation_prompt = f"Translate the following text to English:\n\n'{text}'"
92
+ response = openai.completions.create(
93
  model="gpt-3.5-turbo-instruct",
94
  prompt=translation_prompt,
95
  max_tokens=100,
 
97
  )
98
  return response.choices[0].text.strip()
99
  return text
100
+ def is_query_relevant(query, relevant_keywords):
101
+ for keyword in relevant_keywords:
102
+ if keyword.lower() in query.lower():
103
+ return True
104
+ return False
105
+ def search_in_pinecone(query):
106
+ embedding = get_embedding(query)
107
+ search_result = index.query(vector=embedding, top_k=5, include_metadata=True)
108
+ return search_result
109
  def process_query(query):
110
  query = check_and_correct_spelling(query)
111
+
112
  query = correct_and_complete_query(query)
113
+
114
  query = translate_to_english(query)
115
 
116
+ # Step 4: Check if the query is relevant
117
+ # if not is_query_relevant(query):
118
+ # return "The query is not relevant. Please enter a different query."
119
+
120
  return query
121
+ def search_in_pinecone2(query):
 
122
  processed_query = process_query(query)
123
+ embedding = get_embedding(query)
 
124
  search_results = index.query(vector=embedding, top_k=5, include_metadata=True)
125
 
126
  result_strings = []
 
132
  result_strings.append(result_string)
133
 
134
  return "\n".join(result_strings)
135
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  interface = gr.Interface(
137
+ fn=search_in_pinecone2,
138
  inputs=gr.Textbox(label="Enter your query"),
139
  outputs=gr.Textbox(label="Top 5 Similar Products"),
140
  title="Product Similarity Search",
 
142
  )
143
 
144
  # Launch the interface
145
+ interface.launch()