eli02 commited on
Commit
72ed4d9
·
1 Parent(s): 72d58ce

update: Add OpenAI to requirements and remove unused parquet file

Browse files
[openai_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet → [all_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f94d381f4dfcff0bbf6bfa5c84def47794d1596e12e2204a2a4bb413fc25a05
3
- size 2257769
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ced650f23166f55939fb6dfec6df2fd7d83995a9db362a1a7460d36e6f3ab510
3
+ size 3118786
app.py CHANGED
@@ -1,76 +1,86 @@
1
- import streamlit as st
2
- import pandas as pd
3
  from time import perf_counter as timer
4
- from datasets import Dataset, load_dataset
5
  from huggingface_hub import login
 
 
 
 
 
6
  import os
7
- from openai import OpenAI
8
 
9
- # Load credentials from environment variables or a secure source
 
 
 
 
 
 
 
 
 
 
10
  def load_credentials():
11
  credentials = {}
12
- for i in range(1, 51): # Assuming you have 10 credentials
13
  username = os.environ.get(f"login_{i}")
14
  password = os.environ.get(f"password_{i}")
15
  if username and password:
16
  credentials[username] = password
17
  return credentials
18
 
19
- # Authentication function
20
  def authenticate(username, password, credentials):
21
  return credentials.get(username) == password
22
 
23
- def load_data(database_file):
24
- df = pd.read_parquet(database_file)
25
- return df
26
-
27
- def save_reactions_to_dataset(user_type, query, results):
28
  data = {
29
  "user_type": [],
 
30
  "query": [],
31
  "retrieved_text": [],
 
32
  "reaction": []
33
  }
34
 
35
- for result in results:
 
36
  data["user_type"].append(user_type)
 
37
  data["query"].append(query)
38
  data["retrieved_text"].append(result["text"])
 
 
 
 
 
 
 
 
 
 
39
  data["reaction"].append(result["reaction"])
40
 
41
- # Load existing dataset from the Hub (if it exists)
42
  try:
43
  dataset = load_dataset("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation", split="train")
44
  existing_data = dataset.to_dict()
45
  except Exception:
46
- # If the dataset doesn't exist, start with an empty dataset
47
  existing_data = {
48
  "user_type": [],
 
49
  "query": [],
50
  "retrieved_text": [],
 
51
  "reaction": []
52
  }
53
 
54
- # Append new data to existing data
55
  for key in data:
56
  existing_data[key].extend(data[key])
57
 
58
- # Create a new dataset from the combined data
59
  updated_dataset = Dataset.from_dict(existing_data)
60
-
61
- # Push the updated dataset to the Hub
62
  updated_dataset.push_to_hub("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation")
63
 
64
- # Callback function to handle reaction selection
65
- def update_reaction(idx):
66
- st.session_state.reactions[f"reaction_{idx}"] = st.session_state[f"reaction_{idx}"]
67
-
68
- def generate_openai_embeddings(client, text):
69
- response = client.embeddings.create(
70
- input=text,
71
- model="text-embedding-3-small"
72
- )
73
- return response.data[0].embedding
74
 
75
  def cosine_similarity(embedding_0, embedding_1):
76
  dot_product = sum(a * b for a, b in zip(embedding_0, embedding_1))
@@ -78,22 +88,59 @@ def cosine_similarity(embedding_0, embedding_1):
78
  norm_1 = sum(b * b for b in embedding_1) ** 0.5
79
  return dot_product / (norm_0 * norm_1)
80
 
81
- def search_query(client, query, df, n=3):
82
- embedding = generate_openai_embeddings(client, query)
83
- df['similarities'] = df.openai_embedding.apply(lambda x: cosine_similarity(x, embedding))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  res = df.sort_values('similarities', ascending=False).head(n)
85
  return res
86
 
 
 
 
 
 
 
 
 
87
  def main():
88
  st.title("EnlightenQalb (Alchemy of Happiness)")
89
 
90
- # Load credentials
91
- credentials = load_credentials()
92
-
93
- # Check if user is authenticated
94
  if 'authenticated' not in st.session_state:
95
  st.session_state.authenticated = False
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
 
97
  if not st.session_state.authenticated:
98
  st.sidebar.title("Login")
99
  username = st.sidebar.text_input("Username")
@@ -102,6 +149,7 @@ def main():
102
  if st.sidebar.button("Login"):
103
  if authenticate(username, password, credentials):
104
  st.session_state.authenticated = True
 
105
  st.sidebar.success("Logged in successfully!")
106
  else:
107
  st.sidebar.error("Invalid username or password")
@@ -110,17 +158,7 @@ def main():
110
  st.warning("Please login to access the application.")
111
  return
112
 
113
- # Initialize session state variables
114
- if "search_performed" not in st.session_state:
115
- st.session_state.search_performed = False
116
- if "top_results" not in st.session_state:
117
- st.session_state.top_results = []
118
- if "reactions" not in st.session_state:
119
- st.session_state.reactions = {}
120
- if "results_saved" not in st.session_state:
121
- st.session_state.results_saved = False
122
-
123
- # Access the Hugging Face token from the environment variable
124
  huggingface_token = os.environ.get("al_ghazali_rag_retrieval_evaluation")
125
  if huggingface_token:
126
  login(token=huggingface_token)
@@ -130,14 +168,13 @@ def main():
130
  # Initialize OpenAI client
131
  client = OpenAI()
132
 
133
- # Load database from predefined path
134
- database_file = '[openai_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet'
135
 
136
  try:
137
  df = load_data(database_file)
138
  st.success("Database loaded successfully!")
139
 
140
- # Select user type
141
  user_type = st.radio(
142
  "Select your user type:",
143
  ["Layman", "Enthusiast", "Ustaz (Expert)"],
@@ -146,15 +183,27 @@ def main():
146
 
147
  query = st.text_area("Enter your query:")
148
 
 
 
 
 
 
149
  if st.button("Search") and query:
 
 
 
150
  start_time = timer()
151
- res = search_query(client, query, df, n=3)
 
 
 
 
 
 
 
 
152
  end_time = timer()
153
-
154
  st.write(f"Time taken to compute scores: {end_time - start_time:.5f} seconds")
155
-
156
- # Store the top results indices in session_state
157
- st.session_state.top_results = res.index.tolist()
158
  st.session_state.search_performed = True
159
 
160
  # Display results and collect reactions
@@ -162,42 +211,75 @@ def main():
162
  st.subheader("Query Results")
163
  st.write(f"Query: {query}")
164
 
165
- for idx in st.session_state.top_results:
 
 
166
  text = df.iloc[int(idx)]["ext"]
167
  st.write(f"**Text:** {text}")
168
 
169
- key = f"reaction_{idx}"
170
  if key not in st.session_state.reactions:
171
  st.session_state.reactions[key] = "🤷"
172
 
173
- # Use a callback to handle reaction selection
174
  reaction = st.radio(
175
- label=f"Rate this result (Result {idx}):",
176
  options=["👎", "🤷", "👍"],
177
  index=["👎", "🤷", "👍"].index(st.session_state.reactions[key]),
178
  key=key,
179
  horizontal=True,
180
  on_change=update_reaction,
181
- args=(idx,)
182
  )
183
 
184
- # Save reactions when the button is clicked
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  if st.button("Save Reactions"):
186
- # Collect the results to save
187
- results = []
188
- for idx in st.session_state.top_results:
189
- key = f"reaction_{idx}"
190
- results.append({
191
  "text": df.iloc[int(idx)]["ext"],
192
  "reaction": st.session_state.reactions[key]
193
  })
194
- save_reactions_to_dataset(user_type, query, results)
195
- st.success("Reactions saved successfully!")
196
 
197
- # Reset flags
198
- st.session_state.search_performed = False
199
- st.session_state.results_saved = True
200
- st.session_state.reactions = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  except Exception as e:
203
  st.error(f"Failed to load database: {str(e)}")
 
1
+ from openai import OpenAI
2
+ from sentence_transformers import SentenceTransformer
3
  from time import perf_counter as timer
 
4
  from huggingface_hub import login
5
+ from datasets import Dataset, load_dataset
6
+ import streamlit as st
7
+ import pandas as pd
8
+ import numpy as np
9
+ import torch as t
10
  import os
 
11
 
12
+ # Cache the model loading
13
+ @st.cache_resource
14
+ def load_sentence_transformer():
15
+ """Cache the SentenceTransformer model loading to avoid reloading on every rerun"""
16
+ return SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cpu")
17
+
18
+ # Cache the database loading
19
+ @st.cache_data
20
+ def load_data(database_file):
21
+ return pd.read_parquet(database_file)
22
+
23
  def load_credentials():
24
  credentials = {}
25
+ for i in range(1, 51):
26
  username = os.environ.get(f"login_{i}")
27
  password = os.environ.get(f"password_{i}")
28
  if username and password:
29
  credentials[username] = password
30
  return credentials
31
 
 
32
  def authenticate(username, password, credentials):
33
  return credentials.get(username) == password
34
 
35
+ def save_reactions_to_dataset(user_type, username, query, results_mpnet, results_openai):
 
 
 
 
36
  data = {
37
  "user_type": [],
38
+ "username": [],
39
  "query": [],
40
  "retrieved_text": [],
41
+ "model_type": [],
42
  "reaction": []
43
  }
44
 
45
+ # Add results from MPNet
46
+ for result in results_mpnet:
47
  data["user_type"].append(user_type)
48
+ data["username"].append(username)
49
  data["query"].append(query)
50
  data["retrieved_text"].append(result["text"])
51
+ data["model_type"].append("all-mpnet-base-v2")
52
+ data["reaction"].append(result["reaction"])
53
+
54
+ # Add results from OpenAI
55
+ for result in results_openai:
56
+ data["user_type"].append(user_type)
57
+ data["username"].append(username)
58
+ data["query"].append(query)
59
+ data["retrieved_text"].append(result["text"])
60
+ data["model_type"].append("openai")
61
  data["reaction"].append(result["reaction"])
62
 
 
63
  try:
64
  dataset = load_dataset("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation", split="train")
65
  existing_data = dataset.to_dict()
66
  except Exception:
 
67
  existing_data = {
68
  "user_type": [],
69
+ "username": [],
70
  "query": [],
71
  "retrieved_text": [],
72
+ "model_type": [],
73
  "reaction": []
74
  }
75
 
 
76
  for key in data:
77
  existing_data[key].extend(data[key])
78
 
 
79
  updated_dataset = Dataset.from_dict(existing_data)
 
 
80
  updated_dataset.push_to_hub("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation")
81
 
82
+ def update_reaction(model_type, idx):
83
+ st.session_state.reactions[f"reaction_{model_type}_{idx}"] = st.session_state[f"reaction_{model_type}_{idx}"]
 
 
 
 
 
 
 
 
84
 
85
  def cosine_similarity(embedding_0, embedding_1):
86
  dot_product = sum(a * b for a, b in zip(embedding_0, embedding_1))
 
88
  norm_1 = sum(b * b for b in embedding_1) ** 0.5
89
  return dot_product / (norm_0 * norm_1)
90
 
91
+ def generate_embedding(model, text, model_type="all-mpnet-base-v2"):
92
+ if model_type == "all-mpnet-base-v2":
93
+ chunk_embedding = model.encode(
94
+ text,
95
+ convert_to_tensor = True
96
+ )
97
+ return np.array(t.Tensor.cpu(chunk_embedding))
98
+ elif model_type == "openai":
99
+ response = model.embeddings.create(
100
+ input=text,
101
+ model="text-embedding-3-small"
102
+ )
103
+ return response.data[0].embedding
104
+
105
+ def search_query(model, query, df, model_type, n=3):
106
+ if model_type == "all-mpnet-base-v2":
107
+ embedding = generate_embedding(model, query, model_type=model_type)
108
+ df['similarities'] = df.all_mpnet_embedding.apply(lambda x: cosine_similarity(x, embedding))
109
+ elif model_type == "openai":
110
+ embedding = generate_embedding(model, query, model_type=model_type)
111
+ df['similarities'] = df.openai_embedding.apply(lambda x: cosine_similarity(x, embedding))
112
  res = df.sort_values('similarities', ascending=False).head(n)
113
  return res
114
 
115
+ def clear_search_state():
116
+ """Clear search-related session state variables"""
117
+ st.session_state.search_performed = False
118
+ st.session_state.top_results_mpnet = []
119
+ st.session_state.top_results_openai = []
120
+ st.session_state.reactions = {}
121
+ st.session_state.results_saved = False
122
+
123
  def main():
124
  st.title("EnlightenQalb (Alchemy of Happiness)")
125
 
126
+ # Initialize session state variables
 
 
 
127
  if 'authenticated' not in st.session_state:
128
  st.session_state.authenticated = False
129
+ st.session_state.username = None
130
+ st.session_state.search_performed = False
131
+ st.session_state.top_results_mpnet = []
132
+ st.session_state.top_results_openai = []
133
+ st.session_state.reactions = {}
134
+ st.session_state.results_saved = False
135
+ st.session_state.current_query = ""
136
+
137
+ # Load the model at startup (will be cached)
138
+ embedding_model = load_sentence_transformer()
139
+
140
+ # Load credentials
141
+ credentials = load_credentials()
142
 
143
+ # Authentication handling
144
  if not st.session_state.authenticated:
145
  st.sidebar.title("Login")
146
  username = st.sidebar.text_input("Username")
 
149
  if st.sidebar.button("Login"):
150
  if authenticate(username, password, credentials):
151
  st.session_state.authenticated = True
152
+ st.session_state.username = username
153
  st.sidebar.success("Logged in successfully!")
154
  else:
155
  st.sidebar.error("Invalid username or password")
 
158
  st.warning("Please login to access the application.")
159
  return
160
 
161
+ # Login to Hugging Face
 
 
 
 
 
 
 
 
 
 
162
  huggingface_token = os.environ.get("al_ghazali_rag_retrieval_evaluation")
163
  if huggingface_token:
164
  login(token=huggingface_token)
 
168
  # Initialize OpenAI client
169
  client = OpenAI()
170
 
171
+ # Load database
172
+ database_file = '[all_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet'
173
 
174
  try:
175
  df = load_data(database_file)
176
  st.success("Database loaded successfully!")
177
 
 
178
  user_type = st.radio(
179
  "Select your user type:",
180
  ["Layman", "Enthusiast", "Ustaz (Expert)"],
 
183
 
184
  query = st.text_area("Enter your query:")
185
 
186
+ # Clear search state if query changes
187
+ if query != st.session_state.current_query:
188
+ clear_search_state()
189
+ st.session_state.current_query = query
190
+
191
  if st.button("Search") and query:
192
+ clear_search_state() # Clear previous search results
193
+
194
+ # Perform searches with both models
195
  start_time = timer()
196
+
197
+ # MPNet search
198
+ res_mpnet = search_query(embedding_model, query, df, "all-mpnet-base-v2", n=1)
199
+ st.session_state.top_results_mpnet = res_mpnet.index.tolist()
200
+
201
+ # OpenAI search
202
+ res_openai = search_query(client, query, df, "openai", n=1)
203
+ st.session_state.top_results_openai = res_openai.index.tolist()
204
+
205
  end_time = timer()
 
206
  st.write(f"Time taken to compute scores: {end_time - start_time:.5f} seconds")
 
 
 
207
  st.session_state.search_performed = True
208
 
209
  # Display results and collect reactions
 
211
  st.subheader("Query Results")
212
  st.write(f"Query: {query}")
213
 
214
+ # Display MPNet results
215
+ st.markdown("### Results from MPNet Model")
216
+ for idx in st.session_state.top_results_mpnet:
217
  text = df.iloc[int(idx)]["ext"]
218
  st.write(f"**Text:** {text}")
219
 
220
+ key = f"reaction_mpnet_{idx}"
221
  if key not in st.session_state.reactions:
222
  st.session_state.reactions[key] = "🤷"
223
 
 
224
  reaction = st.radio(
225
+ label=f"Rate this MPNet result (Result {idx}):",
226
  options=["👎", "🤷", "👍"],
227
  index=["👎", "🤷", "👍"].index(st.session_state.reactions[key]),
228
  key=key,
229
  horizontal=True,
230
  on_change=update_reaction,
231
+ args=("mpnet", idx)
232
  )
233
 
234
+ # Display OpenAI results
235
+ st.markdown("### Results from OpenAI Model")
236
+ for idx in st.session_state.top_results_openai:
237
+ text = df.iloc[int(idx)]["ext"]
238
+ st.write(f"**Text:** {text}")
239
+
240
+ key = f"reaction_openai_{idx}"
241
+ if key not in st.session_state.reactions:
242
+ st.session_state.reactions[key] = "🤷"
243
+
244
+ reaction = st.radio(
245
+ label=f"Rate this OpenAI result (Result {idx}):",
246
+ options=["👎", "🤷", "👍"],
247
+ index=["👎", "🤷", "👍"].index(st.session_state.reactions[key]),
248
+ key=key,
249
+ horizontal=True,
250
+ on_change=update_reaction,
251
+ args=("openai", idx)
252
+ )
253
+
254
+ # Save reactions button
255
  if st.button("Save Reactions"):
256
+ # Collect MPNet results
257
+ results_mpnet = []
258
+ for idx in st.session_state.top_results_mpnet:
259
+ key = f"reaction_mpnet_{idx}"
260
+ results_mpnet.append({
261
  "text": df.iloc[int(idx)]["ext"],
262
  "reaction": st.session_state.reactions[key]
263
  })
 
 
264
 
265
+ # Collect OpenAI results
266
+ results_openai = []
267
+ for idx in st.session_state.top_results_openai:
268
+ key = f"reaction_openai_{idx}"
269
+ results_openai.append({
270
+ "text": df.iloc[int(idx)]["ext"],
271
+ "reaction": st.session_state.reactions[key]
272
+ })
273
+
274
+ save_reactions_to_dataset(
275
+ user_type,
276
+ st.session_state.username,
277
+ query,
278
+ results_mpnet,
279
+ results_openai
280
+ )
281
+ st.success("Reactions saved successfully!")
282
+ clear_search_state()
283
 
284
  except Exception as e:
285
  st.error(f"Failed to load database: {str(e)}")
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  torch
2
  pandas
3
- sentence-transformers
 
 
1
  torch
2
  pandas
3
+ sentence-transformers
4
+ openai