update: Add OpenAI to requirements and remove unused parquet file
Browse files
[openai_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet → [all_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ced650f23166f55939fb6dfec6df2fd7d83995a9db362a1a7460d36e6f3ab510
|
3 |
+
size 3118786
|
app.py
CHANGED
@@ -1,76 +1,86 @@
|
|
1 |
-
|
2 |
-
|
3 |
from time import perf_counter as timer
|
4 |
-
from datasets import Dataset, load_dataset
|
5 |
from huggingface_hub import login
|
|
|
|
|
|
|
|
|
|
|
6 |
import os
|
7 |
-
from openai import OpenAI
|
8 |
|
9 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def load_credentials():
|
11 |
credentials = {}
|
12 |
-
for i in range(1, 51):
|
13 |
username = os.environ.get(f"login_{i}")
|
14 |
password = os.environ.get(f"password_{i}")
|
15 |
if username and password:
|
16 |
credentials[username] = password
|
17 |
return credentials
|
18 |
|
19 |
-
# Authentication function
|
20 |
def authenticate(username, password, credentials):
|
21 |
return credentials.get(username) == password
|
22 |
|
23 |
-
def
|
24 |
-
df = pd.read_parquet(database_file)
|
25 |
-
return df
|
26 |
-
|
27 |
-
def save_reactions_to_dataset(user_type, query, results):
|
28 |
data = {
|
29 |
"user_type": [],
|
|
|
30 |
"query": [],
|
31 |
"retrieved_text": [],
|
|
|
32 |
"reaction": []
|
33 |
}
|
34 |
|
35 |
-
|
|
|
36 |
data["user_type"].append(user_type)
|
|
|
37 |
data["query"].append(query)
|
38 |
data["retrieved_text"].append(result["text"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
data["reaction"].append(result["reaction"])
|
40 |
|
41 |
-
# Load existing dataset from the Hub (if it exists)
|
42 |
try:
|
43 |
dataset = load_dataset("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation", split="train")
|
44 |
existing_data = dataset.to_dict()
|
45 |
except Exception:
|
46 |
-
# If the dataset doesn't exist, start with an empty dataset
|
47 |
existing_data = {
|
48 |
"user_type": [],
|
|
|
49 |
"query": [],
|
50 |
"retrieved_text": [],
|
|
|
51 |
"reaction": []
|
52 |
}
|
53 |
|
54 |
-
# Append new data to existing data
|
55 |
for key in data:
|
56 |
existing_data[key].extend(data[key])
|
57 |
|
58 |
-
# Create a new dataset from the combined data
|
59 |
updated_dataset = Dataset.from_dict(existing_data)
|
60 |
-
|
61 |
-
# Push the updated dataset to the Hub
|
62 |
updated_dataset.push_to_hub("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation")
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
st.session_state.reactions[f"reaction_{idx}"] = st.session_state[f"reaction_{idx}"]
|
67 |
-
|
68 |
-
def generate_openai_embeddings(client, text):
|
69 |
-
response = client.embeddings.create(
|
70 |
-
input=text,
|
71 |
-
model="text-embedding-3-small"
|
72 |
-
)
|
73 |
-
return response.data[0].embedding
|
74 |
|
75 |
def cosine_similarity(embedding_0, embedding_1):
|
76 |
dot_product = sum(a * b for a, b in zip(embedding_0, embedding_1))
|
@@ -78,22 +88,59 @@ def cosine_similarity(embedding_0, embedding_1):
|
|
78 |
norm_1 = sum(b * b for b in embedding_1) ** 0.5
|
79 |
return dot_product / (norm_0 * norm_1)
|
80 |
|
81 |
-
def
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
res = df.sort_values('similarities', ascending=False).head(n)
|
85 |
return res
|
86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
def main():
|
88 |
st.title("EnlightenQalb (Alchemy of Happiness)")
|
89 |
|
90 |
-
#
|
91 |
-
credentials = load_credentials()
|
92 |
-
|
93 |
-
# Check if user is authenticated
|
94 |
if 'authenticated' not in st.session_state:
|
95 |
st.session_state.authenticated = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
|
|
97 |
if not st.session_state.authenticated:
|
98 |
st.sidebar.title("Login")
|
99 |
username = st.sidebar.text_input("Username")
|
@@ -102,6 +149,7 @@ def main():
|
|
102 |
if st.sidebar.button("Login"):
|
103 |
if authenticate(username, password, credentials):
|
104 |
st.session_state.authenticated = True
|
|
|
105 |
st.sidebar.success("Logged in successfully!")
|
106 |
else:
|
107 |
st.sidebar.error("Invalid username or password")
|
@@ -110,17 +158,7 @@ def main():
|
|
110 |
st.warning("Please login to access the application.")
|
111 |
return
|
112 |
|
113 |
-
#
|
114 |
-
if "search_performed" not in st.session_state:
|
115 |
-
st.session_state.search_performed = False
|
116 |
-
if "top_results" not in st.session_state:
|
117 |
-
st.session_state.top_results = []
|
118 |
-
if "reactions" not in st.session_state:
|
119 |
-
st.session_state.reactions = {}
|
120 |
-
if "results_saved" not in st.session_state:
|
121 |
-
st.session_state.results_saved = False
|
122 |
-
|
123 |
-
# Access the Hugging Face token from the environment variable
|
124 |
huggingface_token = os.environ.get("al_ghazali_rag_retrieval_evaluation")
|
125 |
if huggingface_token:
|
126 |
login(token=huggingface_token)
|
@@ -130,14 +168,13 @@ def main():
|
|
130 |
# Initialize OpenAI client
|
131 |
client = OpenAI()
|
132 |
|
133 |
-
# Load database
|
134 |
-
database_file = '[
|
135 |
|
136 |
try:
|
137 |
df = load_data(database_file)
|
138 |
st.success("Database loaded successfully!")
|
139 |
|
140 |
-
# Select user type
|
141 |
user_type = st.radio(
|
142 |
"Select your user type:",
|
143 |
["Layman", "Enthusiast", "Ustaz (Expert)"],
|
@@ -146,15 +183,27 @@ def main():
|
|
146 |
|
147 |
query = st.text_area("Enter your query:")
|
148 |
|
|
|
|
|
|
|
|
|
|
|
149 |
if st.button("Search") and query:
|
|
|
|
|
|
|
150 |
start_time = timer()
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
end_time = timer()
|
153 |
-
|
154 |
st.write(f"Time taken to compute scores: {end_time - start_time:.5f} seconds")
|
155 |
-
|
156 |
-
# Store the top results indices in session_state
|
157 |
-
st.session_state.top_results = res.index.tolist()
|
158 |
st.session_state.search_performed = True
|
159 |
|
160 |
# Display results and collect reactions
|
@@ -162,42 +211,75 @@ def main():
|
|
162 |
st.subheader("Query Results")
|
163 |
st.write(f"Query: {query}")
|
164 |
|
165 |
-
|
|
|
|
|
166 |
text = df.iloc[int(idx)]["ext"]
|
167 |
st.write(f"**Text:** {text}")
|
168 |
|
169 |
-
key = f"
|
170 |
if key not in st.session_state.reactions:
|
171 |
st.session_state.reactions[key] = "🤷"
|
172 |
|
173 |
-
# Use a callback to handle reaction selection
|
174 |
reaction = st.radio(
|
175 |
-
label=f"Rate this result (Result {idx}):",
|
176 |
options=["👎", "🤷", "👍"],
|
177 |
index=["👎", "🤷", "👍"].index(st.session_state.reactions[key]),
|
178 |
key=key,
|
179 |
horizontal=True,
|
180 |
on_change=update_reaction,
|
181 |
-
args=(idx
|
182 |
)
|
183 |
|
184 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
if st.button("Save Reactions"):
|
186 |
-
# Collect
|
187 |
-
|
188 |
-
for idx in st.session_state.
|
189 |
-
key = f"
|
190 |
-
|
191 |
"text": df.iloc[int(idx)]["ext"],
|
192 |
"reaction": st.session_state.reactions[key]
|
193 |
})
|
194 |
-
save_reactions_to_dataset(user_type, query, results)
|
195 |
-
st.success("Reactions saved successfully!")
|
196 |
|
197 |
-
#
|
198 |
-
|
199 |
-
st.session_state.
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
except Exception as e:
|
203 |
st.error(f"Failed to load database: {str(e)}")
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
from time import perf_counter as timer
|
|
|
4 |
from huggingface_hub import login
|
5 |
+
from datasets import Dataset, load_dataset
|
6 |
+
import streamlit as st
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
import torch as t
|
10 |
import os
|
|
|
11 |
|
12 |
+
# Cache the model loading
|
13 |
+
@st.cache_resource
|
14 |
+
def load_sentence_transformer():
|
15 |
+
"""Cache the SentenceTransformer model loading to avoid reloading on every rerun"""
|
16 |
+
return SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cpu")
|
17 |
+
|
18 |
+
# Cache the database loading
|
19 |
+
@st.cache_data
|
20 |
+
def load_data(database_file):
|
21 |
+
return pd.read_parquet(database_file)
|
22 |
+
|
23 |
def load_credentials():
|
24 |
credentials = {}
|
25 |
+
for i in range(1, 51):
|
26 |
username = os.environ.get(f"login_{i}")
|
27 |
password = os.environ.get(f"password_{i}")
|
28 |
if username and password:
|
29 |
credentials[username] = password
|
30 |
return credentials
|
31 |
|
|
|
32 |
def authenticate(username, password, credentials):
|
33 |
return credentials.get(username) == password
|
34 |
|
35 |
+
def save_reactions_to_dataset(user_type, username, query, results_mpnet, results_openai):
|
|
|
|
|
|
|
|
|
36 |
data = {
|
37 |
"user_type": [],
|
38 |
+
"username": [],
|
39 |
"query": [],
|
40 |
"retrieved_text": [],
|
41 |
+
"model_type": [],
|
42 |
"reaction": []
|
43 |
}
|
44 |
|
45 |
+
# Add results from MPNet
|
46 |
+
for result in results_mpnet:
|
47 |
data["user_type"].append(user_type)
|
48 |
+
data["username"].append(username)
|
49 |
data["query"].append(query)
|
50 |
data["retrieved_text"].append(result["text"])
|
51 |
+
data["model_type"].append("all-mpnet-base-v2")
|
52 |
+
data["reaction"].append(result["reaction"])
|
53 |
+
|
54 |
+
# Add results from OpenAI
|
55 |
+
for result in results_openai:
|
56 |
+
data["user_type"].append(user_type)
|
57 |
+
data["username"].append(username)
|
58 |
+
data["query"].append(query)
|
59 |
+
data["retrieved_text"].append(result["text"])
|
60 |
+
data["model_type"].append("openai")
|
61 |
data["reaction"].append(result["reaction"])
|
62 |
|
|
|
63 |
try:
|
64 |
dataset = load_dataset("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation", split="train")
|
65 |
existing_data = dataset.to_dict()
|
66 |
except Exception:
|
|
|
67 |
existing_data = {
|
68 |
"user_type": [],
|
69 |
+
"username": [],
|
70 |
"query": [],
|
71 |
"retrieved_text": [],
|
72 |
+
"model_type": [],
|
73 |
"reaction": []
|
74 |
}
|
75 |
|
|
|
76 |
for key in data:
|
77 |
existing_data[key].extend(data[key])
|
78 |
|
|
|
79 |
updated_dataset = Dataset.from_dict(existing_data)
|
|
|
|
|
80 |
updated_dataset.push_to_hub("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation")
|
81 |
|
82 |
+
def update_reaction(model_type, idx):
|
83 |
+
st.session_state.reactions[f"reaction_{model_type}_{idx}"] = st.session_state[f"reaction_{model_type}_{idx}"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
def cosine_similarity(embedding_0, embedding_1):
|
86 |
dot_product = sum(a * b for a, b in zip(embedding_0, embedding_1))
|
|
|
88 |
norm_1 = sum(b * b for b in embedding_1) ** 0.5
|
89 |
return dot_product / (norm_0 * norm_1)
|
90 |
|
91 |
+
def generate_embedding(model, text, model_type="all-mpnet-base-v2"):
|
92 |
+
if model_type == "all-mpnet-base-v2":
|
93 |
+
chunk_embedding = model.encode(
|
94 |
+
text,
|
95 |
+
convert_to_tensor = True
|
96 |
+
)
|
97 |
+
return np.array(t.Tensor.cpu(chunk_embedding))
|
98 |
+
elif model_type == "openai":
|
99 |
+
response = model.embeddings.create(
|
100 |
+
input=text,
|
101 |
+
model="text-embedding-3-small"
|
102 |
+
)
|
103 |
+
return response.data[0].embedding
|
104 |
+
|
105 |
+
def search_query(model, query, df, model_type, n=3):
|
106 |
+
if model_type == "all-mpnet-base-v2":
|
107 |
+
embedding = generate_embedding(model, query, model_type=model_type)
|
108 |
+
df['similarities'] = df.all_mpnet_embedding.apply(lambda x: cosine_similarity(x, embedding))
|
109 |
+
elif model_type == "openai":
|
110 |
+
embedding = generate_embedding(model, query, model_type=model_type)
|
111 |
+
df['similarities'] = df.openai_embedding.apply(lambda x: cosine_similarity(x, embedding))
|
112 |
res = df.sort_values('similarities', ascending=False).head(n)
|
113 |
return res
|
114 |
|
115 |
+
def clear_search_state():
|
116 |
+
"""Clear search-related session state variables"""
|
117 |
+
st.session_state.search_performed = False
|
118 |
+
st.session_state.top_results_mpnet = []
|
119 |
+
st.session_state.top_results_openai = []
|
120 |
+
st.session_state.reactions = {}
|
121 |
+
st.session_state.results_saved = False
|
122 |
+
|
123 |
def main():
|
124 |
st.title("EnlightenQalb (Alchemy of Happiness)")
|
125 |
|
126 |
+
# Initialize session state variables
|
|
|
|
|
|
|
127 |
if 'authenticated' not in st.session_state:
|
128 |
st.session_state.authenticated = False
|
129 |
+
st.session_state.username = None
|
130 |
+
st.session_state.search_performed = False
|
131 |
+
st.session_state.top_results_mpnet = []
|
132 |
+
st.session_state.top_results_openai = []
|
133 |
+
st.session_state.reactions = {}
|
134 |
+
st.session_state.results_saved = False
|
135 |
+
st.session_state.current_query = ""
|
136 |
+
|
137 |
+
# Load the model at startup (will be cached)
|
138 |
+
embedding_model = load_sentence_transformer()
|
139 |
+
|
140 |
+
# Load credentials
|
141 |
+
credentials = load_credentials()
|
142 |
|
143 |
+
# Authentication handling
|
144 |
if not st.session_state.authenticated:
|
145 |
st.sidebar.title("Login")
|
146 |
username = st.sidebar.text_input("Username")
|
|
|
149 |
if st.sidebar.button("Login"):
|
150 |
if authenticate(username, password, credentials):
|
151 |
st.session_state.authenticated = True
|
152 |
+
st.session_state.username = username
|
153 |
st.sidebar.success("Logged in successfully!")
|
154 |
else:
|
155 |
st.sidebar.error("Invalid username or password")
|
|
|
158 |
st.warning("Please login to access the application.")
|
159 |
return
|
160 |
|
161 |
+
# Login to Hugging Face
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
huggingface_token = os.environ.get("al_ghazali_rag_retrieval_evaluation")
|
163 |
if huggingface_token:
|
164 |
login(token=huggingface_token)
|
|
|
168 |
# Initialize OpenAI client
|
169 |
client = OpenAI()
|
170 |
|
171 |
+
# Load database
|
172 |
+
database_file = '[all_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet'
|
173 |
|
174 |
try:
|
175 |
df = load_data(database_file)
|
176 |
st.success("Database loaded successfully!")
|
177 |
|
|
|
178 |
user_type = st.radio(
|
179 |
"Select your user type:",
|
180 |
["Layman", "Enthusiast", "Ustaz (Expert)"],
|
|
|
183 |
|
184 |
query = st.text_area("Enter your query:")
|
185 |
|
186 |
+
# Clear search state if query changes
|
187 |
+
if query != st.session_state.current_query:
|
188 |
+
clear_search_state()
|
189 |
+
st.session_state.current_query = query
|
190 |
+
|
191 |
if st.button("Search") and query:
|
192 |
+
clear_search_state() # Clear previous search results
|
193 |
+
|
194 |
+
# Perform searches with both models
|
195 |
start_time = timer()
|
196 |
+
|
197 |
+
# MPNet search
|
198 |
+
res_mpnet = search_query(embedding_model, query, df, "all-mpnet-base-v2", n=1)
|
199 |
+
st.session_state.top_results_mpnet = res_mpnet.index.tolist()
|
200 |
+
|
201 |
+
# OpenAI search
|
202 |
+
res_openai = search_query(client, query, df, "openai", n=1)
|
203 |
+
st.session_state.top_results_openai = res_openai.index.tolist()
|
204 |
+
|
205 |
end_time = timer()
|
|
|
206 |
st.write(f"Time taken to compute scores: {end_time - start_time:.5f} seconds")
|
|
|
|
|
|
|
207 |
st.session_state.search_performed = True
|
208 |
|
209 |
# Display results and collect reactions
|
|
|
211 |
st.subheader("Query Results")
|
212 |
st.write(f"Query: {query}")
|
213 |
|
214 |
+
# Display MPNet results
|
215 |
+
st.markdown("### Results from MPNet Model")
|
216 |
+
for idx in st.session_state.top_results_mpnet:
|
217 |
text = df.iloc[int(idx)]["ext"]
|
218 |
st.write(f"**Text:** {text}")
|
219 |
|
220 |
+
key = f"reaction_mpnet_{idx}"
|
221 |
if key not in st.session_state.reactions:
|
222 |
st.session_state.reactions[key] = "🤷"
|
223 |
|
|
|
224 |
reaction = st.radio(
|
225 |
+
label=f"Rate this MPNet result (Result {idx}):",
|
226 |
options=["👎", "🤷", "👍"],
|
227 |
index=["👎", "🤷", "👍"].index(st.session_state.reactions[key]),
|
228 |
key=key,
|
229 |
horizontal=True,
|
230 |
on_change=update_reaction,
|
231 |
+
args=("mpnet", idx)
|
232 |
)
|
233 |
|
234 |
+
# Display OpenAI results
|
235 |
+
st.markdown("### Results from OpenAI Model")
|
236 |
+
for idx in st.session_state.top_results_openai:
|
237 |
+
text = df.iloc[int(idx)]["ext"]
|
238 |
+
st.write(f"**Text:** {text}")
|
239 |
+
|
240 |
+
key = f"reaction_openai_{idx}"
|
241 |
+
if key not in st.session_state.reactions:
|
242 |
+
st.session_state.reactions[key] = "🤷"
|
243 |
+
|
244 |
+
reaction = st.radio(
|
245 |
+
label=f"Rate this OpenAI result (Result {idx}):",
|
246 |
+
options=["👎", "🤷", "👍"],
|
247 |
+
index=["👎", "🤷", "👍"].index(st.session_state.reactions[key]),
|
248 |
+
key=key,
|
249 |
+
horizontal=True,
|
250 |
+
on_change=update_reaction,
|
251 |
+
args=("openai", idx)
|
252 |
+
)
|
253 |
+
|
254 |
+
# Save reactions button
|
255 |
if st.button("Save Reactions"):
|
256 |
+
# Collect MPNet results
|
257 |
+
results_mpnet = []
|
258 |
+
for idx in st.session_state.top_results_mpnet:
|
259 |
+
key = f"reaction_mpnet_{idx}"
|
260 |
+
results_mpnet.append({
|
261 |
"text": df.iloc[int(idx)]["ext"],
|
262 |
"reaction": st.session_state.reactions[key]
|
263 |
})
|
|
|
|
|
264 |
|
265 |
+
# Collect OpenAI results
|
266 |
+
results_openai = []
|
267 |
+
for idx in st.session_state.top_results_openai:
|
268 |
+
key = f"reaction_openai_{idx}"
|
269 |
+
results_openai.append({
|
270 |
+
"text": df.iloc[int(idx)]["ext"],
|
271 |
+
"reaction": st.session_state.reactions[key]
|
272 |
+
})
|
273 |
+
|
274 |
+
save_reactions_to_dataset(
|
275 |
+
user_type,
|
276 |
+
st.session_state.username,
|
277 |
+
query,
|
278 |
+
results_mpnet,
|
279 |
+
results_openai
|
280 |
+
)
|
281 |
+
st.success("Reactions saved successfully!")
|
282 |
+
clear_search_state()
|
283 |
|
284 |
except Exception as e:
|
285 |
st.error(f"Failed to load database: {str(e)}")
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
torch
|
2 |
pandas
|
3 |
-
sentence-transformers
|
|
|
|
1 |
torch
|
2 |
pandas
|
3 |
+
sentence-transformers
|
4 |
+
openai
|