import gradio as gr import pandas as pd from time import perf_counter as timer from datasets import Dataset, load_dataset from huggingface_hub import login import os from openai import OpenAI # Load credentials from environment variables or a secure source def load_credentials(): credentials = {} for i in range(1, 51): # Assuming you have 10 credentials username = os.environ.get(f"login_{i}") password = os.environ.get(f"password_{i}") if username and password: credentials[username] = password return credentials # Authentication function def authenticate(username, password, credentials): return credentials.get(username) == password def load_data(database_file): df = pd.read_parquet(database_file) return df def save_reactions_to_dataset(user_type, query, results): data = { "user_type": [], "query": [], "retrieved_text": [], "reaction": [] } for result in results: data["user_type"].append(user_type) data["query"].append(query) data["retrieved_text"].append(result["text"]) data["reaction"].append(result["reaction"]) # Load existing dataset from the Hub (if it exists) try: dataset = load_dataset("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation", split="train") existing_data = dataset.to_dict() except Exception: # If the dataset doesn't exist, start with an empty dataset existing_data = { "user_type": [], "query": [], "retrieved_text": [], "reaction": [] } # Append new data to existing data for key in data: existing_data[key].extend(data[key]) # Create a new dataset from the combined data updated_dataset = Dataset.from_dict(existing_data) # Push the updated dataset to the Hub updated_dataset.push_to_hub("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation") def generate_openai_embeddings(client, text): response = client.embeddings.create( input=text, model="text-embedding-3-small" ) return response.data[0].embedding def cosine_similarity(embedding_0, embedding_1): dot_product = sum(a * b for a, b in zip(embedding_0, embedding_1)) norm_0 = sum(a * a for a in embedding_0) ** 0.5 norm_1 = sum(b * b for b in embedding_1) ** 0.5 return dot_product / (norm_0 * norm_1) def search_query(client, query, df, n=3): embedding = generate_openai_embeddings(client, query) df['similarities'] = df.openai_embedding.apply(lambda x: cosine_similarity(x, embedding)) res = df.sort_values('similarities', ascending=False).head(n) return res def main(username, password, user_type, query, reactions=None): credentials = load_credentials() if not authenticate(username, password, credentials): return "Invalid username or password", [], [] # Access the Hugging Face token from the environment variable huggingface_token = os.environ.get("al_ghazali_rag_retrieval_evaluation") if huggingface_token: login(token=huggingface_token) else: return "Hugging Face API token not found in environment variables.", [], [] # Initialize OpenAI client client = OpenAI() # Load database from predefined path database_file = '[openai_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet' try: df = load_data(database_file) start_time = timer() res = search_query(client, query, df, n=3) end_time = timer() results = [] for idx in res.index.tolist(): text = df.iloc[int(idx)]["ext"] results.append({"text": text, "index": idx}) # If reactions are provided, save them to the dataset if reactions: reaction_results = [] for idx, reaction in reactions.items(): reaction_results.append({ "text": df.iloc[int(idx)]["ext"], "reaction": reaction }) save_reactions_to_dataset(user_type, query, reaction_results) return f"Time taken to compute scores: {end_time - start_time:.5f} seconds", results, "Reactions saved successfully!" return f"Time taken to compute scores: {end_time - start_time:.5f} seconds", results, "" except Exception as e: return f"Failed to load database: {str(e)}", [], [] # Gradio interface for collecting reactions def collect_reactions(results, reaction_1, reaction_2, reaction_3): reactions = {} for i, reaction in enumerate([reaction_1, reaction_2, reaction_3]): if results and i < len(results): reactions[results[i]["index"]] = reaction return reactions # Define the Gradio interface def gradio_interface(username, password, user_type, query, reaction_1=None, reaction_2=None, reaction_3=None): time_taken, results, save_message = main(username, password, user_type, query) # Only collect reactions if they are provided if reaction_1 is not None or reaction_2 is not None or reaction_3 is not None: reactions = collect_reactions(results, reaction_1, reaction_2, reaction_3) if any(reactions.values()): # If any reaction is provided, save them _, _, save_message = main(username, password, user_type, query, reactions) return time_taken, results, save_message # Input and output components for Gradio inputs = [ gr.Textbox(label="Username"), gr.Textbox(label="Password", type="password"), gr.Radio(["Layman", "Enthusiast", "Ustaz (Expert)"], label="Select your user type:"), gr.Textbox(label="Enter your query:"), gr.Radio(["👎", "🤷", "👍"], label="Reaction for Result 1"), gr.Radio(["👎", "🤷", "👍"], label="Reaction for Result 2"), gr.Radio(["👎", "🤷", "👍"], label="Reaction for Result 3"), ] outputs = [ gr.Textbox(label="Time taken"), gr.JSON(label="Results"), gr.Textbox(label="Save Status"), ] iface = gr.Interface( fn=gradio_interface, inputs=inputs, outputs=outputs, title="EnlightenQalb (Alchemy of Happiness)", description="Search and rate results from The Alchemy of Happiness." ) if __name__ == "__main__": iface.launch()