import gradio as gr from sentence_transformers import SentenceTransformer from wikipediaapi import Wikipedia import textwrap import numpy as np from openai import OpenAI # Function to process the input and generate the output def process_query(wiki_page, model_name, embed_dim, query, api_key): model_mapping = { "Arabic-mpnet-base-all-nli-triplet": "Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", "Arabic-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", "Arabert-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", "Arabic-labse-Matryoshka": "Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", "Marbert-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka" } model_path = model_mapping[model_name] model = SentenceTransformer(model_path, trust_remote_code=True, truncate_dim=embed_dim) wiki = Wikipedia('RAGBot/0.0', 'ar') doc = wiki.page(wiki_page).text paragraphs = doc.split('\n\n') # chunking for i, p in enumerate(paragraphs): wrapped_text = textwrap.fill(p, width=100) docs_embed = model.encode(paragraphs, normalize_embeddings=True) query_embed = model.encode(query, normalize_embeddings=True) similarities = np.dot(docs_embed, query_embed.T) top_3_idx = np.argsort(similarities, axis=0)[-3:][::-1].tolist() most_similar_documents = [paragraphs[idx] for idx in top_3_idx] CONTEXT = "" for i, p in enumerate(most_similar_documents): wrapped_text = textwrap.fill(p, width=100) CONTEXT += wrapped_text + "\n\n" prompt = f""" use the following CONTEXT to answer the QUESTION at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. CONTEXT: {CONTEXT} QUESTION: {query} """ client = OpenAI(api_key=api_key) response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "user", "content": prompt}, ] ) return response.choices[0].message.content # Define the interface wiki_page_input = gr.Textbox(label="Wikipedia Page (in Arabic)") query_input = gr.Textbox(label="Query (in Arabic)") api_key_input = gr.Textbox(label="OpenAI API Key", type="password") model_choice = gr.Dropdown( choices=[ "Arabic-mpnet-base-all-nli-triplet", "Arabic-all-nli-triplet-Matryoshka", "Arabert-all-nli-triplet-Matryoshka", "Arabic-labse-Matryoshka", "Marbert-all-nli-triplet-Matryoshka" ], label="Choose Embedding Model" ) embed_dim_choice = gr.Dropdown( choices=[768, 512, 256, 128, 64], label="Embedding Dimension" ) output_text = gr.Textbox(label="Output") gr.Interface( fn=process_query, inputs=[wiki_page_input, model_choice, embed_dim_choice, query_input, api_key_input], outputs=output_text, title="Arabic Wiki RAG", description="Choose a Wikipedia page, embedding model, and dimension to answer a query in Arabic." ).launch()