|
import gradio as gr |
|
from sentence_transformers import SentenceTransformer |
|
from wikipediaapi import Wikipedia |
|
import textwrap |
|
import numpy as np |
|
from openai import OpenAI |
|
|
|
|
|
def process_query(wiki_page, model_name, embed_dim, query, api_key): |
|
model_mapping = { |
|
"Arabic-mpnet-base-all-nli-triplet": "Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", |
|
"Arabic-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", |
|
"Arabert-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", |
|
"Arabic-labse-Matryoshka": "Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", |
|
"Marbert-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka" |
|
} |
|
|
|
model_path = model_mapping[model_name] |
|
model = SentenceTransformer(model_path, trust_remote_code=True, truncate_dim=embed_dim) |
|
wiki = Wikipedia('RAGBot/0.0', 'ar') |
|
doc = wiki.page(wiki_page).text |
|
paragraphs = doc.split('\n\n') |
|
|
|
for i, p in enumerate(paragraphs): |
|
wrapped_text = textwrap.fill(p, width=100) |
|
|
|
docs_embed = model.encode(paragraphs, normalize_embeddings=True) |
|
query_embed = model.encode(query, normalize_embeddings=True) |
|
similarities = np.dot(docs_embed, query_embed.T) |
|
top_3_idx = np.argsort(similarities, axis=0)[-3:][::-1].tolist() |
|
most_similar_documents = [paragraphs[idx] for idx in top_3_idx] |
|
|
|
CONTEXT = "" |
|
for i, p in enumerate(most_similar_documents): |
|
wrapped_text = textwrap.fill(p, width=100) |
|
CONTEXT += wrapped_text + "\n\n" |
|
|
|
prompt = f""" |
|
use the following CONTEXT to answer the QUESTION at the end. |
|
If you don't know the answer, just say that you don't know, don't try to make up an answer. |
|
CONTEXT: {CONTEXT} |
|
QUESTION: {query} |
|
""" |
|
|
|
client = OpenAI(api_key=api_key) |
|
response = client.chat.completions.create( |
|
model="gpt-4o", |
|
messages=[ |
|
{"role": "user", "content": prompt}, |
|
] |
|
) |
|
|
|
return response.choices[0].message.content |
|
|
|
|
|
wiki_page_input = gr.Textbox(label="Wikipedia Page (in Arabic)") |
|
query_input = gr.Textbox(label="Query (in Arabic)") |
|
api_key_input = gr.Textbox(label="OpenAI API Key", type="password") |
|
|
|
model_choice = gr.Dropdown( |
|
choices=[ |
|
"Arabic-mpnet-base-all-nli-triplet", |
|
"Arabic-all-nli-triplet-Matryoshka", |
|
"Arabert-all-nli-triplet-Matryoshka", |
|
"Arabic-labse-Matryoshka", |
|
"Marbert-all-nli-triplet-Matryoshka" |
|
], |
|
label="Choose Embedding Model" |
|
) |
|
|
|
embed_dim_choice = gr.Dropdown( |
|
choices=[768, 512, 256, 128, 64], |
|
label="Embedding Dimension" |
|
) |
|
|
|
output_text = gr.Textbox(label="Output") |
|
|
|
gr.Interface( |
|
fn=process_query, |
|
inputs=[wiki_page_input, model_choice, embed_dim_choice, query_input, api_key_input], |
|
outputs=output_text, |
|
title="Arabic Wiki RAG", |
|
description="Choose a Wikipedia page, embedding model, and dimension to answer a query in Arabic." |
|
).launch() |