File size: 5,240 Bytes
21a79fb
 
 
 
c2b54ad
 
 
 
 
 
 
 
21a79fb
c2b54ad
 
 
21a79fb
 
c2b54ad
 
954807c
ff131c7
 
 
 
 
 
 
 
 
 
 
954807c
 
 
 
 
 
 
 
21a79fb
954807c
 
21a79fb
954807c
 
 
ff131c7
954807c
 
 
ff131c7
21a79fb
c2b54ad
 
 
 
 
 
 
 
 
 
 
 
 
 
ff131c7
21a79fb
 
c2b54ad
060eb8e
bd7b1da
954807c
cce52ef
 
 
21a79fb
c2b54ad
ff131c7
21a79fb
 
c2b54ad
 
21a79fb
 
 
 
 
954807c
c2b54ad
21a79fb
 
 
 
ff131c7
a11557a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer, util

# Available models
model_dict = {
    "Arabic-mpnet-base-all-nli-triplet": "Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet",
    "Arabic-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka",
    "Arabert-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka",
    "Arabic-labse-Matryoshka": "Omartificial-Intelligence-Space/Arabic-labse-Matryoshka",
    "Marbert-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka"
}

# Function to load the selected model
def load_model(model_name):
    return SentenceTransformer(model_dict[model_name])

# Function to compute similarity and classify relationship
def predict(model_name, mode, sentence1, sentence2=None, sentence3=None, sentence4=None, dimension="64"):
    model = load_model(model_name)
    dimension = int(dimension)
    result = {
        "Selected Dimension": dimension,
        "Input Sentences": {
            "Sentence 1": sentence1,
            "Sentence 2": sentence2,
            "Sentence 3": sentence3,
            "Sentence 4": sentence4
        },
        "Similarity Scores": {}
    }
    
    if mode == "Compare one to three":
        if sentence2 is None or sentence3 is None or sentence4 is None:
            return "Please provide three sentences for comparison.", {}
        sentences = [sentence1, sentence2, sentence3, sentence4]
    else:
        if sentence2 is None:
            return "Please provide the second sentence for comparison.", {}
        sentences = [sentence1, sentence2]
    
    embeddings = model.encode(sentences)
    embeddings = embeddings[..., :dimension]
    
    if mode == "Compare one to three":
        similarities = util.cos_sim(embeddings[0], embeddings[1:])
        similarity_scores = {f"Sentence {i+2}": float(similarities[0, i]) for i in range(3)}
        result["Similarity Scores"] = similarity_scores
    else:
        similarity_score = util.cos_sim(embeddings[0], embeddings[1])
        similarity_scores = {"Similarity Score": float(similarity_score)}
        result["Similarity Scores"] = similarity_scores
    
    # Word-level similarity
    if mode == "Compare two sentences" and sentence2 is not None:
        words1 = sentence1.split()
        words2 = sentence2.split()
        word_pairs = [(w1, w2) for w1 in words1 for w2 in words2]
        word_embeddings1 = model.encode(words1)[..., :dimension]
        word_embeddings2 = model.encode(words2)[..., :dimension]
        word_similarities = {
            f"{w1} - {w2}": float(util.cos_sim(we1, we2))
            for (w1, we1) in zip(words1, word_embeddings1)
            for (w2, we2) in zip(words2, word_embeddings2)
        }
        result["Word-level Similarities"] = word_similarities
    
    return result

# Define inputs and outputs for Gradio interface
model_dropdown = gr.Dropdown(choices=list(model_dict.keys()), label="Model")
mode_dropdown = gr.Dropdown(choices=["Compare two sentences", "Compare one to three"], label="Mode")
dimension_dropdown = gr.Dropdown(choices=["768", "512", "256", "128", "64"], label="Embedding Dimension")
sentence1_input = gr.Textbox(lines=2, placeholder="Enter the first sentence here...", label="Sentence 1")
sentence2_input = gr.Textbox(lines=2, placeholder="Enter the second sentence here...", label="Sentence 2 (or first of three for mode)")
sentence3_input = gr.Textbox(lines=2, placeholder="Enter the third sentence here...", label="Sentence 3")
sentence4_input = gr.Textbox(lines=2, placeholder="Enter the fourth sentence here...", label="Sentence 4")

inputs = [model_dropdown, mode_dropdown, sentence1_input, sentence2_input, sentence3_input, sentence4_input, dimension_dropdown]
outputs = gr.JSON(label="Detailed Similarity Scores")

examples = [
    ["Arabic-all-nli-triplet-Matryoshka", "Compare one to three", "يجلس شاب ذو شعر أشقر على الحائط يقرأ جريدة بينما تمر امرأة وفتاة شابة.", "ذكر شاب ينظر إلى جريدة بينما تمر إمرأتان بجانبه", "الشاب نائم بينما الأم تقود ابنتها إلى الحديقة", "رجل يقرأ الجريدة في الحديقة", "64"],
    ["Arabic-all-nli-triplet-Matryoshka", "Compare two sentences", "يجلس شاب ذو شعر أشقر على الحائط يقرأ جريدة بينما تمر امرأة وفتاة شابة.", "ذكر شاب ينظر إلى جريدة بينما تمر إمرأتان بجانبه", None, None, "64"]
]

# Create Gradio interface
gr.Interface(
    fn=predict,
    title="Arabic Sentence Similarity with Matryoshka Model",
    description="Compute the semantic similarity between Arabic sentences using various SentenceTransformer models.",
    inputs=inputs,
    examples=examples,
    outputs=outputs,
    cache_examples=False,
    article="Author: OMER NACAR. Model from Hugging Face Hub: [Omartificial-Intelligence-Space/Arabic-Nli-Matryoshka](https://huggingface.co/Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka)",
).launch(debug=True, share=True)