Fawaz commited on
Commit
3240876
โ€ข
1 Parent(s): 193c1e4

Add application file

Browse files
Files changed (1) hide show
  1. app.py +141 -0
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Task22.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1yBvg6i_GsMk--P2nuSG-mfqCDbuIcEpx
8
+
9
+ # Task 2
10
+ - Raghad Al-Rasheed
11
+ - Fawwaz Alsheikhi
12
+
13
+ using the E5 model as the embedding model and translated dataset from huggingface
14
+ """
15
+
16
+ !pip install sentence_transformers
17
+
18
+ """## Downloading the Embedding model"""
19
+
20
+ from sentence_transformers import SentenceTransformer
21
+ import nltk
22
+ nltk.download('punkt')
23
+ from nltk.tokenize import word_tokenize
24
+ import math
25
+ from scipy import spatial
26
+
27
+
28
+ model = SentenceTransformer("intfloat/multilingual-e5-large").to('cuda')
29
+
30
+ """## Downloading Translated data from english to arabic"""
31
+
32
+ !pip3 install datasets
33
+ from datasets import load_dataset
34
+
35
+
36
+ ds = load_dataset("Helsinki-NLP/news_commentary", "ar-en",split="train")
37
+
38
+ import pandas as pd
39
+
40
+ df = pd.DataFrame(ds['translation'])
41
+
42
+ df['ar']
43
+
44
+ df['ar'][0]
45
+
46
+ """### Extracting the first 10000 rows out of the data"""
47
+
48
+ df=df.head(10000)
49
+
50
+ df['ar'].shape
51
+
52
+ documents =[doc for doc in df['ar']]
53
+
54
+ documents[9999]
55
+
56
+ """## Embedding the sentences by rows"""
57
+
58
+ embeddings = model.encode(documents)
59
+
60
+ from sentence_transformers import SentenceTransformer
61
+ import nltk
62
+ nltk.download('punkt')
63
+ from nltk.tokenize import word_tokenize
64
+ import math
65
+ from scipy import spatial
66
+ import scipy
67
+
68
+ def semantic_search(query, embeddings, documents):
69
+ query_embedding = model.encode(query)
70
+
71
+ document_embeddings = embeddings
72
+ scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings]
73
+ ls1 = list()
74
+ for i, score in enumerate(scores):
75
+ ls1.append([documents[i],score])
76
+
77
+ print(scores.index(min(scores)))
78
+ most_similar_doc = documents[scores.index(min(scores))]
79
+ print("Most similar document", most_similar_doc)
80
+ return ls1
81
+
82
+ output = semantic_search("ู€ ู„ู… ูŠูƒู† ู…ู† ุงู„ุณู‡ู„ ู‚ุท ุฃู† ูŠู†ุฎุฑุท ุงู„ู…ุฑุก ููŠ ู…ุญุงุฏุซุฉ ุนู‚ู„ุงู†ูŠุฉ ุญูˆู„ ู‚ูŠู…ุฉ ุงู„ุฐู‡ุจ.",embeddings, documents)
83
+
84
+ documents[999]
85
+
86
+ """### Extracting top three related sentences"""
87
+
88
+ ranked = sorted(output, key=lambda x: x[1])
89
+ ranked[:3]
90
+
91
+ df
92
+
93
+ """## using english with arabic to see the semantic search of multilangual model"""
94
+
95
+ df['ar']
96
+
97
+ df['en']
98
+
99
+ df_ar = df['ar'].tolist()[:5000]
100
+
101
+ df_en = df['en'].tolist()[:5000]
102
+
103
+ combined_list = df_ar + df_en
104
+
105
+ print(len(combined_list))
106
+
107
+ embeddings1 = model.encode(combined_list)
108
+
109
+ from sentence_transformers import SentenceTransformer
110
+ import nltk
111
+ nltk.download('punkt')
112
+ from nltk.tokenize import word_tokenize
113
+ import math
114
+ from scipy import spatial
115
+ import scipy
116
+
117
+ def semantic_search(query, embeddings1, combined_list):
118
+ query_embedding = model.encode(query)
119
+
120
+ document_embeddings = embeddings1
121
+ scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings]
122
+ ls1 = list()
123
+ for i, score in enumerate(scores):
124
+ ls1.append([combined_list[i],score])
125
+
126
+ print(scores.index(min(scores)))
127
+ most_similar_doc = combined_list[scores.index(min(scores))]
128
+ print("Most similar document", most_similar_doc)
129
+ return ls1
130
+
131
+ output = semantic_search("ู„ุฐู‡ุจ ุจุนุดุฑุฉ ุขู„ุงู ุฏูˆู„ุงุฑุŸ",embeddings1, combined_list)
132
+
133
+ ranked = sorted(output, key=lambda x: x[1])
134
+ ranked[:3]
135
+
136
+ import gradio as gr
137
+
138
+ demo = gr.Interface(fn=semantic_search,inputs = ["text"], outputs=["text", "text", "text"])
139
+ if __name__ == "__main__":
140
+ demo.launch()
141
+