Spaces:
Sleeping
Sleeping
Federico Galatolo
commited on
Commit
•
717aa8f
1
Parent(s):
d21b6e2
first commit
Browse files- .gitignore +4 -0
- app.py +77 -0
- requirements.txt +16 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/env
|
2 |
+
/__pycache__/
|
3 |
+
|
4 |
+
.env
|
app.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
from elasticsearch import Elasticsearch
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
from sklearn.preprocessing import StandardScaler
|
7 |
+
from sklearn.manifold import TSNE
|
8 |
+
import plotly.express as plx
|
9 |
+
|
10 |
+
def compare():
|
11 |
+
if len(multiselect) == 0: return
|
12 |
+
target_field = f"{model}_features"
|
13 |
+
ids = [documents[title] for title in multiselect]
|
14 |
+
|
15 |
+
results = []
|
16 |
+
for id in ids:
|
17 |
+
results.append(es.search(
|
18 |
+
index="sentences",
|
19 |
+
query={
|
20 |
+
"constant_score" : {
|
21 |
+
"filter" : {
|
22 |
+
"term" : {
|
23 |
+
"document": id
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
},
|
28 |
+
size=limit
|
29 |
+
))
|
30 |
+
|
31 |
+
|
32 |
+
features = []
|
33 |
+
classes = []
|
34 |
+
sentences = []
|
35 |
+
for result, title in zip(results, multiselect):
|
36 |
+
features.append(np.asarray([sent["_source"][target_field] for sent in result["hits"]["hits"]]))
|
37 |
+
classes.extend([title]*len(result["hits"]["hits"]))
|
38 |
+
sentences.extend([sent["_source"]["sentence"] for sent in result["hits"]["hits"]])
|
39 |
+
|
40 |
+
features = np.concatenate(features)
|
41 |
+
|
42 |
+
scaler = StandardScaler()
|
43 |
+
features = scaler.fit_transform(features)
|
44 |
+
tsne = TSNE(n_components=2, metric="cosine", init="pca")
|
45 |
+
features = tsne.fit_transform(features)
|
46 |
+
|
47 |
+
classes = [c[:10]+"..." for c in classes]
|
48 |
+
|
49 |
+
df = pd.DataFrame.from_dict(dict(
|
50 |
+
x=features[:, 0],
|
51 |
+
y=features[:, 1],
|
52 |
+
classes=classes,
|
53 |
+
sentences=sentences
|
54 |
+
))
|
55 |
+
|
56 |
+
|
57 |
+
st.plotly_chart(plx.scatter(
|
58 |
+
data_frame=df,
|
59 |
+
x="x",
|
60 |
+
y="y",
|
61 |
+
color="classes",
|
62 |
+
hover_name="sentences"
|
63 |
+
))
|
64 |
+
|
65 |
+
es = Elasticsearch(os.environ["ELASTIC_HOST"], basic_auth=os.environ["ELASTIC_AUTH"].split(":"))
|
66 |
+
|
67 |
+
results = es.search(index="documents", query={"match_all":{}})
|
68 |
+
results = [result["_source"] for result in results["hits"]["hits"]]
|
69 |
+
|
70 |
+
documents = {f"{result['title']} - {result['author']}": result['id'] for result in results}
|
71 |
+
|
72 |
+
st.sidebar.title("Semantic compare")
|
73 |
+
st.sidebar.write("Select 2 or more documents from the SERICA library to semantically compare them")
|
74 |
+
multiselect = st.sidebar.multiselect("Documents", list(documents.keys()))
|
75 |
+
model = st.sidebar.selectbox("Model", ["LaBSE"])
|
76 |
+
limit = st.sidebar.number_input("Sentences per document", 1000)
|
77 |
+
st.sidebar.button("Compare", on_click=compare)
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
certifi==2022.6.15
|
2 |
+
elastic-transport==8.1.2
|
3 |
+
elasticsearch==8.3.3
|
4 |
+
joblib==1.1.0
|
5 |
+
numpy==1.23.1
|
6 |
+
pandas==1.4.3
|
7 |
+
plotly==5.9.0
|
8 |
+
python-dateutil==2.8.2
|
9 |
+
pytz==2022.1
|
10 |
+
scikit-learn==1.1.1
|
11 |
+
scipy==1.9.0
|
12 |
+
six==1.16.0
|
13 |
+
sklearn==0.0
|
14 |
+
tenacity==8.0.1
|
15 |
+
threadpoolctl==3.1.0
|
16 |
+
urllib3==1.26.11
|