Federico Galatolo commited on
Commit
717aa8f
1 Parent(s): d21b6e2

first commit

Browse files
Files changed (3) hide show
  1. .gitignore +4 -0
  2. app.py +77 -0
  3. requirements.txt +16 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ /env
2
+ /__pycache__/
3
+
4
+ .env
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from elasticsearch import Elasticsearch
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.preprocessing import StandardScaler
7
+ from sklearn.manifold import TSNE
8
+ import plotly.express as plx
9
+
10
+ def compare():
11
+ if len(multiselect) == 0: return
12
+ target_field = f"{model}_features"
13
+ ids = [documents[title] for title in multiselect]
14
+
15
+ results = []
16
+ for id in ids:
17
+ results.append(es.search(
18
+ index="sentences",
19
+ query={
20
+ "constant_score" : {
21
+ "filter" : {
22
+ "term" : {
23
+ "document": id
24
+ }
25
+ }
26
+ }
27
+ },
28
+ size=limit
29
+ ))
30
+
31
+
32
+ features = []
33
+ classes = []
34
+ sentences = []
35
+ for result, title in zip(results, multiselect):
36
+ features.append(np.asarray([sent["_source"][target_field] for sent in result["hits"]["hits"]]))
37
+ classes.extend([title]*len(result["hits"]["hits"]))
38
+ sentences.extend([sent["_source"]["sentence"] for sent in result["hits"]["hits"]])
39
+
40
+ features = np.concatenate(features)
41
+
42
+ scaler = StandardScaler()
43
+ features = scaler.fit_transform(features)
44
+ tsne = TSNE(n_components=2, metric="cosine", init="pca")
45
+ features = tsne.fit_transform(features)
46
+
47
+ classes = [c[:10]+"..." for c in classes]
48
+
49
+ df = pd.DataFrame.from_dict(dict(
50
+ x=features[:, 0],
51
+ y=features[:, 1],
52
+ classes=classes,
53
+ sentences=sentences
54
+ ))
55
+
56
+
57
+ st.plotly_chart(plx.scatter(
58
+ data_frame=df,
59
+ x="x",
60
+ y="y",
61
+ color="classes",
62
+ hover_name="sentences"
63
+ ))
64
+
65
+ es = Elasticsearch(os.environ["ELASTIC_HOST"], basic_auth=os.environ["ELASTIC_AUTH"].split(":"))
66
+
67
+ results = es.search(index="documents", query={"match_all":{}})
68
+ results = [result["_source"] for result in results["hits"]["hits"]]
69
+
70
+ documents = {f"{result['title']} - {result['author']}": result['id'] for result in results}
71
+
72
+ st.sidebar.title("Semantic compare")
73
+ st.sidebar.write("Select 2 or more documents from the SERICA library to semantically compare them")
74
+ multiselect = st.sidebar.multiselect("Documents", list(documents.keys()))
75
+ model = st.sidebar.selectbox("Model", ["LaBSE"])
76
+ limit = st.sidebar.number_input("Sentences per document", 1000)
77
+ st.sidebar.button("Compare", on_click=compare)
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ certifi==2022.6.15
2
+ elastic-transport==8.1.2
3
+ elasticsearch==8.3.3
4
+ joblib==1.1.0
5
+ numpy==1.23.1
6
+ pandas==1.4.3
7
+ plotly==5.9.0
8
+ python-dateutil==2.8.2
9
+ pytz==2022.1
10
+ scikit-learn==1.1.1
11
+ scipy==1.9.0
12
+ six==1.16.0
13
+ sklearn==0.0
14
+ tenacity==8.0.1
15
+ threadpoolctl==3.1.0
16
+ urllib3==1.26.11