|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
|
|
from sentence_transformers.util import cos_sim |
|
from sentence_transformers import SentenceTransformer |
|
from bokeh.plotting import figure, output_notebook, show, save |
|
from bokeh.io import output_file, show |
|
from bokeh.models import ColumnDataSource, HoverTool |
|
from sklearn.manifold import TSNE |
|
|
|
|
|
@st.cache |
|
def load_model(): |
|
model = SentenceTransformer('hackathon-pln-es/bertin-roberta-base-finetuning-esnli') |
|
model.eval() |
|
return model |
|
|
|
@st.cache |
|
def load_plot_data(): |
|
embs = np.load('semeval2015-embs.npy') |
|
data = pd.read_csv('semeval2015-data.csv') |
|
return embs, data |
|
|
|
st.title("Sentence Embedding for Spanish with Bertin") |
|
st.write("Sentence embedding for spanish trained on NLI. Used for Sentence Textual Similarity. Based on the model hackathon-pln-es/bertin-roberta-base-finetuning-esnli.") |
|
st.write("Introduce two sentence to see their cosine similarity and a graph showing them in the embedding space.") |
|
st.write("Authors: Anibal Pérez, Emilio Tomás Ariza, Lautaro Gesuelli y Mauricio Mazuecos.") |
|
|
|
sent1 = st.text_area('Enter sentence 1') |
|
sent2 = st.text_area('Enter sentence 2') |
|
|
|
if st.button('Compute similarity'): |
|
if sent1 and sent2: |
|
model = load_model() |
|
encodings = model.encode([sent1, sent2]) |
|
sim = cos_sim(encodings[0], encodings[1]).numpy().tolist()[0][0] |
|
st.text('Cosine Similarity: {0:.4f}'.format(sim)) |
|
|
|
print('Generating visualization...') |
|
sentembs, data = load_plot_data() |
|
X_embedded = TSNE(n_components=2, learning_rate='auto', |
|
init='random').fit_transform(np.concatenate([sentembs, encodings], axis=0)) |
|
|
|
data = data.append({'sent': sent1, 'color': '#F0E442'}, ignore_index=True) |
|
data = data.append({'sent': sent2, 'color': '#D55E00'}, ignore_index=True) |
|
data['x'] = X_embedded[:,0] |
|
data['y'] = X_embedded[:,1] |
|
|
|
source = ColumnDataSource(data) |
|
|
|
p = figure(title="Embeddings in space") |
|
p.circle( |
|
x='x', |
|
y='y', |
|
legend_label="Objects", |
|
|
|
color='color', |
|
fill_alpha=0.5, |
|
line_color="blue", |
|
size=14, |
|
source=source |
|
) |
|
p.add_tools(HoverTool( |
|
tooltips=[ |
|
('sent', '@sent') |
|
], |
|
formatters={ |
|
'@sent': 'printf' |
|
}, |
|
mode='mouse' |
|
)) |
|
st.bokeh_chart(p, use_container_width=True) |
|
else: |
|
st.write('Missing a sentences') |
|
else: |
|
pass |
|
|
|
|
|
|