File size: 2,494 Bytes
4547fcf 9c52068 4547fcf 9c52068 4547fcf abc1971 76316b2 9c52068 4547fcf a71f9d4 954178d 4547fcf 9423c98 5117017 abc1971 5117017 9c52068 2c2a191 9c52068 f336aa0 9c52068 5117017 4547fcf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer
from bokeh.plotting import figure, output_notebook, show, save
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, HoverTool
from sklearn.manifold import TSNE
@st.cache
def load_model():
model = SentenceTransformer('hackathon-pln-es/bertin-roberta-base-finetuning-esnli')
model.eval()
return model
@st.cache
def load_plot_data():
embs = np.load('semeval2015-embs.npy')
data = pd.read_csv('semeval2015-data.csv')
return embs, data
st.title("Sentence Embedding for Spanish with Bertin")
st.write("Sentence embedding for spanish trained on NLI. Used for Sentence Textual Similarity. Based on the model hackathon-pln-es/bertin-roberta-base-finetuning-esnli.")
st.write("Introduce two sentence to see their cosine similarity and a graph showing them in the embedding space.")
st.write("Authors: Anibal Pérez, Emilio Tomás Ariza, Lautaro Gesuelli y Mauricio Mazuecos.")
sent1 = st.text_area('Enter sentence 1')
sent2 = st.text_area('Enter sentence 2')
if st.button('Compute similarity'):
if sent1 and sent2:
model = load_model()
encodings = model.encode([sent1, sent2])
sim = cos_sim(encodings[0], encodings[1]).numpy().tolist()[0][0]
st.text('Cosine Similarity: {0:.4f}'.format(sim))
print('Generating visualization...')
sentembs, data = load_plot_data()
X_embedded = TSNE(n_components=2, learning_rate='auto',
init='random').fit_transform(np.concatenate([sentembs, encodings], axis=0))
data = data.append({'sent': sent1, 'color': '#F0E442'}, ignore_index=True) # sentence 1
data = data.append({'sent': sent2, 'color': '#D55E00'}, ignore_index=True) # sentence 2
data['x'] = X_embedded[:,0]
data['y'] = X_embedded[:,1]
source = ColumnDataSource(data)
p = figure(title="Embeddings in space")
p.circle(
x='x',
y='y',
legend_label="Objects",
#fill_color=["red"],
color='color',
fill_alpha=0.5,
line_color="blue",
size=14,
source=source
)
p.add_tools(HoverTool(
tooltips=[
('sent', '@sent')
],
formatters={
'@sent': 'printf'
},
mode='mouse'
))
st.bokeh_chart(p, use_container_width=True)
else:
st.write('Missing a sentences')
else:
pass
|