|
import streamlit as st |
|
import tensorflow as tf |
|
import sentencepiece as spm |
|
import numpy as np |
|
from scipy.spatial.distance import cosine |
|
import pandas as pd |
|
from openTSNE import TSNE |
|
import plotly.express as px |
|
|
|
|
|
st.set_page_config(layout="wide") |
|
|
|
|
|
tflite_model_path = "model.tflite" |
|
spm_model_path = "sentencepiece.model" |
|
|
|
sp = spm.SentencePieceProcessor() |
|
sp.load(spm_model_path) |
|
|
|
interpreter = tf.lite.Interpreter(model_path=tflite_model_path) |
|
interpreter.allocate_tensors() |
|
|
|
input_details = interpreter.get_input_details() |
|
output_details = interpreter.get_output_details() |
|
required_input_length = 64 |
|
|
|
|
|
def preprocess_text(text, sp, required_length): |
|
input_ids = sp.encode(text, out_type=int) |
|
input_ids = input_ids[:required_length] + [0] * (required_length - len(input_ids)) |
|
return np.array(input_ids, dtype=np.int32).reshape(1, -1) |
|
|
|
|
|
def generate_embeddings(text): |
|
input_data = preprocess_text(text, sp, required_input_length) |
|
interpreter.set_tensor(input_details[0]['index'], input_data) |
|
interpreter.invoke() |
|
embedding = interpreter.get_tensor(output_details[0]['index']) |
|
return embedding.flatten() |
|
|
|
|
|
preset_sentences_a = [ |
|
"Dan Petrovic predicted conversational search in 2013.", |
|
"Understanding user intent is key to effective SEO.", |
|
"Dejan SEO has been a leader in data-driven SEO.", |
|
"Machine learning is transforming search engines.", |
|
"The future of search is AI-driven and personalized.", |
|
"Search algorithms are evolving to better match user intent.", |
|
"AI technologies enhance digital marketing strategies." |
|
] |
|
|
|
preset_sentences_b = [ |
|
"Advances in machine learning reshape how search engines operate.", |
|
"Personalized content is becoming more prevalent with AI.", |
|
"Customer behavior insights are crucial for marketing strategies.", |
|
"Dan Petrovic anticipated the rise of chat-based search interactions.", |
|
"Dejan SEO is recognized for innovative SEO research and analysis.", |
|
"Quantum computing is advancing rapidly in the tech world.", |
|
"Studying user behavior can improve the effectiveness of online ads." |
|
] |
|
|
|
|
|
if "input_text_a" not in st.session_state: |
|
st.session_state["input_text_a"] = "\n".join(preset_sentences_a) |
|
if "input_text_b" not in st.session_state: |
|
st.session_state["input_text_b"] = "\n".join(preset_sentences_b) |
|
|
|
|
|
if st.button("Clear Fields"): |
|
st.session_state["input_text_a"] = "" |
|
st.session_state["input_text_b"] = "" |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.subheader("Set A Sentences") |
|
input_text_a = st.text_area("Set A", value=st.session_state["input_text_a"], height=200) |
|
|
|
with col2: |
|
st.subheader("Set B Sentences") |
|
input_text_b = st.text_area("Set B", value=st.session_state["input_text_b"], height=200) |
|
|
|
|
|
iterations = st.slider("Number of t-SNE Iterations (Higher values = more refined clusters)", 250, 1000, step=250) |
|
|
|
|
|
if st.button("Calculate Similarity"): |
|
sentences_a = [line.strip() for line in input_text_a.split("\n") if line.strip()] |
|
sentences_b = [line.strip() for line in input_text_b.split("\n") if line.strip()] |
|
|
|
if len(sentences_a) > 0 and len(sentences_b) > 0: |
|
|
|
embeddings_a = [generate_embeddings(sentence) for sentence in sentences_a] |
|
embeddings_b = [generate_embeddings(sentence) for sentence in sentences_b] |
|
|
|
|
|
all_sentences = sentences_a + sentences_b |
|
all_embeddings = np.array(embeddings_a + embeddings_b) |
|
labels = ["Set A"] * len(sentences_a) + ["Set B"] * len(sentences_b) |
|
|
|
|
|
perplexity_value = min(5, len(all_sentences) - 1) |
|
|
|
|
|
tsne = TSNE(n_components=3, perplexity=perplexity_value, n_iter=iterations, initialization="pca", random_state=42) |
|
tsne_results = tsne.fit(all_embeddings) |
|
|
|
|
|
df_tsne = pd.DataFrame({ |
|
"Sentence": all_sentences, |
|
"Set": labels, |
|
"X": tsne_results[:, 0], |
|
"Y": tsne_results[:, 1], |
|
"Z": tsne_results[:, 2] |
|
}) |
|
|
|
|
|
fig = px.scatter_3d(df_tsne, x="X", y="Y", z="Z", color="Set", hover_data={"Sentence": True}, |
|
title="Incremental 3D t-SNE Visualization of Sentence Similarity", |
|
labels={"X": "t-SNE Dimension 1", "Y": "t-SNE Dimension 2", "Z": "t-SNE Dimension 3"}, |
|
width=1200, height=800) |
|
fig.update_traces(marker=dict(size=5, opacity=0.8)) |
|
|
|
|
|
st.plotly_chart(fig) |
|
|
|
|
|
st.subheader("Embeddings for each sentence in Set A") |
|
for i, (sentence, embedding) in enumerate(zip(sentences_a, embeddings_a)): |
|
with st.expander(f"Embedding for Sentence A{i+1}: {sentence}"): |
|
st.write(", ".join([f"{x:.4f}" for x in embedding])) |
|
|
|
st.subheader("Embeddings for each sentence in Set B") |
|
for i, (sentence, embedding) in enumerate(zip(sentences_b, embeddings_b)): |
|
with st.expander(f"Embedding for Sentence B{i+1}: {sentence}"): |
|
st.write(", ".join([f"{x:.4f}" for x in embedding])) |
|
|
|
else: |
|
st.warning("Please enter sentences in both Set A and Set B.") |
|
|