Spaces:
Runtime error
Runtime error
import gzip | |
import json | |
from collections import Counter | |
import pandas as pd | |
import numpy as np | |
import jax.numpy as jnp | |
import tqdm | |
from sentence_transformers import util | |
from typing import List, Union | |
import torch | |
from backend.utils import load_model, filter_questions, load_embeddings | |
from sklearn.manifold import TSNE | |
def cos_sim(a, b): | |
return jnp.matmul(a, jnp.transpose(b)) / (jnp.linalg.norm(a) * jnp.linalg.norm(b)) | |
# We get similarity between embeddings. | |
def text_similarity(anchor: str, inputs: List[str], model_name: str, model_dict: dict): | |
print(model_name) | |
model = load_model(model_name, model_dict) | |
# Creating embeddings | |
if hasattr(model, 'encode'): | |
anchor_emb = model.encode(anchor)[None, :] | |
inputs_emb = model.encode(inputs) | |
else: | |
assert len(model) == 2 | |
anchor_emb = model[0].encode(anchor)[None, :] | |
inputs_emb = model[1].encode(inputs) | |
# Obtaining similarity | |
similarity = list(jnp.squeeze(cos_sim(anchor_emb, inputs_emb))) | |
# Returning a Pandas' dataframe | |
d = {'inputs': inputs, | |
'score': [round(similarity[i], 3) for i in range(len(similarity))]} | |
df = pd.DataFrame(d, columns=['inputs', 'score']) | |
return df | |
# Search | |
def text_search(anchor: str, n_answers: int, model_name: str, model_dict: dict): | |
# Proceeding with model | |
print(model_name) | |
assert model_name == "distilbert_qa" | |
model = load_model(model_name, model_dict) | |
# Creating embeddings | |
query_emb = model.encode(anchor, convert_to_tensor=True)[None, :] | |
print("loading embeddings") | |
corpus_emb = load_embeddings() | |
# Getting hits | |
hits = util.semantic_search(query_emb, corpus_emb, score_function=util.dot_score, top_k=n_answers)[0] | |
filtered_posts = filter_questions("python") | |
print(f"{len(filtered_posts)} posts found with tag: python") | |
hits_titles = [] | |
hits_scores = [] | |
urls = [] | |
for hit in hits: | |
post = filtered_posts[hit['corpus_id']] | |
hits_titles.append(post['title']) | |
hits_scores.append("{:.3f}".format(hit['score'])) | |
urls.append(f"https://stackoverflow.com/q/{post['id']}") | |
return hits_titles, hits_scores, urls | |
def text_cluster(anchor: str, n_answers: int, model_name: str, model_dict: dict): | |
# Proceeding with model | |
print(model_name) | |
assert model_name == "distilbert_qa" | |
model = load_model(model_name, model_dict) | |
# Creating embeddings | |
query_emb = model.encode(anchor, convert_to_tensor=True)[None, :] | |
print("loading embeddings") | |
corpus_emb = load_embeddings() | |
# Getting hits | |
hits = util.semantic_search(query_emb, corpus_emb, score_function=util.dot_score, top_k=n_answers)[0] | |
filtered_posts = filter_questions("python") | |
hits_dict = [filtered_posts[hit['corpus_id']] for hit in hits] | |
hits_dict.append(dict(id = '1', title = anchor, tags = [''])) | |
hits_emb = torch.stack([corpus_emb[hit['corpus_id']] for hit in hits]) | |
hits_emb = torch.cat((hits_emb, query_emb)) | |
# Dimensionality reduction with t-SNE | |
tsne = TSNE(n_components=3, verbose=1, perplexity=15, n_iter=1000) | |
tsne_results = tsne.fit_transform(hits_emb.cpu()) | |
df = pd.DataFrame(hits_dict) | |
tags = list(df['tags']) | |
counter = Counter(tags[0]) | |
for i in tags[1:]: | |
counter.update(i) | |
df_tags = pd.DataFrame(counter.most_common(), columns=['Tag', 'Mentions']) | |
most_common_tags = list(df_tags['Tag'])[1:5] | |
labels = [] | |
for tags_list in list(df['tags']): | |
for common_tag in most_common_tags: | |
if common_tag in tags_list: | |
labels.append(common_tag) | |
break | |
elif common_tag != most_common_tags[-1]: | |
continue | |
else: | |
labels.append('others') | |
df['title'] = [post['title'] for post in hits_dict] | |
df['labels'] = labels | |
df['tsne_x'] = tsne_results[:, 0] | |
df['tsne_y'] = tsne_results[:, 1] | |
df['tsne_z'] = tsne_results[:, 2] | |
df['size'] = [2 for i in range(len(df))] | |
# Making the query bigger than the rest of the observations | |
df['size'][len(df) - 1] = 10 | |
df['labels'][len(df) - 1] = 'QUERY' | |
import plotly.express as px | |
fig = px.scatter_3d(df, x='tsne_x', y='tsne_y', z='tsne_z', color='labels', size='size', | |
color_discrete_sequence=px.colors.qualitative.D3, hover_data=[df.title]) | |
return fig | |