import gzip import json from collections import Counter import pandas as pd import numpy as np import jax.numpy as jnp import tqdm from sentence_transformers import util from typing import List, Union import torch from backend.utils import load_model, filter_questions, load_embeddings from sklearn.manifold import TSNE def cos_sim(a, b): return jnp.matmul(a, jnp.transpose(b)) / (jnp.linalg.norm(a) * jnp.linalg.norm(b)) # We get similarity between embeddings. def text_similarity(anchor: str, inputs: List[str], model_name: str, model_dict: dict): print(model_name) model = load_model(model_name, model_dict) # Creating embeddings if hasattr(model, 'encode'): anchor_emb = model.encode(anchor)[None, :] inputs_emb = model.encode(inputs) else: assert len(model) == 2 anchor_emb = model[0].encode(anchor)[None, :] inputs_emb = model[1].encode(inputs) # Obtaining similarity similarity = list(jnp.squeeze(cos_sim(anchor_emb, inputs_emb))) # Returning a Pandas' dataframe d = {'inputs': inputs, 'score': [round(similarity[i], 3) for i in range(len(similarity))]} df = pd.DataFrame(d, columns=['inputs', 'score']) return df # Search def text_search(anchor: str, n_answers: int, model_name: str, model_dict: dict): # Proceeding with model print(model_name) assert model_name == "distilbert_qa" model = load_model(model_name, model_dict) # Creating embeddings query_emb = model.encode(anchor, convert_to_tensor=True)[None, :] print("loading embeddings") corpus_emb = load_embeddings() # Getting hits hits = util.semantic_search(query_emb, corpus_emb, score_function=util.dot_score, top_k=n_answers)[0] filtered_posts = filter_questions("python") print(f"{len(filtered_posts)} posts found with tag: python") hits_titles = [] hits_scores = [] urls = [] for hit in hits: post = filtered_posts[hit['corpus_id']] hits_titles.append(post['title']) hits_scores.append("{:.3f}".format(hit['score'])) urls.append(f"https://stackoverflow.com/q/{post['id']}") return hits_titles, hits_scores, urls def text_cluster(anchor: str, n_answers: int, model_name: str, model_dict: dict): # Proceeding with model print(model_name) assert model_name == "distilbert_qa" model = load_model(model_name, model_dict) # Creating embeddings query_emb = model.encode(anchor, convert_to_tensor=True)[None, :] print("loading embeddings") corpus_emb = load_embeddings() # Getting hits hits = util.semantic_search(query_emb, corpus_emb, score_function=util.dot_score, top_k=n_answers)[0] filtered_posts = filter_questions("python") hits_dict = [filtered_posts[hit['corpus_id']] for hit in hits] hits_dict.append(dict(id = '1', title = anchor, tags = [''])) hits_emb = torch.stack([corpus_emb[hit['corpus_id']] for hit in hits]) hits_emb = torch.cat((hits_emb, query_emb)) # Dimensionality reduction with t-SNE tsne = TSNE(n_components=3, verbose=1, perplexity=15, n_iter=1000) tsne_results = tsne.fit_transform(hits_emb.cpu()) df = pd.DataFrame(hits_dict) tags = list(df['tags']) counter = Counter(tags[0]) for i in tags[1:]: counter.update(i) df_tags = pd.DataFrame(counter.most_common(), columns=['Tag', 'Mentions']) most_common_tags = list(df_tags['Tag'])[1:5] labels = [] for tags_list in list(df['tags']): for common_tag in most_common_tags: if common_tag in tags_list: labels.append(common_tag) break elif common_tag != most_common_tags[-1]: continue else: labels.append('others') df['title'] = [post['title'] for post in hits_dict] df['labels'] = labels df['tsne_x'] = tsne_results[:, 0] df['tsne_y'] = tsne_results[:, 1] df['tsne_z'] = tsne_results[:, 2] df['size'] = [2 for i in range(len(df))] # Making the query bigger than the rest of the observations df['size'][len(df) - 1] = 10 df['labels'][len(df) - 1] = 'QUERY' import plotly.express as px fig = px.scatter_3d(df, x='tsne_x', y='tsne_y', z='tsne_z', color='labels', size='size', color_discrete_sequence=px.colors.qualitative.D3, hover_data=[df.title]) return fig