Tran Xuan Huy
Update app.py
5e74cff
raw
history blame
1.97 kB
import pandas as pd
import numpy as np
from tqdm import tqdm
from copy import deepcopy
import torch
import json
from numpy.linalg import norm
import gradio as gr
from sentence_transformers import SentenceTransformer
# necessary function
def cosinesimilarity(vector1, vector2):
cosine = np.dot(vector1, vector2)/(norm(vector1)*norm(vector2))
return cosine
def encode_input_and_return_top_n(input_in, db_dff, top_k, new2oldmatching):
embed1 = model.encode(input_in)
scores = []
db_df_in = deepcopy(db_dff)
db_in = list(set(db_df_in['Câu lệnh có sẵn'].tolist()))
for i, func in enumerate(db_in):
embed2 = db_df_in['Embedding'].loc[i]
scores.append(round(cosinesimilarity(embed1, embed2), 3))
db_df_in["Điểm"] = scores
db_df_in.sort_values(by=['Điểm'], inplace=True, ascending=False)
ids = db_df_in[:top_k].index.tolist()
output = {new2oldmatching[db_df_in['Câu lệnh có sẵn'][i].strip()]: round(db_df_in['Điểm'][i].item(), 2) for i in ids}
return output
def image_classifier(Input):
inputt = Input.lower()
result = encode_input_and_return_top_n(inputt, db_df, 3, new2oldmatch)
return result
def encode_database(db_in):
df = pd.DataFrame(list(zip(db_in, [[]]*len(db_in))), columns=["Câu lệnh có sẵn", "Embedding"])
for i, func in tqdm(enumerate(db_in)):
embedding2 = model.encode(func)
df['Embedding'].loc[i] = embedding2
else:
print()
print("Encode database successfully")
return df
model = SentenceTransformer("Huy1432884/function_retrieval")
model.eval()
with open('new2oldmatch.json', 'r') as openfile:
new2oldmatch = json.load(openfile)
new2oldmatch = {u.strip().lower(): v.strip() for u, v in new2oldmatch.items()}
database = [cmd.lower() for cmd in new2oldmatch.keys()]
db_df = encode_database(database)
demo = gr.Interface(fn=image_classifier, inputs="text", outputs="label")
demo.launch()