Spaces:
Build error
Build error
from transformers import AutoTokenizer | |
from transformers import AutoModelForSequenceClassification | |
import torch | |
from sentence_transformers import SentenceTransformer, util | |
import gensim | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn import preprocessing | |
import numpy as np | |
import pandas as pd | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli") | |
nli_model = ( | |
AutoModelForSequenceClassification.from_pretrained( | |
"facebook/bart-large-mnli" | |
).cuda() | |
if torch.cuda.is_available() | |
else AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli") | |
) | |
def get_prob(sequence, label): | |
premise = sequence | |
hypothesis = f"This example is {label}." | |
# run through model pre-trained on MNLI | |
x = tokenizer.encode( | |
premise, hypothesis, return_tensors="pt", truncation_strategy="only_first" | |
) | |
logits = nli_model(x.to(device))[0] | |
# we throw away "neutral" (dim 1) and take the probability of | |
# "entailment" (2) as the probability of the label being true | |
entail_contradiction_logits = logits[:, [0, 2]] | |
probs = entail_contradiction_logits.softmax(dim=1) | |
prob_label_is_true = probs[:, 1] | |
return prob_label_is_true[0].item() | |
def get_prob_lists(sequence, labels): | |
out = [] | |
for l in labels: | |
out.append(get_prob(sequence, l)) | |
return out | |
compare_model = SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1") | |
def compare_sentence(query, docs): | |
query_emb = compare_model.encode(query) | |
doc_emb = compare_model.encode(docs) | |
scores = util.dot_score(query_emb, doc_emb)[0].to(device).tolist() | |
return np.mean(scores) | |
def query_jds(DB, keyword): | |
keywords = " ".join(gensim.utils.simple_preprocess(keyword, deacc=True)) | |
temp_tf_matrix = tfidf_matrix(DB, tokenized="tokenized", name="Title") | |
target = query(DB, keywords, temp_tf_matrix) | |
return target | |
def query(df, keywords, tf_matrix): | |
keywords = " ".join(gensim.utils.simple_preprocess(keywords, deacc=True)) | |
df["Query_score"] = tfidf_score(tf_matrix, keywords) | |
q = df.loc[df["Query_score"] > 0.3].sort_values(by="Query_score", ascending=False) | |
result = q[:5].reset_index(drop=True) | |
# print(result[["Title", "Query_score"]]) | |
return result.drop("Query_score", axis=1) | |
def tfidf_score(tf_matrix, keyword): | |
vector = np.array([0] * tf_matrix.shape[1]) | |
for i in keyword.split(): | |
if i in tf_matrix.index: | |
vector = vector + tf_matrix.loc[i].values | |
return vector | |
def tfidf_matrix(data, tokenized="tokenized", name="Course_Name"): | |
corpus = [" ".join(i) for i in data[tokenized]] | |
tfidf_voctorize = TfidfVectorizer().fit(corpus) | |
avg_score = tfidf_voctorize.transform(corpus).toarray().T | |
vocab = tfidf_voctorize.get_feature_names() | |
courses = data[name].values | |
avg_score = preprocessing.minmax_scale(avg_score.T).T | |
scores = pd.DataFrame(avg_score, index=vocab, columns=courses) | |
return scores | |