Spaces:
Sleeping
Sleeping
import pandas as pd | |
from tfidf_matcher.ngrams import ngrams | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.neighbors import NearestNeighbors | |
import gradio as gr | |
def matcher(original=[], lookup=[], outname='Original', ngram_length=3, cutoff=0.8): | |
k_matches=1 | |
# Enforce listtype, set to lower | |
original = list(original.split(",")) | |
lookup = list(lookup.split(",")) | |
# print(original) | |
# print(lookup) | |
original_lower = [x.lower() for x in original] | |
lookup_lower = [x.lower() for x in lookup] | |
# Set ngram length for TfidfVectorizer callable | |
def ngrams_user(string, n=ngram_length): | |
return ngrams(string, n) | |
# Generate Sparse TFIDF matrix from Lookup corpus | |
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams_user) | |
tf_idf_lookup = vectorizer.fit_transform(lookup_lower) | |
# Fit KNN model to sparse TFIDF matrix generated from Lookup | |
nbrs = NearestNeighbors(n_neighbors=k_matches, n_jobs=-1, metric="cosine").fit(tf_idf_lookup) | |
# Use nbrs model to obtain nearest matches in lookup dataset. Vectorize first. | |
tf_idf_original = vectorizer.transform(original_lower) | |
distances, lookup_indices = nbrs.kneighbors(tf_idf_original) | |
# Extract top Match Score (which is just the distance to the nearest neighbour), | |
# Original match item, and Lookup matches. | |
original_name_list = [] | |
confidence_list = [] | |
index_list = [] | |
lookup_list = [] | |
print(len(lookup_indices)) | |
# i is 0:len(original), j is list of lists of matches | |
for i, lookup_index in enumerate(lookup_indices): | |
original_name = original[i] | |
# lookup names in lookup list | |
lookups = [lookup[index] for index in lookup_index] | |
# transform distances to confidences and store | |
confidence = [1 - round(dist, 2) for dist in distances[i]] | |
original_name_list.append(original_name) | |
# store index | |
index_list.append(lookup_index) | |
confidence_list.append(confidence) | |
lookup_list.append(lookups) | |
# Convert to df | |
df_orig_name = pd.DataFrame(original_name_list, columns=[outname]) | |
df_lookups = pd.DataFrame( | |
lookup_list, columns=["Match"] | |
) | |
df_confidence = pd.DataFrame( | |
confidence_list, | |
columns=["Match Confidence"], | |
) | |
# bind columns | |
matches = pd.concat([df_orig_name, df_lookups, df_confidence], axis=1) | |
# reorder columns | can be skipped | |
lookup_cols = list(matches.columns.values) | |
lookup_cols_reordered = [lookup_cols[0]] | |
for i in range(1, k_matches + 1): | |
lookup_cols_reordered.append(lookup_cols[i]) | |
lookup_cols_reordered.append(lookup_cols[i + k_matches]) | |
# lookup_cols_reordered.append(lookup_cols[i + 2 * k_matches]) | |
matches = matches[lookup_cols_reordered] | |
matches = matches.loc[matches["Match Confidence"] > cutoff] | |
matches.sort_values(by=["Match Confidence"], ascending=False, inplace=True) | |
matches.drop(columns={"Match Confidence"}, inplace=True) | |
return matches | |
def combine(a, b): | |
return a + " " + b | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
txt = gr.Textbox(label="Input a list of names", value='Courtney Walsh,Curtly Ambrose,Malcolm Marshall,Brian Lara,Viv Richards,Obama',lines=2) | |
txt_2 = gr.Textbox(label="Input some names to match", value="Walsh, Ambrose, Marshall, Lara",lines=2) | |
# with gr.Row(): | |
with gr.Column(): | |
outty = gr.Dataframe( | |
headers=["Original", "Match"], | |
datatype=["str", "str"], | |
label="Matched", | |
) | |
btn = gr.Button(value="Submit") | |
btn.click(matcher, inputs=[txt, txt_2], outputs=[outty]) | |
# if __name__ == "__main__": | |
demo.launch() |