import pandas as pd from tfidf_matcher.ngrams import ngrams from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.neighbors import NearestNeighbors import gradio as gr def matcher(original=[], lookup=[], outname='Original', ngram_length=3, cutoff=0.8): k_matches=1 # Enforce listtype, set to lower original = list(original.split(",")) lookup = list(lookup.split(",")) # print(original) # print(lookup) original_lower = [x.lower() for x in original] lookup_lower = [x.lower() for x in lookup] # Set ngram length for TfidfVectorizer callable def ngrams_user(string, n=ngram_length): return ngrams(string, n) # Generate Sparse TFIDF matrix from Lookup corpus vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams_user) tf_idf_lookup = vectorizer.fit_transform(lookup_lower) # Fit KNN model to sparse TFIDF matrix generated from Lookup nbrs = NearestNeighbors(n_neighbors=k_matches, n_jobs=-1, metric="cosine").fit(tf_idf_lookup) # Use nbrs model to obtain nearest matches in lookup dataset. Vectorize first. tf_idf_original = vectorizer.transform(original_lower) distances, lookup_indices = nbrs.kneighbors(tf_idf_original) # Extract top Match Score (which is just the distance to the nearest neighbour), # Original match item, and Lookup matches. original_name_list = [] confidence_list = [] index_list = [] lookup_list = [] print(len(lookup_indices)) # i is 0:len(original), j is list of lists of matches for i, lookup_index in enumerate(lookup_indices): original_name = original[i] # lookup names in lookup list lookups = [lookup[index] for index in lookup_index] # transform distances to confidences and store confidence = [1 - round(dist, 2) for dist in distances[i]] original_name_list.append(original_name) # store index index_list.append(lookup_index) confidence_list.append(confidence) lookup_list.append(lookups) # Convert to df df_orig_name = pd.DataFrame(original_name_list, columns=[outname]) df_lookups = pd.DataFrame( lookup_list, columns=["Match"] ) df_confidence = pd.DataFrame( confidence_list, columns=["Match Confidence"], ) # bind columns matches = pd.concat([df_orig_name, df_lookups, df_confidence], axis=1) # reorder columns | can be skipped lookup_cols = list(matches.columns.values) lookup_cols_reordered = [lookup_cols[0]] for i in range(1, k_matches + 1): lookup_cols_reordered.append(lookup_cols[i]) lookup_cols_reordered.append(lookup_cols[i + k_matches]) # lookup_cols_reordered.append(lookup_cols[i + 2 * k_matches]) matches = matches[lookup_cols_reordered] matches = matches.loc[matches["Match Confidence"] > cutoff] matches.sort_values(by=["Match Confidence"], ascending=False, inplace=True) matches.drop(columns={"Match Confidence"}, inplace=True) return matches def combine(a, b): return a + " " + b with gr.Blocks() as demo: with gr.Row(): with gr.Column(): txt = gr.Textbox(label="Input a list of names", value='Courtney Walsh,Curtly Ambrose,Malcolm Marshall,Brian Lara,Viv Richards,Obama',lines=2) txt_2 = gr.Textbox(label="Input some names to match", value="Walsh, Ambrose, Marshall, Lara",lines=2) # with gr.Row(): with gr.Column(): outty = gr.Dataframe( headers=["Original", "Match"], datatype=["str", "str"], label="Matched", ) btn = gr.Button(value="Submit") btn.click(matcher, inputs=[txt, txt_2], outputs=[outty]) # if __name__ == "__main__": demo.launch()