Spaces:
Sleeping
Sleeping
Joshnicholas
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1 +1,110 @@
|
|
1 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from tfidf_matcher.ngrams import ngrams
|
3 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
4 |
+
from sklearn.neighbors import NearestNeighbors
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
def matcher(original=[], lookup=[], outname='Original', ngram_length=3, cutoff=0.8):
|
8 |
+
k_matches=1
|
9 |
+
|
10 |
+
# Enforce listtype, set to lower
|
11 |
+
original = list(original.split(","))
|
12 |
+
lookup = list(lookup.split(","))
|
13 |
+
|
14 |
+
# print(original)
|
15 |
+
# print(lookup)
|
16 |
+
|
17 |
+
original_lower = [x.lower() for x in original]
|
18 |
+
lookup_lower = [x.lower() for x in lookup]
|
19 |
+
|
20 |
+
# Set ngram length for TfidfVectorizer callable
|
21 |
+
def ngrams_user(string, n=ngram_length):
|
22 |
+
return ngrams(string, n)
|
23 |
+
|
24 |
+
# Generate Sparse TFIDF matrix from Lookup corpus
|
25 |
+
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams_user)
|
26 |
+
tf_idf_lookup = vectorizer.fit_transform(lookup_lower)
|
27 |
+
|
28 |
+
# Fit KNN model to sparse TFIDF matrix generated from Lookup
|
29 |
+
nbrs = NearestNeighbors(n_neighbors=k_matches, n_jobs=-1, metric="cosine").fit(tf_idf_lookup)
|
30 |
+
|
31 |
+
# Use nbrs model to obtain nearest matches in lookup dataset. Vectorize first.
|
32 |
+
tf_idf_original = vectorizer.transform(original_lower)
|
33 |
+
distances, lookup_indices = nbrs.kneighbors(tf_idf_original)
|
34 |
+
|
35 |
+
# Extract top Match Score (which is just the distance to the nearest neighbour),
|
36 |
+
# Original match item, and Lookup matches.
|
37 |
+
original_name_list = []
|
38 |
+
confidence_list = []
|
39 |
+
index_list = []
|
40 |
+
lookup_list = []
|
41 |
+
print(len(lookup_indices))
|
42 |
+
# i is 0:len(original), j is list of lists of matches
|
43 |
+
for i, lookup_index in enumerate(lookup_indices):
|
44 |
+
original_name = original[i]
|
45 |
+
# lookup names in lookup list
|
46 |
+
lookups = [lookup[index] for index in lookup_index]
|
47 |
+
# transform distances to confidences and store
|
48 |
+
confidence = [1 - round(dist, 2) for dist in distances[i]]
|
49 |
+
original_name_list.append(original_name)
|
50 |
+
# store index
|
51 |
+
index_list.append(lookup_index)
|
52 |
+
confidence_list.append(confidence)
|
53 |
+
lookup_list.append(lookups)
|
54 |
+
|
55 |
+
# Convert to df
|
56 |
+
df_orig_name = pd.DataFrame(original_name_list, columns=[outname])
|
57 |
+
|
58 |
+
df_lookups = pd.DataFrame(
|
59 |
+
lookup_list, columns=["Match"]
|
60 |
+
)
|
61 |
+
df_confidence = pd.DataFrame(
|
62 |
+
confidence_list,
|
63 |
+
columns=["Match Confidence"],
|
64 |
+
)
|
65 |
+
|
66 |
+
# bind columns
|
67 |
+
matches = pd.concat([df_orig_name, df_lookups, df_confidence], axis=1)
|
68 |
+
|
69 |
+
# reorder columns | can be skipped
|
70 |
+
lookup_cols = list(matches.columns.values)
|
71 |
+
lookup_cols_reordered = [lookup_cols[0]]
|
72 |
+
for i in range(1, k_matches + 1):
|
73 |
+
lookup_cols_reordered.append(lookup_cols[i])
|
74 |
+
lookup_cols_reordered.append(lookup_cols[i + k_matches])
|
75 |
+
# lookup_cols_reordered.append(lookup_cols[i + 2 * k_matches])
|
76 |
+
matches = matches[lookup_cols_reordered]
|
77 |
+
|
78 |
+
matches = matches.loc[matches["Match Confidence"] > cutoff]
|
79 |
+
matches.sort_values(by=["Match Confidence"], ascending=False, inplace=True)
|
80 |
+
|
81 |
+
return matches
|
82 |
+
|
83 |
+
def combine(a, b):
|
84 |
+
return a + " " + b
|
85 |
+
|
86 |
+
|
87 |
+
with gr.Blocks() as demo:
|
88 |
+
|
89 |
+
with gr.Row():
|
90 |
+
with gr.Column():
|
91 |
+
txt = gr.Textbox(label="Input a list of names", value='Courtney Walsh,Curtly Ambrose,Malcolm Marshall,Brian Lara,Viv Richards,Obama',lines=2)
|
92 |
+
txt_2 = gr.Textbox(label="Input some names to match", value="Walsh, Ambrose, Marshall, Lara",lines=2)
|
93 |
+
|
94 |
+
# with gr.Row():
|
95 |
+
with gr.Column():
|
96 |
+
|
97 |
+
outty = gr.Dataframe(
|
98 |
+
headers=["Original", "Match", "Confidence"],
|
99 |
+
datatype=["str", "str", "number"],
|
100 |
+
label="Matched",
|
101 |
+
)
|
102 |
+
|
103 |
+
|
104 |
+
btn = gr.Button(value="Submit")
|
105 |
+
btn.click(matcher, inputs=[txt, txt_2], outputs=[outty])
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
if __name__ == "__main__":
|
110 |
+
demo.launch()
|