Joshnicholas commited on
Commit
61c74a7
·
verified ·
1 Parent(s): dc5eaf8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -1
app.py CHANGED
@@ -1 +1,110 @@
1
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from tfidf_matcher.ngrams import ngrams
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.neighbors import NearestNeighbors
5
+ import gradio as gr
6
+
7
+ def matcher(original=[], lookup=[], outname='Original', ngram_length=3, cutoff=0.8):
8
+ k_matches=1
9
+
10
+ # Enforce listtype, set to lower
11
+ original = list(original.split(","))
12
+ lookup = list(lookup.split(","))
13
+
14
+ # print(original)
15
+ # print(lookup)
16
+
17
+ original_lower = [x.lower() for x in original]
18
+ lookup_lower = [x.lower() for x in lookup]
19
+
20
+ # Set ngram length for TfidfVectorizer callable
21
+ def ngrams_user(string, n=ngram_length):
22
+ return ngrams(string, n)
23
+
24
+ # Generate Sparse TFIDF matrix from Lookup corpus
25
+ vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams_user)
26
+ tf_idf_lookup = vectorizer.fit_transform(lookup_lower)
27
+
28
+ # Fit KNN model to sparse TFIDF matrix generated from Lookup
29
+ nbrs = NearestNeighbors(n_neighbors=k_matches, n_jobs=-1, metric="cosine").fit(tf_idf_lookup)
30
+
31
+ # Use nbrs model to obtain nearest matches in lookup dataset. Vectorize first.
32
+ tf_idf_original = vectorizer.transform(original_lower)
33
+ distances, lookup_indices = nbrs.kneighbors(tf_idf_original)
34
+
35
+ # Extract top Match Score (which is just the distance to the nearest neighbour),
36
+ # Original match item, and Lookup matches.
37
+ original_name_list = []
38
+ confidence_list = []
39
+ index_list = []
40
+ lookup_list = []
41
+ print(len(lookup_indices))
42
+ # i is 0:len(original), j is list of lists of matches
43
+ for i, lookup_index in enumerate(lookup_indices):
44
+ original_name = original[i]
45
+ # lookup names in lookup list
46
+ lookups = [lookup[index] for index in lookup_index]
47
+ # transform distances to confidences and store
48
+ confidence = [1 - round(dist, 2) for dist in distances[i]]
49
+ original_name_list.append(original_name)
50
+ # store index
51
+ index_list.append(lookup_index)
52
+ confidence_list.append(confidence)
53
+ lookup_list.append(lookups)
54
+
55
+ # Convert to df
56
+ df_orig_name = pd.DataFrame(original_name_list, columns=[outname])
57
+
58
+ df_lookups = pd.DataFrame(
59
+ lookup_list, columns=["Match"]
60
+ )
61
+ df_confidence = pd.DataFrame(
62
+ confidence_list,
63
+ columns=["Match Confidence"],
64
+ )
65
+
66
+ # bind columns
67
+ matches = pd.concat([df_orig_name, df_lookups, df_confidence], axis=1)
68
+
69
+ # reorder columns | can be skipped
70
+ lookup_cols = list(matches.columns.values)
71
+ lookup_cols_reordered = [lookup_cols[0]]
72
+ for i in range(1, k_matches + 1):
73
+ lookup_cols_reordered.append(lookup_cols[i])
74
+ lookup_cols_reordered.append(lookup_cols[i + k_matches])
75
+ # lookup_cols_reordered.append(lookup_cols[i + 2 * k_matches])
76
+ matches = matches[lookup_cols_reordered]
77
+
78
+ matches = matches.loc[matches["Match Confidence"] > cutoff]
79
+ matches.sort_values(by=["Match Confidence"], ascending=False, inplace=True)
80
+
81
+ return matches
82
+
83
+ def combine(a, b):
84
+ return a + " " + b
85
+
86
+
87
+ with gr.Blocks() as demo:
88
+
89
+ with gr.Row():
90
+ with gr.Column():
91
+ txt = gr.Textbox(label="Input a list of names", value='Courtney Walsh,Curtly Ambrose,Malcolm Marshall,Brian Lara,Viv Richards,Obama',lines=2)
92
+ txt_2 = gr.Textbox(label="Input some names to match", value="Walsh, Ambrose, Marshall, Lara",lines=2)
93
+
94
+ # with gr.Row():
95
+ with gr.Column():
96
+
97
+ outty = gr.Dataframe(
98
+ headers=["Original", "Match", "Confidence"],
99
+ datatype=["str", "str", "number"],
100
+ label="Matched",
101
+ )
102
+
103
+
104
+ btn = gr.Button(value="Submit")
105
+ btn.click(matcher, inputs=[txt, txt_2], outputs=[outty])
106
+
107
+
108
+
109
+ if __name__ == "__main__":
110
+ demo.launch()