Spaces:
Runtime error
Runtime error
Feliks Zaslavskiy
commited on
Commit
•
55d3f7a
1
Parent(s):
ea4c492
Fixes, Download button
Browse files- app.py +98 -13
- requirements.txt +2 -1
app.py
CHANGED
@@ -1,9 +1,11 @@
|
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
from transformers import AlbertTokenizer, AlbertModel
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
7 |
|
8 |
# base is smaller, vs large
|
9 |
model_size='base'
|
@@ -20,7 +22,7 @@ def get_embedding(input_text):
|
|
20 |
|
21 |
list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
|
22 |
|
23 |
-
print( "Tokens : " + ' '.join(list_of_tokens))
|
24 |
with torch.no_grad():
|
25 |
|
26 |
outputs = model(**encoded_input)
|
@@ -45,11 +47,16 @@ if uploaded_file is not None:
|
|
45 |
# Data cleaning CAQH
|
46 |
data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
|
47 |
data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
|
48 |
-
+ np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)) \
|
49 |
+ data_caqh['city'].astype(str) + ', '\
|
50 |
+ data_caqh['state'].astype(str) + ', ' \
|
51 |
+ data_caqh['postalcode'].astype(str)
|
52 |
|
|
|
|
|
|
|
|
|
|
|
53 |
# Data cleaning NDB
|
54 |
data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')
|
55 |
|
@@ -58,15 +65,75 @@ if uploaded_file is not None:
|
|
58 |
+ data_ndb['zip_pls_4_cd'].astype(str))
|
59 |
|
60 |
data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
|
61 |
-
+ data_ndb['cty_nm'].astype(str) + ', ' \
|
62 |
+ data_ndb['st_cd'].astype(str) + ', ' + data_ndb['zip_cd_zip_pls_4_cd']
|
63 |
|
64 |
-
#
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
-
|
|
|
|
|
|
|
|
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
for i, row in data_caqh.iterrows():
|
71 |
max_similarity = 0
|
72 |
matched_row = None
|
@@ -81,22 +148,40 @@ if uploaded_file is not None:
|
|
81 |
else:
|
82 |
data_caqh.at[i, 'matched-addr'] = 'No Matches'
|
83 |
|
|
|
|
|
|
|
|
|
|
|
84 |
# Drop columns not needed for display
|
85 |
data_caqh.drop(columns=['embedding'], inplace=True)
|
86 |
data_ndb.drop(columns=['embedding'], inplace=True)
|
87 |
|
88 |
st.header('CAQH addresses and matches')
|
89 |
st.dataframe(data_caqh, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
st.header('NDB data')
|
91 |
st.dataframe(data_ndb, use_container_width=True)
|
92 |
|
93 |
-
# calculate the embedding of each item.
|
94 |
|
95 |
-
#st.dataframe(data_caqh)
|
96 |
-
# Do some matching
|
97 |
-
#data_caqh.loc[data_caqh['full-addr'] == '1000 Vale Terrace, Vista, CA, 92084', 'matched-addr'] = '456 Main St'
|
98 |
-
#time.sleep(10)
|
99 |
-
#st.dataframe(data_caqh)
|
100 |
|
101 |
|
102 |
|
|
|
1 |
+
import math
|
2 |
import streamlit as st
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
import torch
|
6 |
from transformers import AlbertTokenizer, AlbertModel
|
7 |
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
+
from io import BytesIO
|
9 |
|
10 |
# base is smaller, vs large
|
11 |
model_size='base'
|
|
|
22 |
|
23 |
list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
|
24 |
|
25 |
+
#print( "Tokens : " + ' '.join(list_of_tokens))
|
26 |
with torch.no_grad():
|
27 |
|
28 |
outputs = model(**encoded_input)
|
|
|
47 |
# Data cleaning CAQH
|
48 |
data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
|
49 |
data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
|
50 |
+
+ np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)+ ', ') \
|
51 |
+ data_caqh['city'].astype(str) + ', '\
|
52 |
+ data_caqh['state'].astype(str) + ', ' \
|
53 |
+ data_caqh['postalcode'].astype(str)
|
54 |
|
55 |
+
st.write(f"CAQH before duplicate removal {len(data_caqh)}")
|
56 |
+
data_caqh.drop_duplicates(subset='full-addr',inplace=True)
|
57 |
+
data_caqh = data_caqh.reset_index(drop=True) # reset the index.
|
58 |
+
st.write(f"CAQH after duplicate removal {len(data_caqh)}")
|
59 |
+
|
60 |
# Data cleaning NDB
|
61 |
data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')
|
62 |
|
|
|
65 |
+ data_ndb['zip_pls_4_cd'].astype(str))
|
66 |
|
67 |
data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
|
68 |
+
+ data_ndb['cty_nm'].astype(str).str.strip() + ', ' \
|
69 |
+ data_ndb['st_cd'].astype(str) + ', ' + data_ndb['zip_cd_zip_pls_4_cd']
|
70 |
|
71 |
+
# Calculate similarity For CAQH
|
72 |
+
num_items = len(data_caqh)
|
73 |
+
progress_bar = st.progress(0)
|
74 |
+
total_steps = 100
|
75 |
+
step_size = math.ceil(num_items / total_steps)
|
76 |
+
|
77 |
+
data_caqh['embedding'] = 0
|
78 |
+
|
79 |
+
embedding_col_index = data_caqh.columns.get_loc('embedding')
|
80 |
+
full_addr_col_index = data_caqh.columns.get_loc('full-addr')
|
81 |
+
for i in range(total_steps):
|
82 |
+
# Update progress bar
|
83 |
+
progress = (i + 1) / total_steps
|
84 |
+
|
85 |
+
|
86 |
+
# Process a batch of rows
|
87 |
+
start = i * step_size
|
88 |
+
end = start + step_size
|
89 |
+
|
90 |
+
stop_iter = False
|
91 |
+
if end >= num_items:
|
92 |
+
end = num_items
|
93 |
+
stop_iter = True
|
94 |
+
|
95 |
+
data_caqh.iloc[start:end, embedding_col_index] = data_caqh.iloc[start:end, full_addr_col_index].apply(get_embedding)
|
96 |
+
|
97 |
+
progress_bar.progress(value=progress, text=f"CAQH embeddings: {(i + 1) * step_size} processed out of {num_items}")
|
98 |
+
|
99 |
+
if stop_iter:
|
100 |
+
break
|
101 |
+
|
102 |
+
st.write(f"Embeddings for CAQH calculated")
|
103 |
+
# Calculate similarity For NDB
|
104 |
+
num_items = len(data_ndb)
|
105 |
+
progress_bar = st.progress(0)
|
106 |
+
total_steps = 100
|
107 |
+
step_size = math.ceil(num_items / total_steps)
|
108 |
+
|
109 |
+
data_ndb['embedding'] = 0
|
110 |
|
111 |
+
embedding_col_index = data_ndb.columns.get_loc('embedding')
|
112 |
+
full_addr_col_index = data_ndb.columns.get_loc('full-addr')
|
113 |
+
for i in range(total_steps):
|
114 |
+
# Update progress bar
|
115 |
+
progress = (i + 1) / total_steps
|
116 |
|
117 |
+
# Process a batch of rows
|
118 |
+
start = i * step_size
|
119 |
+
end = start + step_size
|
120 |
+
|
121 |
+
stop_iter = False
|
122 |
+
if end >= num_items:
|
123 |
+
end = num_items
|
124 |
+
stop_iter = True
|
125 |
+
|
126 |
+
data_ndb.iloc[start:end, embedding_col_index] = data_ndb.iloc[start:end, full_addr_col_index].apply(get_embedding)
|
127 |
+
|
128 |
+
progress_bar.progress(value=progress, text=f"NDB embeddings: {(i + 1) * step_size} processed out of {num_items}")
|
129 |
+
|
130 |
+
if stop_iter:
|
131 |
+
break
|
132 |
+
|
133 |
+
st.write(f"Embeddings for NDB calculated... matching")
|
134 |
+
|
135 |
+
progress_bar = st.progress(0)
|
136 |
+
num_items = len(data_caqh)
|
137 |
for i, row in data_caqh.iterrows():
|
138 |
max_similarity = 0
|
139 |
matched_row = None
|
|
|
148 |
else:
|
149 |
data_caqh.at[i, 'matched-addr'] = 'No Matches'
|
150 |
|
151 |
+
progress = i / num_items
|
152 |
+
if progress > 1.0:
|
153 |
+
progress = 1.0
|
154 |
+
progress_bar.progress(value=progress, text=f"matching similarities - {i} done out of {num_items}")
|
155 |
+
|
156 |
# Drop columns not needed for display
|
157 |
data_caqh.drop(columns=['embedding'], inplace=True)
|
158 |
data_ndb.drop(columns=['embedding'], inplace=True)
|
159 |
|
160 |
st.header('CAQH addresses and matches')
|
161 |
st.dataframe(data_caqh, use_container_width=True)
|
162 |
+
|
163 |
+
# Create an in-memory binary stream
|
164 |
+
output = BytesIO()
|
165 |
+
# Save the DataFrame to the binary stream as an Excel file
|
166 |
+
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
|
167 |
+
data_caqh.to_excel(writer, sheet_name='Sheet1', index=False)
|
168 |
+
writer.save()
|
169 |
+
|
170 |
+
# Get the binary data from the stream
|
171 |
+
data = output.getvalue()
|
172 |
+
|
173 |
+
# Add a download button for the Excel file
|
174 |
+
st.download_button(
|
175 |
+
label='Download CAQH matches as Excel file',
|
176 |
+
data=data,
|
177 |
+
file_name='data.xlsx',
|
178 |
+
mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
179 |
+
)
|
180 |
+
|
181 |
st.header('NDB data')
|
182 |
st.dataframe(data_ndb, use_container_width=True)
|
183 |
|
|
|
184 |
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
|
187 |
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
streamlit
|
2 |
pandas
|
3 |
numpy
|
4 |
-
openpyxl
|
|
|
|
1 |
streamlit
|
2 |
pandas
|
3 |
numpy
|
4 |
+
openpyxl
|
5 |
+
xlsxwriter
|