Spaces:
Runtime error
Runtime error
File size: 7,643 Bytes
55d3f7a f51e197 c90e96d 59dea20 01a5a51 59dea20 55d3f7a 59dea20 01a5a51 69cba78 01a5a51 59dea20 ea4c492 59dea20 ea4c492 59dea20 55d3f7a 59dea20 4132514 01a5a51 4132514 f51e197 b1a0d5b ea4c492 f51e197 59dea20 d38ab04 4132514 c90e96d 55d3f7a c90e96d d38ab04 55d3f7a 4132514 c90e96d d38ab04 c90e96d 4132514 c90e96d 4132514 55d3f7a c90e96d d38ab04 55d3f7a 01a5a51 55d3f7a 59dea20 55d3f7a 4132514 55d3f7a 01a5a51 55d3f7a 59dea20 01a5a51 59dea20 01a5a51 59dea20 4132514 55d3f7a ea4c492 55d3f7a 69cba78 55d3f7a ea4c492 4132514 59dea20 4132514 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
import math
import streamlit as st
import pandas as pd
import numpy as np
import torch
from transformers import AlbertTokenizer, AlbertModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from io import BytesIO
# base is smaller, vs large
model_size='base'
tokenizer = AlbertTokenizer.from_pretrained('albert-' + model_size + '-v2')
model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
model_sbert = SentenceTransformer('sentence-transformers/paraphrase-albert-base-v2')
# for regular burt 0.98
similarity_threshold = 0.8
def get_sbert_embedding(input_text):
embedding = model_sbert.encode(input_text)
return embedding.tolist()
def get_embedding(input_text):
encoded_input = tokenizer(input_text, return_tensors='pt')
input_ids = encoded_input.input_ids
#input_num_tokens = input_ids.shape[1]
#print( "Number of input tokens: " + str(input_num_tokens))
#print("Length of input: " + str(len(input_text)))
list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
#print( "Tokens : " + ' '.join(list_of_tokens))
with torch.no_grad():
outputs = model(**encoded_input)
last_hidden_states = outputs[0]
sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
#sentence_embedding = output.last_hidden_state[0][0]
return sentence_embedding.tolist()
st.set_page_config(layout="wide")
st.title('Upload the Address Dataset')
st.markdown('Upload an Excel file to view the data in a table.')
uploaded_file = st.file_uploader('Choose a file', type='xlsx')
if uploaded_file is not None:
data_caqh = pd.read_excel(uploaded_file, sheet_name='CAQH', dtype=str)
data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB', dtype=str)
# Data cleaning CAQH
data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
+ np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)+ ', ') \
+ data_caqh['city'].astype(str) + ', '\
+ data_caqh['state'].astype(str) + ', ' \
+ data_caqh['postalcode'].astype(str)
st.write(f"CAQH before duplicate removal {len(data_caqh)}")
data_caqh.drop_duplicates(subset='full-addr',inplace=True)
data_caqh = data_caqh.reset_index(drop=True) # reset the index.
st.write(f"CAQH after duplicate removal {len(data_caqh)}")
# Data cleaning NDB
data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')
data_ndb['zip_cd_zip_pls_4_cd'] = data_ndb['zip_cd'].astype(str) +\
np.where( data_ndb['zip_pls_4_cd'] == '', '', '-' \
+ data_ndb['zip_pls_4_cd'].astype(str))
data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
+ data_ndb['cty_nm'].astype(str).str.strip() + ', ' \
+ data_ndb['st_cd'].astype(str) + ', ' + data_ndb['zip_cd_zip_pls_4_cd']
# Calculate similarity For CAQH
num_items = len(data_caqh)
progress_bar = st.progress(0)
total_steps = 100
step_size = math.ceil(num_items / total_steps)
data_caqh['embedding'] = 0
embedding_col_index = data_caqh.columns.get_loc('embedding')
full_addr_col_index = data_caqh.columns.get_loc('full-addr')
for i in range(total_steps):
# Update progress bar
progress = (i + 1) / total_steps
# Process a batch of rows
start = i * step_size
end = start + step_size
stop_iter = False
if end >= num_items:
end = num_items
stop_iter = True
data_caqh.iloc[start:end, embedding_col_index] = data_caqh.iloc[start:end, full_addr_col_index].apply(get_sbert_embedding)
progress_bar.progress(value=progress, text=f"CAQH embeddings: {(i + 1) * step_size} processed out of {num_items}")
if stop_iter:
break
st.write(f"Embeddings for CAQH calculated")
# Calculate similarity For NDB
num_items = len(data_ndb)
progress_bar = st.progress(0)
total_steps = 100
step_size = math.ceil(num_items / total_steps)
data_ndb['embedding'] = 0
embedding_col_index = data_ndb.columns.get_loc('embedding')
full_addr_col_index = data_ndb.columns.get_loc('full-addr')
for i in range(total_steps):
# Update progress bar
progress = (i + 1) / total_steps
# Process a batch of rows
start = i * step_size
end = start + step_size
stop_iter = False
if end >= num_items:
end = num_items
stop_iter = True
# or get_embedding
data_ndb.iloc[start:end, embedding_col_index] = data_ndb.iloc[start:end, full_addr_col_index].apply(get_sbert_embedding)
progress_bar.progress(value=progress, text=f"NDB embeddings: {(i + 1) * step_size} processed out of {num_items}")
if stop_iter:
break
st.write(f"Embeddings for NDB calculated... matching")
progress_bar = st.progress(0)
num_items = len(data_caqh)
for i, row in data_caqh.iterrows():
max_similarity = 0
matched_row = None
for j, ndb_row in data_ndb.iterrows():
sim = cosine_similarity([row['embedding']], [ndb_row['embedding']])
if sim > max_similarity:
max_similarity = sim
matched_row = ndb_row
if max_similarity >= similarity_threshold:
data_caqh.at[i, 'matched-addr'] = matched_row['full-addr']
data_caqh.at[i, 'similarity-score'] = max_similarity
else:
print(f"max similarity was {max_similarity}")
data_caqh.at[i, 'matched-addr'] = 'No Matches'
progress = i / num_items
if progress > 1.0:
progress = 1.0
progress_bar.progress(value=progress, text=f"matching similarities - {i} done out of {num_items}")
# Drop columns not needed for display
data_caqh.drop(columns=['embedding'], inplace=True)
data_ndb.drop(columns=['embedding'], inplace=True)
st.header('CAQH addresses and matches')
st.dataframe(data_caqh, use_container_width=True)
# Calculate stats.
total_items = len(data_caqh)
item_without_matches = data_caqh['matched-addr'].value_counts().get('No Matches', 0)
items_with_matches = total_items - item_without_matches;
percent_matched = (items_with_matches/total_items)*100.0
st.write(f"From total matches {total_items}, {items_with_matches} items matched, {item_without_matches} items did not match, {percent_matched:.2f}% matched")
# Create an in-memory binary stream
output = BytesIO()
# Save the DataFrame to the binary stream as an Excel file
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
data_caqh.to_excel(writer, sheet_name='Sheet1', index=False)
writer.save()
# Get the binary data from the stream
data = output.getvalue()
# Add a download button for the Excel file
st.download_button(
label='Download CAQH matches as Excel file',
data=data,
file_name='data.xlsx',
mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
)
st.header('NDB data')
st.dataframe(data_ndb, use_container_width=True)
|