File size: 7,643 Bytes
55d3f7a
f51e197
 
c90e96d
59dea20
 
01a5a51
59dea20
55d3f7a
59dea20
 
 
 
 
 
01a5a51
 
69cba78
 
01a5a51
 
 
 
 
59dea20
 
 
ea4c492
59dea20
ea4c492
 
59dea20
 
 
55d3f7a
59dea20
 
 
 
 
 
 
4132514
01a5a51
4132514
f51e197
 
 
 
 
b1a0d5b
ea4c492
f51e197
59dea20
 
d38ab04
4132514
 
c90e96d
55d3f7a
c90e96d
 
 
d38ab04
55d3f7a
 
 
 
 
4132514
c90e96d
d38ab04
c90e96d
4132514
c90e96d
 
4132514
55d3f7a
c90e96d
d38ab04
55d3f7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01a5a51
55d3f7a
 
 
 
 
 
 
 
 
 
 
 
 
 
59dea20
55d3f7a
 
 
 
 
4132514
55d3f7a
 
 
 
 
 
 
 
 
01a5a51
 
55d3f7a
 
 
 
 
 
 
 
 
 
59dea20
 
 
 
 
 
 
 
01a5a51
59dea20
 
 
01a5a51
59dea20
4132514
55d3f7a
 
 
 
 
ea4c492
 
 
 
 
 
55d3f7a
69cba78
 
 
 
 
 
 
 
55d3f7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea4c492
 
4132514
59dea20
4132514
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import math
import streamlit as st
import pandas as pd
import numpy as np
import torch
from transformers import AlbertTokenizer, AlbertModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from io import BytesIO

# base is smaller, vs large
model_size='base'
tokenizer = AlbertTokenizer.from_pretrained('albert-' + model_size + '-v2')
model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')

model_sbert = SentenceTransformer('sentence-transformers/paraphrase-albert-base-v2')
# for regular burt 0.98

similarity_threshold = 0.8

def get_sbert_embedding(input_text):
    embedding = model_sbert.encode(input_text)
    return embedding.tolist()

def get_embedding(input_text):
    encoded_input = tokenizer(input_text, return_tensors='pt')
    input_ids = encoded_input.input_ids
    #input_num_tokens = input_ids.shape[1]

    #print( "Number of input tokens: " + str(input_num_tokens))
    #print("Length of input: " + str(len(input_text)))

    list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())

    #print( "Tokens : " + ' '.join(list_of_tokens))
    with torch.no_grad():

        outputs = model(**encoded_input)
        last_hidden_states = outputs[0]
        sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
        #sentence_embedding = output.last_hidden_state[0][0]
        return sentence_embedding.tolist()

st.set_page_config(layout="wide")
st.title('Upload the Address Dataset')

st.markdown('Upload an Excel file to view the data in a table.')

uploaded_file = st.file_uploader('Choose a file', type='xlsx')



if uploaded_file is not None:
    data_caqh = pd.read_excel(uploaded_file, sheet_name='CAQH', dtype=str)
    data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB', dtype=str)

    # Data cleaning CAQH
    data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
    data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
                             + np.where(data_caqh['address2'].isnull(),  '' , data_caqh['address2'].astype(str)+ ', ')  \
                             + data_caqh['city'].astype(str) + ', '\
                             + data_caqh['state'].astype(str) + ', ' \
                             + data_caqh['postalcode'].astype(str)

    st.write(f"CAQH before duplicate removal {len(data_caqh)}")
    data_caqh.drop_duplicates(subset='full-addr',inplace=True)
    data_caqh = data_caqh.reset_index(drop=True) # reset the index.
    st.write(f"CAQH after duplicate removal {len(data_caqh)}")

    # Data cleaning NDB
    data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')

    data_ndb['zip_cd_zip_pls_4_cd'] = data_ndb['zip_cd'].astype(str) +\
                                      np.where( data_ndb['zip_pls_4_cd'] == '', '', '-' \
                                      + data_ndb['zip_pls_4_cd'].astype(str))

    data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
                            + data_ndb['cty_nm'].astype(str).str.strip() + ', ' \
                            + data_ndb['st_cd'].astype(str) + ', ' + data_ndb['zip_cd_zip_pls_4_cd']

    # Calculate similarity For CAQH
    num_items = len(data_caqh)
    progress_bar = st.progress(0)
    total_steps = 100
    step_size = math.ceil(num_items / total_steps)

    data_caqh['embedding'] = 0

    embedding_col_index = data_caqh.columns.get_loc('embedding')
    full_addr_col_index = data_caqh.columns.get_loc('full-addr')
    for i in range(total_steps):
        # Update progress bar
        progress = (i + 1) / total_steps


        # Process a batch of rows
        start = i * step_size
        end = start + step_size

        stop_iter = False
        if end >= num_items:
            end = num_items
            stop_iter = True

        data_caqh.iloc[start:end, embedding_col_index]  = data_caqh.iloc[start:end, full_addr_col_index].apply(get_sbert_embedding)

        progress_bar.progress(value=progress, text=f"CAQH embeddings: {(i + 1) * step_size} processed out of {num_items}")

        if stop_iter:
            break

    st.write(f"Embeddings for CAQH calculated")
    # Calculate similarity For NDB
    num_items = len(data_ndb)
    progress_bar = st.progress(0)
    total_steps = 100
    step_size = math.ceil(num_items / total_steps)

    data_ndb['embedding'] = 0

    embedding_col_index = data_ndb.columns.get_loc('embedding')
    full_addr_col_index = data_ndb.columns.get_loc('full-addr')
    for i in range(total_steps):
        # Update progress bar
        progress = (i + 1) / total_steps

        # Process a batch of rows
        start = i * step_size
        end = start + step_size

        stop_iter = False
        if end >= num_items:
            end = num_items
            stop_iter = True

        # or get_embedding
        data_ndb.iloc[start:end, embedding_col_index]  = data_ndb.iloc[start:end, full_addr_col_index].apply(get_sbert_embedding)

        progress_bar.progress(value=progress, text=f"NDB embeddings: {(i + 1) * step_size} processed out of {num_items}")

        if stop_iter:
            break

    st.write(f"Embeddings for NDB calculated... matching")

    progress_bar = st.progress(0)
    num_items = len(data_caqh)
    for i, row in data_caqh.iterrows():
        max_similarity = 0
        matched_row = None
        for j, ndb_row in data_ndb.iterrows():
            sim = cosine_similarity([row['embedding']], [ndb_row['embedding']])
            if sim > max_similarity:
                max_similarity = sim
                matched_row = ndb_row
        if max_similarity >= similarity_threshold:
            data_caqh.at[i, 'matched-addr'] = matched_row['full-addr']
            data_caqh.at[i, 'similarity-score'] = max_similarity
        else:
            print(f"max similarity was {max_similarity}")
            data_caqh.at[i, 'matched-addr'] = 'No Matches'

        progress = i / num_items
        if progress > 1.0:
            progress = 1.0
        progress_bar.progress(value=progress, text=f"matching similarities - {i} done out of {num_items}")

    # Drop columns not needed for display
    data_caqh.drop(columns=['embedding'], inplace=True)
    data_ndb.drop(columns=['embedding'], inplace=True)

    st.header('CAQH addresses and matches')
    st.dataframe(data_caqh, use_container_width=True)

    # Calculate stats.
    total_items = len(data_caqh)
    item_without_matches = data_caqh['matched-addr'].value_counts().get('No Matches', 0)
    items_with_matches = total_items - item_without_matches;
    percent_matched = (items_with_matches/total_items)*100.0

    st.write(f"From total matches {total_items}, {items_with_matches} items matched, {item_without_matches} items did not match, {percent_matched:.2f}% matched")

    # Create an in-memory binary stream
    output = BytesIO()
    # Save the DataFrame to the binary stream as an Excel file
    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
        data_caqh.to_excel(writer, sheet_name='Sheet1', index=False)
        writer.save()

    # Get the binary data from the stream
    data = output.getvalue()

    # Add a download button for the Excel file
    st.download_button(
        label='Download CAQH matches as Excel file',
        data=data,
        file_name='data.xlsx',
        mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
    )

    st.header('NDB data')
    st.dataframe(data_ndb, use_container_width=True)