felix commited on
Commit
59dea20
1 Parent(s): 78b9733

improvements

Browse files
Files changed (3) hide show
  1. Addr-Test.xlsx +0 -0
  2. app.py +50 -28
  3. data.py +9 -3
Addr-Test.xlsx ADDED
Binary file (9.73 kB). View file
 
app.py CHANGED
@@ -1,29 +1,33 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
- #import torch
5
- #from transformers import AlbertTokenizer, AlbertModel
6
- #from sklearn.metrics.pairwise import cosine_similarity
7
-
8
- #tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
9
- #model = AlbertModel.from_pretrained("albert-base-v2")
10
-
11
- #def get_embedding(input_text):
12
- # encoded_input = tokenizer(input_text, return_tensors='pt')
13
- # input_ids = encoded_input.input_ids
14
- # input_num_tokens = input_ids.shape[1]
15
- #
16
- # #print( "Number of input tokens: " + str(input_num_tokens))
17
- # #print("Length of input: " + str(len(input_text)))
18
- #
19
- # list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
20
- #
21
- # #print( "Tokens : " + ' '.join(list_of_tokens))
22
- # with torch.no_grad():
23
- # output = model(**encoded_input)
24
- #
25
- # embedding = output.last_hidden_state[0][0]
26
- # return embedding.tolist()
 
 
 
 
27
 
28
  st.title('Upload the Address Dataset')
29
 
@@ -33,8 +37,8 @@ uploaded_file = st.file_uploader('Choose a file', type='xlsx')
33
 
34
 
35
  if uploaded_file is not None:
36
- data_caqh = pd.read_excel(uploaded_file, sheet_name='CAQH')
37
- data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB')
38
 
39
  # Data cleaning CAQH
40
  data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
@@ -52,17 +56,35 @@ if uploaded_file is not None:
52
  + data_ndb['zip_pls_4_cd'].astype(str))
53
 
54
  data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
 
55
  + data_ndb['st_cd'].astype(str) + ', ' + data_ndb['zip_cd_zip_pls_4_cd']
56
 
57
- # Add a matched column
 
 
 
58
  data_caqh['matched-addr'] = ''
59
 
60
- # App
61
- #data_caqh['embed'] = data_caqh['full-addr'].apply(get_embedding)
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  st.dataframe(data_caqh)
64
  st.dataframe(data_ndb)
65
 
 
 
 
66
  # Do some matching
67
  #data_caqh.loc[data_caqh['full-addr'] == '1000 Vale Terrace, Vista, CA, 92084', 'matched-addr'] = '456 Main St'
68
  #time.sleep(10)
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
+ import torch
5
+ from transformers import AlbertTokenizer, AlbertModel
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+
8
+ # base is smaller, vs large
9
+ model_size='base'
10
+ tokenizer = AlbertTokenizer.from_pretrained('albert-' + model_size + '-v2')
11
+ model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
12
+
13
+ def get_embedding(input_text):
14
+ encoded_input = tokenizer(input_text, return_tensors='pt')
15
+ input_ids = encoded_input.input_ids
16
+ input_num_tokens = input_ids.shape[1]
17
+
18
+ print( "Number of input tokens: " + str(input_num_tokens))
19
+ print("Length of input: " + str(len(input_text)))
20
+
21
+ list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
22
+
23
+ print( "Tokens : " + ' '.join(list_of_tokens))
24
+ with torch.no_grad():
25
+
26
+ outputs = model(**encoded_input)
27
+ last_hidden_states = outputs[0]
28
+ sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
29
+ #sentence_embedding = output.last_hidden_state[0][0]
30
+ return sentence_embedding.tolist()
31
 
32
  st.title('Upload the Address Dataset')
33
 
 
37
 
38
 
39
  if uploaded_file is not None:
40
+ data_caqh = pd.read_excel(uploaded_file, sheet_name='CAQH', dtype=str)
41
+ data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB', dtype=str)
42
 
43
  # Data cleaning CAQH
44
  data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
 
56
  + data_ndb['zip_pls_4_cd'].astype(str))
57
 
58
  data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
59
+ + data_ndb['city'].astype(str) + ', ' \
60
  + data_ndb['st_cd'].astype(str) + ', ' + data_ndb['zip_cd_zip_pls_4_cd']
61
 
62
+ # App
63
+ data_caqh['embedding'] = data_caqh['full-addr'].apply(get_embedding)
64
+ data_ndb['embedding'] = data_ndb['full-addr'].apply(get_embedding)
65
+
66
  data_caqh['matched-addr'] = ''
67
 
68
+ for i, row in data_caqh.iterrows():
69
+ max_similarity = 0
70
+ matched_row = None
71
+ for j, ndb_row in data_ndb.iterrows():
72
+ sim = cosine_similarity([row['embedding']], [ndb_row['embedding']])
73
+ if sim > max_similarity:
74
+ max_similarity = sim
75
+ matched_row = ndb_row
76
+ if max_similarity >= 0.98:
77
+ data_caqh.at[i, 'matched-addr'] = matched_row['full-addr']
78
+ data_caqh.at[i, 'similarity-score'] = max_similarity
79
+ else:
80
+ data_caqh.at[i, 'matched-addr'] = 'No Matches'
81
 
82
  st.dataframe(data_caqh)
83
  st.dataframe(data_ndb)
84
 
85
+ # calculate the embedding of each item.
86
+
87
+ #st.dataframe(data_caqh)
88
  # Do some matching
89
  #data_caqh.loc[data_caqh['full-addr'] == '1000 Vale Terrace, Vista, CA, 92084', 'matched-addr'] = '456 Main St'
90
  #time.sleep(10)
data.py CHANGED
@@ -1,5 +1,6 @@
1
  from transformers import AlbertTokenizer, AlbertModel
2
  from sklearn.metrics.pairwise import cosine_similarity
 
3
 
4
  # base
5
  # large
@@ -11,6 +12,7 @@ a2 = "112 Mountain Blvd Ext, Warren, NJ 07059"
11
  a3 = "1677 NJ-27 #2, Edison, NJ 08817"
12
  a4 = "5078 S Maryland Pkwy, Las Vegas, NV 89119"
13
  a5 = "65 Mountain Boulevard Ext, Warren, NJ 07059"
 
14
 
15
  def get_embedding(input_text):
16
  encoded_input = tokenizer(input_text, return_tensors='pt')
@@ -23,16 +25,20 @@ def get_embedding(input_text):
23
  list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
24
 
25
  print( "Tokens : " + ' '.join(list_of_tokens))
26
- output = model(**encoded_input)
27
 
28
- embedding = output.last_hidden_state[0][0]
29
- return embedding.tolist()
 
 
 
30
 
31
  e1 = get_embedding(a1)
32
  e2 = get_embedding(a2)
33
  #e3 = get_embedding(a3)
34
  e4 = get_embedding(a4)
35
  e5 = get_embedding(a5)
 
36
 
37
  print(f"a1 \"{a1}\" to \"{a2}\" a2")
38
  print(cosine_similarity([e1], [e2]))
 
1
  from transformers import AlbertTokenizer, AlbertModel
2
  from sklearn.metrics.pairwise import cosine_similarity
3
+ import torch
4
 
5
  # base
6
  # large
 
12
  a3 = "1677 NJ-27 #2, Edison, NJ 08817"
13
  a4 = "5078 S Maryland Pkwy, Las Vegas, NV 89119"
14
  a5 = "65 Mountain Boulevard Ext, Warren, NJ 07059"
15
+ a6 = "123 Broad St, New York, NY, 10304-2345"
16
 
17
  def get_embedding(input_text):
18
  encoded_input = tokenizer(input_text, return_tensors='pt')
 
25
  list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
26
 
27
  print( "Tokens : " + ' '.join(list_of_tokens))
28
+ with torch.no_grad():
29
 
30
+ outputs = model(**encoded_input)
31
+ last_hidden_states = outputs[0]
32
+ sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
33
+ #sentence_embedding = output.last_hidden_state[0][0]
34
+ return sentence_embedding.tolist()
35
 
36
  e1 = get_embedding(a1)
37
  e2 = get_embedding(a2)
38
  #e3 = get_embedding(a3)
39
  e4 = get_embedding(a4)
40
  e5 = get_embedding(a5)
41
+ e6 = get_embedding(a6)
42
 
43
  print(f"a1 \"{a1}\" to \"{a2}\" a2")
44
  print(cosine_similarity([e1], [e2]))