Spaces:
Runtime error
Runtime error
Feliks Zaslavskiy
commited on
Commit
•
71667b3
1
Parent(s):
0766f0d
minor updates
Browse files- .gitignore +1 -0
- app.py +30 -25
- data.py +8 -8
- data_set_training.csv +2 -1
- dev_set_training.csv +9 -1
- train.py +1 -1
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
output/
|
app.py
CHANGED
@@ -9,37 +9,42 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
9 |
from io import BytesIO
|
10 |
|
11 |
# base is smaller, vs large
|
12 |
-
model_size='base'
|
13 |
-
tokenizer = AlbertTokenizer.from_pretrained('albert-' + model_size + '-v2')
|
14 |
-
model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
model_sbert = SentenceTransformer('sentence-transformers/paraphrase-albert-base-v2')
|
17 |
# for regular burt 0.98
|
18 |
|
19 |
-
|
|
|
20 |
|
21 |
def get_sbert_embedding(input_text):
|
22 |
embedding = model_sbert.encode(input_text)
|
23 |
return embedding.tolist()
|
24 |
|
25 |
-
def get_embedding(input_text):
|
26 |
-
encoded_input = tokenizer(input_text, return_tensors='pt')
|
27 |
-
input_ids = encoded_input.input_ids
|
28 |
-
#input_num_tokens = input_ids.shape[1]
|
29 |
-
|
30 |
-
#print( "Number of input tokens: " + str(input_num_tokens))
|
31 |
-
#print("Length of input: " + str(len(input_text)))
|
32 |
-
|
33 |
-
list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
|
34 |
-
|
35 |
-
#print( "Tokens : " + ' '.join(list_of_tokens))
|
36 |
-
with torch.no_grad():
|
37 |
-
|
38 |
-
outputs = model(**encoded_input)
|
39 |
-
last_hidden_states = outputs[0]
|
40 |
-
sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
|
41 |
-
#sentence_embedding = output.last_hidden_state[0][0]
|
42 |
-
return sentence_embedding.tolist()
|
43 |
|
44 |
st.set_page_config(layout="wide")
|
45 |
st.title('Upload the Address Dataset')
|
@@ -58,7 +63,7 @@ if uploaded_file is not None:
|
|
58 |
data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
|
59 |
data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
|
60 |
+ np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)+ ', ') \
|
61 |
-
+ data_caqh['city'].astype(str) + ' '\
|
62 |
+ data_caqh['state'].astype(str) + ' ' \
|
63 |
+ data_caqh['postalcode'].astype(str)
|
64 |
|
@@ -75,7 +80,7 @@ if uploaded_file is not None:
|
|
75 |
+ data_ndb['zip_pls_4_cd'].astype(str))
|
76 |
|
77 |
data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
|
78 |
-
+ data_ndb['cty_nm'].astype(str).str.strip() + ' ' \
|
79 |
+ data_ndb['st_cd'].astype(str) + ' ' + data_ndb['zip_cd_zip_pls_4_cd']
|
80 |
|
81 |
# Calculate similarity For CAQH
|
|
|
9 |
from io import BytesIO
|
10 |
|
11 |
# base is smaller, vs large
|
12 |
+
#model_size='base'
|
13 |
+
#tokenizer = AlbertTokenizer.from_pretrained('albert-' + model_size + '-v2')
|
14 |
+
#model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
|
15 |
+
|
16 |
+
# For baseline 'sentence-transformers/paraphrase-albert-base-v2'
|
17 |
+
model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
|
18 |
+
|
19 |
+
similarity_threshold = 0.9
|
20 |
|
|
|
21 |
# for regular burt 0.98
|
22 |
|
23 |
+
model_sbert = SentenceTransformer(model_name)
|
24 |
+
|
25 |
|
26 |
def get_sbert_embedding(input_text):
|
27 |
embedding = model_sbert.encode(input_text)
|
28 |
return embedding.tolist()
|
29 |
|
30 |
+
#def get_embedding(input_text):
|
31 |
+
# encoded_input = tokenizer(input_text, return_tensors='pt')
|
32 |
+
# input_ids = encoded_input.input_ids
|
33 |
+
# #input_num_tokens = input_ids.shape[1]
|
34 |
+
#
|
35 |
+
# #print( "Number of input tokens: " + str(input_num_tokens))
|
36 |
+
# #print("Length of input: " + str(len(input_text)))
|
37 |
+
#
|
38 |
+
# list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
|
39 |
+
#
|
40 |
+
# #print( "Tokens : " + ' '.join(list_of_tokens))
|
41 |
+
# with torch.no_grad():
|
42 |
+
#
|
43 |
+
# outputs = model(**encoded_input)
|
44 |
+
# last_hidden_states = outputs[0]
|
45 |
+
# sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
|
46 |
+
# #sentence_embedding = output.last_hidden_state[0][0]
|
47 |
+
# return sentence_embedding.tolist()
|
48 |
|
49 |
st.set_page_config(layout="wide")
|
50 |
st.title('Upload the Address Dataset')
|
|
|
63 |
data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
|
64 |
data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
|
65 |
+ np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)+ ', ') \
|
66 |
+
+ data_caqh['city'].astype(str) + ', '\
|
67 |
+ data_caqh['state'].astype(str) + ' ' \
|
68 |
+ data_caqh['postalcode'].astype(str)
|
69 |
|
|
|
80 |
+ data_ndb['zip_pls_4_cd'].astype(str))
|
81 |
|
82 |
data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
|
83 |
+
+ data_ndb['cty_nm'].astype(str).str.strip() + ', ' \
|
84 |
+ data_ndb['st_cd'].astype(str) + ' ' + data_ndb['zip_cd_zip_pls_4_cd']
|
85 |
|
86 |
# Calculate similarity For CAQH
|
data.py
CHANGED
@@ -9,7 +9,7 @@ from sentence_transformers import SentenceTransformer
|
|
9 |
#tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
10 |
#model = AlbertModel.from_pretrained("albert-base-v2")
|
11 |
#'sentence-transformers/paraphrase-albert-base-v2'
|
12 |
-
model_name = 'output/training_OnlineConstrativeLoss-2023-03-
|
13 |
model_sbert = SentenceTransformer(model_name)
|
14 |
|
15 |
def get_sbert_embedding(input_text):
|
@@ -58,22 +58,22 @@ e8 = get_sbert_embedding(a8)
|
|
58 |
e8x = get_sbert_embedding(a8x)
|
59 |
e9 = get_sbert_embedding(a9)
|
60 |
e10 = get_sbert_embedding(a10)
|
61 |
-
print(f"a1 \"{a1}\" to \"{a2}\" a2")
|
62 |
print(cosine_similarity([e1], [e2]))
|
63 |
-
print(f"a1 \"{a1}\" to \"{a4}\" a4")
|
64 |
print(cosine_similarity([e1], [e4]))
|
65 |
-
print(f"a1 \"{a1}\" to \"{a5}\" a5")
|
66 |
print(cosine_similarity([e1], [e5]))
|
67 |
|
68 |
-
print(f"a7 \"{a7}\" to \"{a8}\" a8")
|
69 |
print(cosine_similarity([e7], [e8]))
|
70 |
-
print(f"a7 \"{a7}\" to \"{a8x}\" a8x")
|
71 |
print(cosine_similarity([e7], [e8x]))
|
72 |
|
73 |
-
print(f"a7 \"{a7}\" to \"{a9}\" a9")
|
74 |
print(cosine_similarity([e7], [e9]))
|
75 |
|
76 |
-
print(f"a7 \"{a7}\" to \"{a10}\" a10")
|
77 |
print(cosine_similarity([e7], [e10]))
|
78 |
# with base
|
79 |
#a1 to a2
|
|
|
9 |
#tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
10 |
#model = AlbertModel.from_pretrained("albert-base-v2")
|
11 |
#'sentence-transformers/paraphrase-albert-base-v2'
|
12 |
+
model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
|
13 |
model_sbert = SentenceTransformer(model_name)
|
14 |
|
15 |
def get_sbert_embedding(input_text):
|
|
|
58 |
e8x = get_sbert_embedding(a8x)
|
59 |
e9 = get_sbert_embedding(a9)
|
60 |
e10 = get_sbert_embedding(a10)
|
61 |
+
print(f"a1 \"{a1}\" to \"{a2}\" a2 - expected Different")
|
62 |
print(cosine_similarity([e1], [e2]))
|
63 |
+
print(f"a1 \"{a1}\" to \"{a4}\" a4 - expected Different")
|
64 |
print(cosine_similarity([e1], [e4]))
|
65 |
+
print(f"a1 \"{a1}\" to \"{a5}\" a5 - expected Same")
|
66 |
print(cosine_similarity([e1], [e5]))
|
67 |
|
68 |
+
print(f"a7 \"{a7}\" to \"{a8}\" a8 - expected Different")
|
69 |
print(cosine_similarity([e7], [e8]))
|
70 |
+
print(f"a7 \"{a7}\" to \"{a8x}\" a8x - expected Different")
|
71 |
print(cosine_similarity([e7], [e8x]))
|
72 |
|
73 |
+
print(f"a7 \"{a7}\" to \"{a9}\" a9 - expected Same")
|
74 |
print(cosine_similarity([e7], [e9]))
|
75 |
|
76 |
+
print(f"a7 \"{a7}\" to \"{a10}\" a10 - expected Same")
|
77 |
print(cosine_similarity([e7], [e10]))
|
78 |
# with base
|
79 |
#a1 to a2
|
data_set_training.csv
CHANGED
@@ -106,7 +106,8 @@ Valley Healthcare System 1600 Fort Benning Rd, Columbus, GA 31903|1600 Fort Benn
|
|
106 |
Valley Healthcare System 1600 Fort Benning Rd, Columbus, GA 31903|1600 Fort Benning Rd, Valley Healthcare System, Columbus, GA 31903|1
|
107 |
Memorial Satilla Health, 1900 Tebeau St, Waycross, GA 31501|1900 Tebeau St, Waycross, GA 31501|1
|
108 |
VA Medical Center 2002 Holcombe Blvd, Houston, TX 77030|VA Medical Center 2002 Holcombe Boulevard, Houston, TX 77030|1
|
109 |
-
|
|
|
110 |
|
111 |
|
112 |
|
|
|
106 |
Valley Healthcare System 1600 Fort Benning Rd, Columbus, GA 31903|1600 Fort Benning Rd, Valley Healthcare System, Columbus, GA 31903|1
|
107 |
Memorial Satilla Health, 1900 Tebeau St, Waycross, GA 31501|1900 Tebeau St, Waycross, GA 31501|1
|
108 |
VA Medical Center 2002 Holcombe Blvd, Houston, TX 77030|VA Medical Center 2002 Holcombe Boulevard, Houston, TX 77030|1
|
109 |
+
1839 E Capitol Ave, Bismarck, ND 58501|1839 East Capitol Avenue, Bismarck, ND 58501|1
|
110 |
+
1839 E Capitol Ave, Bismarck, ND 58501|1912 East Capitol Avenue, Bismarck, ND 58501|0
|
111 |
|
112 |
|
113 |
|
dev_set_training.csv
CHANGED
@@ -4,4 +4,12 @@ address1|address2|are_same
|
|
4 |
1061 Schmidt Ln, North Brunswick Township, NJ 08902|1061 Schmidt Lane, North Brunswick Township, NJ 08902|1
|
5 |
1061 Schmidt Ln, North Brunswick Township, NJ 08902|934 Schmidt Ln, North Brunswick Township, NJ 08902|0
|
6 |
5844 N Orange Blossom Trail, Orlando, FL 32810|5844 North Orange Blossom Trail, Orlando, FL 32810-9635|1
|
7 |
-
6701 Fannin St #1400, Houston, TX 77030|6701 Fannin Ste #1400, Houston, TX 77030|1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
1061 Schmidt Ln, North Brunswick Township, NJ 08902|1061 Schmidt Lane, North Brunswick Township, NJ 08902|1
|
5 |
1061 Schmidt Ln, North Brunswick Township, NJ 08902|934 Schmidt Ln, North Brunswick Township, NJ 08902|0
|
6 |
5844 N Orange Blossom Trail, Orlando, FL 32810|5844 North Orange Blossom Trail, Orlando, FL 32810-9635|1
|
7 |
+
6701 Fannin St #1400, Houston, TX 77030|6701 Fannin Ste #1400, Houston, TX 77030|1
|
8 |
+
14143 Winecup Ln, Houston, TX 77047|14121 Winecup Lane, Houston, TX 77047|0
|
9 |
+
440 TECHNOLOGY CENTER DRIVE, Boston, MA 10034|440 Technology Center Dr., Boston, MA 10034|1
|
10 |
+
440 TECHNOLOGY CENTER DRIVE, Boston, MA 10034|440 Technology Center Dr., Boston, MA 10034-0345|1
|
11 |
+
440 TECHNOLOGY CENTER DRIVE, Boston, MA 10034|87 Technology Center Drive, Boston, MA 10034|0
|
12 |
+
440 TECHNOLOGY CENTER DRIVE, Boston, MA 10034|200 Technology Center Drive, Boston, MA 10034|0
|
13 |
+
65 Mountain Blvd Ext, Warren, NJ 07059|65 Mountain Boulevard Ext, Warren, NJ 07059|1
|
14 |
+
65 Mountain Blvd Ext, Warren, NJ 07059|5078 S Maryland Pkwy, Las Vegas, NV 89119|0
|
15 |
+
65 Mountain Blvd Ext, Warren, NJ 07059|112 Mountain Blvd Ext, Warren, NJ 07059|0
|
train.py
CHANGED
@@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
|
|
24 |
|
25 |
|
26 |
#As base model, we use DistilBERT-base that was pre-trained on NLI and STSb data
|
27 |
-
model = SentenceTransformer('albert-base-v2')
|
28 |
num_epochs = 10
|
29 |
train_batch_size = 8
|
30 |
|
|
|
24 |
|
25 |
|
26 |
#As base model, we use DistilBERT-base that was pre-trained on NLI and STSb data
|
27 |
+
model = SentenceTransformer('sentence-transformers/paraphrase-albert-base-v2')
|
28 |
num_epochs = 10
|
29 |
train_batch_size = 8
|
30 |
|