Spaces:
Runtime error
Runtime error
Feliks Zaslavskiy
commited on
Commit
•
0c1e501
1
Parent(s):
ce71282
small updates
Browse files- app.py +15 -10
- data_set_training.csv +3 -0
- quick_evaluate.py +1 -1
app.py
CHANGED
@@ -2,8 +2,8 @@ import math
|
|
2 |
import streamlit as st
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
-
import torch
|
6 |
-
from transformers import AlbertTokenizer, AlbertModel
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
from io import BytesIO
|
@@ -14,7 +14,7 @@ from io import BytesIO
|
|
14 |
#model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
|
15 |
|
16 |
# For baseline 'sentence-transformers/paraphrase-albert-base-v2'
|
17 |
-
model_name = 'output/training_OnlineConstrativeLoss-2023-03-
|
18 |
|
19 |
similarity_threshold = 0.9
|
20 |
|
@@ -60,12 +60,16 @@ if uploaded_file is not None:
|
|
60 |
data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB', dtype=str)
|
61 |
|
62 |
# Data cleaning CAQH
|
63 |
-
|
|
|
|
|
|
|
64 |
data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
|
65 |
+ np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)+ ', ') \
|
66 |
+ data_caqh['city'].astype(str) + ', '\
|
67 |
+ data_caqh['state'].astype(str) + ' ' \
|
68 |
+ data_caqh['postalcode'].astype(str)
|
|
|
69 |
|
70 |
st.write(f"CAQH before duplicate removal {len(data_caqh)}")
|
71 |
data_caqh.drop_duplicates(subset='full-addr',inplace=True)
|
@@ -73,15 +77,16 @@ if uploaded_file is not None:
|
|
73 |
st.write(f"CAQH after duplicate removal {len(data_caqh)}")
|
74 |
|
75 |
# Data cleaning NDB
|
76 |
-
data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')
|
77 |
|
78 |
-
data_ndb['zip_cd_zip_pls_4_cd'] = data_ndb['zip_cd'].astype(str) +\
|
79 |
-
|
80 |
-
|
81 |
|
82 |
data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
|
83 |
-
+ data_ndb['cty_nm'].astype(str).str.strip() + ',
|
84 |
-
+ data_ndb['st_cd'].astype(str) + ' ' + data_ndb['
|
|
|
85 |
|
86 |
# Calculate similarity For CAQH
|
87 |
num_items = len(data_caqh)
|
|
|
2 |
import streamlit as st
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
+
#import torch
|
6 |
+
#from transformers import AlbertTokenizer, AlbertModel
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
from io import BytesIO
|
|
|
14 |
#model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
|
15 |
|
16 |
# For baseline 'sentence-transformers/paraphrase-albert-base-v2'
|
17 |
+
model_name = 'output/training_OnlineConstrativeLoss-2023-03-14_00-40-03'
|
18 |
|
19 |
similarity_threshold = 0.9
|
20 |
|
|
|
60 |
data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB', dtype=str)
|
61 |
|
62 |
# Data cleaning CAQH
|
63 |
+
# if you need to format with 00000-0000
|
64 |
+
# lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x
|
65 |
+
data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5])
|
66 |
+
|
67 |
data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
|
68 |
+ np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)+ ', ') \
|
69 |
+ data_caqh['city'].astype(str) + ', '\
|
70 |
+ data_caqh['state'].astype(str) + ' ' \
|
71 |
+ data_caqh['postalcode'].astype(str)
|
72 |
+
data_caqh['full-addr'] = data_caqh['full-addr'].str.upper()
|
73 |
|
74 |
st.write(f"CAQH before duplicate removal {len(data_caqh)}")
|
75 |
data_caqh.drop_duplicates(subset='full-addr',inplace=True)
|
|
|
77 |
st.write(f"CAQH after duplicate removal {len(data_caqh)}")
|
78 |
|
79 |
# Data cleaning NDB
|
80 |
+
#data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')
|
81 |
|
82 |
+
#data_ndb['zip_cd_zip_pls_4_cd'] = data_ndb['zip_cd'].astype(str) +\
|
83 |
+
# np.where( data_ndb['zip_pls_4_cd'] == '', '', '-' \
|
84 |
+
# + data_ndb['zip_pls_4_cd'].astype(str))
|
85 |
|
86 |
data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
|
87 |
+
+ data_ndb['cty_nm'].astype(str).str.strip() + ', ' \
|
88 |
+
+ data_ndb['st_cd'].astype(str) + ' ' + data_ndb['zip_cd'].astype(str)
|
89 |
+
data_ndb['full-addr'] = data_ndb['full-addr'].str.upper()
|
90 |
|
91 |
# Calculate similarity For CAQH
|
92 |
num_items = len(data_caqh)
|
data_set_training.csv
CHANGED
@@ -65,12 +65,15 @@ ADDRESS1|ADDRESS2|ARE_SAME
|
|
65 |
145 34 23TH ST, JACKSONVILLE, FL 32258|145-50 23TH ST, JACKSONVILLE, FL 32258|0
|
66 |
145-12 23TH ST, JACKSONVILLE, FL 32258|145 29 23TH ST, JACKSONVILLE, FL 32258|0
|
67 |
15 49 RT 9, HALFMOON, NY 12065|15-49 RT 9, HALFMOON, NY 12065|1
|
|
|
68 |
15 49 RT 9, HALFMOON, NY 12065|15-59 RT 9, HALFMOON, NY 12065|0
|
|
|
69 |
15 49 RT 9, HALFMOON, NY 12065|1549 RT 9, HALFMOON, NY 12065|1
|
70 |
152 13 GOLD STAR HWY, GROTON, CT 63403|152-18 GOLD STAR HWY, GROTON, CT 63403|0
|
71 |
152 43 GOLD STAR HWY, GROTON, CT 63403|152-43 GOLD STAR HWY, GROTON, CT 63403|0
|
72 |
152 43 GOLD STAR HWY, GROTON, CT 63403|152-44 GOLD STAR HWY, GROTON, CT 63403|0
|
73 |
154-9 RT 9, HALFMOON, NY 12065|1549 RT 9, HALFMOON, NY 12065|1
|
|
|
74 |
160-10 N MIDLAND AVE, NYACK, NY 10960|160 10 NORTH MIDLAND AVENUE, NYACK, NY 10960|1
|
75 |
160-10 N MIDLAND AVE, NYACK, NY 10960|160 20 NORTH MIDLAND AVE, NYACK, NY 10960|0
|
76 |
160-10 N MIDLAND AVE, NYACK, NY 10960|160-10 N MIDLAND AVENUE, NYACK, NY 10960|1
|
|
|
65 |
145 34 23TH ST, JACKSONVILLE, FL 32258|145-50 23TH ST, JACKSONVILLE, FL 32258|0
|
66 |
145-12 23TH ST, JACKSONVILLE, FL 32258|145 29 23TH ST, JACKSONVILLE, FL 32258|0
|
67 |
15 49 RT 9, HALFMOON, NY 12065|15-49 RT 9, HALFMOON, NY 12065|1
|
68 |
+
15 49 RT 9, HALFMOON, NY 12065|15-49 ROUTE 9, HALFMOON, NY 12065|1
|
69 |
15 49 RT 9, HALFMOON, NY 12065|15-59 RT 9, HALFMOON, NY 12065|0
|
70 |
+
15 49 RT 9, HALFMOON, NY 12065|15-59 ROUTE 9, HALFMOON, NY 12065|0
|
71 |
15 49 RT 9, HALFMOON, NY 12065|1549 RT 9, HALFMOON, NY 12065|1
|
72 |
152 13 GOLD STAR HWY, GROTON, CT 63403|152-18 GOLD STAR HWY, GROTON, CT 63403|0
|
73 |
152 43 GOLD STAR HWY, GROTON, CT 63403|152-43 GOLD STAR HWY, GROTON, CT 63403|0
|
74 |
152 43 GOLD STAR HWY, GROTON, CT 63403|152-44 GOLD STAR HWY, GROTON, CT 63403|0
|
75 |
154-9 RT 9, HALFMOON, NY 12065|1549 RT 9, HALFMOON, NY 12065|1
|
76 |
+
154-9 RT 9, HALFMOON, NY 12065|1549 ROUTE 9, HALFMOON, NY 12065|1
|
77 |
160-10 N MIDLAND AVE, NYACK, NY 10960|160 10 NORTH MIDLAND AVENUE, NYACK, NY 10960|1
|
78 |
160-10 N MIDLAND AVE, NYACK, NY 10960|160 20 NORTH MIDLAND AVE, NYACK, NY 10960|0
|
79 |
160-10 N MIDLAND AVE, NYACK, NY 10960|160-10 N MIDLAND AVENUE, NYACK, NY 10960|1
|
quick_evaluate.py
CHANGED
@@ -33,7 +33,7 @@ a10= "440 TECHNOLOGY CENTER DR., BOSTON, MA 10034"
|
|
33 |
a11="87-22 ROUTE 13, CORTLANDVILLE, NY 13045"
|
34 |
a12="87 22 ROUTE 13, CORTLANDVILLE, NY 13045"
|
35 |
a13="87-55 ROUTE 13, CORTLANDVILLE, NY 13045"
|
36 |
-
a14="257 37 US
|
37 |
a15="257-37 US ROUTE 11, EVANS MILLS, NY 13637"
|
38 |
|
39 |
a16="15645 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
|
|
33 |
a11="87-22 ROUTE 13, CORTLANDVILLE, NY 13045"
|
34 |
a12="87 22 ROUTE 13, CORTLANDVILLE, NY 13045"
|
35 |
a13="87-55 ROUTE 13, CORTLANDVILLE, NY 13045"
|
36 |
+
a14="257 37 US ROUTE 11, EVANS MILLS, NY 13637"
|
37 |
a15="257-37 US ROUTE 11, EVANS MILLS, NY 13637"
|
38 |
|
39 |
a16="15645 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|