Spaces:
Runtime error
Runtime error
Feliks Zaslavskiy
commited on
Commit
·
0c1e501
1
Parent(s):
ce71282
small updates
Browse files- app.py +15 -10
- data_set_training.csv +3 -0
- quick_evaluate.py +1 -1
app.py
CHANGED
|
@@ -2,8 +2,8 @@ import math
|
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
import numpy as np
|
| 5 |
-
import torch
|
| 6 |
-
from transformers import AlbertTokenizer, AlbertModel
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
from io import BytesIO
|
|
@@ -14,7 +14,7 @@ from io import BytesIO
|
|
| 14 |
#model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
|
| 15 |
|
| 16 |
# For baseline 'sentence-transformers/paraphrase-albert-base-v2'
|
| 17 |
-
model_name = 'output/training_OnlineConstrativeLoss-2023-03-
|
| 18 |
|
| 19 |
similarity_threshold = 0.9
|
| 20 |
|
|
@@ -60,12 +60,16 @@ if uploaded_file is not None:
|
|
| 60 |
data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB', dtype=str)
|
| 61 |
|
| 62 |
# Data cleaning CAQH
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
| 64 |
data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
|
| 65 |
+ np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)+ ', ') \
|
| 66 |
+ data_caqh['city'].astype(str) + ', '\
|
| 67 |
+ data_caqh['state'].astype(str) + ' ' \
|
| 68 |
+ data_caqh['postalcode'].astype(str)
|
|
|
|
| 69 |
|
| 70 |
st.write(f"CAQH before duplicate removal {len(data_caqh)}")
|
| 71 |
data_caqh.drop_duplicates(subset='full-addr',inplace=True)
|
|
@@ -73,15 +77,16 @@ if uploaded_file is not None:
|
|
| 73 |
st.write(f"CAQH after duplicate removal {len(data_caqh)}")
|
| 74 |
|
| 75 |
# Data cleaning NDB
|
| 76 |
-
data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')
|
| 77 |
|
| 78 |
-
data_ndb['zip_cd_zip_pls_4_cd'] = data_ndb['zip_cd'].astype(str) +\
|
| 79 |
-
|
| 80 |
-
|
| 81 |
|
| 82 |
data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
|
| 83 |
-
+ data_ndb['cty_nm'].astype(str).str.strip() + ',
|
| 84 |
-
+ data_ndb['st_cd'].astype(str) + ' ' + data_ndb['
|
|
|
|
| 85 |
|
| 86 |
# Calculate similarity For CAQH
|
| 87 |
num_items = len(data_caqh)
|
|
|
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
import numpy as np
|
| 5 |
+
#import torch
|
| 6 |
+
#from transformers import AlbertTokenizer, AlbertModel
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
from io import BytesIO
|
|
|
|
| 14 |
#model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
|
| 15 |
|
| 16 |
# For baseline 'sentence-transformers/paraphrase-albert-base-v2'
|
| 17 |
+
model_name = 'output/training_OnlineConstrativeLoss-2023-03-14_00-40-03'
|
| 18 |
|
| 19 |
similarity_threshold = 0.9
|
| 20 |
|
|
|
|
| 60 |
data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB', dtype=str)
|
| 61 |
|
| 62 |
# Data cleaning CAQH
|
| 63 |
+
# if you need to format with 00000-0000
|
| 64 |
+
# lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x
|
| 65 |
+
data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5])
|
| 66 |
+
|
| 67 |
data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
|
| 68 |
+ np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)+ ', ') \
|
| 69 |
+ data_caqh['city'].astype(str) + ', '\
|
| 70 |
+ data_caqh['state'].astype(str) + ' ' \
|
| 71 |
+ data_caqh['postalcode'].astype(str)
|
| 72 |
+
data_caqh['full-addr'] = data_caqh['full-addr'].str.upper()
|
| 73 |
|
| 74 |
st.write(f"CAQH before duplicate removal {len(data_caqh)}")
|
| 75 |
data_caqh.drop_duplicates(subset='full-addr',inplace=True)
|
|
|
|
| 77 |
st.write(f"CAQH after duplicate removal {len(data_caqh)}")
|
| 78 |
|
| 79 |
# Data cleaning NDB
|
| 80 |
+
#data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')
|
| 81 |
|
| 82 |
+
#data_ndb['zip_cd_zip_pls_4_cd'] = data_ndb['zip_cd'].astype(str) +\
|
| 83 |
+
# np.where( data_ndb['zip_pls_4_cd'] == '', '', '-' \
|
| 84 |
+
# + data_ndb['zip_pls_4_cd'].astype(str))
|
| 85 |
|
| 86 |
data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
|
| 87 |
+
+ data_ndb['cty_nm'].astype(str).str.strip() + ', ' \
|
| 88 |
+
+ data_ndb['st_cd'].astype(str) + ' ' + data_ndb['zip_cd'].astype(str)
|
| 89 |
+
data_ndb['full-addr'] = data_ndb['full-addr'].str.upper()
|
| 90 |
|
| 91 |
# Calculate similarity For CAQH
|
| 92 |
num_items = len(data_caqh)
|
data_set_training.csv
CHANGED
|
@@ -65,12 +65,15 @@ ADDRESS1|ADDRESS2|ARE_SAME
|
|
| 65 |
145 34 23TH ST, JACKSONVILLE, FL 32258|145-50 23TH ST, JACKSONVILLE, FL 32258|0
|
| 66 |
145-12 23TH ST, JACKSONVILLE, FL 32258|145 29 23TH ST, JACKSONVILLE, FL 32258|0
|
| 67 |
15 49 RT 9, HALFMOON, NY 12065|15-49 RT 9, HALFMOON, NY 12065|1
|
|
|
|
| 68 |
15 49 RT 9, HALFMOON, NY 12065|15-59 RT 9, HALFMOON, NY 12065|0
|
|
|
|
| 69 |
15 49 RT 9, HALFMOON, NY 12065|1549 RT 9, HALFMOON, NY 12065|1
|
| 70 |
152 13 GOLD STAR HWY, GROTON, CT 63403|152-18 GOLD STAR HWY, GROTON, CT 63403|0
|
| 71 |
152 43 GOLD STAR HWY, GROTON, CT 63403|152-43 GOLD STAR HWY, GROTON, CT 63403|0
|
| 72 |
152 43 GOLD STAR HWY, GROTON, CT 63403|152-44 GOLD STAR HWY, GROTON, CT 63403|0
|
| 73 |
154-9 RT 9, HALFMOON, NY 12065|1549 RT 9, HALFMOON, NY 12065|1
|
|
|
|
| 74 |
160-10 N MIDLAND AVE, NYACK, NY 10960|160 10 NORTH MIDLAND AVENUE, NYACK, NY 10960|1
|
| 75 |
160-10 N MIDLAND AVE, NYACK, NY 10960|160 20 NORTH MIDLAND AVE, NYACK, NY 10960|0
|
| 76 |
160-10 N MIDLAND AVE, NYACK, NY 10960|160-10 N MIDLAND AVENUE, NYACK, NY 10960|1
|
|
|
|
| 65 |
145 34 23TH ST, JACKSONVILLE, FL 32258|145-50 23TH ST, JACKSONVILLE, FL 32258|0
|
| 66 |
145-12 23TH ST, JACKSONVILLE, FL 32258|145 29 23TH ST, JACKSONVILLE, FL 32258|0
|
| 67 |
15 49 RT 9, HALFMOON, NY 12065|15-49 RT 9, HALFMOON, NY 12065|1
|
| 68 |
+
15 49 RT 9, HALFMOON, NY 12065|15-49 ROUTE 9, HALFMOON, NY 12065|1
|
| 69 |
15 49 RT 9, HALFMOON, NY 12065|15-59 RT 9, HALFMOON, NY 12065|0
|
| 70 |
+
15 49 RT 9, HALFMOON, NY 12065|15-59 ROUTE 9, HALFMOON, NY 12065|0
|
| 71 |
15 49 RT 9, HALFMOON, NY 12065|1549 RT 9, HALFMOON, NY 12065|1
|
| 72 |
152 13 GOLD STAR HWY, GROTON, CT 63403|152-18 GOLD STAR HWY, GROTON, CT 63403|0
|
| 73 |
152 43 GOLD STAR HWY, GROTON, CT 63403|152-43 GOLD STAR HWY, GROTON, CT 63403|0
|
| 74 |
152 43 GOLD STAR HWY, GROTON, CT 63403|152-44 GOLD STAR HWY, GROTON, CT 63403|0
|
| 75 |
154-9 RT 9, HALFMOON, NY 12065|1549 RT 9, HALFMOON, NY 12065|1
|
| 76 |
+
154-9 RT 9, HALFMOON, NY 12065|1549 ROUTE 9, HALFMOON, NY 12065|1
|
| 77 |
160-10 N MIDLAND AVE, NYACK, NY 10960|160 10 NORTH MIDLAND AVENUE, NYACK, NY 10960|1
|
| 78 |
160-10 N MIDLAND AVE, NYACK, NY 10960|160 20 NORTH MIDLAND AVE, NYACK, NY 10960|0
|
| 79 |
160-10 N MIDLAND AVE, NYACK, NY 10960|160-10 N MIDLAND AVENUE, NYACK, NY 10960|1
|
quick_evaluate.py
CHANGED
|
@@ -33,7 +33,7 @@ a10= "440 TECHNOLOGY CENTER DR., BOSTON, MA 10034"
|
|
| 33 |
a11="87-22 ROUTE 13, CORTLANDVILLE, NY 13045"
|
| 34 |
a12="87 22 ROUTE 13, CORTLANDVILLE, NY 13045"
|
| 35 |
a13="87-55 ROUTE 13, CORTLANDVILLE, NY 13045"
|
| 36 |
-
a14="257 37 US
|
| 37 |
a15="257-37 US ROUTE 11, EVANS MILLS, NY 13637"
|
| 38 |
|
| 39 |
a16="15645 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
|
|
|
| 33 |
a11="87-22 ROUTE 13, CORTLANDVILLE, NY 13045"
|
| 34 |
a12="87 22 ROUTE 13, CORTLANDVILLE, NY 13045"
|
| 35 |
a13="87-55 ROUTE 13, CORTLANDVILLE, NY 13045"
|
| 36 |
+
a14="257 37 US ROUTE 11, EVANS MILLS, NY 13637"
|
| 37 |
a15="257-37 US ROUTE 11, EVANS MILLS, NY 13637"
|
| 38 |
|
| 39 |
a16="15645 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|