Spaces:
Runtime error
Runtime error
Feliks Zaslavskiy
commited on
Commit
·
ecdea0f
1
Parent(s):
cf9bb91
wip
Browse files- app.py +1 -0
- data_set_training.csv +30 -1
- dev_set_training.csv +9 -1
- eval.py +6 -0
- quick_evaluate.py +15 -3
app.py
CHANGED
|
@@ -15,6 +15,7 @@ from io import BytesIO
|
|
| 15 |
|
| 16 |
# For baseline 'sentence-transformers/paraphrase-albert-base-v2'
|
| 17 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-14_01-24-44'
|
|
|
|
| 18 |
|
| 19 |
similarity_threshold = 0.9
|
| 20 |
|
|
|
|
| 15 |
|
| 16 |
# For baseline 'sentence-transformers/paraphrase-albert-base-v2'
|
| 17 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-14_01-24-44'
|
| 18 |
+
model_name = 'output/training_OnlineConstrativeLoss-2023-03-17_16-10-39'
|
| 19 |
|
| 20 |
similarity_threshold = 0.9
|
| 21 |
|
data_set_training.csv
CHANGED
|
@@ -239,4 +239,33 @@ VALLEY HEALTHCARE SYSTEM 1600 FORT BENNING RD, COLUMBUS, GA 31903|1600 FORT BENN
|
|
| 239 |
165 10 VILLAGE DR W, UPPER MARLBORO, MD 20772|165 12 VILLAGE DR W, UPPER MARLBORO, MD 20772|0
|
| 240 |
345 12 OLD WASHINGTON RD, WALDORF, MD 20602|345-12 OLD WASHINGTON RD, WALDORF, MD 20602|1
|
| 241 |
144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-11 ONYX CT, FREDERICKSBURG, VA 22407|0
|
| 242 |
-
144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-12 ONYX CT, FREDERICKSBURG, VA 22407|1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
165 10 VILLAGE DR W, UPPER MARLBORO, MD 20772|165 12 VILLAGE DR W, UPPER MARLBORO, MD 20772|0
|
| 240 |
345 12 OLD WASHINGTON RD, WALDORF, MD 20602|345-12 OLD WASHINGTON RD, WALDORF, MD 20602|1
|
| 241 |
144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-11 ONYX CT, FREDERICKSBURG, VA 22407|0
|
| 242 |
+
144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-12 ONYX CT, FREDERICKSBURG, VA 22407|1
|
| 243 |
+
14453 UNION ST, Mc Coll, SC 29570|144-53 UNION ST, Mc Coll, SC 29570|1
|
| 244 |
+
14453 UNION ST, Mc Coll, SC 29570|144 53 UNION ST, Mc Coll, SC 29570|0
|
| 245 |
+
14453 UNION ST, Mc Coll, SC 29570|14 453 UNION STREET, Mc Coll, SC 29570|1
|
| 246 |
+
14453 UNION ST APT 343, Mc Coll, SC 29570|144 53 UNION ST APT 343, Mc Coll, SC 29570|1
|
| 247 |
+
14453 UNION ST, Mc Coll, SC 29570|144-53A UNION STREET, Mc Coll, SC 29570|0
|
| 248 |
+
14453 UNION ST, Mc Coll, SC 29570|14443 UNION ST, Mc Coll, SC 29570|0
|
| 249 |
+
14453 UNION ST, Mc Coll, SC 29570|144-53 UNION ST APT 343, Mc Coll, SC 29570|0
|
| 250 |
+
20334 PARK AVE, PARK CITY, UT 84060|20234 PARK AVE, PARK CITY, UT 84060|0
|
| 251 |
+
20334 PARK AVE, PARK CITY, UT 84060|20-334 PARK AVE, PARK CITY, UT 84060|0
|
| 252 |
+
20334 PARK AVE, PARK CITY, UT 84060|202-34 PARK AVENUE, PARK CITY, UT 84060|1
|
| 253 |
+
20334 PARK AVE, PARK CITY SUITE 2, UT 84060|202 34 PARK AVENUE STE 2, PARK CITY, UT 84060|1
|
| 254 |
+
203 MAPLE AVE FL 2, ENGLEWOOD, NJ 07631|203 MAPLE AVE, ENGLEWOOD, NJ 07631|1
|
| 255 |
+
203 MAPLE AVE FL 2, ENGLEWOOD, NJ 07631|203 MAPLE AVENUE, ENGLEWOOD, NJ 07631|1
|
| 256 |
+
203 MAPLE AVE FL 2 STE 3, ENGLEWOOD, NJ 07631|203 MAPLE AVE, ENGLEWOOD, NJ 07631|0
|
| 257 |
+
203 MAPLE AVE, ENGLEWOOD, NJ 07631|205 MAPLE AVE, ENGLEWOOD, NJ 07631|0
|
| 258 |
+
2032 MAPLE AVE, ENGLEWOOD, NJ 07631|2031 MAPLE AVE, ENGLEWOOD, NJ 07631|0
|
| 259 |
+
1427 MARVIN GRIFFIN RD, AUGUSTA, GA 30906|1417 MARVIN GRIFFIN RD, AUGUSTA, GA 30906|0
|
| 260 |
+
32 GRAND ST, NEDERLAND, TX 77627|33 GRAND ST, NEDERLAND, TX 77627|0
|
| 261 |
+
32 GRAND ST, NEDERLAND, TX 77627|32 GRAND ST #4, NEDERLAND, TX 77627|0
|
| 262 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|80 HOSPITAL DRIVE SUITE 6, BARBOURVILLE, KY 40906|1
|
| 263 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|80 HOSPITAL DR. STE. 6, BARBOURVILLE KY, 40906|1
|
| 264 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|SUITE #6, 80 HOSPITAL DRIVE, BARBOURVILLE, KY 40906|1
|
| 265 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|STE #6 - 80 HOSPITAL DR., BARBOURVILLE, KY 40906|1
|
| 266 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|#6-80 HOSPITAL DRIVE, BARBOURVILLE, KY 40906|1
|
| 267 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|80-2 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|0
|
| 268 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|80 HOSPITAL DR SUITE 6A, BARBOURVILLE, KY 40906|0
|
| 269 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|81 HOSPITAL DRIVE STE 6, BARBOURVILLE, KY 40906|0
|
| 270 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|82 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|0
|
| 271 |
+
80 22 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|8022 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|1
|
dev_set_training.csv
CHANGED
|
@@ -26,4 +26,12 @@ ADDRESS1|ADDRESS2|ARE_SAME
|
|
| 26 |
8724 ROUTE 13, CORTLANDVILLE, NY 13045|87-24 ROUTE 13, CORTLANDVILLE, NY 13045|1
|
| 27 |
HEART HEALTH, 90 N COLUMBUS AVE, LOUISVILLE, MS 39339|90 N COLUMBUS AVE, LOUISVILLE, MS 39339|1
|
| 28 |
115 34 SHOREWAY DR, QUEENSTOWN, MD 21658|115-43 SHOREWAY DR, QUEENSTOWN, MD 21658|0
|
| 29 |
-
112 24 SHOREWAY DR, QUEENSTOWN, MD 21658|112-24 SHOREWAY DR, QUEENSTOWN, MD 21658|1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
8724 ROUTE 13, CORTLANDVILLE, NY 13045|87-24 ROUTE 13, CORTLANDVILLE, NY 13045|1
|
| 27 |
HEART HEALTH, 90 N COLUMBUS AVE, LOUISVILLE, MS 39339|90 N COLUMBUS AVE, LOUISVILLE, MS 39339|1
|
| 28 |
115 34 SHOREWAY DR, QUEENSTOWN, MD 21658|115-43 SHOREWAY DR, QUEENSTOWN, MD 21658|0
|
| 29 |
+
112 24 SHOREWAY DR, QUEENSTOWN, MD 21658|112-24 SHOREWAY DR, QUEENSTOWN, MD 21658|1
|
| 30 |
+
3619 S 22ND DR, YUMA, AZ 85364|3636 S 22ND DR, YUMA, AZ 85364|0
|
| 31 |
+
7325 FRANKLIN BLVD, SACRAMENTO, CA 95823|73235 FRANKLIN BLVD, SACRAMENTO, CA 95823|0
|
| 32 |
+
3660 MAIN ST, TUCSON, AZ 85721|3701 MAIN ST, TUCSON, AZ 85721|0
|
| 33 |
+
3910 MAGNET RD, MALVERN, AR 72104|3910 MAGNET RD, STE 206 MALVERN, AR 72104|0
|
| 34 |
+
15702 OBERLIN RD, RALEIGH, NC 27605|15702 OBERLIN RD FL 1, RALEIGH, NC 27605|1
|
| 35 |
+
14425 ROOSOVELT AVE APT 322, LA JOLLA, CA 92092|14325 ROOSOVELT AVE, LA JOLLA, CA 92092|0
|
| 36 |
+
14425 ROOSOVELT AVE APT 322, LA JOLLA, CA 92092|144-25 ROOSOVELT AVE APT 322, LA JOLLA, CA 92092|1
|
| 37 |
+
14425 ROOSOVELT AVE, LA JOLLA, CA 92092|144-25A ROOSOVELT AVENUE, LA JOLLA, CA 92092|0
|
eval.py
CHANGED
|
@@ -13,6 +13,12 @@ logger = logging.getLogger(__name__)
|
|
| 13 |
|
| 14 |
model_name = 'sentence-transformers/paraphrase-albert-base-v2'
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
model_sbert = SentenceTransformer(model_name)
|
| 18 |
|
|
|
|
| 13 |
|
| 14 |
model_name = 'sentence-transformers/paraphrase-albert-base-v2'
|
| 15 |
|
| 16 |
+
#model_name='output/training_OnlineConstrativeLoss-2023-03-11_23-47-34'
|
| 17 |
+
#model_name= 'output/training_OnlineConstrativeLoss-2023-03-14_01-24-44'
|
| 18 |
+
|
| 19 |
+
#86% so far
|
| 20 |
+
model_name = 'output/training_OnlineConstrativeLoss-2023-03-17_16-10-39'
|
| 21 |
+
|
| 22 |
|
| 23 |
model_sbert = SentenceTransformer(model_name)
|
| 24 |
|
quick_evaluate.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
|
| 2 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 3 |
from sentence_transformers import SentenceTransformer
|
| 4 |
|
|
@@ -6,15 +6,16 @@ from sentence_transformers import SentenceTransformer
|
|
| 6 |
|
| 7 |
# base
|
| 8 |
# large
|
| 9 |
-
|
| 10 |
#model = AlbertModel.from_pretrained("albert-base-v2")
|
| 11 |
#'sentence-transformers/paraphrase-albert-base-v2'
|
| 12 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
|
| 13 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_00-24-35'
|
| 14 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_01-00-19'
|
| 15 |
-
model_name='output/training_OnlineConstrativeLoss-2023-03-
|
| 16 |
model_sbert = SentenceTransformer(model_name)
|
| 17 |
|
|
|
|
| 18 |
def get_sbert_embedding(input_text):
|
| 19 |
embedding = model_sbert.encode(input_text)
|
| 20 |
return embedding.tolist()
|
|
@@ -40,6 +41,17 @@ a16="15645 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
|
| 40 |
a17="156-45 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
| 41 |
a18="156-46 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
#def get_embedding(input_text):
|
| 44 |
# encoded_input = tokenizer(input_text, return_tensors='pt')
|
| 45 |
# input_ids = encoded_input.input_ids
|
|
|
|
| 1 |
+
from transformers import AlbertTokenizer, AlbertModel
|
| 2 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 3 |
from sentence_transformers import SentenceTransformer
|
| 4 |
|
|
|
|
| 6 |
|
| 7 |
# base
|
| 8 |
# large
|
| 9 |
+
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
| 10 |
#model = AlbertModel.from_pretrained("albert-base-v2")
|
| 11 |
#'sentence-transformers/paraphrase-albert-base-v2'
|
| 12 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
|
| 13 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_00-24-35'
|
| 14 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_01-00-19'
|
| 15 |
+
model_name='output/training_OnlineConstrativeLoss-2023-03-17_16-10-39'
|
| 16 |
model_sbert = SentenceTransformer(model_name)
|
| 17 |
|
| 18 |
+
|
| 19 |
def get_sbert_embedding(input_text):
|
| 20 |
embedding = model_sbert.encode(input_text)
|
| 21 |
return embedding.tolist()
|
|
|
|
| 41 |
a17="156-45 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
| 42 |
a18="156-46 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
| 43 |
|
| 44 |
+
a19 = "THE PAVILION AT QUEENS FOR REHABILITAION AND NURSING 36-17 PARSONS BOULEVARD, FLUSHING, NY 11354"
|
| 45 |
+
a20 = "136-17 39TH AVENUE, 4TH FLOOR, SUITE CF-E, FLUSHING, NY 11354"
|
| 46 |
+
a21="WISDOM MEDICAL P.C., 136-20 38 TH AVE 6E, FLUSHING, NY 11354"
|
| 47 |
+
|
| 48 |
+
encoded_input = tokenizer(a21, return_tensors='pt')
|
| 49 |
+
input_ids = encoded_input.input_ids
|
| 50 |
+
input_num_tokens = input_ids.shape[1]
|
| 51 |
+
print(input_num_tokens)
|
| 52 |
+
list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
|
| 53 |
+
#
|
| 54 |
+
print( "Tokens : " + ' '.join(list_of_tokens))
|
| 55 |
#def get_embedding(input_text):
|
| 56 |
# encoded_input = tokenizer(input_text, return_tensors='pt')
|
| 57 |
# input_ids = encoded_input.input_ids
|