Updates
Browse files
app.py
CHANGED
|
@@ -14,19 +14,19 @@ default_dataset_split = "train"
|
|
| 14 |
default_text_column = "sentence"
|
| 15 |
default_threshold = 0.9
|
| 16 |
|
| 17 |
-
def batch_iterable(iterable, batch_size):
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
def compute_embeddings(texts, batch_size, progress, desc):
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
|
| 31 |
def deduplicate_embeddings(
|
| 32 |
embeddings_a: np.ndarray,
|
|
@@ -90,8 +90,8 @@ def perform_deduplication(
|
|
| 90 |
yield "Loading Dataset 1...", ""
|
| 91 |
texts1 = load_dataset_texts(dataset1_name, dataset1_split, dataset1_text_column)
|
| 92 |
yield "Computing embeddings for Dataset 1...", ""
|
| 93 |
-
embeddings1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Dataset 1 embeddings")
|
| 94 |
-
|
| 95 |
if deduplication_type == "Single dataset":
|
| 96 |
# Deduplicate within Dataset 1
|
| 97 |
yield "Deduplicating within Dataset 1...", ""
|
|
@@ -128,8 +128,8 @@ def perform_deduplication(
|
|
| 128 |
yield "Loading Dataset 2...", ""
|
| 129 |
texts2 = load_dataset_texts(dataset2_name, dataset2_split, dataset2_text_column)
|
| 130 |
yield "Computing embeddings for Dataset 2...", ""
|
| 131 |
-
embeddings2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Dataset 2 embeddings")
|
| 132 |
-
|
| 133 |
# Deduplicate Dataset 2 against Dataset 1
|
| 134 |
yield "Deduplicating Dataset 2 against Dataset 1...", ""
|
| 135 |
duplicate_indices, duplicate_mapping = deduplicate_embeddings(
|
|
|
|
| 14 |
default_text_column = "sentence"
|
| 15 |
default_threshold = 0.9
|
| 16 |
|
| 17 |
+
# def batch_iterable(iterable, batch_size):
|
| 18 |
+
# """Yield successive batches from an iterable."""
|
| 19 |
+
# for i in range(0, len(iterable), batch_size):
|
| 20 |
+
# yield iterable[i:i + batch_size]
|
| 21 |
+
|
| 22 |
+
# def compute_embeddings(texts, batch_size, progress, desc):
|
| 23 |
+
# """Compute embeddings for a list of texts with progress tracking."""
|
| 24 |
+
# embeddings = []
|
| 25 |
+
# total_batches = (len(texts) + batch_size - 1) // batch_size
|
| 26 |
+
# for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
|
| 27 |
+
# embeddings.append(model.encode(batch_texts, show_progressbar=False))
|
| 28 |
+
# progress((i + 1) / total_batches, desc=desc)
|
| 29 |
+
# return np.concatenate(embeddings, axis=0)
|
| 30 |
|
| 31 |
def deduplicate_embeddings(
|
| 32 |
embeddings_a: np.ndarray,
|
|
|
|
| 90 |
yield "Loading Dataset 1...", ""
|
| 91 |
texts1 = load_dataset_texts(dataset1_name, dataset1_split, dataset1_text_column)
|
| 92 |
yield "Computing embeddings for Dataset 1...", ""
|
| 93 |
+
#embeddings1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Dataset 1 embeddings")
|
| 94 |
+
embeddings1 = model.encode(texts1, show_progressbar=True)
|
| 95 |
if deduplication_type == "Single dataset":
|
| 96 |
# Deduplicate within Dataset 1
|
| 97 |
yield "Deduplicating within Dataset 1...", ""
|
|
|
|
| 128 |
yield "Loading Dataset 2...", ""
|
| 129 |
texts2 = load_dataset_texts(dataset2_name, dataset2_split, dataset2_text_column)
|
| 130 |
yield "Computing embeddings for Dataset 2...", ""
|
| 131 |
+
#embeddings2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Dataset 2 embeddings")
|
| 132 |
+
embeddings2 = model.encode(texts2, show_progressbar=True)
|
| 133 |
# Deduplicate Dataset 2 against Dataset 1
|
| 134 |
yield "Deduplicating Dataset 2 against Dataset 1...", ""
|
| 135 |
duplicate_indices, duplicate_mapping = deduplicate_embeddings(
|