Updates
Browse files
app.py
CHANGED
|
@@ -69,17 +69,11 @@ def perform_deduplication(
|
|
| 69 |
batch_size = 64
|
| 70 |
total_batches = (len(texts) + batch_size - 1) // batch_size
|
| 71 |
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 79 |
-
future = executor.submit(compute_embeddings)
|
| 80 |
-
while not future.done():
|
| 81 |
-
pass # Wait for embeddings to be computed
|
| 82 |
-
embedding_matrix = future.result()
|
| 83 |
|
| 84 |
# Deduplicate
|
| 85 |
status = "Deduplicating embeddings..."
|
|
@@ -125,6 +119,7 @@ def perform_deduplication(
|
|
| 125 |
yield f"An error occurred: {e}", ""
|
| 126 |
raise e
|
| 127 |
|
|
|
|
| 128 |
def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
|
| 129 |
"""
|
| 130 |
Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
|
|
|
|
| 69 |
batch_size = 64
|
| 70 |
total_batches = (len(texts) + batch_size - 1) // batch_size
|
| 71 |
|
| 72 |
+
for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches):
|
| 73 |
+
batch_embeddings = model.encode(batch_texts, show_progressbar=False)
|
| 74 |
+
embeddings.append(batch_embeddings)
|
| 75 |
+
|
| 76 |
+
embedding_matrix = np.concatenate(embeddings, axis=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
# Deduplicate
|
| 79 |
status = "Deduplicating embeddings..."
|
|
|
|
| 119 |
yield f"An error occurred: {e}", ""
|
| 120 |
raise e
|
| 121 |
|
| 122 |
+
|
| 123 |
def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
|
| 124 |
"""
|
| 125 |
Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
|