Updated app with code for deduplication
Browse files
app.py
CHANGED
|
@@ -4,9 +4,7 @@ import numpy as np
|
|
| 4 |
from model2vec import StaticModel
|
| 5 |
from reach import Reach
|
| 6 |
from difflib import ndiff
|
| 7 |
-
import sys
|
| 8 |
import tqdm
|
| 9 |
-
from tqdm.utils import format_interval, format_num, format_sizeof
|
| 10 |
|
| 11 |
# Load the model at startup
|
| 12 |
model = StaticModel.from_pretrained("minishlab/M2V_base_output")
|
|
@@ -23,26 +21,41 @@ default_threshold = 0.9
|
|
| 23 |
ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
|
| 24 |
ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
|
| 25 |
|
| 26 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
"""
|
| 28 |
Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
|
| 29 |
"""
|
| 30 |
# Building the index
|
|
|
|
| 31 |
reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
|
| 32 |
|
| 33 |
deduplicated_indices = set(range(len(embedding_matrix)))
|
| 34 |
duplicate_to_original_mapping = {}
|
| 35 |
|
| 36 |
# Finding nearest neighbors
|
|
|
|
| 37 |
results = reach.nearest_neighbor_threshold(
|
| 38 |
embedding_matrix,
|
| 39 |
threshold=threshold,
|
| 40 |
batch_size=batch_size,
|
| 41 |
-
show_progressbar=
|
| 42 |
)
|
| 43 |
|
| 44 |
-
# Processing duplicates
|
| 45 |
-
|
|
|
|
| 46 |
if i not in deduplicated_indices:
|
| 47 |
continue
|
| 48 |
|
|
@@ -55,26 +68,29 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
|
|
| 55 |
|
| 56 |
return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
|
| 57 |
|
| 58 |
-
def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[list[int], dict[int, int]]:
|
| 59 |
"""
|
| 60 |
Deduplicate embeddings across two datasets and return the indices of duplicates between them.
|
| 61 |
"""
|
| 62 |
# Building the index from Dataset 1
|
|
|
|
| 63 |
reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
|
| 64 |
|
| 65 |
duplicate_indices_in_test = []
|
| 66 |
duplicate_to_original_mapping = {}
|
| 67 |
|
| 68 |
# Finding nearest neighbors between datasets
|
|
|
|
| 69 |
results = reach.nearest_neighbor_threshold(
|
| 70 |
embedding_matrix_2,
|
| 71 |
threshold=threshold,
|
| 72 |
batch_size=batch_size,
|
| 73 |
-
show_progressbar=
|
| 74 |
)
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
|
|
|
| 78 |
similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
|
| 79 |
|
| 80 |
if similar_indices:
|
|
@@ -98,31 +114,6 @@ def perform_deduplication(
|
|
| 98 |
threshold=default_threshold,
|
| 99 |
progress=gr.Progress(track_tqdm=True)
|
| 100 |
):
|
| 101 |
-
# Custom tqdm class that wraps progress.tqdm and includes module-level attributes
|
| 102 |
-
class TqdmWrapper(tqdm.std.tqdm):
|
| 103 |
-
def __init__(self, *args, **kwargs):
|
| 104 |
-
super().__init__(*args, **kwargs)
|
| 105 |
-
|
| 106 |
-
# Copy module-level attributes from original tqdm module
|
| 107 |
-
TqdmWrapper.format_interval = staticmethod(format_interval)
|
| 108 |
-
TqdmWrapper.format_num = staticmethod(format_num)
|
| 109 |
-
TqdmWrapper.format_sizeof = staticmethod(format_sizeof)
|
| 110 |
-
|
| 111 |
-
# Monkey-patch tqdm.tqdm with our wrapper
|
| 112 |
-
original_tqdm_tqdm = tqdm.tqdm
|
| 113 |
-
tqdm.tqdm = progress.tqdm
|
| 114 |
-
|
| 115 |
-
# Monkey-patch model2vec's tqdm reference if needed
|
| 116 |
-
import model2vec.model
|
| 117 |
-
if hasattr(model2vec.model, 'tqdm'):
|
| 118 |
-
original_model2vec_tqdm = model2vec.model.tqdm
|
| 119 |
-
model2vec.model.tqdm = TqdmWrapper
|
| 120 |
-
|
| 121 |
-
# Monkey-patch reach's tqdm reference if needed
|
| 122 |
-
if hasattr(Reach, 'tqdm'):
|
| 123 |
-
original_reach_tqdm = Reach.tqdm
|
| 124 |
-
Reach.tqdm = TqdmWrapper
|
| 125 |
-
|
| 126 |
try:
|
| 127 |
# Convert threshold to float
|
| 128 |
threshold = float(threshold)
|
|
@@ -147,13 +138,13 @@ def perform_deduplication(
|
|
| 147 |
# Compute embeddings
|
| 148 |
status = "Computing embeddings for Dataset 1..."
|
| 149 |
yield status, ""
|
| 150 |
-
embedding_matrix =
|
| 151 |
|
| 152 |
# Deduplicate
|
| 153 |
status = "Deduplicating embeddings..."
|
| 154 |
yield status, ""
|
| 155 |
deduplicated_indices, duplicate_to_original_mapping = deduplicate(
|
| 156 |
-
embedding_matrix, threshold
|
| 157 |
)
|
| 158 |
|
| 159 |
# Prepare the results
|
|
@@ -214,18 +205,18 @@ def perform_deduplication(
|
|
| 214 |
# Compute embeddings for Dataset 1
|
| 215 |
status = "Computing embeddings for Dataset 1..."
|
| 216 |
yield status, ""
|
| 217 |
-
embedding_matrix1 =
|
| 218 |
|
| 219 |
# Compute embeddings for Dataset 2
|
| 220 |
status = "Computing embeddings for Dataset 2..."
|
| 221 |
yield status, ""
|
| 222 |
-
embedding_matrix2 =
|
| 223 |
|
| 224 |
# Deduplicate across datasets
|
| 225 |
status = "Deduplicating embeddings across datasets..."
|
| 226 |
yield status, ""
|
| 227 |
duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
|
| 228 |
-
embedding_matrix1, embedding_matrix2, threshold
|
| 229 |
)
|
| 230 |
|
| 231 |
num_duplicates = len(duplicate_indices_in_ds2)
|
|
@@ -256,13 +247,9 @@ def perform_deduplication(
|
|
| 256 |
status = "Deduplication completed."
|
| 257 |
yield status, result_text
|
| 258 |
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
if hasattr(model2vec.model, 'tqdm'):
|
| 263 |
-
model2vec.model.tqdm = original_model2vec_tqdm
|
| 264 |
-
if hasattr(Reach, 'tqdm'):
|
| 265 |
-
Reach.tqdm = original_reach_tqdm
|
| 266 |
|
| 267 |
with gr.Blocks() as demo:
|
| 268 |
gr.Markdown("# Semantic Deduplication")
|
|
@@ -330,6 +317,7 @@ demo.launch()
|
|
| 330 |
|
| 331 |
|
| 332 |
|
|
|
|
| 333 |
# import gradio as gr
|
| 334 |
# from datasets import load_dataset
|
| 335 |
# import numpy as np
|
|
|
|
| 4 |
from model2vec import StaticModel
|
| 5 |
from reach import Reach
|
| 6 |
from difflib import ndiff
|
|
|
|
| 7 |
import tqdm
|
|
|
|
| 8 |
|
| 9 |
# Load the model at startup
|
| 10 |
model = StaticModel.from_pretrained("minishlab/M2V_base_output")
|
|
|
|
| 21 |
ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
|
| 22 |
ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
|
| 23 |
|
| 24 |
+
def batch_iterable(iterable, batch_size):
|
| 25 |
+
"""Helper function to create batches from an iterable."""
|
| 26 |
+
for i in range(0, len(iterable), batch_size):
|
| 27 |
+
yield iterable[i:i + batch_size]
|
| 28 |
+
|
| 29 |
+
def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
|
| 30 |
+
embeddings = []
|
| 31 |
+
for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
|
| 32 |
+
batch_embeddings = model.encode(batch, show_progressbar=False)
|
| 33 |
+
embeddings.append(batch_embeddings)
|
| 34 |
+
return np.concatenate(embeddings, axis=0)
|
| 35 |
+
|
| 36 |
+
def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
|
| 37 |
"""
|
| 38 |
Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
|
| 39 |
"""
|
| 40 |
# Building the index
|
| 41 |
+
progress(0, desc="Building search index...")
|
| 42 |
reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
|
| 43 |
|
| 44 |
deduplicated_indices = set(range(len(embedding_matrix)))
|
| 45 |
duplicate_to_original_mapping = {}
|
| 46 |
|
| 47 |
# Finding nearest neighbors
|
| 48 |
+
progress(0, desc="Finding nearest neighbors...")
|
| 49 |
results = reach.nearest_neighbor_threshold(
|
| 50 |
embedding_matrix,
|
| 51 |
threshold=threshold,
|
| 52 |
batch_size=batch_size,
|
| 53 |
+
show_progressbar=False # Disable internal progress bar
|
| 54 |
)
|
| 55 |
|
| 56 |
+
# Processing duplicates with a progress bar
|
| 57 |
+
total_items = len(embedding_matrix)
|
| 58 |
+
for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
|
| 59 |
if i not in deduplicated_indices:
|
| 60 |
continue
|
| 61 |
|
|
|
|
| 68 |
|
| 69 |
return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
|
| 70 |
|
| 71 |
+
def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
|
| 72 |
"""
|
| 73 |
Deduplicate embeddings across two datasets and return the indices of duplicates between them.
|
| 74 |
"""
|
| 75 |
# Building the index from Dataset 1
|
| 76 |
+
progress(0, desc="Building search index from Dataset 1...")
|
| 77 |
reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
|
| 78 |
|
| 79 |
duplicate_indices_in_test = []
|
| 80 |
duplicate_to_original_mapping = {}
|
| 81 |
|
| 82 |
# Finding nearest neighbors between datasets
|
| 83 |
+
progress(0, desc="Finding nearest neighbors between datasets...")
|
| 84 |
results = reach.nearest_neighbor_threshold(
|
| 85 |
embedding_matrix_2,
|
| 86 |
threshold=threshold,
|
| 87 |
batch_size=batch_size,
|
| 88 |
+
show_progressbar=False # Disable internal progress bar
|
| 89 |
)
|
| 90 |
|
| 91 |
+
total_items = len(embedding_matrix_2)
|
| 92 |
+
# Processing duplicates with a progress bar
|
| 93 |
+
for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
|
| 94 |
similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
|
| 95 |
|
| 96 |
if similar_indices:
|
|
|
|
| 114 |
threshold=default_threshold,
|
| 115 |
progress=gr.Progress(track_tqdm=True)
|
| 116 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
try:
|
| 118 |
# Convert threshold to float
|
| 119 |
threshold = float(threshold)
|
|
|
|
| 138 |
# Compute embeddings
|
| 139 |
status = "Computing embeddings for Dataset 1..."
|
| 140 |
yield status, ""
|
| 141 |
+
embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
|
| 142 |
|
| 143 |
# Deduplicate
|
| 144 |
status = "Deduplicating embeddings..."
|
| 145 |
yield status, ""
|
| 146 |
deduplicated_indices, duplicate_to_original_mapping = deduplicate(
|
| 147 |
+
embedding_matrix, threshold, progress=progress
|
| 148 |
)
|
| 149 |
|
| 150 |
# Prepare the results
|
|
|
|
| 205 |
# Compute embeddings for Dataset 1
|
| 206 |
status = "Computing embeddings for Dataset 1..."
|
| 207 |
yield status, ""
|
| 208 |
+
embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
|
| 209 |
|
| 210 |
# Compute embeddings for Dataset 2
|
| 211 |
status = "Computing embeddings for Dataset 2..."
|
| 212 |
yield status, ""
|
| 213 |
+
embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
|
| 214 |
|
| 215 |
# Deduplicate across datasets
|
| 216 |
status = "Deduplicating embeddings across datasets..."
|
| 217 |
yield status, ""
|
| 218 |
duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
|
| 219 |
+
embedding_matrix1, embedding_matrix2, threshold, progress=progress
|
| 220 |
)
|
| 221 |
|
| 222 |
num_duplicates = len(duplicate_indices_in_ds2)
|
|
|
|
| 247 |
status = "Deduplication completed."
|
| 248 |
yield status, result_text
|
| 249 |
|
| 250 |
+
except Exception as e:
|
| 251 |
+
yield f"An error occurred: {e}", ""
|
| 252 |
+
raise e
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
with gr.Blocks() as demo:
|
| 255 |
gr.Markdown("# Semantic Deduplication")
|
|
|
|
| 317 |
|
| 318 |
|
| 319 |
|
| 320 |
+
|
| 321 |
# import gradio as gr
|
| 322 |
# from datasets import load_dataset
|
| 323 |
# import numpy as np
|