Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

Pringled commited on Oct 12, 2024

Commit

58d8f1a

1 Parent(s): a9118ee

Updates

Browse files

Files changed (1) hide show

app.py +78 -60

app.py CHANGED Viewed

@@ -4,7 +4,8 @@ import numpy as np
 from model2vec import StaticModel
 from reach import Reach
 from difflib import ndiff
-import concurrent.futures
 # Load the model at startup
 model = StaticModel.from_pretrained("minishlab/M2V_base_output")
@@ -26,9 +27,65 @@ def batch_iterable(iterable, batch_size):
     for i in range(0, len(iterable), batch_size):
         yield iterable[i:i + batch_size]
 def display_word_differences(x: str, y: str) -> str:
     diff = ndiff(x.split(), y.split())
-    return " ".join([word for word in diff if word.startswith(('+', '-'))])
 def perform_deduplication(
     deduplication_type,
@@ -39,7 +96,7 @@ def perform_deduplication(
     dataset2_split="",
     dataset2_text_column="",
     threshold=default_threshold,
-    progress=gr.Progress(track_tqdm=True)
 ):
     try:
         # Convert threshold to float
@@ -52,7 +109,10 @@ def perform_deduplication(
             # Load Dataset 1
             status = "Loading Dataset 1..."
             yield status, ""
-            if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
                 ds = ds_default1
             else:
                 ds = load_dataset(dataset1_name, split=dataset1_split)
@@ -65,15 +125,12 @@ def perform_deduplication(
             # Compute embeddings
             status = "Computing embeddings for Dataset 1..."
             yield status, ""
-            embeddings = []
-            batch_size = 64
-            total_batches = (len(texts) + batch_size - 1) // batch_size
-            for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches):
-                batch_embeddings = model.encode(batch_texts, show_progressbar=False)
-                embeddings.append(batch_embeddings)
-            embedding_matrix = np.concatenate(embeddings, axis=0)
             # Deduplicate
             status = "Deduplicating embeddings..."
@@ -89,7 +146,9 @@ def perform_deduplication(
             result_text = f"**Total documents:** {num_total}\n"
             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-            result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
             # Show deduplicated examples
             if num_duplicates > 0:
@@ -119,49 +178,13 @@ def perform_deduplication(
         yield f"An error occurred: {e}", ""
         raise e
-def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
-    """
-    Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
-    """
-    # Building the index
-    progress(0, desc="Building search index...")
-    reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
-    deduplicated_indices = set(range(len(embedding_matrix)))
-    duplicate_to_original_mapping = {}
-    # Finding nearest neighbors
-    progress(0, desc="Finding nearest neighbors...")
-    results = reach.nearest_neighbor_threshold(
-        embedding_matrix,
-        threshold=threshold,
-        batch_size=batch_size,
-        show_progressbar=False  # Disable internal progress bar
-    )
-    # Processing duplicates with a progress bar
-    total_items = len(embedding_matrix)
-    for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
-        if i not in deduplicated_indices:
-            continue
-        similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
-        for sim_idx in similar_indices:
-            if sim_idx in deduplicated_indices:
-                deduplicated_indices.remove(sim_idx)
-                duplicate_to_original_mapping[sim_idx] = i
-    return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
 with gr.Blocks() as demo:
     gr.Markdown("# Semantic Deduplication")
     deduplication_type = gr.Radio(
         choices=["Single dataset", "Cross-dataset"],
         label="Deduplication Type",
-        value="Single dataset"
     )
     with gr.Row():
@@ -178,10 +201,7 @@ with gr.Blocks() as demo:
             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
     threshold = gr.Slider(
-        minimum=0.0,
-        maximum=1.0,
-        value=default_threshold,
-        label="Similarity Threshold"
     )
     compute_button = gr.Button("Compute")
@@ -197,9 +217,7 @@ with gr.Blocks() as demo:
             return gr.update(visible=False)
     deduplication_type.change(
-        update_visibility,
-        inputs=deduplication_type,
-        outputs=dataset2_inputs
     )
     compute_button.click(
@@ -212,9 +230,9 @@ with gr.Blocks() as demo:
             dataset2_name,
             dataset2_split,
             dataset2_text_column,
-            threshold
         ],
-        outputs=[status_output, result_output]
     )
 demo.launch()

 from model2vec import StaticModel
 from reach import Reach
 from difflib import ndiff
+import tqdm
+from contextlib import contextmanager
 # Load the model at startup
 model = StaticModel.from_pretrained("minishlab/M2V_base_output")
     for i in range(0, len(iterable), batch_size):
         yield iterable[i:i + batch_size]
+@contextmanager
+def tqdm_redirect(progress):
+    original_tqdm = tqdm.tqdm
+    try:
+        tqdm.tqdm = progress.tqdm
+        yield
+    finally:
+        tqdm.tqdm = original_tqdm
+def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
+    with tqdm_redirect(progress):
+        embeddings = model.encode(texts, show_progressbar=True, batch_size=batch_size)
+    return embeddings
+def deduplicate(
+    embedding_matrix: np.ndarray,
+    threshold: float,
+    batch_size: int = 1024,
+    progress=None
+) -> tuple[np.ndarray, dict[int, int]]:
+    # Existing deduplication code remains unchanged
+    # Building the index
+    progress(0, desc="Building search index...")
+    reach = Reach(
+        vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
+    )
+    deduplicated_indices = set(range(len(embedding_matrix)))
+    duplicate_to_original_mapping = {}
+    # Finding nearest neighbors
+    progress(0, desc="Finding nearest neighbors...")
+    results = reach.nearest_neighbor_threshold(
+        embedding_matrix,
+        threshold=threshold,
+        batch_size=batch_size,
+        show_progressbar=False,  # Disable internal progress bar
+    )
+    # Processing duplicates with a progress bar
+    total_items = len(embedding_matrix)
+    for i, similar_items in enumerate(
+        progress.tqdm(results, desc="Processing duplicates", total=total_items)
+    ):
+        if i not in deduplicated_indices:
+            continue
+        similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
+        for sim_idx in similar_indices:
+            if sim_idx in deduplicated_indices:
+                deduplicated_indices.remove(sim_idx)
+                duplicate_to_original_mapping[sim_idx] = i
+    return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
 def display_word_differences(x: str, y: str) -> str:
     diff = ndiff(x.split(), y.split())
+    return " ".join([word for word in diff if word.startswith(("+", "-"))])
 def perform_deduplication(
     deduplication_type,
     dataset2_split="",
     dataset2_text_column="",
     threshold=default_threshold,
+    progress=gr.Progress(track_tqdm=True),
 ):
     try:
         # Convert threshold to float
             # Load Dataset 1
             status = "Loading Dataset 1..."
             yield status, ""
+            if (
+                dataset1_name == default_dataset1_name
+                and dataset1_split == default_dataset1_split
+            ):
                 ds = ds_default1
             else:
                 ds = load_dataset(dataset1_name, split=dataset1_split)
             # Compute embeddings
             status = "Computing embeddings for Dataset 1..."
             yield status, ""
+            embedding_matrix = compute_embeddings(
+                texts,
+                batch_size=64,
+                progress=progress,
+                desc="Computing embeddings for Dataset 1",
+            )
             # Deduplicate
             status = "Deduplicating embeddings..."
             result_text = f"**Total documents:** {num_total}\n"
             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+            result_text += (
+                f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
+            )
             # Show deduplicated examples
             if num_duplicates > 0:
         yield f"An error occurred: {e}", ""
         raise e
 with gr.Blocks() as demo:
     gr.Markdown("# Semantic Deduplication")
     deduplication_type = gr.Radio(
         choices=["Single dataset", "Cross-dataset"],
         label="Deduplication Type",
+        value="Single dataset",
     )
     with gr.Row():
             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
     threshold = gr.Slider(
+        minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold"
     )
     compute_button = gr.Button("Compute")
             return gr.update(visible=False)
     deduplication_type.change(
+        update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
     )
     compute_button.click(
             dataset2_name,
             dataset2_split,
             dataset2_text_column,
+            threshold,
         ],
+        outputs=[status_output, result_output],
     )
 demo.launch()