Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

Pringled commited on Oct 12, 2024

Commit

892ceeb

1 Parent(s): 7a1cd7a

Updated app with code for deduplication

Browse files

Files changed (1) hide show

app.py +10 -14

app.py CHANGED Viewed

@@ -3,10 +3,9 @@ from datasets import load_dataset
 import numpy as np
 from model2vec import StaticModel
 from reach import Reach
-from tqdm import tqdm
 from difflib import ndiff
-def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[np.ndarray, dict[int, int]]:
     """
     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
     """
@@ -24,7 +23,7 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
     )
     # Process duplicates
-    for i, similar_items in enumerate(tqdm(results)):
         if i not in deduplicated_indices:
             continue  # Skip already marked duplicates
@@ -39,7 +38,7 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[list[int], dict[int, int]]:
     """
     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
     """
@@ -58,7 +57,7 @@ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix
     )
     # Process duplicates
-    for i, similar_items in enumerate(tqdm(results)):
         # Similar items are returned as (index, score), we are only interested in the index
         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]  # Keep those above the threshold
@@ -71,7 +70,7 @@ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix
 def display_word_differences(x: str, y: str) -> str:
     diff = ndiff(x.split(), y.split())
-    return " ".join([f"{word}" for word in diff if word.startswith(('+', '-'))])
 def perform_deduplication(
     deduplication_type,
@@ -81,7 +80,8 @@ def perform_deduplication(
     dataset2_name,
     dataset2_split,
     dataset2_text_column,
-    threshold
 ):
     # Convert threshold to float
     threshold = float(threshold)
@@ -98,8 +98,7 @@ def perform_deduplication(
         embedding_matrix = model.encode(texts, show_progressbar=True)
         # Deduplicate
-        with gr.Progress(track_tqdm=True):
-            deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold)
         # Prepare the results
         num_duplicates = len(duplicate_to_original_mapping)
@@ -114,9 +113,7 @@ def perform_deduplication(
         result_text += "**Examples of duplicates found:**\n\n"
         num_examples = min(5, num_duplicates)
         examples_shown = 0
-        for duplicate_idx, original_idx in duplicate_to_original_mapping.items():
-            if examples_shown >= num_examples:
-                break
             original_text = texts[original_idx]
             duplicate_text = texts[duplicate_idx]
             differences = display_word_differences(original_text, duplicate_text)
@@ -143,8 +140,7 @@ def perform_deduplication(
         embedding_matrix2 = model.encode(texts2, show_progressbar=True)
         # Deduplicate across datasets
-        with gr.Progress(track_tqdm=True):
-            duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold)
         num_duplicates = len(duplicate_indices_in_ds2)
         num_total_ds2 = len(texts2)

 import numpy as np
 from model2vec import StaticModel
 from reach import Reach
 from difflib import ndiff
+def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=gr.Progress(track_tqdm=True)) -> tuple[np.ndarray, dict[int, int]]:
     """
     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
     """
     )
     # Process duplicates
+    for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
         if i not in deduplicated_indices:
             continue  # Skip already marked duplicates
     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=gr.Progress(track_tqdm=True)) -> tuple[list[int], dict[int, int]]:
     """
     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
     """
     )
     # Process duplicates
+    for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
         # Similar items are returned as (index, score), we are only interested in the index
         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]  # Keep those above the threshold
 def display_word_differences(x: str, y: str) -> str:
     diff = ndiff(x.split(), y.split())
+    return " ".join([word for word in diff if word.startswith(('+', '-'))])
 def perform_deduplication(
     deduplication_type,
     dataset2_name,
     dataset2_split,
     dataset2_text_column,
+    threshold,
+    progress=gr.Progress(track_tqdm=True)
 ):
     # Convert threshold to float
     threshold = float(threshold)
         embedding_matrix = model.encode(texts, show_progressbar=True)
         # Deduplicate
+        deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
         # Prepare the results
         num_duplicates = len(duplicate_to_original_mapping)
         result_text += "**Examples of duplicates found:**\n\n"
         num_examples = min(5, num_duplicates)
         examples_shown = 0
+        for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
             original_text = texts[original_idx]
             duplicate_text = texts[duplicate_idx]
             differences = display_word_differences(original_text, duplicate_text)
         embedding_matrix2 = model.encode(texts2, show_progressbar=True)
         # Deduplicate across datasets
+        duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
         num_duplicates = len(duplicate_indices_in_ds2)
         num_total_ds2 = len(texts2)