Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

Pringled commited on Oct 12, 2024

Commit

504b6fc

1 Parent(s): 4b1ac5a

Updated app with code for deduplication

Browse files

Files changed (1) hide show

app.py +18 -8

app.py CHANGED Viewed

@@ -26,15 +26,15 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
     """
     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
     """
-    # Building the index with a progress bar
-    with progress.tqdm(total=1, desc="Building search index") as p:
-        reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
-        p.update(1)
     deduplicated_indices = set(range(len(embedding_matrix)))
     duplicate_to_original_mapping = {}
     # Finding nearest neighbors
     results = reach.nearest_neighbor_threshold(
         embedding_matrix,
         threshold=threshold,
@@ -61,15 +61,15 @@ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix
     """
     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
     """
-    # Building the index from Dataset 1
-    with progress.tqdm(total=1, desc="Building search index from Dataset 1") as p:
-        reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
-        p.update(1)
     duplicate_indices_in_test = []
     duplicate_to_original_mapping = {}
     # Finding nearest neighbors between datasets
     results = reach.nearest_neighbor_threshold(
         embedding_matrix_2,
         threshold=threshold,
@@ -117,15 +117,18 @@ def perform_deduplication(
         if deduplication_type == "Single dataset":
             # Load Dataset 1
             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
                 ds = ds_default1
             else:
                 ds = load_dataset(dataset1_name, split=dataset1_split)
             # Extract texts
             texts = [example[dataset1_text_column] for example in ds]
             # Compute embeddings
             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
             # Deduplicate
@@ -137,27 +140,33 @@ def perform_deduplication(
         elif deduplication_type == "Cross-dataset":
             # Load Dataset 1
             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
                 ds1 = ds_default1
             else:
                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
             # Load Dataset 2
             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
                 ds2 = ds_default2
             else:
                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
             # Extract texts from Dataset 1
             texts1 = [example[dataset1_text_column] for example in ds1]
             # Extract texts from Dataset 2
             texts2 = [example[dataset2_text_column] for example in ds2]
             # Compute embeddings for Dataset 1
             embedding_matrix1 = model.encode(texts1, show_progressbar=True)
             # Compute embeddings for Dataset 2
             embedding_matrix2 = model.encode(texts2, show_progressbar=True)
             # Deduplicate across datasets
@@ -308,6 +317,7 @@ demo.launch()
 # import gradio as gr
 # from datasets import load_dataset
 # import numpy as np

     """
     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
     """
+    # Update progress to indicate building the index
+    progress(0, desc="Building search index...")
+    reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
     deduplicated_indices = set(range(len(embedding_matrix)))
     duplicate_to_original_mapping = {}
     # Finding nearest neighbors
+    progress(0, desc="Finding nearest neighbors...")
     results = reach.nearest_neighbor_threshold(
         embedding_matrix,
         threshold=threshold,
     """
     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
     """
+    # Update progress to indicate building the index
+    progress(0, desc="Building search index from Dataset 1...")
+    reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
     duplicate_indices_in_test = []
     duplicate_to_original_mapping = {}
     # Finding nearest neighbors between datasets
+    progress(0, desc="Finding nearest neighbors between datasets...")
     results = reach.nearest_neighbor_threshold(
         embedding_matrix_2,
         threshold=threshold,
         if deduplication_type == "Single dataset":
             # Load Dataset 1
+            progress(0, desc="Loading Dataset 1...")
             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
                 ds = ds_default1
             else:
                 ds = load_dataset(dataset1_name, split=dataset1_split)
             # Extract texts
+            progress(0, desc="Extracting texts from Dataset 1...")
             texts = [example[dataset1_text_column] for example in ds]
             # Compute embeddings
+            progress(0, desc="Computing embeddings for Dataset 1...")
             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
             # Deduplicate
         elif deduplication_type == "Cross-dataset":
             # Load Dataset 1
+            progress(0, desc="Loading Dataset 1...")
             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
                 ds1 = ds_default1
             else:
                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
             # Load Dataset 2
+            progress(0, desc="Loading Dataset 2...")
             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
                 ds2 = ds_default2
             else:
                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
             # Extract texts from Dataset 1
+            progress(0, desc="Extracting texts from Dataset 1...")
             texts1 = [example[dataset1_text_column] for example in ds1]
             # Extract texts from Dataset 2
+            progress(0, desc="Extracting texts from Dataset 2...")
             texts2 = [example[dataset2_text_column] for example in ds2]
             # Compute embeddings for Dataset 1
+            progress(0, desc="Computing embeddings for Dataset 1...")
             embedding_matrix1 = model.encode(texts1, show_progressbar=True)
             # Compute embeddings for Dataset 2
+            progress(0, desc="Computing embeddings for Dataset 2...")
             embedding_matrix2 = model.encode(texts2, show_progressbar=True)
             # Deduplicate across datasets
 # import gradio as gr
 # from datasets import load_dataset
 # import numpy as np