Updated app with code for deduplication
Browse files
app.py
CHANGED
|
@@ -26,15 +26,15 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
|
|
| 26 |
"""
|
| 27 |
Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
|
| 28 |
"""
|
| 29 |
-
#
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
p.update(1)
|
| 33 |
|
| 34 |
deduplicated_indices = set(range(len(embedding_matrix)))
|
| 35 |
duplicate_to_original_mapping = {}
|
| 36 |
|
| 37 |
# Finding nearest neighbors
|
|
|
|
| 38 |
results = reach.nearest_neighbor_threshold(
|
| 39 |
embedding_matrix,
|
| 40 |
threshold=threshold,
|
|
@@ -61,15 +61,15 @@ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix
|
|
| 61 |
"""
|
| 62 |
Deduplicate embeddings across two datasets and return the indices of duplicates between them.
|
| 63 |
"""
|
| 64 |
-
#
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
p.update(1)
|
| 68 |
|
| 69 |
duplicate_indices_in_test = []
|
| 70 |
duplicate_to_original_mapping = {}
|
| 71 |
|
| 72 |
# Finding nearest neighbors between datasets
|
|
|
|
| 73 |
results = reach.nearest_neighbor_threshold(
|
| 74 |
embedding_matrix_2,
|
| 75 |
threshold=threshold,
|
|
@@ -117,15 +117,18 @@ def perform_deduplication(
|
|
| 117 |
|
| 118 |
if deduplication_type == "Single dataset":
|
| 119 |
# Load Dataset 1
|
|
|
|
| 120 |
if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
|
| 121 |
ds = ds_default1
|
| 122 |
else:
|
| 123 |
ds = load_dataset(dataset1_name, split=dataset1_split)
|
| 124 |
|
| 125 |
# Extract texts
|
|
|
|
| 126 |
texts = [example[dataset1_text_column] for example in ds]
|
| 127 |
|
| 128 |
# Compute embeddings
|
|
|
|
| 129 |
embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
|
| 130 |
|
| 131 |
# Deduplicate
|
|
@@ -137,27 +140,33 @@ def perform_deduplication(
|
|
| 137 |
|
| 138 |
elif deduplication_type == "Cross-dataset":
|
| 139 |
# Load Dataset 1
|
|
|
|
| 140 |
if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
|
| 141 |
ds1 = ds_default1
|
| 142 |
else:
|
| 143 |
ds1 = load_dataset(dataset1_name, split=dataset1_split)
|
| 144 |
|
| 145 |
# Load Dataset 2
|
|
|
|
| 146 |
if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
|
| 147 |
ds2 = ds_default2
|
| 148 |
else:
|
| 149 |
ds2 = load_dataset(dataset2_name, split=dataset2_split)
|
| 150 |
|
| 151 |
# Extract texts from Dataset 1
|
|
|
|
| 152 |
texts1 = [example[dataset1_text_column] for example in ds1]
|
| 153 |
|
| 154 |
# Extract texts from Dataset 2
|
|
|
|
| 155 |
texts2 = [example[dataset2_text_column] for example in ds2]
|
| 156 |
|
| 157 |
# Compute embeddings for Dataset 1
|
|
|
|
| 158 |
embedding_matrix1 = model.encode(texts1, show_progressbar=True)
|
| 159 |
|
| 160 |
# Compute embeddings for Dataset 2
|
|
|
|
| 161 |
embedding_matrix2 = model.encode(texts2, show_progressbar=True)
|
| 162 |
|
| 163 |
# Deduplicate across datasets
|
|
@@ -308,6 +317,7 @@ demo.launch()
|
|
| 308 |
|
| 309 |
|
| 310 |
|
|
|
|
| 311 |
# import gradio as gr
|
| 312 |
# from datasets import load_dataset
|
| 313 |
# import numpy as np
|
|
|
|
| 26 |
"""
|
| 27 |
Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
|
| 28 |
"""
|
| 29 |
+
# Update progress to indicate building the index
|
| 30 |
+
progress(0, desc="Building search index...")
|
| 31 |
+
reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
|
|
|
|
| 32 |
|
| 33 |
deduplicated_indices = set(range(len(embedding_matrix)))
|
| 34 |
duplicate_to_original_mapping = {}
|
| 35 |
|
| 36 |
# Finding nearest neighbors
|
| 37 |
+
progress(0, desc="Finding nearest neighbors...")
|
| 38 |
results = reach.nearest_neighbor_threshold(
|
| 39 |
embedding_matrix,
|
| 40 |
threshold=threshold,
|
|
|
|
| 61 |
"""
|
| 62 |
Deduplicate embeddings across two datasets and return the indices of duplicates between them.
|
| 63 |
"""
|
| 64 |
+
# Update progress to indicate building the index
|
| 65 |
+
progress(0, desc="Building search index from Dataset 1...")
|
| 66 |
+
reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
|
|
|
|
| 67 |
|
| 68 |
duplicate_indices_in_test = []
|
| 69 |
duplicate_to_original_mapping = {}
|
| 70 |
|
| 71 |
# Finding nearest neighbors between datasets
|
| 72 |
+
progress(0, desc="Finding nearest neighbors between datasets...")
|
| 73 |
results = reach.nearest_neighbor_threshold(
|
| 74 |
embedding_matrix_2,
|
| 75 |
threshold=threshold,
|
|
|
|
| 117 |
|
| 118 |
if deduplication_type == "Single dataset":
|
| 119 |
# Load Dataset 1
|
| 120 |
+
progress(0, desc="Loading Dataset 1...")
|
| 121 |
if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
|
| 122 |
ds = ds_default1
|
| 123 |
else:
|
| 124 |
ds = load_dataset(dataset1_name, split=dataset1_split)
|
| 125 |
|
| 126 |
# Extract texts
|
| 127 |
+
progress(0, desc="Extracting texts from Dataset 1...")
|
| 128 |
texts = [example[dataset1_text_column] for example in ds]
|
| 129 |
|
| 130 |
# Compute embeddings
|
| 131 |
+
progress(0, desc="Computing embeddings for Dataset 1...")
|
| 132 |
embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
|
| 133 |
|
| 134 |
# Deduplicate
|
|
|
|
| 140 |
|
| 141 |
elif deduplication_type == "Cross-dataset":
|
| 142 |
# Load Dataset 1
|
| 143 |
+
progress(0, desc="Loading Dataset 1...")
|
| 144 |
if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
|
| 145 |
ds1 = ds_default1
|
| 146 |
else:
|
| 147 |
ds1 = load_dataset(dataset1_name, split=dataset1_split)
|
| 148 |
|
| 149 |
# Load Dataset 2
|
| 150 |
+
progress(0, desc="Loading Dataset 2...")
|
| 151 |
if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
|
| 152 |
ds2 = ds_default2
|
| 153 |
else:
|
| 154 |
ds2 = load_dataset(dataset2_name, split=dataset2_split)
|
| 155 |
|
| 156 |
# Extract texts from Dataset 1
|
| 157 |
+
progress(0, desc="Extracting texts from Dataset 1...")
|
| 158 |
texts1 = [example[dataset1_text_column] for example in ds1]
|
| 159 |
|
| 160 |
# Extract texts from Dataset 2
|
| 161 |
+
progress(0, desc="Extracting texts from Dataset 2...")
|
| 162 |
texts2 = [example[dataset2_text_column] for example in ds2]
|
| 163 |
|
| 164 |
# Compute embeddings for Dataset 1
|
| 165 |
+
progress(0, desc="Computing embeddings for Dataset 1...")
|
| 166 |
embedding_matrix1 = model.encode(texts1, show_progressbar=True)
|
| 167 |
|
| 168 |
# Compute embeddings for Dataset 2
|
| 169 |
+
progress(0, desc="Computing embeddings for Dataset 2...")
|
| 170 |
embedding_matrix2 = model.encode(texts2, show_progressbar=True)
|
| 171 |
|
| 172 |
# Deduplicate across datasets
|
|
|
|
| 317 |
|
| 318 |
|
| 319 |
|
| 320 |
+
|
| 321 |
# import gradio as gr
|
| 322 |
# from datasets import load_dataset
|
| 323 |
# import numpy as np
|