Updated app with code for deduplication
Browse files
app.py
CHANGED
|
@@ -77,10 +77,10 @@ def perform_deduplication(
|
|
| 77 |
dataset1_name,
|
| 78 |
dataset1_split,
|
| 79 |
dataset1_text_column,
|
| 80 |
-
dataset2_name,
|
| 81 |
-
dataset2_split,
|
| 82 |
-
dataset2_text_column,
|
| 83 |
-
threshold,
|
| 84 |
progress=gr.Progress(track_tqdm=True)
|
| 85 |
):
|
| 86 |
# Convert threshold to float
|
|
@@ -112,7 +112,6 @@ def perform_deduplication(
|
|
| 112 |
# Show deduplicated examples
|
| 113 |
result_text += "**Examples of duplicates found:**\n\n"
|
| 114 |
num_examples = min(5, num_duplicates)
|
| 115 |
-
examples_shown = 0
|
| 116 |
for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
|
| 117 |
original_text = texts[original_idx]
|
| 118 |
duplicate_text = texts[duplicate_idx]
|
|
@@ -121,7 +120,6 @@ def perform_deduplication(
|
|
| 121 |
result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
|
| 122 |
result_text += f"**Differences:**\n{differences}\n"
|
| 123 |
result_text += "-" * 50 + "\n\n"
|
| 124 |
-
examples_shown += 1
|
| 125 |
|
| 126 |
return result_text
|
| 127 |
|
|
@@ -153,7 +151,6 @@ def perform_deduplication(
|
|
| 153 |
# Show deduplicated examples
|
| 154 |
result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
|
| 155 |
num_examples = min(5, num_duplicates)
|
| 156 |
-
examples_shown = 0
|
| 157 |
for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
|
| 158 |
original_idx = duplicate_to_original_mapping[duplicate_idx]
|
| 159 |
original_text = texts1[original_idx]
|
|
@@ -163,42 +160,54 @@ def perform_deduplication(
|
|
| 163 |
result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
|
| 164 |
result_text += f"**Differences:**\n{differences}\n"
|
| 165 |
result_text += "-" * 50 + "\n\n"
|
| 166 |
-
examples_shown += 1
|
| 167 |
|
| 168 |
return result_text
|
| 169 |
|
| 170 |
with gr.Blocks() as demo:
|
| 171 |
gr.Markdown("# Semantic Deduplication")
|
| 172 |
|
| 173 |
-
deduplication_type = gr.Radio(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
-
with gr.
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
|
| 180 |
|
| 181 |
-
|
| 182 |
-
with
|
|
|
|
| 183 |
with gr.Row():
|
| 184 |
-
dataset2_name = gr.Textbox(value="ag_news", label="Dataset Name")
|
| 185 |
-
dataset2_split = gr.Textbox(value="test", label="Split")
|
| 186 |
dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
|
| 187 |
|
| 188 |
-
threshold = gr.Slider(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
compute_button = gr.Button("Compute")
|
| 191 |
|
| 192 |
output = gr.Markdown()
|
| 193 |
|
| 194 |
-
# Function to update the visibility of
|
| 195 |
-
def update_visibility(
|
| 196 |
-
if
|
| 197 |
-
return
|
| 198 |
else:
|
| 199 |
-
return
|
| 200 |
|
| 201 |
-
deduplication_type.change(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
compute_button.click(
|
| 204 |
fn=perform_deduplication,
|
|
|
|
| 77 |
dataset1_name,
|
| 78 |
dataset1_split,
|
| 79 |
dataset1_text_column,
|
| 80 |
+
dataset2_name="",
|
| 81 |
+
dataset2_split="",
|
| 82 |
+
dataset2_text_column="",
|
| 83 |
+
threshold=0.8,
|
| 84 |
progress=gr.Progress(track_tqdm=True)
|
| 85 |
):
|
| 86 |
# Convert threshold to float
|
|
|
|
| 112 |
# Show deduplicated examples
|
| 113 |
result_text += "**Examples of duplicates found:**\n\n"
|
| 114 |
num_examples = min(5, num_duplicates)
|
|
|
|
| 115 |
for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
|
| 116 |
original_text = texts[original_idx]
|
| 117 |
duplicate_text = texts[duplicate_idx]
|
|
|
|
| 120 |
result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
|
| 121 |
result_text += f"**Differences:**\n{differences}\n"
|
| 122 |
result_text += "-" * 50 + "\n\n"
|
|
|
|
| 123 |
|
| 124 |
return result_text
|
| 125 |
|
|
|
|
| 151 |
# Show deduplicated examples
|
| 152 |
result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
|
| 153 |
num_examples = min(5, num_duplicates)
|
|
|
|
| 154 |
for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
|
| 155 |
original_idx = duplicate_to_original_mapping[duplicate_idx]
|
| 156 |
original_text = texts1[original_idx]
|
|
|
|
| 160 |
result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
|
| 161 |
result_text += f"**Differences:**\n{differences}\n"
|
| 162 |
result_text += "-" * 50 + "\n\n"
|
|
|
|
| 163 |
|
| 164 |
return result_text
|
| 165 |
|
| 166 |
with gr.Blocks() as demo:
|
| 167 |
gr.Markdown("# Semantic Deduplication")
|
| 168 |
|
| 169 |
+
deduplication_type = gr.Radio(
|
| 170 |
+
choices=["Single dataset", "Cross-dataset"],
|
| 171 |
+
label="Deduplication Type",
|
| 172 |
+
value="Single dataset"
|
| 173 |
+
)
|
| 174 |
|
| 175 |
+
with gr.Row():
|
| 176 |
+
dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
|
| 177 |
+
dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
|
| 178 |
+
dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
|
|
|
|
| 179 |
|
| 180 |
+
dataset2_inputs = gr.Column(visible=False)
|
| 181 |
+
with dataset2_inputs:
|
| 182 |
+
gr.Markdown("### Dataset 2")
|
| 183 |
with gr.Row():
|
| 184 |
+
dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
|
| 185 |
+
dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
|
| 186 |
dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
|
| 187 |
|
| 188 |
+
threshold = gr.Slider(
|
| 189 |
+
minimum=0.0,
|
| 190 |
+
maximum=1.0,
|
| 191 |
+
value=0.8,
|
| 192 |
+
label="Similarity Threshold"
|
| 193 |
+
)
|
| 194 |
|
| 195 |
compute_button = gr.Button("Compute")
|
| 196 |
|
| 197 |
output = gr.Markdown()
|
| 198 |
|
| 199 |
+
# Function to update the visibility of dataset2_inputs
|
| 200 |
+
def update_visibility(deduplication_type_value):
|
| 201 |
+
if deduplication_type_value == "Cross-dataset":
|
| 202 |
+
return gr.update(visible=True)
|
| 203 |
else:
|
| 204 |
+
return gr.update(visible=False)
|
| 205 |
|
| 206 |
+
deduplication_type.change(
|
| 207 |
+
update_visibility,
|
| 208 |
+
inputs=deduplication_type,
|
| 209 |
+
outputs=dataset2_inputs
|
| 210 |
+
)
|
| 211 |
|
| 212 |
compute_button.click(
|
| 213 |
fn=perform_deduplication,
|