Updates
Browse files
app.py
CHANGED
|
@@ -21,7 +21,16 @@ def deduplicate_embeddings(
|
|
| 21 |
batch_size: int = 1024,
|
| 22 |
progress=None
|
| 23 |
) -> tuple[np.ndarray, dict[int, int]]:
|
| 24 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
if embeddings_b is None:
|
| 26 |
reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
|
| 27 |
duplicate_to_original = {}
|
|
@@ -49,13 +58,27 @@ def deduplicate_embeddings(
|
|
| 49 |
return duplicate_indices_in_b, duplicate_to_original
|
| 50 |
|
| 51 |
def display_word_differences(x: str, y: str) -> str:
|
| 52 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
diff = ndiff(x.split(), y.split())
|
| 54 |
formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
|
| 55 |
return f"```\n{formatted_diff}\n```"
|
| 56 |
|
| 57 |
def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
|
| 58 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
ds = load_dataset(dataset_name, split=dataset_split)
|
| 60 |
return [example[text_column] for example in ds]
|
| 61 |
|
|
@@ -70,7 +93,20 @@ def perform_deduplication(
|
|
| 70 |
threshold: float = default_threshold,
|
| 71 |
progress: gr.Progress = gr.Progress(track_tqdm=True)
|
| 72 |
):
|
| 73 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
try:
|
| 75 |
threshold = float(threshold)
|
| 76 |
|
|
@@ -209,6 +245,8 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
|
|
| 209 |
|
| 210 |
demo.launch()
|
| 211 |
|
|
|
|
|
|
|
| 212 |
# import gradio as gr
|
| 213 |
# from datasets import load_dataset
|
| 214 |
# import numpy as np
|
|
@@ -232,16 +270,7 @@ demo.launch()
|
|
| 232 |
# batch_size: int = 1024,
|
| 233 |
# progress=None
|
| 234 |
# ) -> tuple[np.ndarray, dict[int, int]]:
|
| 235 |
-
# """
|
| 236 |
-
# Deduplicate embeddings within one dataset or across two datasets.
|
| 237 |
-
|
| 238 |
-
# :param embeddings_a: Embeddings of Dataset 1.
|
| 239 |
-
# :param embeddings_b: Optional, embeddings of Dataset 2.
|
| 240 |
-
# :param threshold: Similarity threshold for deduplication.
|
| 241 |
-
# :param batch_size: Batch size for similarity computation.
|
| 242 |
-
# :param progress: Gradio progress tracker for feedback.
|
| 243 |
-
# :return: Deduplicated indices and a mapping of removed indices to their original counterparts.
|
| 244 |
-
# """
|
| 245 |
# if embeddings_b is None:
|
| 246 |
# reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
|
| 247 |
# duplicate_to_original = {}
|
|
@@ -269,39 +298,13 @@ demo.launch()
|
|
| 269 |
# return duplicate_indices_in_b, duplicate_to_original
|
| 270 |
|
| 271 |
# def display_word_differences(x: str, y: str) -> str:
|
| 272 |
-
# """
|
| 273 |
-
# Display the word-level differences between two texts, formatted to avoid
|
| 274 |
-
# misinterpretation of Markdown syntax.
|
| 275 |
-
|
| 276 |
-
# :param x: First text.
|
| 277 |
-
# :param y: Second text.
|
| 278 |
-
# :return: A string showing word-level differences, wrapped in a code block.
|
| 279 |
-
# """
|
| 280 |
# diff = ndiff(x.split(), y.split())
|
| 281 |
-
# # Wrap differences in a code block to prevent interpretation as Markdown
|
| 282 |
# formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
|
| 283 |
# return f"```\n{formatted_diff}\n```"
|
| 284 |
|
| 285 |
-
# # def display_word_differences(x: str, y: str) -> str:
|
| 286 |
-
# # """
|
| 287 |
-
# # Display the word-level differences between two texts.
|
| 288 |
-
|
| 289 |
-
# # :param x: First text.
|
| 290 |
-
# # :param y: Second text.
|
| 291 |
-
# # :return: A string showing word-level differences.
|
| 292 |
-
# # """
|
| 293 |
-
# # diff = ndiff(x.split(), y.split())
|
| 294 |
-
# # return " ".join(word for word in diff if word.startswith(("+", "-")))
|
| 295 |
-
|
| 296 |
# def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
|
| 297 |
-
# """
|
| 298 |
-
# Load texts from a specified dataset and split.
|
| 299 |
-
|
| 300 |
-
# :param dataset_name: Name of the dataset.
|
| 301 |
-
# :param dataset_split: Split of the dataset (e.g., 'train', 'validation').
|
| 302 |
-
# :param text_column: Name of the text column.
|
| 303 |
-
# :return: A list of texts from the dataset.
|
| 304 |
-
# """
|
| 305 |
# ds = load_dataset(dataset_name, split=dataset_split)
|
| 306 |
# return [example[text_column] for example in ds]
|
| 307 |
|
|
@@ -316,20 +319,7 @@ demo.launch()
|
|
| 316 |
# threshold: float = default_threshold,
|
| 317 |
# progress: gr.Progress = gr.Progress(track_tqdm=True)
|
| 318 |
# ):
|
| 319 |
-
# """
|
| 320 |
-
# Perform deduplication on one or two datasets based on the deduplication type.
|
| 321 |
-
|
| 322 |
-
# :param deduplication_type: 'Single dataset' or 'Cross-dataset'.
|
| 323 |
-
# :param dataset1_name: Name of the first dataset.
|
| 324 |
-
# :param dataset1_split: Split of the first dataset.
|
| 325 |
-
# :param dataset1_text_column: Text column of the first dataset.
|
| 326 |
-
# :param dataset2_name: Optional, name of the second dataset (for cross-dataset deduplication).
|
| 327 |
-
# :param dataset2_split: Optional, split of the second dataset.
|
| 328 |
-
# :param dataset2_text_column: Optional, text column of the second dataset.
|
| 329 |
-
# :param threshold: Similarity threshold for deduplication.
|
| 330 |
-
# :param progress: Gradio progress tracker.
|
| 331 |
-
# :return: Status updates and result text for the Gradio interface.
|
| 332 |
-
# """
|
| 333 |
# try:
|
| 334 |
# threshold = float(threshold)
|
| 335 |
|
|
@@ -411,6 +401,7 @@ demo.launch()
|
|
| 411 |
# yield f"An error occurred: {e}", ""
|
| 412 |
# raise e
|
| 413 |
|
|
|
|
| 414 |
# with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
|
| 415 |
# gr.Markdown("# Semantic Deduplication")
|
| 416 |
# gr.Markdown("""
|
|
@@ -440,7 +431,7 @@ demo.launch()
|
|
| 440 |
# dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
|
| 441 |
|
| 442 |
# threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
|
| 443 |
-
# compute_button = gr.Button("
|
| 444 |
# status_output = gr.Markdown(elem_id="status_output")
|
| 445 |
# result_output = gr.Markdown()
|
| 446 |
|
|
@@ -464,5 +455,5 @@ demo.launch()
|
|
| 464 |
# outputs=[status_output, result_output],
|
| 465 |
# )
|
| 466 |
|
| 467 |
-
# demo.launch()
|
| 468 |
|
|
|
|
|
|
| 21 |
batch_size: int = 1024,
|
| 22 |
progress=None
|
| 23 |
) -> tuple[np.ndarray, dict[int, int]]:
|
| 24 |
+
"""
|
| 25 |
+
Deduplicate embeddings within one dataset or across two datasets.
|
| 26 |
+
|
| 27 |
+
:param embeddings_a: Embeddings of Dataset 1.
|
| 28 |
+
:param embeddings_b: Optional, embeddings of Dataset 2.
|
| 29 |
+
:param threshold: Similarity threshold for deduplication.
|
| 30 |
+
:param batch_size: Batch size for similarity computation.
|
| 31 |
+
:param progress: Gradio progress tracker for feedback.
|
| 32 |
+
:return: Deduplicated indices and a mapping of removed indices to their original counterparts.
|
| 33 |
+
"""
|
| 34 |
if embeddings_b is None:
|
| 35 |
reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
|
| 36 |
duplicate_to_original = {}
|
|
|
|
| 58 |
return duplicate_indices_in_b, duplicate_to_original
|
| 59 |
|
| 60 |
def display_word_differences(x: str, y: str) -> str:
|
| 61 |
+
"""
|
| 62 |
+
Display the word-level differences between two texts, formatted to avoid
|
| 63 |
+
misinterpretation of Markdown syntax.
|
| 64 |
+
|
| 65 |
+
:param x: First text.
|
| 66 |
+
:param y: Second text.
|
| 67 |
+
:return: A string showing word-level differences, wrapped in a code block.
|
| 68 |
+
"""
|
| 69 |
diff = ndiff(x.split(), y.split())
|
| 70 |
formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
|
| 71 |
return f"```\n{formatted_diff}\n```"
|
| 72 |
|
| 73 |
def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
|
| 74 |
+
"""
|
| 75 |
+
Load texts from a specified dataset and split.
|
| 76 |
+
|
| 77 |
+
:param dataset_name: Name of the dataset.
|
| 78 |
+
:param dataset_split: Split of the dataset (e.g., 'train', 'validation').
|
| 79 |
+
:param text_column: Name of the text column.
|
| 80 |
+
:return: A list of texts from the dataset.
|
| 81 |
+
"""
|
| 82 |
ds = load_dataset(dataset_name, split=dataset_split)
|
| 83 |
return [example[text_column] for example in ds]
|
| 84 |
|
|
|
|
| 93 |
threshold: float = default_threshold,
|
| 94 |
progress: gr.Progress = gr.Progress(track_tqdm=True)
|
| 95 |
):
|
| 96 |
+
"""
|
| 97 |
+
Perform deduplication on one or two datasets based on the deduplication type.
|
| 98 |
+
|
| 99 |
+
:param deduplication_type: 'Single dataset' or 'Cross-dataset'.
|
| 100 |
+
:param dataset1_name: Name of the first dataset.
|
| 101 |
+
:param dataset1_split: Split of the first dataset.
|
| 102 |
+
:param dataset1_text_column: Text column of the first dataset.
|
| 103 |
+
:param dataset2_name: Optional, name of the second dataset (for cross-dataset deduplication).
|
| 104 |
+
:param dataset2_split: Optional, split of the second dataset.
|
| 105 |
+
:param dataset2_text_column: Optional, text column of the second dataset.
|
| 106 |
+
:param threshold: Similarity threshold for deduplication.
|
| 107 |
+
:param progress: Gradio progress tracker.
|
| 108 |
+
:return: Status updates and result text for the Gradio interface.
|
| 109 |
+
"""
|
| 110 |
try:
|
| 111 |
threshold = float(threshold)
|
| 112 |
|
|
|
|
| 245 |
|
| 246 |
demo.launch()
|
| 247 |
|
| 248 |
+
|
| 249 |
+
|
| 250 |
# import gradio as gr
|
| 251 |
# from datasets import load_dataset
|
| 252 |
# import numpy as np
|
|
|
|
| 270 |
# batch_size: int = 1024,
|
| 271 |
# progress=None
|
| 272 |
# ) -> tuple[np.ndarray, dict[int, int]]:
|
| 273 |
+
# """Deduplicate embeddings within one dataset or across two datasets."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
# if embeddings_b is None:
|
| 275 |
# reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
|
| 276 |
# duplicate_to_original = {}
|
|
|
|
| 298 |
# return duplicate_indices_in_b, duplicate_to_original
|
| 299 |
|
| 300 |
# def display_word_differences(x: str, y: str) -> str:
|
| 301 |
+
# """Display word-level differences between two texts, avoiding Markdown issues."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
# diff = ndiff(x.split(), y.split())
|
|
|
|
| 303 |
# formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
|
| 304 |
# return f"```\n{formatted_diff}\n```"
|
| 305 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
# def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
|
| 307 |
+
# """Load texts from a specified dataset and split."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
# ds = load_dataset(dataset_name, split=dataset_split)
|
| 309 |
# return [example[text_column] for example in ds]
|
| 310 |
|
|
|
|
| 319 |
# threshold: float = default_threshold,
|
| 320 |
# progress: gr.Progress = gr.Progress(track_tqdm=True)
|
| 321 |
# ):
|
| 322 |
+
# """Perform deduplication on one or two datasets."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
# try:
|
| 324 |
# threshold = float(threshold)
|
| 325 |
|
|
|
|
| 401 |
# yield f"An error occurred: {e}", ""
|
| 402 |
# raise e
|
| 403 |
|
| 404 |
+
# # Gradio app with stop button support
|
| 405 |
# with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
|
| 406 |
# gr.Markdown("# Semantic Deduplication")
|
| 407 |
# gr.Markdown("""
|
|
|
|
| 431 |
# dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
|
| 432 |
|
| 433 |
# threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
|
| 434 |
+
# compute_button = gr.Button("Deduplicate")
|
| 435 |
# status_output = gr.Markdown(elem_id="status_output")
|
| 436 |
# result_output = gr.Markdown()
|
| 437 |
|
|
|
|
| 455 |
# outputs=[status_output, result_output],
|
| 456 |
# )
|
| 457 |
|
|
|
|
| 458 |
|
| 459 |
+
# demo.launch()
|