Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

Pringled commited on Oct 12, 2024

Commit

ed5b7bd

1 Parent(s): d54c792

Updates

Browse files

Files changed (1) hide show

app.py +49 -58

app.py CHANGED Viewed

@@ -21,7 +21,16 @@ def deduplicate_embeddings(
     batch_size: int = 1024,
     progress=None
 ) -> tuple[np.ndarray, dict[int, int]]:
-    """Deduplicate embeddings within one dataset or across two datasets."""
     if embeddings_b is None:
         reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
         duplicate_to_original = {}
@@ -49,13 +58,27 @@ def deduplicate_embeddings(
         return duplicate_indices_in_b, duplicate_to_original
 def display_word_differences(x: str, y: str) -> str:
-    """Display word-level differences between two texts, avoiding Markdown issues."""
     diff = ndiff(x.split(), y.split())
     formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
     return f"```\n{formatted_diff}\n```"
 def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
-    """Load texts from a specified dataset and split."""
     ds = load_dataset(dataset_name, split=dataset_split)
     return [example[text_column] for example in ds]
@@ -70,7 +93,20 @@ def perform_deduplication(
     threshold: float = default_threshold,
     progress: gr.Progress = gr.Progress(track_tqdm=True)
 ):
-    """Perform deduplication on one or two datasets."""
     try:
         threshold = float(threshold)
@@ -209,6 +245,8 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
 demo.launch()
 # import gradio as gr
 # from datasets import load_dataset
 # import numpy as np
@@ -232,16 +270,7 @@ demo.launch()
 #     batch_size: int = 1024,
 #     progress=None
 # ) -> tuple[np.ndarray, dict[int, int]]:
-#     """
-#     Deduplicate embeddings within one dataset or across two datasets.
-#     :param embeddings_a: Embeddings of Dataset 1.
-#     :param embeddings_b: Optional, embeddings of Dataset 2.
-#     :param threshold: Similarity threshold for deduplication.
-#     :param batch_size: Batch size for similarity computation.
-#     :param progress: Gradio progress tracker for feedback.
-#     :return: Deduplicated indices and a mapping of removed indices to their original counterparts.
-#     """
 #     if embeddings_b is None:
 #         reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
 #         duplicate_to_original = {}
@@ -269,39 +298,13 @@ demo.launch()
 #         return duplicate_indices_in_b, duplicate_to_original
 # def display_word_differences(x: str, y: str) -> str:
-#     """
-#     Display the word-level differences between two texts, formatted to avoid
-#     misinterpretation of Markdown syntax.
-#     :param x: First text.
-#     :param y: Second text.
-#     :return: A string showing word-level differences, wrapped in a code block.
-#     """
 #     diff = ndiff(x.split(), y.split())
-#     # Wrap differences in a code block to prevent interpretation as Markdown
 #     formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
 #     return f"```\n{formatted_diff}\n```"
-# # def display_word_differences(x: str, y: str) -> str:
-# #     """
-# #     Display the word-level differences between two texts.
-# #     :param x: First text.
-# #     :param y: Second text.
-# #     :return: A string showing word-level differences.
-# #     """
-# #     diff = ndiff(x.split(), y.split())
-# #     return " ".join(word for word in diff if word.startswith(("+", "-")))
 # def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
-#     """
-#     Load texts from a specified dataset and split.
-#     :param dataset_name: Name of the dataset.
-#     :param dataset_split: Split of the dataset (e.g., 'train', 'validation').
-#     :param text_column: Name of the text column.
-#     :return: A list of texts from the dataset.
-#     """
 #     ds = load_dataset(dataset_name, split=dataset_split)
 #     return [example[text_column] for example in ds]
@@ -316,20 +319,7 @@ demo.launch()
 #     threshold: float = default_threshold,
 #     progress: gr.Progress = gr.Progress(track_tqdm=True)
 # ):
-#     """
-#     Perform deduplication on one or two datasets based on the deduplication type.
-#     :param deduplication_type: 'Single dataset' or 'Cross-dataset'.
-#     :param dataset1_name: Name of the first dataset.
-#     :param dataset1_split: Split of the first dataset.
-#     :param dataset1_text_column: Text column of the first dataset.
-#     :param dataset2_name: Optional, name of the second dataset (for cross-dataset deduplication).
-#     :param dataset2_split: Optional, split of the second dataset.
-#     :param dataset2_text_column: Optional, text column of the second dataset.
-#     :param threshold: Similarity threshold for deduplication.
-#     :param progress: Gradio progress tracker.
-#     :return: Status updates and result text for the Gradio interface.
-#     """
 #     try:
 #         threshold = float(threshold)
@@ -411,6 +401,7 @@ demo.launch()
 #         yield f"An error occurred: {e}", ""
 #         raise e
 # with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
 #     gr.Markdown("# Semantic Deduplication")
 #     gr.Markdown("""
@@ -440,7 +431,7 @@ demo.launch()
 #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
 #     threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
-#     compute_button = gr.Button("Compute")
 #     status_output = gr.Markdown(elem_id="status_output")
 #     result_output = gr.Markdown()
@@ -464,5 +455,5 @@ demo.launch()
 #         outputs=[status_output, result_output],
 #     )
-# demo.launch()

     batch_size: int = 1024,
     progress=None
 ) -> tuple[np.ndarray, dict[int, int]]:
+    """
+    Deduplicate embeddings within one dataset or across two datasets.
+    :param embeddings_a: Embeddings of Dataset 1.
+    :param embeddings_b: Optional, embeddings of Dataset 2.
+    :param threshold: Similarity threshold for deduplication.
+    :param batch_size: Batch size for similarity computation.
+    :param progress: Gradio progress tracker for feedback.
+    :return: Deduplicated indices and a mapping of removed indices to their original counterparts.
+    """
     if embeddings_b is None:
         reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
         duplicate_to_original = {}
         return duplicate_indices_in_b, duplicate_to_original
 def display_word_differences(x: str, y: str) -> str:
+    """
+    Display the word-level differences between two texts, formatted to avoid
+    misinterpretation of Markdown syntax.
+    :param x: First text.
+    :param y: Second text.
+    :return: A string showing word-level differences, wrapped in a code block.
+    """
     diff = ndiff(x.split(), y.split())
     formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
     return f"```\n{formatted_diff}\n```"
 def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
+    """
+    Load texts from a specified dataset and split.
+    :param dataset_name: Name of the dataset.
+    :param dataset_split: Split of the dataset (e.g., 'train', 'validation').
+    :param text_column: Name of the text column.
+    :return: A list of texts from the dataset.
+    """
     ds = load_dataset(dataset_name, split=dataset_split)
     return [example[text_column] for example in ds]
     threshold: float = default_threshold,
     progress: gr.Progress = gr.Progress(track_tqdm=True)
 ):
+    """
+    Perform deduplication on one or two datasets based on the deduplication type.
+    :param deduplication_type: 'Single dataset' or 'Cross-dataset'.
+    :param dataset1_name: Name of the first dataset.
+    :param dataset1_split: Split of the first dataset.
+    :param dataset1_text_column: Text column of the first dataset.
+    :param dataset2_name: Optional, name of the second dataset (for cross-dataset deduplication).
+    :param dataset2_split: Optional, split of the second dataset.
+    :param dataset2_text_column: Optional, text column of the second dataset.
+    :param threshold: Similarity threshold for deduplication.
+    :param progress: Gradio progress tracker.
+    :return: Status updates and result text for the Gradio interface.
+    """
     try:
         threshold = float(threshold)
 demo.launch()
 # import gradio as gr
 # from datasets import load_dataset
 # import numpy as np
 #     batch_size: int = 1024,
 #     progress=None
 # ) -> tuple[np.ndarray, dict[int, int]]:
+#     """Deduplicate embeddings within one dataset or across two datasets."""
 #     if embeddings_b is None:
 #         reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
 #         duplicate_to_original = {}
 #         return duplicate_indices_in_b, duplicate_to_original
 # def display_word_differences(x: str, y: str) -> str:
+#     """Display word-level differences between two texts, avoiding Markdown issues."""
 #     diff = ndiff(x.split(), y.split())
 #     formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
 #     return f"```\n{formatted_diff}\n```"
 # def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
+#     """Load texts from a specified dataset and split."""
 #     ds = load_dataset(dataset_name, split=dataset_split)
 #     return [example[text_column] for example in ds]
 #     threshold: float = default_threshold,
 #     progress: gr.Progress = gr.Progress(track_tqdm=True)
 # ):
+#     """Perform deduplication on one or two datasets."""
 #     try:
 #         threshold = float(threshold)
 #         yield f"An error occurred: {e}", ""
 #         raise e
+# # Gradio app with stop button support
 # with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
 #     gr.Markdown("# Semantic Deduplication")
 #     gr.Markdown("""
 #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
 #     threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
+#     compute_button = gr.Button("Deduplicate")
 #     status_output = gr.Markdown(elem_id="status_output")
 #     result_output = gr.Markdown()
 #         outputs=[status_output, result_output],
 #     )
+# demo.launch()