Spaces:
Runtime error
Runtime error
| import json | |
| import os | |
| import random | |
| import string | |
| import gradio as gr | |
| import huggingface_hub | |
| from datasets import load_dataset | |
| from evaluate import load | |
| from guidelines import guidelines | |
| human2_annotation_file = "test.jsonl" | |
| def clean_text(text: str) -> str: | |
| # Remove punctuation | |
| text = text.translate(str.maketrans("", "", string.punctuation)) | |
| # Remove newlines and multiple spaces | |
| text = text.replace("\n", " ").strip() | |
| text = " ".join(text.split()).strip() | |
| # lowercase | |
| text = text.lower() | |
| return text | |
| def html_progress_bar(completed_steps, total_steps): | |
| percentage = (completed_steps / total_steps) * 100 | |
| return f""" | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Progress Bar</title> | |
| <style> | |
| .progress-container {{ | |
| width: 100%; | |
| background-color: #ffffff; | |
| }} | |
| .progress-bar {{ | |
| width: {percentage}%; | |
| height: 30px; | |
| background-color: #d1fae5; | |
| text-align: center; | |
| line-height: 30px; | |
| color: white; | |
| }} | |
| </style> | |
| </head> | |
| <body> | |
| <div class="progress-container"> | |
| <div class="progress-bar">{percentage:.0f}%</div> | |
| </div> | |
| </body> | |
| </html> | |
| """ | |
| class AnnotationManager: | |
| def __init__(self): | |
| self.dataset = list( | |
| load_dataset( | |
| "Iker/NoticIA", token=os.environ.get("TOKEN") or True, split="test" | |
| ) | |
| ) | |
| self.total = len(self.dataset) | |
| self.predictions = [] | |
| self.references = [] | |
| print(f"Total examples: {self.total}") | |
| try: | |
| if os.path.exists(human2_annotation_file): | |
| os.remove(human2_annotation_file) | |
| huggingface_hub.hf_hub_download( | |
| repo_id="Iker/NoticIA_Human_Validation", | |
| repo_type="dataset", | |
| token=os.environ.get("TOKEN") or True, | |
| filename="test.jsonl", | |
| local_dir=os.getcwd(), | |
| ) | |
| with open(human2_annotation_file, "r") as f: | |
| annotations = f.readlines() | |
| annotations = [json.loads(a) for a in annotations] | |
| for a in annotations: | |
| self.predictions.append(clean_text(a["summary2"])) | |
| self.references.append([clean_text(a["summary"])]) | |
| self.dataset = self.dataset[len(annotations) :] | |
| except Exception: | |
| print("Unable to download annotations. Starting from the beginning.") | |
| self.current = None | |
| def get_next(self): | |
| if len(self.dataset) == 0: | |
| return "🎉 Anotación Finalizada 🎉", "🎉 Anotación Finalizada 🎉" | |
| self.current = self.dataset.pop(0) | |
| return self.current["web_headline"], self.current["web_text"] | |
| def save_annotation(self, annotation): | |
| if len(annotation) > 0: | |
| example = { | |
| "web_url": self.current["web_url"], | |
| "web_headline": self.current["web_headline"], | |
| "summary": self.current["summary"], | |
| "summary2": annotation, | |
| "web_text": self.current["web_text"], | |
| "clean_web_text": self.current["clean_web_text"], | |
| } | |
| if not os.path.exists(human2_annotation_file): | |
| os.makedirs(os.path.dirname(human2_annotation_file), exist_ok=True) | |
| with open(human2_annotation_file, "w", encoding="utf8") as f: | |
| print(json.dumps(example, ensure_ascii=False), file=f) | |
| else: | |
| with open(human2_annotation_file, "a", encoding="utf8") as f: | |
| print(json.dumps(example, ensure_ascii=False), file=f) | |
| self.predictions.append(clean_text(annotation)) | |
| self.references.append([clean_text(example["summary"])]) | |
| huggingface_hub.upload_file( | |
| repo_id="Iker/NoticIA_Human_Validation", | |
| repo_type="dataset", | |
| token=os.environ.get("TOKEN") or True, | |
| path_in_repo="test.jsonl", | |
| path_or_fileobj=human2_annotation_file, | |
| ) | |
| next_headline, next_text = self.get_next() | |
| return ( | |
| next_headline, | |
| next_text, | |
| self.get_rouge(), | |
| self.progress(), | |
| "", | |
| ) | |
| if self.current is not None: | |
| return ( | |
| self.current["web_headline"], | |
| self.current["web_text"], | |
| self.get_rouge(), | |
| self.progress(), | |
| "", | |
| ) | |
| else: | |
| return ( | |
| "Pulsa ▶️", | |
| "Pulsa ▶️", | |
| "Pulsa ▶️", | |
| self.progress(), | |
| "", | |
| ) | |
| def get_rouge(self): | |
| try: | |
| experiment_id = "".join( | |
| random.choice(string.ascii_uppercase + string.digits) for _ in range(6) | |
| ) | |
| rouge = load("rouge", experiment_id=experiment_id) | |
| return rouge.compute( | |
| predictions=self.predictions, | |
| references=self.references, | |
| use_aggregator=True, | |
| rouge_types=["rouge1"], | |
| )["rouge1"] | |
| except Exception: | |
| return "N/A" | |
| def progress(self): | |
| # Return first number represents steps completed, and second value represents total steps | |
| return html_progress_bar(self.total - len(self.dataset), self.total) | |
| def gr_start(self): | |
| self.__init__() | |
| headline, text = self.get_next() | |
| return headline, text, self.get_rouge(), self.progress(), "" | |
| theme = gr.themes.Soft( | |
| primary_hue="emerald", | |
| secondary_hue="red", | |
| text_size="sm", | |
| spacing_size="sm", | |
| font=[ | |
| gr.themes.GoogleFont("Poppins"), | |
| gr.themes.GoogleFont("Poppins"), | |
| gr.themes.GoogleFont("Poppins"), | |
| gr.themes.GoogleFont("Poppins"), | |
| ], | |
| ).set(block_background_fill="*neutral_50", block_background_fill_dark="*neutral_950") | |
| manager = AnnotationManager() | |
| with gr.Blocks( | |
| theme=theme, title="🖱️ Resumen de noticias Clickbait 🖱️", analytics_enabled=False, | |
| ) as demo: | |
| with gr.Tab("Guidelines") as tab_guidelines: | |
| gr.Markdown(guidelines) | |
| with gr.Tab("Anotación") as tab_annotation: | |
| gr_play = gr.Button("▶️ Empieza a anotar") | |
| gr_progress = gr.HTML(value=manager.progress(), label="Progreso") | |
| gr_rouge = gr.Textbox( | |
| value="Pulsa ▶️", | |
| label="Rouge-1", | |
| info="Rouge Score actual entre las anotaciones y los resúmenes de referencia.", | |
| lines=1, | |
| interactive=False, | |
| ) | |
| gr_headline = gr.Textbox( | |
| value="Pulsa ▶️", | |
| label="Titular", | |
| info="El titular del artículo.", | |
| lines=2, | |
| interactive=False, | |
| ) | |
| gr_body = gr.Textbox( | |
| value="Pulsa ▶️", | |
| label="Artículo", | |
| info="El cuerpo del artículo/noticia.", | |
| lines=10, | |
| interactive=False, | |
| ) | |
| gr_summary = gr.Textbox( | |
| value="", | |
| label="Resumen", | |
| info="Escribe aquí el resumen del artículo. Recuerda leer las guidelines antes de empezar.", | |
| lines=2, | |
| interactive=True, | |
| ) | |
| save = gr.Button( | |
| "💾 Guardar", | |
| ) | |
| save.click( | |
| fn=manager.save_annotation, | |
| inputs=[gr_summary], | |
| outputs=[gr_headline, gr_body, gr_rouge, gr_progress, gr_summary], | |
| concurrency_limit=None, | |
| ) | |
| gr_play.click( | |
| fn=manager.gr_start, | |
| inputs=None, | |
| outputs=[gr_headline, gr_body, gr_rouge, gr_progress, gr_summary], | |
| concurrency_limit=None, | |
| ) | |
| demo.queue(default_concurrency_limit=None) | |
| demo.launch(auth=(os.environ.get("pass"), os.environ.get("pass"))) | |