Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| import os | |
| import re | |
| import tempfile | |
| from pathlib import Path | |
| import pdfplumber | |
| import gradio as gr | |
| def clean_text_for_rag(text: str) -> str: | |
| """Normalise et nettoie le texte pour un usage RAG.""" | |
| # Normalisation des caractères typographiques | |
| text = re.sub( | |
| r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]", | |
| lambda m: { | |
| "’": "'", "‘": "'", "“": '"', "”": '"', | |
| "«": '"', "»": '"', "–": "-", "—": "-", | |
| "…": "...", "œ": "oe", "Œ": "OE", | |
| "æ": "ae", "Æ": "AE", "©": "(c)", "®": "(R)", | |
| "™": "TM", "§": "§", "°": "°", "±": "+/-", | |
| "×": "x", "÷": "/" | |
| }.get(m.group(0), m.group(0)), | |
| text, | |
| ) | |
| # Conserver uniquement les caractères suivants | |
| text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\\(\)]', '', text) | |
| # Réduire les espaces multiples | |
| return re.sub(r'\s+', ' ', text).strip() | |
| def extract_and_clean_pdf(pdf_path: str) -> str: | |
| """Ouvre le PDF, récupère le texte et le nettoie.""" | |
| print(f"[+] Extraction du PDF : {pdf_path}") | |
| all_pages = [] | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page in pdf.pages: | |
| txt = page.extract_text() | |
| if txt: | |
| all_pages.append(txt) | |
| return clean_text_for_rag(" ".join(all_pages)) | |
| def extract_and_clean_txt(txt_path: str) -> str: | |
| """Lit un fichier texte (txt, md, …) et le nettoie.""" | |
| print(f"[+] Lecture du fichier texte : {txt_path}") | |
| with open(txt_path, "r", encoding="utf-8") as f: | |
| lines = f.readlines() | |
| cleaned = [ | |
| clean_text_for_rag(line.strip()) | |
| for line in lines | |
| if line.strip() | |
| ] | |
| return "\n".join(cleaned) | |
| def process_file(input_file: gr.File, output_name: str) -> str: | |
| """ | |
| - Detecte le type (PDF ou texte) | |
| - Effectue l'extraction + nettoyage | |
| - Crée un fichier temporaire **avec le nom choisi** (output_name) | |
| - Retourne le chemin du fichier temporaire (Gradio le propose en téléchargement) | |
| """ | |
| input_path = input_file.name | |
| _, ext = os.path.splitext(input_path.lower()) | |
| if ext == ".pdf": | |
| cleaned_text = extract_and_clean_pdf(input_path) | |
| else: | |
| cleaned_text = extract_and_clean_txt(input_path) | |
| output_name = output_name.strip() | |
| if not output_name.lower().endswith(".md"): | |
| output_name = f"{output_name}.md" | |
| temp_dir = tempfile.mkdtemp() | |
| out_path = os.path.join(temp_dir, output_name) | |
| with open(out_path, "w", encoding="utf-8") as f: | |
| f.write(cleaned_text) | |
| return out_path | |
| with gr.Blocks(title="Nettoyage de texte pour RAG") as demo: | |
| gr.Markdown("# 📄 Nettoyage de texte pour RAG") | |
| gr.Markdown( | |
| "Déposez un fichier, le contenu textuel sera extrait, nettoyé " | |
| "et vous pourrez le télécharger sous **le nom que vous choisissez**." | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_file = gr.File( | |
| label="Déposez votre fichier ici", | |
| file_types=["pdf", "txt", "md", "file"], | |
| ) | |
| output_name = gr.Textbox( | |
| value="output.md", | |
| label="Nom du fichier de sortie (en .md)", | |
| placeholder="exemple.md", | |
| interactive=True, | |
| ) | |
| submit_btn = gr.Button("Traiter le fichier", variant="primary") | |
| with gr.Column(scale=1): | |
| output_file = gr.File( | |
| label="Fichier nettoyé (.md)", | |
| file_types=["md"], | |
| ) | |
| submit_btn.click( | |
| fn=process_file, | |
| inputs=[input_file, output_name], | |
| outputs=output_file, | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| **Nettoyage effectué :** | |
| - Suppression des symboles non imprimables / caractères parasites | |
| - Conservation : lettres (avec accents), chiffres, espaces, ponctuation simple | |
| - Normalisation des espaces | |
| - Sortie toujours au format **`.md`** | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) |