#!/usr/bin/env python # -*- coding: utf-8 -*- import re, tempfile, requests, os from bs4 import BeautifulSoup from markdownify import markdownify as md from readability import Document import gradio as gr def clean_text_for_rag(text: str) -> str: text = re.sub( r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]", lambda m: { "’": "'", "‘": "'", "“": '"', "”": '"', "«": '"', "»": '"', "–": "-", "—": "-", "…": "...", "œ": "oe", "Œ": "OE", "æ": "ae", "Æ": "AE", "©": "(c)", "®": "(R)", "™": "TM", "§": "§", "°": "°", "±": "+/-", "×": "x", "÷": "/" }.get(m.group(0), m.group(0)), text, ) text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\\(\)]', '', text) return re.sub(r'\s+', ' ', text).strip() def fetch_html(url: str) -> str: hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/124.0 Safari/537.36"} r = requests.get(url, headers=hdr, timeout=20) r.raise_for_status() if r.encoding == "ISO-8859-1": r.encoding = r.apparent_encoding return r.text def extract_main(html: str) -> str: """ Extraction du contenu principal d'une page web en éliminant les éléments parasites. - Utilise readability.Document pour identifier la zone de l'article. - Supprime les éléments de navigation, commentaires, widgets, publicités, etc. - Ne garde que les balises sémantiquement pertinentes (article, p, h1-6, li, etc.). """ doc = Document(html) article_html = doc.content() soup = BeautifulSoup(article_html, "html.parser") # 🔹 Supprimer les éléments non pertinents (pubs, menus, scripts, etc.) for tag in soup(["script", "style", "noscript", "footer", "header", "nav", "form", "iframe", "aside", "button"]): tag.decompose() # 🔹 Supprimer les div ou sections trop petites ou sans texte utile for div in soup.find_all(["div", "section"]): text = div.get_text(strip=True) if len(text) < 80: # seuil ajustable div.decompose() # 🔹 Conserver uniquement les éléments textuels pertinents keep_tags = ["article", "p", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "ul", "ol", "li", "pre", "code"] clean_parts = [] for tag in soup.find_all(keep_tags): text = tag.get_text(" ", strip=True) if text and not re.match(r"^(Partager|Suivre|Commentaires|Lire aussi|Publicité)", text, re.I): clean_parts.append(text) clean_text = "\n\n".join(clean_parts).strip() return clean_text def to_markdown(text: str) -> str: md_raw = md(f"

{text}

", heading_style="ATX") return clean_text_for_rag(md_raw) def process(url: str, out_name: str) -> str: html = fetch_html(url) main_text = extract_main(html) # **seul le contenu pertinent** markdown = to_markdown(main_text) out_name = out_name.strip() if not out_name.lower().endswith(".md"): out_name += ".md" tmp_dir = tempfile.mkdtemp() out_path = os.path.join(tmp_dir, out_name) with open(out_path, "w", encoding="utf-8") as f: f.write(markdown) return out_path with gr.Blocks(title="Web → Markdown") as demo: gr.Markdown("# 🌐 Web Scraping — du web vers Markdown") with gr.Row(): with gr.Column(): url_in = gr.Textbox(label="URL à scraper") out_name = gr.Textbox(label="Nom du fichier (.md)", value="output.md") btn = gr.Button("🛠️ Générer", variant="primary") with gr.Column(): file_out = gr.File(label="Fichier Markdown") btn.click(fn=process, inputs=[url_in, out_name], outputs=file_out) gr.Markdown( """ --- **Opérations effectuées :** - Extraction ciblée du contenu utile (titre, texte, articles, etc.) - Suppression automatique des menus, pubs et scripts indésirables - Mise en forme fluide en Markdown lisible et propre - Téléchargement immédiat du résultat """ ) if __name__ == "__main__": demo.launch(share=True)