Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| import re, tempfile, requests, os | |
| from bs4 import BeautifulSoup | |
| from markdownify import markdownify as md | |
| from readability import Document | |
| import gradio as gr | |
| def clean_text_for_rag(text: str) -> str: | |
| text = re.sub( | |
| r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]", | |
| lambda m: { | |
| "’": "'", "‘": "'", "“": '"', "”": '"', | |
| "«": '"', "»": '"', "–": "-", "—": "-", | |
| "…": "...", "œ": "oe", "Œ": "OE", | |
| "æ": "ae", "Æ": "AE", "©": "(c)", "®": "(R)", | |
| "™": "TM", "§": "§", "°": "°", "±": "+/-", | |
| "×": "x", "÷": "/" | |
| }.get(m.group(0), m.group(0)), | |
| text, | |
| ) | |
| text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\\(\)]', '', text) | |
| return re.sub(r'\s+', ' ', text).strip() | |
| def fetch_html(url: str) -> str: | |
| hdr = {"User-Agent": | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/124.0 Safari/537.36"} | |
| r = requests.get(url, headers=hdr, timeout=20) | |
| r.raise_for_status() | |
| if r.encoding == "ISO-8859-1": | |
| r.encoding = r.apparent_encoding | |
| return r.text | |
| def extract_main(html: str) -> str: | |
| """ | |
| Extraction du contenu principal d'une page web en éliminant les éléments parasites. | |
| - Utilise readability.Document pour identifier la zone de l'article. | |
| - Supprime les éléments de navigation, commentaires, widgets, publicités, etc. | |
| - Ne garde que les balises sémantiquement pertinentes (article, p, h1-6, li, etc.). | |
| """ | |
| doc = Document(html) | |
| article_html = doc.content() | |
| soup = BeautifulSoup(article_html, "html.parser") | |
| # 🔹 Supprimer les éléments non pertinents (pubs, menus, scripts, etc.) | |
| for tag in soup(["script", "style", "noscript", "footer", "header", "nav", "form", "iframe", "aside", "button"]): | |
| tag.decompose() | |
| # 🔹 Supprimer les div ou sections trop petites ou sans texte utile | |
| for div in soup.find_all(["div", "section"]): | |
| text = div.get_text(strip=True) | |
| if len(text) < 80: # seuil ajustable | |
| div.decompose() | |
| # 🔹 Conserver uniquement les éléments textuels pertinents | |
| keep_tags = ["article", "p", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "ul", "ol", "li", "pre", "code"] | |
| clean_parts = [] | |
| for tag in soup.find_all(keep_tags): | |
| text = tag.get_text(" ", strip=True) | |
| if text and not re.match(r"^(Partager|Suivre|Commentaires|Lire aussi|Publicité)", text, re.I): | |
| clean_parts.append(text) | |
| clean_text = "\n\n".join(clean_parts).strip() | |
| return clean_text | |
| def to_markdown(text: str) -> str: | |
| md_raw = md(f"<div>{text}</div>", heading_style="ATX") | |
| return clean_text_for_rag(md_raw) | |
| def process(url: str, out_name: str) -> str: | |
| html = fetch_html(url) | |
| main_text = extract_main(html) # **seul le contenu pertinent** | |
| markdown = to_markdown(main_text) | |
| out_name = out_name.strip() | |
| if not out_name.lower().endswith(".md"): | |
| out_name += ".md" | |
| tmp_dir = tempfile.mkdtemp() | |
| out_path = os.path.join(tmp_dir, out_name) | |
| with open(out_path, "w", encoding="utf-8") as f: | |
| f.write(markdown) | |
| return out_path | |
| with gr.Blocks(title="Web → Markdown") as demo: | |
| gr.Markdown("# 🌐 Scraper web vers Markdown") | |
| with gr.Row(): | |
| with gr.Column(): | |
| url_in = gr.Textbox(label="URL à scraper") | |
| out_name = gr.Textbox(label="Nom du fichier (.md)", value="output.md") | |
| btn = gr.Button("🛠️ Générer", variant="primary") | |
| with gr.Column(): | |
| file_out = gr.File(label="Fichier Markdown") | |
| btn.click(fn=process, inputs=[url_in, out_name], outputs=file_out) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) |