Spaces:

lsottani
/

web_to_md

Sleeping

File size: 4,310 Bytes

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re, tempfile, requests, os 
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from readability import Document          
import gradio as gr

def clean_text_for_rag(text: str) -> str:
    text = re.sub(
        r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]",
        lambda m: {
            "’": "'", "‘": "'", "“": '"', "”": '"',
            "«": '"', "»": '"', "–": "-", "—": "-",
            "…": "...", "œ": "oe", "Œ": "OE",
            "æ": "ae", "Æ": "AE", "©": "(c)", "®": "(R)",
            "™": "TM", "§": "§", "°": "°", "±": "+/-",
            "×": "x", "÷": "/"
        }.get(m.group(0), m.group(0)),
        text,
    )
    text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\\(\)]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

def fetch_html(url: str) -> str:
    hdr = {"User-Agent":
           "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
           "AppleWebKit/537.36 (KHTML, like Gecko) "
           "Chrome/124.0 Safari/537.36"}
    r = requests.get(url, headers=hdr, timeout=20)
    r.raise_for_status()
    if r.encoding == "ISO-8859-1":
        r.encoding = r.apparent_encoding
    return r.text

def extract_main(html: str) -> str:
    """
    Extraction du contenu principal d'une page web en éliminant les éléments parasites.
    - Utilise readability.Document pour identifier la zone de l'article.
    - Supprime les éléments de navigation, commentaires, widgets, publicités, etc.
    - Ne garde que les balises sémantiquement pertinentes (article, p, h1-6, li, etc.).
    """
    doc = Document(html)
    article_html = doc.content()
    soup = BeautifulSoup(article_html, "html.parser")

    # 🔹 Supprimer les éléments non pertinents (pubs, menus, scripts, etc.)
    for tag in soup(["script", "style", "noscript", "footer", "header", "nav", "form", "iframe", "aside", "button"]):
        tag.decompose()

    # 🔹 Supprimer les div ou sections trop petites ou sans texte utile
    for div in soup.find_all(["div", "section"]):
        text = div.get_text(strip=True)
        if len(text) < 80:  # seuil ajustable
            div.decompose()

    # 🔹 Conserver uniquement les éléments textuels pertinents
    keep_tags = ["article", "p", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "ul", "ol", "li", "pre", "code"]
    clean_parts = []

    for tag in soup.find_all(keep_tags):
        text = tag.get_text(" ", strip=True)
        if text and not re.match(r"^(Partager|Suivre|Commentaires|Lire aussi|Publicité)", text, re.I):
            clean_parts.append(text)

    clean_text = "\n\n".join(clean_parts).strip()

    return clean_text

def to_markdown(text: str) -> str:
    md_raw = md(f"<div>{text}</div>", heading_style="ATX")
    return clean_text_for_rag(md_raw)

def process(url: str, out_name: str) -> str:
    html      = fetch_html(url)
    main_text = extract_main(html)          # **seul le contenu pertinent**
    markdown  = to_markdown(main_text)

    out_name = out_name.strip()
    if not out_name.lower().endswith(".md"):
        out_name += ".md"
    tmp_dir = tempfile.mkdtemp()
    out_path = os.path.join(tmp_dir, out_name)
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(markdown)
    return out_path

with gr.Blocks(title="Web → Markdown") as demo:
    gr.Markdown("# 🌐 Web Scraping — du web vers Markdown")
    with gr.Row():
        with gr.Column():
            url_in   = gr.Textbox(label="URL à scraper")
            out_name = gr.Textbox(label="Nom du fichier (.md)", value="output.md")
            btn      = gr.Button("🛠️ Générer", variant="primary")
        with gr.Column():
            file_out = gr.File(label="Fichier Markdown")
    btn.click(fn=process, inputs=[url_in, out_name], outputs=file_out)

    gr.Markdown(
        """
        ---
        **Opérations effectuées :**
        - Extraction ciblée du contenu utile (titre, texte, articles, etc.)  
        -  Suppression automatique des menus, pubs et scripts indésirables
        -  Mise en forme fluide en Markdown lisible et propre  
        - Téléchargement immédiat du résultat  
        
        """
    )

if __name__ == "__main__":
    demo.launch(share=True)