Spaces:

lsottani
/

web_to_md

Sleeping

App Files Files Community

lsottani commited on Oct 7

Commit

6997817

verified ·

1 Parent(s): e934a66

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +2 -8
requirements.txt +2 -0
webscraping_cleaning.py +101 -0

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
 title: WebDetox
-emoji: 👀
-colorFrom: blue
-colorTo: red
 sdk: gradio
-sdk_version: 5.49.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: WebDetox
+app_file: webscraping_cleaning.py
 sdk: gradio
+sdk_version: 4.44.1
 ---

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ mardownify
2	+ readability

webscraping_cleaning.py ADDED Viewed

	@@ -0,0 +1,101 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import re, tempfile, requests, os
+from bs4 import BeautifulSoup
+from markdownify import markdownify as md
+from readability import Document
+import gradio as gr
+def clean_text_for_rag(text: str) -> str:
+    text = re.sub(
+        r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]",
+        lambda m: {
+            "’": "'", "‘": "'", "“": '"', "”": '"',
+            "«": '"', "»": '"', "–": "-", "—": "-",
+            "…": "...", "œ": "oe", "Œ": "OE",
+            "æ": "ae", "Æ": "AE", "©": "(c)", "®": "(R)",
+            "™": "TM", "§": "§", "°": "°", "±": "+/-",
+            "×": "x", "÷": "/"
+        }.get(m.group(0), m.group(0)),
+        text,
+    )
+    text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\\(\)]', '', text)
+    return re.sub(r'\s+', ' ', text).strip()
+def fetch_html(url: str) -> str:
+    hdr = {"User-Agent":
+           "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+           "AppleWebKit/537.36 (KHTML, like Gecko) "
+           "Chrome/124.0 Safari/537.36"}
+    r = requests.get(url, headers=hdr, timeout=20)
+    r.raise_for_status()
+    if r.encoding == "ISO-8859-1":
+        r.encoding = r.apparent_encoding
+    return r.text
+def extract_main(html: str) -> str:
+    """
+    Extraction du contenu principal d'une page web en éliminant les éléments parasites.
+    - Utilise readability.Document pour identifier la zone de l'article.
+    - Supprime les éléments de navigation, commentaires, widgets, publicités, etc.
+    - Ne garde que les balises sémantiquement pertinentes (article, p, h1-6, li, etc.).
+    """
+    doc = Document(html)
+    article_html = doc.content()
+    soup = BeautifulSoup(article_html, "html.parser")
+    # 🔹 Supprimer les éléments non pertinents (pubs, menus, scripts, etc.)
+    for tag in soup(["script", "style", "noscript", "footer", "header", "nav", "form", "iframe", "aside", "button"]):
+        tag.decompose()
+    # 🔹 Supprimer les div ou sections trop petites ou sans texte utile
+    for div in soup.find_all(["div", "section"]):
+        text = div.get_text(strip=True)
+        if len(text) < 80:  # seuil ajustable
+            div.decompose()
+    # 🔹 Conserver uniquement les éléments textuels pertinents
+    keep_tags = ["article", "p", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "ul", "ol", "li", "pre", "code"]
+    clean_parts = []
+    for tag in soup.find_all(keep_tags):
+        text = tag.get_text(" ", strip=True)
+        if text and not re.match(r"^(Partager|Suivre|Commentaires|Lire aussi|Publicité)", text, re.I):
+            clean_parts.append(text)
+    clean_text = "\n\n".join(clean_parts).strip()
+    return clean_text
+def to_markdown(text: str) -> str:
+    md_raw = md(f"<div>{text}</div>", heading_style="ATX")
+    return clean_text_for_rag(md_raw)
+def process(url: str, out_name: str) -> str:
+    html      = fetch_html(url)
+    main_text = extract_main(html)          # **seul le contenu pertinent**
+    markdown  = to_markdown(main_text)
+    out_name = out_name.strip()
+    if not out_name.lower().endswith(".md"):
+        out_name += ".md"
+    tmp_dir = tempfile.mkdtemp()
+    out_path = os.path.join(tmp_dir, out_name)
+    with open(out_path, "w", encoding="utf-8") as f:
+        f.write(markdown)
+    return out_path
+with gr.Blocks(title="Web → Markdown") as demo:
+    gr.Markdown("# 🌐 Scraper web vers Markdown")
+    with gr.Row():
+        with gr.Column():
+            url_in   = gr.Textbox(label="URL à scraper")
+            out_name = gr.Textbox(label="Nom du fichier (.md)", value="output.md")
+            btn      = gr.Button("🛠️ Générer", variant="primary")
+        with gr.Column():
+            file_out = gr.File(label="Fichier Markdown")
+    btn.click(fn=process, inputs=[url_in, out_name], outputs=file_out)
+if __name__ == "__main__":
+    demo.launch(share=True)