web_to_md / webscraping_cleaning.py
Laurine Sottani
minor changes
5e9a370
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re, tempfile, requests, os
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from readability import Document
import gradio as gr
def clean_text_for_rag(text: str) -> str:
text = re.sub(
r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]",
lambda m: {
"’": "'", "‘": "'", "“": '"', "”": '"',
"«": '"', "»": '"', "–": "-", "—": "-",
"…": "...", "œ": "oe", "Œ": "OE",
"æ": "ae", "Æ": "AE", "©": "(c)", "®": "(R)",
"™": "TM", "§": "§", "°": "°", "±": "+/-",
"×": "x", "÷": "/"
}.get(m.group(0), m.group(0)),
text,
)
text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\\(\)]', '', text)
return re.sub(r'\s+', ' ', text).strip()
def fetch_html(url: str) -> str:
hdr = {"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0 Safari/537.36"}
r = requests.get(url, headers=hdr, timeout=20)
r.raise_for_status()
if r.encoding == "ISO-8859-1":
r.encoding = r.apparent_encoding
return r.text
def extract_main(html: str) -> str:
"""
Extraction du contenu principal d'une page web en éliminant les éléments parasites.
- Utilise readability.Document pour identifier la zone de l'article.
- Supprime les éléments de navigation, commentaires, widgets, publicités, etc.
- Ne garde que les balises sémantiquement pertinentes (article, p, h1-6, li, etc.).
"""
doc = Document(html)
article_html = doc.content()
soup = BeautifulSoup(article_html, "html.parser")
# 🔹 Supprimer les éléments non pertinents (pubs, menus, scripts, etc.)
for tag in soup(["script", "style", "noscript", "footer", "header", "nav", "form", "iframe", "aside", "button"]):
tag.decompose()
# 🔹 Supprimer les div ou sections trop petites ou sans texte utile
for div in soup.find_all(["div", "section"]):
text = div.get_text(strip=True)
if len(text) < 80: # seuil ajustable
div.decompose()
# 🔹 Conserver uniquement les éléments textuels pertinents
keep_tags = ["article", "p", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "ul", "ol", "li", "pre", "code"]
clean_parts = []
for tag in soup.find_all(keep_tags):
text = tag.get_text(" ", strip=True)
if text and not re.match(r"^(Partager|Suivre|Commentaires|Lire aussi|Publicité)", text, re.I):
clean_parts.append(text)
clean_text = "\n\n".join(clean_parts).strip()
return clean_text
def to_markdown(text: str) -> str:
md_raw = md(f"<div>{text}</div>", heading_style="ATX")
return clean_text_for_rag(md_raw)
def process(url: str, out_name: str) -> str:
html = fetch_html(url)
main_text = extract_main(html) # **seul le contenu pertinent**
markdown = to_markdown(main_text)
out_name = out_name.strip()
if not out_name.lower().endswith(".md"):
out_name += ".md"
tmp_dir = tempfile.mkdtemp()
out_path = os.path.join(tmp_dir, out_name)
with open(out_path, "w", encoding="utf-8") as f:
f.write(markdown)
return out_path
with gr.Blocks(title="Web → Markdown") as demo:
gr.Markdown("# 🌐 Web Scraping — du web vers Markdown")
with gr.Row():
with gr.Column():
url_in = gr.Textbox(label="URL à scraper")
out_name = gr.Textbox(label="Nom du fichier (.md)", value="output.md")
btn = gr.Button("🛠️ Générer", variant="primary")
with gr.Column():
file_out = gr.File(label="Fichier Markdown")
btn.click(fn=process, inputs=[url_in, out_name], outputs=file_out)
gr.Markdown(
"""
---
**Opérations effectuées :**
- Extraction ciblée du contenu utile (titre, texte, articles, etc.)
- Suppression automatique des menus, pubs et scripts indésirables
- Mise en forme fluide en Markdown lisible et propre
- Téléchargement immédiat du résultat
"""
)
if __name__ == "__main__":
demo.launch(share=True)