|
|
|
|
|
|
|
|
|
|
|
import re, tempfile, requests, os |
|
|
from bs4 import BeautifulSoup |
|
|
from markdownify import markdownify as md |
|
|
from readability import Document |
|
|
import gradio as gr |
|
|
|
|
|
def clean_text_for_rag(text: str) -> str: |
|
|
text = re.sub( |
|
|
r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]", |
|
|
lambda m: { |
|
|
"’": "'", "‘": "'", "“": '"', "”": '"', |
|
|
"«": '"', "»": '"', "–": "-", "—": "-", |
|
|
"…": "...", "œ": "oe", "Œ": "OE", |
|
|
"æ": "ae", "Æ": "AE", "©": "(c)", "®": "(R)", |
|
|
"™": "TM", "§": "§", "°": "°", "±": "+/-", |
|
|
"×": "x", "÷": "/" |
|
|
}.get(m.group(0), m.group(0)), |
|
|
text, |
|
|
) |
|
|
text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\\(\)]', '', text) |
|
|
return re.sub(r'\s+', ' ', text).strip() |
|
|
|
|
|
def fetch_html(url: str) -> str: |
|
|
hdr = {"User-Agent": |
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) " |
|
|
"Chrome/124.0 Safari/537.36"} |
|
|
r = requests.get(url, headers=hdr, timeout=20) |
|
|
r.raise_for_status() |
|
|
if r.encoding == "ISO-8859-1": |
|
|
r.encoding = r.apparent_encoding |
|
|
return r.text |
|
|
|
|
|
def extract_main(html: str) -> str: |
|
|
""" |
|
|
Extraction du contenu principal d'une page web en éliminant les éléments parasites. |
|
|
- Utilise readability.Document pour identifier la zone de l'article. |
|
|
- Supprime les éléments de navigation, commentaires, widgets, publicités, etc. |
|
|
- Ne garde que les balises sémantiquement pertinentes (article, p, h1-6, li, etc.). |
|
|
""" |
|
|
doc = Document(html) |
|
|
article_html = doc.content() |
|
|
soup = BeautifulSoup(article_html, "html.parser") |
|
|
|
|
|
|
|
|
for tag in soup(["script", "style", "noscript", "footer", "header", "nav", "form", "iframe", "aside", "button"]): |
|
|
tag.decompose() |
|
|
|
|
|
|
|
|
for div in soup.find_all(["div", "section"]): |
|
|
text = div.get_text(strip=True) |
|
|
if len(text) < 80: |
|
|
div.decompose() |
|
|
|
|
|
|
|
|
keep_tags = ["article", "p", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "ul", "ol", "li", "pre", "code"] |
|
|
clean_parts = [] |
|
|
|
|
|
for tag in soup.find_all(keep_tags): |
|
|
text = tag.get_text(" ", strip=True) |
|
|
if text and not re.match(r"^(Partager|Suivre|Commentaires|Lire aussi|Publicité)", text, re.I): |
|
|
clean_parts.append(text) |
|
|
|
|
|
clean_text = "\n\n".join(clean_parts).strip() |
|
|
|
|
|
return clean_text |
|
|
|
|
|
def to_markdown(text: str) -> str: |
|
|
md_raw = md(f"<div>{text}</div>", heading_style="ATX") |
|
|
return clean_text_for_rag(md_raw) |
|
|
|
|
|
def process(url: str, out_name: str) -> str: |
|
|
html = fetch_html(url) |
|
|
main_text = extract_main(html) |
|
|
markdown = to_markdown(main_text) |
|
|
|
|
|
out_name = out_name.strip() |
|
|
if not out_name.lower().endswith(".md"): |
|
|
out_name += ".md" |
|
|
tmp_dir = tempfile.mkdtemp() |
|
|
out_path = os.path.join(tmp_dir, out_name) |
|
|
with open(out_path, "w", encoding="utf-8") as f: |
|
|
f.write(markdown) |
|
|
return out_path |
|
|
|
|
|
with gr.Blocks(title="Web → Markdown") as demo: |
|
|
gr.Markdown("# 🌐 Web Scraping — du web vers Markdown") |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
url_in = gr.Textbox(label="URL à scraper") |
|
|
out_name = gr.Textbox(label="Nom du fichier (.md)", value="output.md") |
|
|
btn = gr.Button("🛠️ Générer", variant="primary") |
|
|
with gr.Column(): |
|
|
file_out = gr.File(label="Fichier Markdown") |
|
|
btn.click(fn=process, inputs=[url_in, out_name], outputs=file_out) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
--- |
|
|
**Opérations effectuées :** |
|
|
- Extraction ciblée du contenu utile (titre, texte, articles, etc.) |
|
|
- Suppression automatique des menus, pubs et scripts indésirables |
|
|
- Mise en forme fluide en Markdown lisible et propre |
|
|
- Téléchargement immédiat du résultat |
|
|
|
|
|
""" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(share=True) |