#!/usr/bin/env python # -*- coding: utf-8 -*- import re, tempfile, requests, os from bs4 import BeautifulSoup from markdownify import markdownify as md from readability import Document import gradio as gr def clean_text_for_rag(text: str) -> str: text = re.sub( r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]", lambda m: { "’": "'", "‘": "'", "“": '"', "”": '"', "«": '"', "»": '"', "–": "-", "—": "-", "…": "...", "œ": "oe", "Œ": "OE", "æ": "ae", "Æ": "AE", "©": "(c)", "®": "(R)", "™": "TM", "§": "§", "°": "°", "±": "+/-", "×": "x", "÷": "/" }.get(m.group(0), m.group(0)), text, ) text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\\(\)]', '', text) return re.sub(r'\s+', ' ', text).strip() def fetch_html(url: str) -> str: hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/124.0 Safari/537.36"} r = requests.get(url, headers=hdr, timeout=20) r.raise_for_status() if r.encoding == "ISO-8859-1": r.encoding = r.apparent_encoding return r.text def extract_main(html: str) -> str: """ Extraction du contenu principal d'une page web en éliminant les éléments parasites. - Utilise readability.Document pour identifier la zone de l'article. - Supprime les éléments de navigation, commentaires, widgets, publicités, etc. - Ne garde que les balises sémantiquement pertinentes (article, p, h1-6, li, etc.). """ doc = Document(html) article_html = doc.content() soup = BeautifulSoup(article_html, "html.parser") # 🔹 Supprimer les éléments non pertinents (pubs, menus, scripts, etc.) for tag in soup(["script", "style", "noscript", "footer", "header", "nav", "form", "iframe", "aside", "button"]): tag.decompose() # 🔹 Supprimer les div ou sections trop petites ou sans texte utile for div in soup.find_all(["div", "section"]): text = div.get_text(strip=True) if len(text) < 80: # seuil ajustable div.decompose() # 🔹 Conserver uniquement les éléments textuels pertinents keep_tags = ["article", "p", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "ul", "ol", "li", "pre", "code"] clean_parts = [] for tag in soup.find_all(keep_tags): text = tag.get_text(" ", strip=True) if text and not re.match(r"^(Partager|Suivre|Commentaires|Lire aussi|Publicité)", text, re.I): clean_parts.append(text) clean_text = "\n\n".join(clean_parts).strip() return clean_text def to_markdown(text: str) -> str: md_raw = md(f"