Spaces:

lsottani
/

RAG_file_preprocessing

Sleeping

App Files Files Community

RAG_file_preprocessing / webscraping_cleaning.py

lsottani

Upload folder using huggingface_hub

d9f69e5 verified about 2 months ago

raw

history blame

3.92 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --

	import re, tempfile, requests, os
	from bs4 import BeautifulSoup
	from markdownify import markdownify as md
	from readability import Document
	import gradio as gr

	def clean_text_for_rag(text: str) -> str:
	text = re.sub(
	r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]",
	lambda m: {
	"’": "'", "‘": "'", "“": '"', "”": '"',
	"«": '"', "»": '"', "–": "-", "—": "-",
	"…": "...", "œ": "oe", "Œ": "OE",
	"æ": "ae", "Æ": "AE", "©": "(c)", "®": "(R)",
	"™": "TM", "§": "§", "°": "°", "±": "+/-",
	"×": "x", "÷": "/"
	}.get(m.group(0), m.group(0)),
	text,
	)
	text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\\(\)]', '', text)
	return re.sub(r'\s+', ' ', text).strip()

	def fetch_html(url: str) -> str:
	hdr = {"User-Agent":
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/124.0 Safari/537.36"}
	r = requests.get(url, headers=hdr, timeout=20)
	r.raise_for_status()
	if r.encoding == "ISO-8859-1":
	r.encoding = r.apparent_encoding
	return r.text

	def extract_main(html: str) -> str:
	"""
	Extraction du contenu principal d'une page web en éliminant les éléments parasites.
	- Utilise readability.Document pour identifier la zone de l'article.
	- Supprime les éléments de navigation, commentaires, widgets, publicités, etc.
	- Ne garde que les balises sémantiquement pertinentes (article, p, h1-6, li, etc.).
	"""
	doc = Document(html)
	article_html = doc.content()
	soup = BeautifulSoup(article_html, "html.parser")

	# 🔹 Supprimer les éléments non pertinents (pubs, menus, scripts, etc.)
	for tag in soup(["script", "style", "noscript", "footer", "header", "nav", "form", "iframe", "aside", "button"]):
	tag.decompose()

	# 🔹 Supprimer les div ou sections trop petites ou sans texte utile
	for div in soup.find_all(["div", "section"]):
	text = div.get_text(strip=True)
	if len(text) < 80: # seuil ajustable
	div.decompose()

	# 🔹 Conserver uniquement les éléments textuels pertinents
	keep_tags = ["article", "p", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "ul", "ol", "li", "pre", "code"]
	clean_parts = []

	for tag in soup.find_all(keep_tags):
	text = tag.get_text(" ", strip=True)
	if text and not re.match(r"^(Partager\|Suivre\|Commentaires\|Lire aussi\|Publicité)", text, re.I):
	clean_parts.append(text)

	clean_text = "\n\n".join(clean_parts).strip()

	return clean_text

	def to_markdown(text: str) -> str:
	md_raw = md(f"<div>{text}</div>", heading_style="ATX")
	return clean_text_for_rag(md_raw)

	def process(url: str, out_name: str) -> str:
	html = fetch_html(url)
	main_text = extract_main(html) # seul le contenu pertinent
	markdown = to_markdown(main_text)

	out_name = out_name.strip()
	if not out_name.lower().endswith(".md"):
	out_name += ".md"
	tmp_dir = tempfile.mkdtemp()
	out_path = os.path.join(tmp_dir, out_name)
	with open(out_path, "w", encoding="utf-8") as f:
	f.write(markdown)
	return out_path

	with gr.Blocks(title="Web → Markdown") as demo:
	gr.Markdown("# 🌐 Scraper web vers Markdown")
	with gr.Row():
	with gr.Column():
	url_in = gr.Textbox(label="URL à scraper")
	out_name = gr.Textbox(label="Nom du fichier (.md)", value="output.md")
	btn = gr.Button("🛠️ Générer", variant="primary")
	with gr.Column():
	file_out = gr.File(label="Fichier Markdown")
	btn.click(fn=process, inputs=[url_in, out_name], outputs=file_out)

	if __name__ == "__main__":
	demo.launch(share=True)