lsottani commited on
Commit
6997817
·
verified ·
1 Parent(s): e934a66

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +2 -8
  2. requirements.txt +2 -0
  3. webscraping_cleaning.py +101 -0
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
  title: WebDetox
3
- emoji: 👀
4
- colorFrom: blue
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.49.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: WebDetox
3
+ app_file: webscraping_cleaning.py
 
 
4
  sdk: gradio
5
+ sdk_version: 4.44.1
 
 
6
  ---
 
 
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ mardownify
2
+ readability
webscraping_cleaning.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import re, tempfile, requests, os
5
+ from bs4 import BeautifulSoup
6
+ from markdownify import markdownify as md
7
+ from readability import Document
8
+ import gradio as gr
9
+
10
+ def clean_text_for_rag(text: str) -> str:
11
+ text = re.sub(
12
+ r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]",
13
+ lambda m: {
14
+ "’": "'", "‘": "'", "“": '"', "”": '"',
15
+ "«": '"', "»": '"', "–": "-", "—": "-",
16
+ "…": "...", "œ": "oe", "Œ": "OE",
17
+ "æ": "ae", "Æ": "AE", "©": "(c)", "®": "(R)",
18
+ "™": "TM", "§": "§", "°": "°", "±": "+/-",
19
+ "×": "x", "÷": "/"
20
+ }.get(m.group(0), m.group(0)),
21
+ text,
22
+ )
23
+ text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\\(\)]', '', text)
24
+ return re.sub(r'\s+', ' ', text).strip()
25
+
26
+ def fetch_html(url: str) -> str:
27
+ hdr = {"User-Agent":
28
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
29
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
30
+ "Chrome/124.0 Safari/537.36"}
31
+ r = requests.get(url, headers=hdr, timeout=20)
32
+ r.raise_for_status()
33
+ if r.encoding == "ISO-8859-1":
34
+ r.encoding = r.apparent_encoding
35
+ return r.text
36
+
37
+ def extract_main(html: str) -> str:
38
+ """
39
+ Extraction du contenu principal d'une page web en éliminant les éléments parasites.
40
+ - Utilise readability.Document pour identifier la zone de l'article.
41
+ - Supprime les éléments de navigation, commentaires, widgets, publicités, etc.
42
+ - Ne garde que les balises sémantiquement pertinentes (article, p, h1-6, li, etc.).
43
+ """
44
+ doc = Document(html)
45
+ article_html = doc.content()
46
+ soup = BeautifulSoup(article_html, "html.parser")
47
+
48
+ # 🔹 Supprimer les éléments non pertinents (pubs, menus, scripts, etc.)
49
+ for tag in soup(["script", "style", "noscript", "footer", "header", "nav", "form", "iframe", "aside", "button"]):
50
+ tag.decompose()
51
+
52
+ # 🔹 Supprimer les div ou sections trop petites ou sans texte utile
53
+ for div in soup.find_all(["div", "section"]):
54
+ text = div.get_text(strip=True)
55
+ if len(text) < 80: # seuil ajustable
56
+ div.decompose()
57
+
58
+ # 🔹 Conserver uniquement les éléments textuels pertinents
59
+ keep_tags = ["article", "p", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "ul", "ol", "li", "pre", "code"]
60
+ clean_parts = []
61
+
62
+ for tag in soup.find_all(keep_tags):
63
+ text = tag.get_text(" ", strip=True)
64
+ if text and not re.match(r"^(Partager|Suivre|Commentaires|Lire aussi|Publicité)", text, re.I):
65
+ clean_parts.append(text)
66
+
67
+ clean_text = "\n\n".join(clean_parts).strip()
68
+
69
+ return clean_text
70
+
71
+ def to_markdown(text: str) -> str:
72
+ md_raw = md(f"<div>{text}</div>", heading_style="ATX")
73
+ return clean_text_for_rag(md_raw)
74
+
75
+ def process(url: str, out_name: str) -> str:
76
+ html = fetch_html(url)
77
+ main_text = extract_main(html) # **seul le contenu pertinent**
78
+ markdown = to_markdown(main_text)
79
+
80
+ out_name = out_name.strip()
81
+ if not out_name.lower().endswith(".md"):
82
+ out_name += ".md"
83
+ tmp_dir = tempfile.mkdtemp()
84
+ out_path = os.path.join(tmp_dir, out_name)
85
+ with open(out_path, "w", encoding="utf-8") as f:
86
+ f.write(markdown)
87
+ return out_path
88
+
89
+ with gr.Blocks(title="Web → Markdown") as demo:
90
+ gr.Markdown("# 🌐 Scraper web vers Markdown")
91
+ with gr.Row():
92
+ with gr.Column():
93
+ url_in = gr.Textbox(label="URL à scraper")
94
+ out_name = gr.Textbox(label="Nom du fichier (.md)", value="output.md")
95
+ btn = gr.Button("🛠️ Générer", variant="primary")
96
+ with gr.Column():
97
+ file_out = gr.File(label="Fichier Markdown")
98
+ btn.click(fn=process, inputs=[url_in, out_name], outputs=file_out)
99
+
100
+ if __name__ == "__main__":
101
+ demo.launch(share=True)