Spaces:
Runtime error
Runtime error
Commit
·
24d01e0
1
Parent(s):
e8893c1
✨ Add HTML purification functionality: integrate PurifyHtml in app and create Gradio interface for user input; update requirements to include transformers.
Browse files- Purify.py +41 -42
- app.py +30 -3
- requirements.txt +1 -0
Purify.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
|
|
| 1 |
from bs4 import BeautifulSoup, Tag
|
| 2 |
import datetime
|
| 3 |
import requests
|
| 4 |
import re
|
| 5 |
|
| 6 |
-
Url = 'https://huggingface.co'
|
| 7 |
-
|
| 8 |
NoisePatterns = {
|
| 9 |
'(No)Script': r'<[ ]*(script|noscript)[^>]*?>.*?<\/[ ]*\1[ ]*>',
|
| 10 |
'Style': r'<[ ]*(style)[^>]*?>.*?<\/[ ]*\1[ ]*>',
|
|
@@ -32,14 +31,11 @@ def RemoveNoise(RawHtml: str) -> str:
|
|
| 32 |
str: Cleaned HTML content without noise.
|
| 33 |
'''
|
| 34 |
CleanedHtml = RawHtml
|
| 35 |
-
OriginalCharCount = len(RawHtml)
|
| 36 |
for PatternName, Pattern in NoisePatterns.items():
|
| 37 |
if PatternName in ['EmptyLines', 'EmptyTags']: # These patterns are line-based
|
| 38 |
CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.MULTILINE)
|
| 39 |
else:
|
| 40 |
CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.DOTALL | re.IGNORECASE | re.MULTILINE)
|
| 41 |
-
print(f'• Removed {PatternName} noise. Removed {OriginalCharCount - len(CleanedHtml)} characters.')
|
| 42 |
-
OriginalCharCount = len(CleanedHtml)
|
| 43 |
return CleanedHtml
|
| 44 |
|
| 45 |
def FetchHtmlContent(Url: str) -> str | int:
|
|
@@ -58,43 +54,46 @@ def FetchHtmlContent(Url: str) -> str | int:
|
|
| 58 |
else:
|
| 59 |
return Response.status_code
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
| 81 |
|
| 82 |
-
|
| 83 |
-
'<!-- --- Purification Summary ---',
|
| 84 |
-
f'URL: {Url}',
|
| 85 |
-
f'Title: {Title}',
|
| 86 |
-
f'Description: {Description}',
|
| 87 |
-
f'Time of Fetch: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} (Took {datetime.datetime.now() - Start})',
|
| 88 |
-
f'Noise Removal Ratio: {Ratio:.2%} (lower is better)',
|
| 89 |
-
f'Characters: {RawCharCount} -> {CleanedCharCount} ({RawCharCount - CleanedCharCount} characters removed)',
|
| 90 |
-
'----------------------------- -->'
|
| 91 |
-
]
|
| 92 |
-
for Line in Summary:
|
| 93 |
-
print(Line)
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
File.write(Line + '\n')
|
| 98 |
-
File.write(CleanedHtml)
|
| 99 |
-
else:
|
| 100 |
-
print(f'Failed to fetch HTML content. Status code: {RawHtml}')
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 2 |
from bs4 import BeautifulSoup, Tag
|
| 3 |
import datetime
|
| 4 |
import requests
|
| 5 |
import re
|
| 6 |
|
|
|
|
|
|
|
| 7 |
NoisePatterns = {
|
| 8 |
'(No)Script': r'<[ ]*(script|noscript)[^>]*?>.*?<\/[ ]*\1[ ]*>',
|
| 9 |
'Style': r'<[ ]*(style)[^>]*?>.*?<\/[ ]*\1[ ]*>',
|
|
|
|
| 31 |
str: Cleaned HTML content without noise.
|
| 32 |
'''
|
| 33 |
CleanedHtml = RawHtml
|
|
|
|
| 34 |
for PatternName, Pattern in NoisePatterns.items():
|
| 35 |
if PatternName in ['EmptyLines', 'EmptyTags']: # These patterns are line-based
|
| 36 |
CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.MULTILINE)
|
| 37 |
else:
|
| 38 |
CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.DOTALL | re.IGNORECASE | re.MULTILINE)
|
|
|
|
|
|
|
| 39 |
return CleanedHtml
|
| 40 |
|
| 41 |
def FetchHtmlContent(Url: str) -> str | int:
|
|
|
|
| 54 |
else:
|
| 55 |
return Response.status_code
|
| 56 |
|
| 57 |
+
def PurifyHtml(Url: str) -> str: # type: ignore
|
| 58 |
+
Start = datetime.datetime.now()
|
| 59 |
+
RawHtml = FetchHtmlContent(Url)
|
| 60 |
+
if isinstance(RawHtml, str):
|
| 61 |
+
RawCharCount = len(RawHtml)
|
| 62 |
+
|
| 63 |
+
Soup = BeautifulSoup(RawHtml, 'html.parser')
|
| 64 |
+
PrettifiedHtml = str(Soup.prettify())
|
| 65 |
+
|
| 66 |
+
Title = Soup.title.string if Soup.title else 'No title found'
|
| 67 |
+
MetaDesc = Soup.find('meta', attrs={'name': 'description'})
|
| 68 |
+
Description = MetaDesc.get('content', 'No description found') if isinstance(MetaDesc, Tag) else 'No description found'
|
| 69 |
+
|
| 70 |
+
CleanedHtml = RemoveNoise(PrettifiedHtml)
|
| 71 |
+
|
| 72 |
+
CleanedCharCount = len(CleanedHtml)
|
| 73 |
+
Ratio = CleanedCharCount / RawCharCount if RawCharCount > 0 else 0
|
| 74 |
+
|
| 75 |
+
Summary = [
|
| 76 |
+
'<!-- --- Purification Summary ---',
|
| 77 |
+
f'URL: {Url}',
|
| 78 |
+
f'Title: {Title}',
|
| 79 |
+
f'Description: {Description}',
|
| 80 |
+
f'Time of Fetch: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} (Took {datetime.datetime.now() - Start})',
|
| 81 |
+
f'Noise Removal Ratio: {Ratio:.2%} (lower is better)',
|
| 82 |
+
f'Characters: {RawCharCount} -> {CleanedCharCount} ({RawCharCount - CleanedCharCount} characters removed)',
|
| 83 |
+
'----------------------------- -->'
|
| 84 |
+
]
|
| 85 |
+
for Line in Summary:
|
| 86 |
+
print(Line)
|
| 87 |
+
|
| 88 |
+
Tokenizer = AutoTokenizer.from_pretrained('jinaai/ReaderLM-v2')
|
| 89 |
+
Model = AutoModelForCausalLM.from_pretrained('jinaai/ReaderLM-v2')
|
| 90 |
|
| 91 |
+
Message = [
|
| 92 |
+
{'role': 'user', 'content': f'Please summarize the following HTML content in clean markdown:\n\n{CleanedHtml}'},
|
| 93 |
+
]
|
| 94 |
+
SummaryOutput = Model.chat(Message, tokenizer=Tokenizer, max_new_tokens=500, do_sample=False)
|
| 95 |
|
| 96 |
+
return str(SummaryOutput)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
+
else:
|
| 99 |
+
print(f'Failed to fetch HTML content. Status code: {RawHtml}')
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from pymongo import MongoClient
|
| 2 |
from dotenv import load_dotenv
|
|
|
|
| 3 |
from typing import Literal
|
| 4 |
from bson import ObjectId
|
| 5 |
from io import StringIO
|
|
@@ -143,6 +144,16 @@ def Ping(Host: str, Count: int = 8) -> str:
|
|
| 143 |
else:
|
| 144 |
return f'Ping to {Host} failed: No successful responses'
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
# ╭───────────────────────────────────╮
|
| 147 |
# │ Fun and Entertainment Tools │
|
| 148 |
# ╰───────────────────────────────────╯
|
|
@@ -163,15 +174,22 @@ def Fact() -> str:
|
|
| 163 |
'''
|
| 164 |
return requests.get('https://uselessfacts.jsph.pl/random.json?language=en').json()['text']
|
| 165 |
|
| 166 |
-
def Plot() -> str:
|
| 167 |
'''Generate a random plot for a movie or story.
|
|
|
|
|
|
|
| 168 |
Returns:
|
| 169 |
str: A random plot description.
|
| 170 |
'''
|
| 171 |
with open(r'Data/human-writing-dpo.json', 'r', encoding='utf-8') as PlotFile:
|
| 172 |
Data = json.load(PlotFile)
|
| 173 |
Plot = random.choice(Data)
|
| 174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
# ╭─────────────────────────────╮
|
| 177 |
# │ Text Processing Tools │
|
|
@@ -398,6 +416,13 @@ with gradio.Blocks(
|
|
| 398 |
PingOutput = gradio.Text(label='Ping Result 📡', interactive=False)
|
| 399 |
PingBtn = gradio.Button('Ping Host 📡', variant='primary')
|
| 400 |
PingBtn.click(Ping, inputs=[PingInput, PingCount], outputs=PingOutput)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
|
| 402 |
with gradio.TabItem('Fun & Entertainment 🎭'):
|
| 403 |
with gradio.TabItem('Random Joke 😂'):
|
|
@@ -416,8 +441,10 @@ with gradio.Blocks(
|
|
| 416 |
with gradio.TabItem('Random Plot 🎬'):
|
| 417 |
with gradio.Group():
|
| 418 |
PlotOutput = gradio.Text(label='Random Plot 🎬', interactive=False)
|
|
|
|
|
|
|
| 419 |
PlotBtn = gradio.Button('Get Plot 🎥', variant='primary')
|
| 420 |
-
PlotBtn.click(Plot, outputs=PlotOutput)
|
| 421 |
|
| 422 |
with gradio.TabItem('Text Processing 📝'):
|
| 423 |
with gradio.TabItem('Text Reversal 🔄'):
|
|
|
|
| 1 |
from pymongo import MongoClient
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
+
from Purify import PurifyHtml
|
| 4 |
from typing import Literal
|
| 5 |
from bson import ObjectId
|
| 6 |
from io import StringIO
|
|
|
|
| 144 |
else:
|
| 145 |
return f'Ping to {Host} failed: No successful responses'
|
| 146 |
|
| 147 |
+
def Purify(Url: str) -> str:
|
| 148 |
+
'''Purify HTML content from a URL.
|
| 149 |
+
Args:
|
| 150 |
+
Url (str): The URL to fetch and purify HTML content from.
|
| 151 |
+
Returns:
|
| 152 |
+
str: The purified HTML content or an error message.
|
| 153 |
+
'''
|
| 154 |
+
|
| 155 |
+
return PurifyHtml(Url)
|
| 156 |
+
|
| 157 |
# ╭───────────────────────────────────╮
|
| 158 |
# │ Fun and Entertainment Tools │
|
| 159 |
# ╰───────────────────────────────────╯
|
|
|
|
| 174 |
'''
|
| 175 |
return requests.get('https://uselessfacts.jsph.pl/random.json?language=en').json()['text']
|
| 176 |
|
| 177 |
+
def Plot(GiveExamplePrompt: bool = True) -> list[str]:
|
| 178 |
'''Generate a random plot for a movie or story.
|
| 179 |
+
Args:
|
| 180 |
+
GiveExamplePrompt (bool): If True, returns a random plot prompt from a predefined dataset.
|
| 181 |
Returns:
|
| 182 |
str: A random plot description.
|
| 183 |
'''
|
| 184 |
with open(r'Data/human-writing-dpo.json', 'r', encoding='utf-8') as PlotFile:
|
| 185 |
Data = json.load(PlotFile)
|
| 186 |
Plot = random.choice(Data)
|
| 187 |
+
Prompt = Plot['prompt']
|
| 188 |
+
Chosen = Plot['chosen']
|
| 189 |
+
if GiveExamplePrompt:
|
| 190 |
+
return [Prompt, Chosen]
|
| 191 |
+
else:
|
| 192 |
+
return [Prompt, '']
|
| 193 |
|
| 194 |
# ╭─────────────────────────────╮
|
| 195 |
# │ Text Processing Tools │
|
|
|
|
| 416 |
PingOutput = gradio.Text(label='Ping Result 📡', interactive=False)
|
| 417 |
PingBtn = gradio.Button('Ping Host 📡', variant='primary')
|
| 418 |
PingBtn.click(Ping, inputs=[PingInput, PingCount], outputs=PingOutput)
|
| 419 |
+
|
| 420 |
+
with gradio.TabItem('Web Scraping & Purification 🌐'):
|
| 421 |
+
with gradio.Group():
|
| 422 |
+
PurifyInput = gradio.Textbox(label='URL to Purify 🌐', placeholder='Enter URL to fetch and purify HTML', lines=1, max_lines=1)
|
| 423 |
+
PurifyOutput = gradio.Text(label='Purified HTML Content 📝', interactive=False)
|
| 424 |
+
PurifyBtn = gradio.Button('Purify HTML 🧹', variant='primary')
|
| 425 |
+
PurifyBtn.click(Purify, inputs=PurifyInput, outputs=PurifyOutput)
|
| 426 |
|
| 427 |
with gradio.TabItem('Fun & Entertainment 🎭'):
|
| 428 |
with gradio.TabItem('Random Joke 😂'):
|
|
|
|
| 441 |
with gradio.TabItem('Random Plot 🎬'):
|
| 442 |
with gradio.Group():
|
| 443 |
PlotOutput = gradio.Text(label='Random Plot 🎬', interactive=False)
|
| 444 |
+
PlotExample = gradio.Checkbox(label='Give Example Plot Prompt 📜', value=True, interactive=True)
|
| 445 |
+
PlotExampleOutput = gradio.Text(label='Example Plot Prompt 📜', interactive=False)
|
| 446 |
PlotBtn = gradio.Button('Get Plot 🎥', variant='primary')
|
| 447 |
+
PlotBtn.click(Plot, inputs=[PlotExample], outputs=[PlotOutput, PlotExampleOutput])
|
| 448 |
|
| 449 |
with gradio.TabItem('Text Processing 📝'):
|
| 450 |
with gradio.TabItem('Text Reversal 🔄'):
|
requirements.txt
CHANGED
|
@@ -1 +1,2 @@
|
|
|
|
|
| 1 |
pymongo
|
|
|
|
| 1 |
+
transformers
|
| 2 |
pymongo
|