Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

create-llms-txt / app.py

cyberandy

Create app.py

92a9efd verified 4 months ago

raw

history blame contribute delete

8.29 kB

	import gradio as gr
	import advertools as adv
	import pandas as pd
	import re
	from secrets import token_hex
	import logging
	import os
	from markitdown import MarkItDown
	from typing import Tuple, List, Optional
	import validators

	# Set up logging
	logging.basicConfig(
	level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
	)
	logger = logging.getLogger(__name__)

	# Initialize MarkItDown
	md_converter = MarkItDown()


	def validate_url(url: str) -> Tuple[bool, str]:
	"""Validate URL format and accessibility."""
	if not url:
	return False, "URL is required"

	if not url.startswith(("http://", "https://")):
	url = "https://" + url

	if not validators.url(url):
	return False, "Invalid URL format"

	return True, url


	def safe_crawl(url: str, output_file: str) -> bool:
	"""Safely perform a web crawl with timeout and error handling."""
	try:
	adv.crawl(
	url,
	output_file,
	follow_links=False,
	custom_settings={
	"CLOSESPIDER_TIMEOUT": 30,
	"ROBOTSTXT_OBEY": True,
	"CONCURRENT_REQUESTS_PER_DOMAIN": 1,
	"USER_AGENT": "Mozilla/5.0 (compatible; LLMContentBot/1.0)",
	"DOWNLOAD_TIMEOUT": 10,
	},
	)
	return True
	except Exception as e:
	logger.error(f"Crawl error for {url}: {str(e)}")
	return False


	def clean_text(text: str) -> str:
	"""Clean and format text by removing extra whitespace and normalizing spacing."""
	if not text:
	return ""
	# Remove extra whitespace and newlines
	text = re.sub(r"[\n\s]+", " ", text)
	# Split camelCase words
	text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
	# Clean extra spaces
	text = " ".join(text.split())
	return text.strip()


	def process_link_pair(url: str, text: str, seen_links: set) -> Optional[str]:
	"""Process a single link-text pair and return markdown if valid."""
	if not url or not text:
	return None

	url = url.strip()
	text = clean_text(text)

	if not text or not url or url in seen_links:
	return None

	seen_links.add(url)
	return f"## {text}\n[{text}]({url})"


	def process_links(crawl_df: pd.DataFrame, link_types: List[str]) -> str:
	"""Process links based on selected types with deduplication."""
	try:
	all_links = []
	seen_links = set() # Track unique URLs

	if "All links" in link_types or not link_types:
	link_df = adv.crawlytics.links(crawl_df)
	for link, text in link_df[["link", "text"]].dropna().values:
	if md_link := process_link_pair(link, text, seen_links):
	all_links.append(md_link)
	else:
	for link_type in link_types:
	type_match = re.findall(r"header\|footer\|nav", link_type.lower())
	if type_match:
	col_prefix = type_match[0]
	urls = crawl_df[f"{col_prefix}_links_url"].iloc[0]
	texts = crawl_df[f"{col_prefix}_links_text"].iloc[0]

	if urls and texts:
	urls = urls.split("@@")
	texts = texts.split("@@")

	for url, text in zip(urls, texts):
	if md_link := process_link_pair(url, text, seen_links):
	all_links.append(md_link)

	return "\n\n".join(all_links)
	except Exception as e:
	logger.error(f"Link processing error: {str(e)}")
	return ""


	def process_url(url: str, link_types: List[str]) -> Tuple[str, str]:
	"""Process website URL and generate markdown content."""
	valid, result = validate_url(url)
	if not valid:
	return "", result

	url = result
	output_file = f"crawl_{token_hex(6)}.jsonl"

	try:
	if not safe_crawl(url, output_file):
	return "", "Crawl failed or timed out"

	crawl_df = pd.read_json(output_file, lines=True)
	if crawl_df.empty:
	return "", "No data found for the URL"

	# Extract and clean title and description
	title = (
	clean_text(crawl_df["title"].iloc[0])
	if "title" in crawl_df.columns
	else "Untitled"
	)
	meta_desc = (
	clean_text(crawl_df["meta_desc"].iloc[0])
	if "meta_desc" in crawl_df.columns
	else ""
	)

	# Process links
	links_content = process_links(crawl_df, link_types)

	# Generate final markdown
	content = f"# {title}\n\n"
	if meta_desc:
	content += f"> {meta_desc}\n\n"
	content += links_content

	return content, f"Successfully processed {url}"

	except Exception as e:
	logger.error(f"Error processing {url}: {str(e)}")
	return "", f"Error: {str(e)}"
	finally:
	if os.path.exists(output_file):
	os.remove(output_file)


	def process_file(file: gr.File) -> Tuple[str, str]:
	"""Convert uploaded file to markdown."""
	if not file:
	return "", "No file uploaded"

	supported_extensions = {".pdf", ".docx", ".pptx", ".xlsx", ".html", ".txt"}
	file_ext = os.path.splitext(file.name)[1].lower()

	if file_ext not in supported_extensions:
	return "", f"Unsupported file type: {file_ext}"

	try:
	result = md_converter.convert(file.name)
	return result.text_content, "File processed successfully"
	except Exception as e:
	logger.error(f"File processing error: {str(e)}")
	return "", f"Error processing file: {str(e)}"


	# Custom CSS for styling
	css = """
	@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap');

	body {
	font-family: 'Open Sans', sans-serif !important;
	}

	.primary-btn {
	background-color: #3452db !important;
	}

	.primary-btn:hover {
	background-color: #2a41af !important;
	}
	"""

	# Create a custom theme
	theme = gr.themes.Soft(
	primary_hue=gr.themes.colors.Color(
	name="blue",
	c50="#eef1ff",
	c100="#e0e5ff",
	c200="#c3cbff",
	c300="#a5b2ff",
	c400="#8798ff",
	c500="#6a7eff",
	c600="#3452db",
	c700="#2a41af",
	c800="#1f3183",
	c900="#152156",
	c950="#0a102b",
	)
	)

	# Create interface
	with gr.Blocks(
	theme=gr.themes.Soft(),
	css=css,
	head="""
	<link rel="canonical" href="https://wordlift.io/generate-llms-txt/" />
	<meta name="description" content="Generate your LLMs.txt file - A WordLift tool to help you manage Large Language Models access to your content." />
	<meta property="og:title" content="LLMs.txt Generator by WordLift" />
	<meta property="og:description" content="Generate your LLMs.txt file - A WordLift tool to help you manage Large Language Models access to your content." />
	<meta property="og:url" content="https://wordlift.io/generate-llms-txt/" />
	""",
	) as iface:
	gr.Markdown("# LLMs.txt Generator")

	with gr.Tab("Website URL"):
	url_input = gr.Textbox(label="Website URL", placeholder="example.com")
	link_types = gr.Dropdown(
	choices=["All links", "<header> links", "<nav> links", "<footer> links"],
	multiselect=True,
	value=["All links"],
	label="Link Types to Extract",
	)
	url_button = gr.Button("Process URL", variant="primary")
	url_output = gr.Textbox(
	label="Generated Content", lines=20, show_copy_button=True
	)
	url_status = gr.Textbox(label="Status")

	url_button.click(
	process_url,
	inputs=[url_input, link_types],
	outputs=[url_output, url_status],
	)

	with gr.Tab("File Converter"):
	file_input = gr.File(label="Upload Document")
	file_button = gr.Button("Convert to Markdown", variant="primary")
	file_output = gr.Textbox(
	label="Converted Content", lines=20, show_copy_button=True
	)
	file_status = gr.Textbox(label="Status")

	file_button.click(
	process_file, inputs=[file_input], outputs=[file_output, file_status]
	)

	if __name__ == "__main__":
	iface.launch()