Spaces:
Running
Running
| import gradio as gr | |
| import advertools as adv | |
| import pandas as pd | |
| import re | |
| from secrets import token_hex | |
| import logging | |
| import os | |
| from markitdown import MarkItDown | |
| from typing import Tuple, List, Optional | |
| import validators | |
| # Set up logging | |
| logging.basicConfig( | |
| level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # Initialize MarkItDown | |
| md_converter = MarkItDown() | |
| def validate_url(url: str) -> Tuple[bool, str]: | |
| """Validate URL format and accessibility.""" | |
| if not url: | |
| return False, "URL is required" | |
| if not url.startswith(("http://", "https://")): | |
| url = "https://" + url | |
| if not validators.url(url): | |
| return False, "Invalid URL format" | |
| return True, url | |
| def safe_crawl(url: str, output_file: str) -> bool: | |
| """Safely perform a web crawl with timeout and error handling.""" | |
| try: | |
| adv.crawl( | |
| url, | |
| output_file, | |
| follow_links=False, | |
| custom_settings={ | |
| "CLOSESPIDER_TIMEOUT": 30, | |
| "ROBOTSTXT_OBEY": True, | |
| "CONCURRENT_REQUESTS_PER_DOMAIN": 1, | |
| "USER_AGENT": "Mozilla/5.0 (compatible; LLMContentBot/1.0)", | |
| "DOWNLOAD_TIMEOUT": 10, | |
| }, | |
| ) | |
| return True | |
| except Exception as e: | |
| logger.error(f"Crawl error for {url}: {str(e)}") | |
| return False | |
| def clean_text(text: str) -> str: | |
| """Clean and format text by removing extra whitespace and normalizing spacing.""" | |
| if not text: | |
| return "" | |
| # Remove extra whitespace and newlines | |
| text = re.sub(r"[\n\s]+", " ", text) | |
| # Split camelCase words | |
| text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text) | |
| # Clean extra spaces | |
| text = " ".join(text.split()) | |
| return text.strip() | |
| def process_link_pair(url: str, text: str, seen_links: set) -> Optional[str]: | |
| """Process a single link-text pair and return markdown if valid.""" | |
| if not url or not text: | |
| return None | |
| url = url.strip() | |
| text = clean_text(text) | |
| if not text or not url or url in seen_links: | |
| return None | |
| seen_links.add(url) | |
| return f"## {text}\n[{text}]({url})" | |
| def process_links(crawl_df: pd.DataFrame, link_types: List[str]) -> str: | |
| """Process links based on selected types with deduplication.""" | |
| try: | |
| all_links = [] | |
| seen_links = set() # Track unique URLs | |
| if "All links" in link_types or not link_types: | |
| link_df = adv.crawlytics.links(crawl_df) | |
| for link, text in link_df[["link", "text"]].dropna().values: | |
| if md_link := process_link_pair(link, text, seen_links): | |
| all_links.append(md_link) | |
| else: | |
| for link_type in link_types: | |
| type_match = re.findall(r"header|footer|nav", link_type.lower()) | |
| if type_match: | |
| col_prefix = type_match[0] | |
| urls = crawl_df[f"{col_prefix}_links_url"].iloc[0] | |
| texts = crawl_df[f"{col_prefix}_links_text"].iloc[0] | |
| if urls and texts: | |
| urls = urls.split("@@") | |
| texts = texts.split("@@") | |
| for url, text in zip(urls, texts): | |
| if md_link := process_link_pair(url, text, seen_links): | |
| all_links.append(md_link) | |
| return "\n\n".join(all_links) | |
| except Exception as e: | |
| logger.error(f"Link processing error: {str(e)}") | |
| return "" | |
| def process_url(url: str, link_types: List[str]) -> Tuple[str, str]: | |
| """Process website URL and generate markdown content.""" | |
| valid, result = validate_url(url) | |
| if not valid: | |
| return "", result | |
| url = result | |
| output_file = f"crawl_{token_hex(6)}.jsonl" | |
| try: | |
| if not safe_crawl(url, output_file): | |
| return "", "Crawl failed or timed out" | |
| crawl_df = pd.read_json(output_file, lines=True) | |
| if crawl_df.empty: | |
| return "", "No data found for the URL" | |
| # Extract and clean title and description | |
| title = ( | |
| clean_text(crawl_df["title"].iloc[0]) | |
| if "title" in crawl_df.columns | |
| else "Untitled" | |
| ) | |
| meta_desc = ( | |
| clean_text(crawl_df["meta_desc"].iloc[0]) | |
| if "meta_desc" in crawl_df.columns | |
| else "" | |
| ) | |
| # Process links | |
| links_content = process_links(crawl_df, link_types) | |
| # Generate final markdown | |
| content = f"# {title}\n\n" | |
| if meta_desc: | |
| content += f"> {meta_desc}\n\n" | |
| content += links_content | |
| return content, f"Successfully processed {url}" | |
| except Exception as e: | |
| logger.error(f"Error processing {url}: {str(e)}") | |
| return "", f"Error: {str(e)}" | |
| finally: | |
| if os.path.exists(output_file): | |
| os.remove(output_file) | |
| def process_file(file: gr.File) -> Tuple[str, str]: | |
| """Convert uploaded file to markdown.""" | |
| if not file: | |
| return "", "No file uploaded" | |
| supported_extensions = {".pdf", ".docx", ".pptx", ".xlsx", ".html", ".txt"} | |
| file_ext = os.path.splitext(file.name)[1].lower() | |
| if file_ext not in supported_extensions: | |
| return "", f"Unsupported file type: {file_ext}" | |
| try: | |
| result = md_converter.convert(file.name) | |
| return result.text_content, "File processed successfully" | |
| except Exception as e: | |
| logger.error(f"File processing error: {str(e)}") | |
| return "", f"Error processing file: {str(e)}" | |
| # Custom CSS for styling | |
| css = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap'); | |
| body { | |
| font-family: 'Open Sans', sans-serif !important; | |
| } | |
| .primary-btn { | |
| background-color: #3452db !important; | |
| } | |
| .primary-btn:hover { | |
| background-color: #2a41af !important; | |
| } | |
| """ | |
| # Create a custom theme | |
| theme = gr.themes.Soft( | |
| primary_hue=gr.themes.colors.Color( | |
| name="blue", | |
| c50="#eef1ff", | |
| c100="#e0e5ff", | |
| c200="#c3cbff", | |
| c300="#a5b2ff", | |
| c400="#8798ff", | |
| c500="#6a7eff", | |
| c600="#3452db", | |
| c700="#2a41af", | |
| c800="#1f3183", | |
| c900="#152156", | |
| c950="#0a102b", | |
| ) | |
| ) | |
| # Create interface | |
| with gr.Blocks( | |
| theme=gr.themes.Soft(), | |
| css=css, | |
| head=""" | |
| <link rel="canonical" href="https://wordlift.io/generate-llms-txt/" /> | |
| <meta name="description" content="Generate your LLMs.txt file - A WordLift tool to help you manage Large Language Models access to your content." /> | |
| <meta property="og:title" content="LLMs.txt Generator by WordLift" /> | |
| <meta property="og:description" content="Generate your LLMs.txt file - A WordLift tool to help you manage Large Language Models access to your content." /> | |
| <meta property="og:url" content="https://wordlift.io/generate-llms-txt/" /> | |
| """, | |
| ) as iface: | |
| gr.Markdown("# LLMs.txt Generator") | |
| with gr.Tab("Website URL"): | |
| url_input = gr.Textbox(label="Website URL", placeholder="example.com") | |
| link_types = gr.Dropdown( | |
| choices=["All links", "<header> links", "<nav> links", "<footer> links"], | |
| multiselect=True, | |
| value=["All links"], | |
| label="Link Types to Extract", | |
| ) | |
| url_button = gr.Button("Process URL", variant="primary") | |
| url_output = gr.Textbox( | |
| label="Generated Content", lines=20, show_copy_button=True | |
| ) | |
| url_status = gr.Textbox(label="Status") | |
| url_button.click( | |
| process_url, | |
| inputs=[url_input, link_types], | |
| outputs=[url_output, url_status], | |
| ) | |
| with gr.Tab("File Converter"): | |
| file_input = gr.File(label="Upload Document") | |
| file_button = gr.Button("Convert to Markdown", variant="primary") | |
| file_output = gr.Textbox( | |
| label="Converted Content", lines=20, show_copy_button=True | |
| ) | |
| file_status = gr.Textbox(label="Status") | |
| file_button.click( | |
| process_file, inputs=[file_input], outputs=[file_output, file_status] | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |