Spaces:
Build error
Build error
| import os | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| from fpdf import FPDF | |
| def content_crawler(url, file_format='txt', output_file='privacy_policy'): | |
| # Send an HTTP GET request to the URL | |
| response = requests.get(url) | |
| # Check if the request was successful | |
| if response.status_code == 200: | |
| # Parse the HTML content using BeautifulSoup | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| text = soup.find_all(['h2', 'p', 'i', 'ul']) | |
| # Create output folder if it doesn't exist | |
| if not os.path.exists('../learning_documents'): | |
| os.makedirs('../learning_documents') | |
| # Save content based on the specified file format | |
| output_path = os.path.join('../learning_documents', output_file) | |
| if file_format == 'txt': | |
| with open(f"{output_path}.txt", "w", encoding="utf-8") as file: | |
| for t in text: | |
| file.write(f'{t.text}\n') | |
| print(f"Content saved to {output_path}.txt") | |
| elif file_format == 'pdf': | |
| pdf = FPDF() | |
| pdf.set_auto_page_break(auto=True, margin=15) | |
| pdf.add_page() | |
| pdf.set_font("Arial", "B", 8) | |
| for t in text: | |
| pdf.cell(0, 10, t.text, ln=True) | |
| pdf.output(f"{output_path}.pdf") | |
| print(f"Content saved to {output_path}.pdf") | |
| elif file_format == 'csv': | |
| df = pd.DataFrame({'Content': [t.text for t in text]}) | |
| df.to_csv(f"{output_path}.csv", index=False) | |
| print(f"Content saved to {output_path}.csv") | |
| elif file_format == 'xml': | |
| xml_content = ''.join([f'<item>{t.text}</item>' for t in text]) | |
| with open(f"{output_path}.xml", "w", encoding="utf-8") as file: | |
| file.write(f'<root>{xml_content}</root>') | |
| print(f"Content saved to {output_path}.xml") | |
| else: | |
| print("Invalid file format. Supported formats: txt, pdf, csv, xml") | |
| else: | |
| print("Failed to retrieve content from the URL.") | |
| if __name__ == '__main__': | |
| pass | |
| # Example usage: | |
| # content_crawler("https://www.presight.io/privacy-policy.html", file_format='pdf', output_file='privacy_policy') | |