Spaces:

myn0908
/

Own-Knowledge-GPT

Build error

App Files Files Community

Own-Knowledge-GPT / bot /web_scrapping /single_crawler.py

myn0908

own knowledge gpt

d97a6fa about 2 years ago

raw

history blame contribute delete

2.25 kB

	import os
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	from fpdf import FPDF


	def content_crawler(url, file_format='txt', output_file='privacy_policy'):
	# Send an HTTP GET request to the URL
	response = requests.get(url)

	# Check if the request was successful
	if response.status_code == 200:
	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(response.text, "html.parser")
	text = soup.find_all(['h2', 'p', 'i', 'ul'])

	# Create output folder if it doesn't exist
	if not os.path.exists('../learning_documents'):
	os.makedirs('../learning_documents')

	# Save content based on the specified file format
	output_path = os.path.join('../learning_documents', output_file)

	if file_format == 'txt':
	with open(f"{output_path}.txt", "w", encoding="utf-8") as file:
	for t in text:
	file.write(f'{t.text}\n')
	print(f"Content saved to {output_path}.txt")
	elif file_format == 'pdf':
	pdf = FPDF()
	pdf.set_auto_page_break(auto=True, margin=15)
	pdf.add_page()
	pdf.set_font("Arial", "B", 8)
	for t in text:
	pdf.cell(0, 10, t.text, ln=True)
	pdf.output(f"{output_path}.pdf")
	print(f"Content saved to {output_path}.pdf")
	elif file_format == 'csv':
	df = pd.DataFrame({'Content': [t.text for t in text]})
	df.to_csv(f"{output_path}.csv", index=False)
	print(f"Content saved to {output_path}.csv")
	elif file_format == 'xml':
	xml_content = ''.join([f'<item>{t.text}</item>' for t in text])
	with open(f"{output_path}.xml", "w", encoding="utf-8") as file:
	file.write(f'<root>{xml_content}</root>')
	print(f"Content saved to {output_path}.xml")
	else:
	print("Invalid file format. Supported formats: txt, pdf, csv, xml")
	else:
	print("Failed to retrieve content from the URL.")


	if __name__ == '__main__':
	pass
	# Example usage:
	# content_crawler("https://www.presight.io/privacy-policy.html", file_format='pdf', output_file='privacy_policy')