Spaces:

myn0908
/

Own-Knowledge-GPT

Build error

App Files Files Community

Own-Knowledge-GPT / bot /web_scrapping /crawler_and_indexer.py

myn0908

own knowledge gpt

d97a6fa about 2 years ago

raw

history blame

3.57 kB

	from bs4 import BeautifulSoup
	from urllib import request
	from bot.web_scrapping.searchable_index import SearchableIndex
	from bot.utils.show_log import logger
	from bot.utils.constanst import set_api_key
	import pandas as pd
	import requests
	import os

	set_api_key(api_key='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')


	def save_content_to_file(url=None, text=None, output_folder=None, file_format=None):
	file_path = os.path.join(output_folder, f"combined_content.{file_format}")
	if file_format == 'txt':
	with open(f"{file_path}", "a", encoding="utf-8") as file:
	for t in text:
	file.write(f'{t.text}\n')
	logger.info(f"Content appended to {file_path}")
	elif file_format == 'pdf':
	request.urlretrieve(url, file_path)
	logger.info(f"Content appended to {file_path}")
	elif file_format == 'csv':
	df = pd.DataFrame({'Content': [t.text for t in text]})
	df.to_csv(f"{file_path}", mode='a', index=False, header=False)
	logger.info(f"Content appended to {file_path}")
	elif file_format == 'xml':
	xml_content = ''.join([f'<item>{t.text}</item>' for t in text])
	with open(f"{file_path}", "a", encoding="utf-8") as file:
	file.write(xml_content)
	logger.info(f"Content appended to {file_path}")
	else:
	logger.warning("Invalid file format. Supported formats: txt, pdf, csv, xml")
	return file_path


	def content_crawler_and_index(url, file_format='txt', output_folder='learning_documents'):
	if url != 'NO_URL':
	# Send an HTTP GET request to the URL
	responses = requests.get(url)
	# Check if the request was successful
	if responses.status_code == 200:
	# Create output folder if it doesn't exist
	if not os.path.exists(output_folder):
	os.makedirs(output_folder)
	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(responses.text, "html.parser")
	text = soup.find_all(['h2', 'p', 'i', 'ul'])
	if text:
	# Save content based on the specified file format
	file_path = save_content_to_file(text=text, output_folder=output_folder, file_format=file_format)

	# Create or update the index
	index = SearchableIndex.embed_index(url, file_path)
	if os.path.isfile(file_path):
	os.remove(file_path)
	return index
	else:
	file_path = save_content_to_file(url=url, output_folder=output_folder, file_format=file_format)
	index = SearchableIndex.embed_index(url, file_path)
	if os.path.isfile(file_path):
	os.remove(file_path)
	return index

	else:
	logger.warning("Failed to retrieve content from the URL.")
	else:
	index = SearchableIndex.embed_index(url=url, path=output_folder)
	return index


	if __name__ == '__main__':
	pass
	# Example usage:
	# First URL
	# idx = content_crawler_and_index("https://www.presight.io/terms-of-use.html", file_format='txt')
	#
	# Second URL (appends content to existing files)
	# idx = content_crawler_and_index(url='https://arxiv.org/pdf/2309.11235v1.pdf', file_format='pdf')
	# # example get response chatbot
	# prompt = 'explain the paper'
	# llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
	# response = SearchableIndex.query(prompt, llm, idx)
	# print(response)
	# logger.info(response)