Spaces:

Shreyas094
/

GPT-Researcher

Paused

Upload 528 files

372531f verified 10 months ago

860 Bytes

	from langchain_community.document_loaders import PyMuPDFLoader
	from langchain_community.retrievers import ArxivRetriever


	def scrape_pdf_with_pymupdf(url) -> str:
	"""Scrape a pdf with pymupdf

	Args:
	url (str): The url of the pdf to scrape

	Returns:
	str: The text scraped from the pdf
	"""
	loader = PyMuPDFLoader(url)
	doc = loader.load()
	return str(doc)


	def scrape_pdf_with_arxiv(query) -> str:
	"""Scrape a pdf with arxiv
	default document length of 70000 about ~15 pages or None for no limit

	Args:
	query (str): The query to search for

	Returns:
	str: The text scraped from the pdf
	"""
	retriever = ArxivRetriever(load_max_docs=2, doc_content_chars_max=None)
	docs = retriever.get_relevant_documents(query=query)
	return docs[0].page_content