Spaces:

rbiswasfc
/

arxiv-extract-from-pdf

Sleeping

arxiv-extract-from-pdf / app.py

zero gpu

2a910d7 about 1 year ago

1.67 kB

	import os

	import gradio as gr
	import requests
	import spaces
	from marker.convert import convert_single_pdf
	from marker.logger import configure_logging
	from marker.models import load_all_models

	configure_logging()
	MARKER_MODEL_LST = load_all_models()


	@spaces.GPU
	def extract_from_pdf(arxiv_id):
	"""extract text from a PDF file"""
	pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
	tmp_pdf = ".tmp_pdf"
	response = requests.get(pdf_url)
	if response.status_code == 200:
	with open(tmp_pdf, "wb") as file:
	file.write(response.content)
	print("PDF downloaded and saved as ", tmp_pdf)
	else:
	print(f"Failed to download PDF. Status code: {response.status_code}")
	full_text, doc_images, out_meta = convert_single_pdf(
	tmp_pdf, MARKER_MODEL_LST, max_pages=20
	)

	os.remove(tmp_pdf)
	print("Temporary PDF file removed.")

	return full_text


	def extract(arxiv_id):
	if not arxiv_id:
	return {"error": "ArXiv ID is required"}

	try:
	full_text = extract_from_pdf(arxiv_id)
	results = {"arxiv_id": arxiv_id, "text": full_text}
	return results

	except Exception as e:
	return {"error": str(e)}


	with gr.Blocks() as app:
	# Create an input text box
	text_input = gr.Textbox(label="Enter arxiv id")

	# Create an output text component
	output = gr.JSON(label="Extracted text")

	# When the input text is submitted, call the embedding function and display the output
	text_input.submit(extract, inputs=text_input, outputs=output)


	if __name__ == "__main__":
	app.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)