Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import requests | |
| import spaces | |
| from marker.convert import convert_single_pdf | |
| from marker.logger import configure_logging | |
| from marker.models import load_all_models | |
| configure_logging() | |
| MARKER_MODEL_LST = load_all_models() | |
| def extract_from_pdf(arxiv_id): | |
| """extract text from a PDF file""" | |
| pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" | |
| tmp_pdf = ".tmp_pdf" | |
| response = requests.get(pdf_url) | |
| if response.status_code == 200: | |
| with open(tmp_pdf, "wb") as file: | |
| file.write(response.content) | |
| print("PDF downloaded and saved as ", tmp_pdf) | |
| else: | |
| print(f"Failed to download PDF. Status code: {response.status_code}") | |
| full_text, doc_images, out_meta = convert_single_pdf( | |
| tmp_pdf, MARKER_MODEL_LST, max_pages=20 | |
| ) | |
| os.remove(tmp_pdf) | |
| print("Temporary PDF file removed.") | |
| return full_text | |
| def extract(arxiv_id): | |
| if not arxiv_id: | |
| return {"error": "ArXiv ID is required"} | |
| try: | |
| full_text = extract_from_pdf(arxiv_id) | |
| results = {"arxiv_id": arxiv_id, "text": full_text} | |
| return results | |
| except Exception as e: | |
| return {"error": str(e)} | |
| with gr.Blocks() as app: | |
| # Create an input text box | |
| text_input = gr.Textbox(label="Enter arxiv id") | |
| # Create an output text component | |
| output = gr.JSON(label="Extracted text") | |
| # When the input text is submitted, call the embedding function and display the output | |
| text_input.submit(extract, inputs=text_input, outputs=output) | |
| if __name__ == "__main__": | |
| app.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860) | |