Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import FileResponse | |
| from fastapi.staticfiles import StaticFiles | |
| from contextlib import asynccontextmanager | |
| import xml.etree.ElementTree as xmlparser | |
| import requests | |
| from pydantic import BaseModel | |
| import sys | |
| import random | |
| import fitz | |
| import re,os | |
| from io import BytesIO | |
| from datetime import datetime | |
| def receive_signal(signalNumber, frame): | |
| print('Received:', signalNumber) | |
| sys.exit() | |
| async def lifespan(app: FastAPI): | |
| import signal | |
| signal.signal(signal.SIGINT, receive_signal) | |
| yield | |
| app = FastAPI(lifespan=lifespan) | |
| app.mount("/static", StaticFiles(directory="static"), name="static") | |
| origins = [ | |
| "*", | |
| ] | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=origins, | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| async def root(): | |
| return FileResponse(os.path.join("templates", "index.html")) | |
| class Query(BaseModel): | |
| keyword: str | |
| limit: int | |
| class DocumentID(BaseModel): | |
| doc_id: str | |
| class WebPDF(BaseModel): | |
| url: str | |
| async def get_articles(query: Query): | |
| XML_NAMESPACE = "{http://www.w3.org/2005/Atom}" | |
| content = {} | |
| try: | |
| arxiv_search_result = requests.get(f"http://export.arxiv.org/api/query?search_query=all:{query.keyword}&max_results={query.limit}", verify=False) | |
| response = xmlparser.fromstring(arxiv_search_result.text) | |
| publications = response.findall(f"{XML_NAMESPACE}entry") | |
| for pub in publications: | |
| id_pub = pub.find(f"{XML_NAMESPACE}id").text.split("/")[-1] | |
| title_pub = pub.find(f"{XML_NAMESPACE}title").text | |
| authors = " and ".join([author.find(f"{XML_NAMESPACE}name").text for author in pub.findall(f"{XML_NAMESPACE}author")]) | |
| pub_date = datetime.strptime(pub.find(f"{XML_NAMESPACE}published").text, "%Y-%m-%dT%H:%M:%SZ").strftime("%d/%m/%Y") | |
| abstract = pub.find(f"{XML_NAMESPACE}summary").text | |
| content[id_pub] = { | |
| "title": title_pub, | |
| "authors": authors, | |
| "date": pub_date, | |
| "abstract": abstract, | |
| "pdf": f"http://arxiv.org/pdf/{id_pub}" | |
| } | |
| return {"error": False, "message": content} | |
| except Exception as e: | |
| print(f"Error while downloading data : {str(e)}") | |
| return {"error": True, "message": str(e)} | |
| async def extract_arxiv_pdf(document: DocumentID): | |
| pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False) | |
| if pdf_req.status_code == 200: | |
| pdf_data = BytesIO(pdf_req.content) | |
| doc = fitz.open(stream=pdf_data, filetype="pdf") | |
| pdf_text = " ".join([page.get_text("text") for page in doc]) | |
| ref_pos = re.search(r"REFERENCES", pdf_text, re.IGNORECASE) | |
| if ref_pos: | |
| ref_pos = ref_pos.end() | |
| if ref_pos is not None: | |
| pdf_text = pdf_text[:ref_pos - 10] | |
| def remove_in_betweens(text): | |
| removed_brackets = re.sub(r'\[.*?\]', ' ', text) | |
| removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets) | |
| return removed_parentheses | |
| def remove_punctuations(text): | |
| return re.sub(r"[\,\;\:\?\!\'\β\"\(\)\{\}\[\]\/\\\*]", '', text) | |
| postprocess_text = remove_in_betweens(pdf_text) | |
| postprocess_text = remove_punctuations(postprocess_text) | |
| postprocess_text = re.sub(r"\s+", " ", postprocess_text) | |
| postprocess_text = postprocess_text.strip() | |
| regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$" | |
| titles = doc.get_toc() | |
| main_titles = [] | |
| if len(titles) <= 0: | |
| main_titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE) | |
| main_titles = [(-1, t) for t in main_titles] | |
| else: | |
| for title in titles: | |
| if title[0] == 1 or title[0] == 2: | |
| main_titles.append((title[0], title[1])) | |
| return {"pub_id": document.doc_id, "titles": [(t[0],re.sub(r"\s+", " ", remove_punctuations(remove_in_betweens(t[1]))).strip()) for t in main_titles], "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": document.doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False} | |
| else: | |
| print("ID: " + document.doc_id) | |
| print("URL: " + f"http://arxiv.org/pdf/{document.doc_id}") | |
| print("Status code: " + str(pdf_req.status_code)) | |
| return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)} | |
| async def extract_random_arxiv_pdf(query: Query): | |
| pubs = await get_articles(query) | |
| return await extract_arxiv_pdf(random.choice(list(pubs["message"].keys()))) | |
| async def extract_pdf(pdf: WebPDF): | |
| pdf_req = requests.get(pdf.url) | |
| if pdf_req.status_code == 200: | |
| pdf_data = BytesIO(pdf_req.content) | |
| doc = fitz.open(stream=pdf_data, filetype="pdf") | |
| pdf_text = " ".join([page.get_text("text") for page in doc]) | |
| pdf_metadata = doc.metadata | |
| return {"error": False, "title": pdf_metadata.get("title", "").strip(), "text": pdf_text} | |
| else: | |
| print("URL: " + pdf.url) | |
| print("Status code: " + str(pdf_req.status_code)) | |
| return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)} |