Spaces:
Sleeping
Sleeping
Omar ID EL MOUMEN
commited on
Commit
·
577d055
1
Parent(s):
959f2b1
Add POST extract PDF
Browse files
app.py
CHANGED
|
@@ -52,6 +52,9 @@ class Query(BaseModel):
|
|
| 52 |
class DocumentID(BaseModel):
|
| 53 |
doc_id: str
|
| 54 |
|
|
|
|
|
|
|
|
|
|
| 55 |
@app.post("/search")
|
| 56 |
async def get_articles(query: Query):
|
| 57 |
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
|
|
@@ -78,8 +81,8 @@ async def get_articles(query: Query):
|
|
| 78 |
print(f"Error while downloading data : {str(e)}")
|
| 79 |
return {"error": True, "message": str(e)}
|
| 80 |
|
| 81 |
-
@app.post("/
|
| 82 |
-
async def
|
| 83 |
pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
|
| 84 |
if pdf_req.status_code == 200:
|
| 85 |
pdf_data = BytesIO(pdf_req.content)
|
|
@@ -121,7 +124,21 @@ async def extract_text_pdf(document: DocumentID):
|
|
| 121 |
print("Status code: " + str(pdf_req.status_code))
|
| 122 |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
| 123 |
|
| 124 |
-
@app.post("/
|
| 125 |
-
async def
|
| 126 |
pubs = await get_articles(query)
|
| 127 |
-
return await
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
class DocumentID(BaseModel):
|
| 53 |
doc_id: str
|
| 54 |
|
| 55 |
+
class URL(BaseModel):
|
| 56 |
+
url: str
|
| 57 |
+
|
| 58 |
@app.post("/search")
|
| 59 |
async def get_articles(query: Query):
|
| 60 |
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
|
|
|
|
| 81 |
print(f"Error while downloading data : {str(e)}")
|
| 82 |
return {"error": True, "message": str(e)}
|
| 83 |
|
| 84 |
+
@app.post("/extract_pdf/arxiv_id")
|
| 85 |
+
async def extract_arxiv_pdf(document: DocumentID):
|
| 86 |
pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
|
| 87 |
if pdf_req.status_code == 200:
|
| 88 |
pdf_data = BytesIO(pdf_req.content)
|
|
|
|
| 124 |
print("Status code: " + str(pdf_req.status_code))
|
| 125 |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
| 126 |
|
| 127 |
+
@app.post("/extract_pdf/arxiv_id/random")
|
| 128 |
+
async def extract_random_arxiv_pdf(query: Query):
|
| 129 |
pubs = await get_articles(query)
|
| 130 |
+
return await extract_arxiv_pdf(random.choice(list(pubs["message"].keys())))
|
| 131 |
+
|
| 132 |
+
@app.post("/extract_pdf/url")
|
| 133 |
+
async def extract_pdf(url: URL):
|
| 134 |
+
pdf_req = requests.get(url, verify=False)
|
| 135 |
+
if pdf_req.status_code == 200:
|
| 136 |
+
pdf_data = BytesIO(pdf_req.content)
|
| 137 |
+
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
| 138 |
+
pdf_text = " ".join([page.get_text("text") for page in doc])
|
| 139 |
+
pdf_metadata = doc.metadata
|
| 140 |
+
return {"error": False, "title": pdf_metadata.get("title", "").strip(), "text": pdf_text}
|
| 141 |
+
else:
|
| 142 |
+
print("URL: " + url)
|
| 143 |
+
print("Status code: " + str(pdf_req.status_code))
|
| 144 |
+
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|