Spaces:
Sleeping
Sleeping
Omar ID EL MOUMEN
commited on
Commit
·
ca2c7e8
1
Parent(s):
9513d18
Fix extract
Browse files
app.py
CHANGED
|
@@ -45,7 +45,9 @@ class Query(BaseModel):
|
|
| 45 |
keyword: str
|
| 46 |
limit: int
|
| 47 |
|
| 48 |
-
|
|
|
|
|
|
|
| 49 |
@app.post("/search")
|
| 50 |
async def get_articles(query: Query):
|
| 51 |
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
|
|
@@ -72,8 +74,8 @@ async def get_articles(query: Query):
|
|
| 72 |
return {"error": True, "message": str(e)}
|
| 73 |
|
| 74 |
@app.post("/extract")
|
| 75 |
-
async def extract_text_pdf(
|
| 76 |
-
pdf_req = requests.get(f"http://arxiv.org/pdf/{doc_id}", verify=False)
|
| 77 |
if pdf_req.status_code == 200:
|
| 78 |
pdf_data = BytesIO(pdf_req.content)
|
| 79 |
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
|
@@ -106,10 +108,10 @@ async def extract_text_pdf(doc_id: str):
|
|
| 106 |
for title in titles:
|
| 107 |
if title[0] == 1:
|
| 108 |
main_titles.append(title[1])
|
| 109 |
-
return {"pub_id": doc_id, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False}
|
| 110 |
else:
|
| 111 |
-
print("ID: " + doc_id)
|
| 112 |
-
print("URL: " + f"http://arxiv.org/pdf/{doc_id}")
|
| 113 |
print("Status code: " + str(pdf_req.status_code))
|
| 114 |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
| 115 |
|
|
|
|
| 45 |
keyword: str
|
| 46 |
limit: int
|
| 47 |
|
| 48 |
+
class DocumentID(BaseModel):
|
| 49 |
+
doc_id: str
|
| 50 |
+
|
| 51 |
@app.post("/search")
|
| 52 |
async def get_articles(query: Query):
|
| 53 |
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
|
|
|
|
| 74 |
return {"error": True, "message": str(e)}
|
| 75 |
|
| 76 |
@app.post("/extract")
|
| 77 |
+
async def extract_text_pdf(document: DocumentID):
|
| 78 |
+
pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
|
| 79 |
if pdf_req.status_code == 200:
|
| 80 |
pdf_data = BytesIO(pdf_req.content)
|
| 81 |
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
|
|
|
| 108 |
for title in titles:
|
| 109 |
if title[0] == 1:
|
| 110 |
main_titles.append(title[1])
|
| 111 |
+
return {"pub_id": document.doc_id, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": document.doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False}
|
| 112 |
else:
|
| 113 |
+
print("ID: " + document.doc_id)
|
| 114 |
+
print("URL: " + f"http://arxiv.org/pdf/{document.doc_id}")
|
| 115 |
print("Status code: " + str(pdf_req.status_code))
|
| 116 |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
| 117 |
|