Spaces:
Sleeping
Sleeping
Omar ID EL MOUMEN
commited on
Commit
·
9513d18
1
Parent(s):
5e9984e
Change CRUD: GET -> POST
Browse files
app.py
CHANGED
|
@@ -3,6 +3,7 @@ from fastapi.middleware.cors import CORSMiddleware
|
|
| 3 |
from contextlib import asynccontextmanager
|
| 4 |
import xml.etree.ElementTree as xmlparser
|
| 5 |
import requests
|
|
|
|
| 6 |
import sys
|
| 7 |
import random
|
| 8 |
import fitz
|
|
@@ -40,12 +41,17 @@ app.add_middleware(
|
|
| 40 |
async def root():
|
| 41 |
return {"message": "API started successfully"}
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
|
| 46 |
content = {}
|
| 47 |
try:
|
| 48 |
-
arxiv_search_result = requests.get(f"http://export.arxiv.org/api/query?search_query=all:{keyword}&max_results={limit}", verify=False)
|
| 49 |
response = xmlparser.fromstring(arxiv_search_result.text)
|
| 50 |
publications = response.findall(f"{XML_NAMESPACE}entry")
|
| 51 |
for pub in publications:
|
|
@@ -64,10 +70,10 @@ async def get_articles(keyword: str, limit: int):
|
|
| 64 |
except Exception as e:
|
| 65 |
print(f"Error while downloading data : {str(e)}")
|
| 66 |
return {"error": True, "message": str(e)}
|
| 67 |
-
|
| 68 |
-
@app.
|
| 69 |
-
async def extract_text_pdf(
|
| 70 |
-
pdf_req = requests.get(f"http://arxiv.org/pdf/{
|
| 71 |
if pdf_req.status_code == 200:
|
| 72 |
pdf_data = BytesIO(pdf_req.content)
|
| 73 |
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
|
@@ -100,14 +106,14 @@ async def extract_text_pdf(id_doc: str):
|
|
| 100 |
for title in titles:
|
| 101 |
if title[0] == 1:
|
| 102 |
main_titles.append(title[1])
|
| 103 |
-
return {"pub_id":
|
| 104 |
else:
|
| 105 |
-
print("ID: " +
|
| 106 |
-
print("URL: " + f"http://arxiv.org/pdf/{
|
| 107 |
print("Status code: " + str(pdf_req.status_code))
|
| 108 |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
| 109 |
|
| 110 |
-
@app.
|
| 111 |
-
async def extract_random_pdf(
|
| 112 |
-
pubs = await get_articles(
|
| 113 |
return await extract_text_pdf(random.choice(list(pubs["message"].keys())))
|
|
|
|
| 3 |
from contextlib import asynccontextmanager
|
| 4 |
import xml.etree.ElementTree as xmlparser
|
| 5 |
import requests
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
import sys
|
| 8 |
import random
|
| 9 |
import fitz
|
|
|
|
| 41 |
async def root():
|
| 42 |
return {"message": "API started successfully"}
|
| 43 |
|
| 44 |
+
class Query(BaseModel):
|
| 45 |
+
keyword: str
|
| 46 |
+
limit: int
|
| 47 |
+
|
| 48 |
+
# Put all GET into POST
|
| 49 |
+
@app.post("/search")
|
| 50 |
+
async def get_articles(query: Query):
|
| 51 |
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
|
| 52 |
content = {}
|
| 53 |
try:
|
| 54 |
+
arxiv_search_result = requests.get(f"http://export.arxiv.org/api/query?search_query=all:{query.keyword}&max_results={query.limit}", verify=False)
|
| 55 |
response = xmlparser.fromstring(arxiv_search_result.text)
|
| 56 |
publications = response.findall(f"{XML_NAMESPACE}entry")
|
| 57 |
for pub in publications:
|
|
|
|
| 70 |
except Exception as e:
|
| 71 |
print(f"Error while downloading data : {str(e)}")
|
| 72 |
return {"error": True, "message": str(e)}
|
| 73 |
+
|
| 74 |
+
@app.post("/extract")
|
| 75 |
+
async def extract_text_pdf(doc_id: str):
|
| 76 |
+
pdf_req = requests.get(f"http://arxiv.org/pdf/{doc_id}", verify=False)
|
| 77 |
if pdf_req.status_code == 200:
|
| 78 |
pdf_data = BytesIO(pdf_req.content)
|
| 79 |
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
|
|
|
| 106 |
for title in titles:
|
| 107 |
if title[0] == 1:
|
| 108 |
main_titles.append(title[1])
|
| 109 |
+
return {"pub_id": doc_id, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False}
|
| 110 |
else:
|
| 111 |
+
print("ID: " + doc_id)
|
| 112 |
+
print("URL: " + f"http://arxiv.org/pdf/{doc_id}")
|
| 113 |
print("Status code: " + str(pdf_req.status_code))
|
| 114 |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
| 115 |
|
| 116 |
+
@app.post("/extract/random")
|
| 117 |
+
async def extract_random_pdf(query: Query):
|
| 118 |
+
pubs = await get_articles(query)
|
| 119 |
return await extract_text_pdf(random.choice(list(pubs["message"].keys())))
|