Spaces:
Sleeping
Sleeping
Omar ID EL MOUMEN
commited on
Commit
·
848b14f
1
Parent(s):
26ddf5d
Add page limitation for PDF url extraction
Browse files
app.py
CHANGED
|
@@ -58,8 +58,9 @@ class Query(BaseModel):
|
|
| 58 |
class DocumentID(BaseModel):
|
| 59 |
doc_id: str
|
| 60 |
|
| 61 |
-
class
|
| 62 |
url: str
|
|
|
|
| 63 |
|
| 64 |
@app.post("/search")
|
| 65 |
async def get_articles(query: Query):
|
|
@@ -113,11 +114,11 @@ async def extract_arxiv_pdf(document: DocumentID):
|
|
| 113 |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
| 114 |
|
| 115 |
@app.post("/extract_pdf/url")
|
| 116 |
-
async def extract_pdf(pdf:
|
| 117 |
pdf_req = requests.get(pdf.url)
|
| 118 |
if pdf_req.status_code == 200:
|
| 119 |
pdf_data = BytesIO(pdf_req.content)
|
| 120 |
-
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
| 121 |
pdf_text = " ".join([page.get_text("text") for page in doc])
|
| 122 |
pdf_metadata = doc.metadata
|
| 123 |
print(pdf_metadata)
|
|
|
|
| 58 |
class DocumentID(BaseModel):
|
| 59 |
doc_id: str
|
| 60 |
|
| 61 |
+
class PDF(BaseModel):
|
| 62 |
url: str
|
| 63 |
+
page_num: str = -1
|
| 64 |
|
| 65 |
@app.post("/search")
|
| 66 |
async def get_articles(query: Query):
|
|
|
|
| 114 |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
| 115 |
|
| 116 |
@app.post("/extract_pdf/url")
|
| 117 |
+
async def extract_pdf(pdf: PDF):
|
| 118 |
pdf_req = requests.get(pdf.url)
|
| 119 |
if pdf_req.status_code == 200:
|
| 120 |
pdf_data = BytesIO(pdf_req.content)
|
| 121 |
+
doc = fitz.open(stream=pdf_data, filetype="pdf")[:pdf.page_num]
|
| 122 |
pdf_text = " ".join([page.get_text("text") for page in doc])
|
| 123 |
pdf_metadata = doc.metadata
|
| 124 |
print(pdf_metadata)
|