Spaces:
Sleeping
Sleeping
Omar ID EL MOUMEN
commited on
Commit
·
a13fabc
1
Parent(s):
0bf43b3
Add postprocessed text
Browse files
app.py
CHANGED
|
@@ -13,6 +13,13 @@ import re,os
|
|
| 13 |
from io import BytesIO
|
| 14 |
from datetime import datetime
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
def receive_signal(signalNumber, frame):
|
| 18 |
print('Received:', signalNumber)
|
|
@@ -95,14 +102,6 @@ async def extract_arxiv_pdf(document: DocumentID):
|
|
| 95 |
if ref_pos is not None:
|
| 96 |
pdf_text = pdf_text[:ref_pos - 10]
|
| 97 |
|
| 98 |
-
def remove_in_betweens(text):
|
| 99 |
-
removed_brackets = re.sub(r'\[.*?\]', ' ', text)
|
| 100 |
-
removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets)
|
| 101 |
-
return removed_parentheses
|
| 102 |
-
|
| 103 |
-
def remove_punctuations(text):
|
| 104 |
-
return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*]", '', text)
|
| 105 |
-
|
| 106 |
postprocess_text = remove_in_betweens(pdf_text)
|
| 107 |
postprocess_text = remove_punctuations(postprocess_text)
|
| 108 |
postprocess_text = re.sub(r"\s+", " ", postprocess_text)
|
|
@@ -137,7 +136,13 @@ async def extract_pdf(pdf: WebPDF):
|
|
| 137 |
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
| 138 |
pdf_text = " ".join([page.get_text("text") for page in doc])
|
| 139 |
pdf_metadata = doc.metadata
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
else:
|
| 142 |
print("URL: " + pdf.url)
|
| 143 |
print("Status code: " + str(pdf_req.status_code))
|
|
|
|
| 13 |
from io import BytesIO
|
| 14 |
from datetime import datetime
|
| 15 |
|
| 16 |
+
def remove_in_betweens(text):
|
| 17 |
+
removed_brackets = re.sub(r'\[.*?\]', ' ', text)
|
| 18 |
+
removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets)
|
| 19 |
+
return removed_parentheses
|
| 20 |
+
|
| 21 |
+
def remove_punctuations(text):
|
| 22 |
+
return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*]", '', text)
|
| 23 |
|
| 24 |
def receive_signal(signalNumber, frame):
|
| 25 |
print('Received:', signalNumber)
|
|
|
|
| 102 |
if ref_pos is not None:
|
| 103 |
pdf_text = pdf_text[:ref_pos - 10]
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
postprocess_text = remove_in_betweens(pdf_text)
|
| 106 |
postprocess_text = remove_punctuations(postprocess_text)
|
| 107 |
postprocess_text = re.sub(r"\s+", " ", postprocess_text)
|
|
|
|
| 136 |
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
| 137 |
pdf_text = " ".join([page.get_text("text") for page in doc])
|
| 138 |
pdf_metadata = doc.metadata
|
| 139 |
+
print(pdf_metadata)
|
| 140 |
+
|
| 141 |
+
postprocess_text = remove_in_betweens(pdf_text)
|
| 142 |
+
postprocess_text = remove_punctuations(postprocess_text)
|
| 143 |
+
postprocess_text = re.sub(r"\s+", " ", postprocess_text)
|
| 144 |
+
postprocess_text = postprocess_text.strip()
|
| 145 |
+
return {"error": False, "title": pdf_metadata.get("title", "").strip(), "text": postprocess_text}
|
| 146 |
else:
|
| 147 |
print("URL: " + pdf.url)
|
| 148 |
print("Status code: " + str(pdf_req.status_code))
|