Spaces:
Sleeping
Sleeping
Omar ID EL MOUMEN
commited on
Commit
·
07e2819
1
Parent(s):
c2b2088
Fixing missing references case
Browse files
app.py
CHANGED
|
@@ -77,7 +77,7 @@ async def extract_text_pdf(id_doc: str):
|
|
| 77 |
ref_pos = ref_pos.end()
|
| 78 |
|
| 79 |
if ref_pos is not None:
|
| 80 |
-
|
| 81 |
|
| 82 |
def remove_in_betweens(text):
|
| 83 |
removed_brackets = re.sub(r'\[.*?\]', ' ', text)
|
|
@@ -85,9 +85,9 @@ async def extract_text_pdf(id_doc: str):
|
|
| 85 |
return removed_parentheses
|
| 86 |
|
| 87 |
def remove_punctuations(text):
|
| 88 |
-
return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*\-]", '
|
| 89 |
|
| 90 |
-
postprocess_text = remove_in_betweens(
|
| 91 |
postprocess_text = remove_punctuations(postprocess_text)
|
| 92 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
| 93 |
titles = doc.get_toc()
|
|
|
|
| 77 |
ref_pos = ref_pos.end()
|
| 78 |
|
| 79 |
if ref_pos is not None:
|
| 80 |
+
pdf_text = pdf_text[:ref_pos - 10]
|
| 81 |
|
| 82 |
def remove_in_betweens(text):
|
| 83 |
removed_brackets = re.sub(r'\[.*?\]', ' ', text)
|
|
|
|
| 85 |
return removed_parentheses
|
| 86 |
|
| 87 |
def remove_punctuations(text):
|
| 88 |
+
return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*\-]", '', text)
|
| 89 |
|
| 90 |
+
postprocess_text = remove_in_betweens(pdf_text)
|
| 91 |
postprocess_text = remove_punctuations(postprocess_text)
|
| 92 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
| 93 |
titles = doc.get_toc()
|