Spaces:
Sleeping
Sleeping
Omar ID EL MOUMEN
commited on
Commit
·
5e9984e
1
Parent(s):
a5f46a9
Update formattimg method
Browse files
app.py
CHANGED
|
@@ -89,7 +89,8 @@ async def extract_text_pdf(id_doc: str):
|
|
| 89 |
|
| 90 |
postprocess_text = remove_in_betweens(pdf_text)
|
| 91 |
postprocess_text = remove_punctuations(postprocess_text)
|
| 92 |
-
postprocess_text = re.sub(r"\
|
|
|
|
| 93 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
| 94 |
titles = doc.get_toc()
|
| 95 |
main_titles = []
|
|
|
|
| 89 |
|
| 90 |
postprocess_text = remove_in_betweens(pdf_text)
|
| 91 |
postprocess_text = remove_punctuations(postprocess_text)
|
| 92 |
+
postprocess_text = re.sub(r"\s+", " ", postprocess_text)
|
| 93 |
+
postprocess_text = postprocess_text.strip()
|
| 94 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
| 95 |
titles = doc.get_toc()
|
| 96 |
main_titles = []
|