Spaces:
Running
Running
Commit
·
835c83e
1
Parent(s):
55cbf6a
formatted text cleaning
Browse files
app.py
CHANGED
|
@@ -38,12 +38,18 @@ label_mapping = {
|
|
| 38 |
def clean_text(text):
|
| 39 |
|
| 40 |
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
text = re.sub(r"\n\
|
| 43 |
|
| 44 |
-
text = re.sub(r"
|
| 45 |
|
| 46 |
text = text.strip()
|
|
|
|
| 47 |
return text
|
| 48 |
|
| 49 |
def classify_text(text):
|
|
|
|
| 38 |
def clean_text(text):
|
| 39 |
|
| 40 |
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
text = re.sub(r"\n\s*\n+", "\n\n", text)
|
| 44 |
+
|
| 45 |
+
text = re.sub(r"[ \t]+", " ", text)
|
| 46 |
|
| 47 |
+
text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
|
| 48 |
|
| 49 |
+
text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
|
| 50 |
|
| 51 |
text = text.strip()
|
| 52 |
+
|
| 53 |
return text
|
| 54 |
|
| 55 |
def classify_text(text):
|