Spaces:
Sleeping
Sleeping
Update utils/preprocessing.py
Browse files- utils/preprocessing.py +36 -20
utils/preprocessing.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
from haystack.nodes.base import BaseComponent
|
| 2 |
from haystack.schema import Document
|
| 3 |
-
from haystack.nodes import
|
| 4 |
from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
|
|
|
|
| 5 |
from typing import Callable, Dict, List, Optional, Text, Tuple, Union
|
| 6 |
from typing_extensions import Literal
|
| 7 |
import pandas as pd
|
|
@@ -9,7 +10,9 @@ import logging
|
|
| 9 |
import re
|
| 10 |
import string
|
| 11 |
from haystack.pipelines import Pipeline
|
|
|
|
| 12 |
|
|
|
|
| 13 |
def useOCR(file_path: str)-> Text:
|
| 14 |
"""
|
| 15 |
Converts image pdfs into text, Using the Farm-haystack[OCR]
|
|
@@ -21,13 +24,30 @@ def useOCR(file_path: str)-> Text:
|
|
| 21 |
|
| 22 |
Returns the text file as string.
|
| 23 |
"""
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
converter =
|
| 27 |
valid_languages=["eng"])
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
|
|
@@ -37,13 +57,10 @@ class FileConverter(BaseComponent):
|
|
| 37 |
Converter class, will use internally haystack PDFToTextOCR in case of image
|
| 38 |
pdf. Cannot use the FileClassifier from haystack as its doesnt has any
|
| 39 |
label/output class for image.
|
| 40 |
-
|
| 41 |
1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
|
| 42 |
2. https://docs.haystack.deepset.ai/docs/file_converters
|
| 43 |
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
|
| 44 |
4. https://docs.haystack.deepset.ai/reference/file-converters-api
|
| 45 |
-
|
| 46 |
-
|
| 47 |
"""
|
| 48 |
|
| 49 |
outgoing_edges = 1
|
|
@@ -84,8 +101,6 @@ class FileConverter(BaseComponent):
|
|
| 84 |
|
| 85 |
documents = []
|
| 86 |
|
| 87 |
-
|
| 88 |
-
# encoding is empty, probably should be utf-8
|
| 89 |
document = converter.convert(
|
| 90 |
file_path=file_path, meta=None,
|
| 91 |
encoding=encoding, id_hash_keys=id_hash_keys
|
|
@@ -101,10 +116,12 @@ class FileConverter(BaseComponent):
|
|
| 101 |
if filtered == "":
|
| 102 |
logging.info("Using OCR")
|
| 103 |
text = useOCR(file_path)
|
| 104 |
-
|
| 105 |
documents.append(Document(content=text,
|
| 106 |
meta={"name": file_name},
|
| 107 |
id_hash_keys=id_hash_keys))
|
|
|
|
|
|
|
| 108 |
|
| 109 |
logging.info('file conversion succesful')
|
| 110 |
output = {'documents': documents}
|
|
@@ -124,7 +141,6 @@ def basic(s:str, remove_punc:bool = False):
|
|
| 124 |
|
| 125 |
"""
|
| 126 |
Performs basic cleaning of text.
|
| 127 |
-
|
| 128 |
Params
|
| 129 |
----------
|
| 130 |
s: string to be processed
|
|
@@ -150,6 +166,7 @@ def basic(s:str, remove_punc:bool = False):
|
|
| 150 |
|
| 151 |
return s.strip()
|
| 152 |
|
|
|
|
| 153 |
def paraLengthCheck(paraList, max_len = 100):
|
| 154 |
"""
|
| 155 |
There are cases where preprocessor cannot respect word limit, when using
|
|
@@ -187,15 +204,13 @@ class UdfPreProcessor(BaseComponent):
|
|
| 187 |
class to preprocess the document returned by FileConverter. It will check
|
| 188 |
for splitting strategy and splits the document by word or sentences and then
|
| 189 |
synthetically create the paragraphs.
|
| 190 |
-
|
| 191 |
1. https://docs.haystack.deepset.ai/docs/preprocessor
|
| 192 |
2. https://docs.haystack.deepset.ai/reference/preprocessor-api
|
| 193 |
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
|
| 194 |
-
|
| 195 |
"""
|
| 196 |
outgoing_edges = 1
|
| 197 |
|
| 198 |
-
def run(self, documents:List[Document], remove_punc:bool=False,
|
| 199 |
split_by: Literal["sentence", "word"] = 'sentence',
|
| 200 |
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
| 201 |
split_overlap:int = 0):
|
|
@@ -250,8 +265,11 @@ class UdfPreProcessor(BaseComponent):
|
|
| 250 |
# # basic cleaning before passing it to preprocessor.
|
| 251 |
# i = basic(i)
|
| 252 |
docs_processed = preprocessor.process([i])
|
| 253 |
-
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
df = pd.DataFrame(docs_processed)
|
| 257 |
all_text = " ".join(df.content.to_list())
|
|
@@ -275,7 +293,6 @@ def processingpipeline():
|
|
| 275 |
"""
|
| 276 |
Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
|
| 277 |
from utils.preprocessing
|
| 278 |
-
|
| 279 |
"""
|
| 280 |
|
| 281 |
preprocessing_pipeline = Pipeline()
|
|
@@ -287,5 +304,4 @@ def processingpipeline():
|
|
| 287 |
preprocessing_pipeline.add_node(component = custom_preprocessor,
|
| 288 |
name ='UdfPreProcessor', inputs=["FileConverter"])
|
| 289 |
|
| 290 |
-
return preprocessing_pipeline
|
| 291 |
-
|
|
|
|
| 1 |
from haystack.nodes.base import BaseComponent
|
| 2 |
from haystack.schema import Document
|
| 3 |
+
from haystack.nodes import ImageToTextConverter, PDFToTextConverter
|
| 4 |
from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
|
| 5 |
+
from pdf2image import convert_from_path
|
| 6 |
from typing import Callable, Dict, List, Optional, Text, Tuple, Union
|
| 7 |
from typing_extensions import Literal
|
| 8 |
import pandas as pd
|
|
|
|
| 10 |
import re
|
| 11 |
import string
|
| 12 |
from haystack.pipelines import Pipeline
|
| 13 |
+
import streamlit as st
|
| 14 |
|
| 15 |
+
@st.cache_data
|
| 16 |
def useOCR(file_path: str)-> Text:
|
| 17 |
"""
|
| 18 |
Converts image pdfs into text, Using the Farm-haystack[OCR]
|
|
|
|
| 24 |
|
| 25 |
Returns the text file as string.
|
| 26 |
"""
|
| 27 |
+
# we need pdf file to be first converted into image file
|
| 28 |
+
# this will create each page as image file
|
| 29 |
+
images = convert_from_path(pdf_path = file_path)
|
| 30 |
+
list_ = []
|
| 31 |
+
# save image file in cache and read them one by one to pass it to OCR
|
| 32 |
+
for i, pdf in enumerate(images):
|
| 33 |
+
# Save pages as images in the pdf
|
| 34 |
+
pdf.save(f'PDF\image_converted_{i+1}.png', 'PNG')
|
| 35 |
+
list_.append(f'PDF\image_converted_{i+1}.png')
|
| 36 |
|
| 37 |
+
converter = ImageToTextConverter(remove_numeric_tables=True,
|
| 38 |
valid_languages=["eng"])
|
| 39 |
+
# placeholder to collect the text from each page
|
| 40 |
+
placeholder = []
|
| 41 |
+
for file in list_:
|
| 42 |
+
document = converter.convert(
|
| 43 |
+
file_path=file, meta=None,
|
| 44 |
+
)[0]
|
| 45 |
|
| 46 |
+
text = document.content
|
| 47 |
+
placeholder.append(text)
|
| 48 |
+
# join the text from each page by page separator
|
| 49 |
+
text = '\x0c'.join(placeholder)
|
| 50 |
+
return text
|
| 51 |
|
| 52 |
|
| 53 |
|
|
|
|
| 57 |
Converter class, will use internally haystack PDFToTextOCR in case of image
|
| 58 |
pdf. Cannot use the FileClassifier from haystack as its doesnt has any
|
| 59 |
label/output class for image.
|
|
|
|
| 60 |
1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
|
| 61 |
2. https://docs.haystack.deepset.ai/docs/file_converters
|
| 62 |
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
|
| 63 |
4. https://docs.haystack.deepset.ai/reference/file-converters-api
|
|
|
|
|
|
|
| 64 |
"""
|
| 65 |
|
| 66 |
outgoing_edges = 1
|
|
|
|
| 101 |
|
| 102 |
documents = []
|
| 103 |
|
|
|
|
|
|
|
| 104 |
document = converter.convert(
|
| 105 |
file_path=file_path, meta=None,
|
| 106 |
encoding=encoding, id_hash_keys=id_hash_keys
|
|
|
|
| 116 |
if filtered == "":
|
| 117 |
logging.info("Using OCR")
|
| 118 |
text = useOCR(file_path)
|
| 119 |
+
|
| 120 |
documents.append(Document(content=text,
|
| 121 |
meta={"name": file_name},
|
| 122 |
id_hash_keys=id_hash_keys))
|
| 123 |
+
|
| 124 |
+
|
| 125 |
|
| 126 |
logging.info('file conversion succesful')
|
| 127 |
output = {'documents': documents}
|
|
|
|
| 141 |
|
| 142 |
"""
|
| 143 |
Performs basic cleaning of text.
|
|
|
|
| 144 |
Params
|
| 145 |
----------
|
| 146 |
s: string to be processed
|
|
|
|
| 166 |
|
| 167 |
return s.strip()
|
| 168 |
|
| 169 |
+
|
| 170 |
def paraLengthCheck(paraList, max_len = 100):
|
| 171 |
"""
|
| 172 |
There are cases where preprocessor cannot respect word limit, when using
|
|
|
|
| 204 |
class to preprocess the document returned by FileConverter. It will check
|
| 205 |
for splitting strategy and splits the document by word or sentences and then
|
| 206 |
synthetically create the paragraphs.
|
|
|
|
| 207 |
1. https://docs.haystack.deepset.ai/docs/preprocessor
|
| 208 |
2. https://docs.haystack.deepset.ai/reference/preprocessor-api
|
| 209 |
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
|
|
|
|
| 210 |
"""
|
| 211 |
outgoing_edges = 1
|
| 212 |
|
| 213 |
+
def run(self, documents:List[Document], remove_punc:bool=False, apply_clean = True,
|
| 214 |
split_by: Literal["sentence", "word"] = 'sentence',
|
| 215 |
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
| 216 |
split_overlap:int = 0):
|
|
|
|
| 265 |
# # basic cleaning before passing it to preprocessor.
|
| 266 |
# i = basic(i)
|
| 267 |
docs_processed = preprocessor.process([i])
|
| 268 |
+
if apply_clean:
|
| 269 |
+
for item in docs_processed:
|
| 270 |
+
item.content = basic(item.content, remove_punc= remove_punc)
|
| 271 |
+
else:
|
| 272 |
+
pass
|
| 273 |
|
| 274 |
df = pd.DataFrame(docs_processed)
|
| 275 |
all_text = " ".join(df.content.to_list())
|
|
|
|
| 293 |
"""
|
| 294 |
Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
|
| 295 |
from utils.preprocessing
|
|
|
|
| 296 |
"""
|
| 297 |
|
| 298 |
preprocessing_pipeline = Pipeline()
|
|
|
|
| 304 |
preprocessing_pipeline.add_node(component = custom_preprocessor,
|
| 305 |
name ='UdfPreProcessor', inputs=["FileConverter"])
|
| 306 |
|
| 307 |
+
return preprocessing_pipeline
|
|
|