vulnerability

Sleeping

App Files Files Community

leavoigt commited on Mar 7, 2024

Commit

9794a01

verified ·

1 Parent(s): e0be539

Update utils/preprocessing.py

Browse files

Files changed (1) hide show

utils/preprocessing.py +36 -20

utils/preprocessing.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from haystack.nodes.base import BaseComponent
 from haystack.schema import Document
-from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
 from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
 from typing import Callable, Dict, List, Optional, Text, Tuple, Union
 from typing_extensions import Literal
 import pandas as pd
@@ -9,7 +10,9 @@ import logging
 import re
 import string
 from haystack.pipelines import Pipeline
 def useOCR(file_path: str)-> Text:
     """
     Converts image pdfs into text, Using the Farm-haystack[OCR]
@@ -21,13 +24,30 @@ def useOCR(file_path: str)-> Text:
     Returns the text file as string.
     """
-    converter = PDFToTextOCRConverter(remove_numeric_tables=True,
                                       valid_languages=["eng"])
-    docs = converter.convert(file_path=file_path, meta=None)
-    return docs[0].content
@@ -37,13 +57,10 @@ class FileConverter(BaseComponent):
     Converter class, will use internally haystack PDFToTextOCR in case of image
     pdf. Cannot use the FileClassifier from haystack as its doesnt has any
     label/output class for image.
     1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
     2. https://docs.haystack.deepset.ai/docs/file_converters
     3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
     4. https://docs.haystack.deepset.ai/reference/file-converters-api
     """
     outgoing_edges = 1
@@ -84,8 +101,6 @@ class FileConverter(BaseComponent):
         documents = []
-# encoding is empty, probably should be utf-8
         document = converter.convert(
                       file_path=file_path, meta=None,
                       encoding=encoding, id_hash_keys=id_hash_keys
@@ -101,10 +116,12 @@ class FileConverter(BaseComponent):
         if filtered == "":
             logging.info("Using OCR")
             text = useOCR(file_path)
         documents.append(Document(content=text,
                               meta={"name": file_name},
                               id_hash_keys=id_hash_keys))
         logging.info('file conversion succesful')
         output = {'documents': documents}
@@ -124,7 +141,6 @@ def basic(s:str, remove_punc:bool = False):
     """
     Performs basic cleaning of text.
     Params
     ----------
     s: string to be processed
@@ -150,6 +166,7 @@ def basic(s:str, remove_punc:bool = False):
     return s.strip()
 def paraLengthCheck(paraList, max_len = 100):
     """
     There are cases where preprocessor cannot respect word limit, when using
@@ -187,15 +204,13 @@ class UdfPreProcessor(BaseComponent):
     class to preprocess the document returned by FileConverter. It will check
     for splitting strategy and splits the document by word or sentences and then
     synthetically create the paragraphs.
     1. https://docs.haystack.deepset.ai/docs/preprocessor
     2. https://docs.haystack.deepset.ai/reference/preprocessor-api
     3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
     """
     outgoing_edges = 1
-    def run(self, documents:List[Document], remove_punc:bool=False,
             split_by: Literal["sentence", "word"] = 'sentence',
             split_length:int = 2, split_respect_sentence_boundary:bool = False,
             split_overlap:int = 0):
@@ -250,8 +265,11 @@ class UdfPreProcessor(BaseComponent):
             # # basic cleaning before passing it to preprocessor.
             # i = basic(i)
             docs_processed = preprocessor.process([i])
-            for item in docs_processed:
-                item.content = basic(item.content, remove_punc= remove_punc)
         df = pd.DataFrame(docs_processed)
         all_text = " ".join(df.content.to_list())
@@ -275,7 +293,6 @@ def processingpipeline():
     """
     Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
     from utils.preprocessing
     """
     preprocessing_pipeline = Pipeline()
@@ -287,5 +304,4 @@ def processingpipeline():
     preprocessing_pipeline.add_node(component = custom_preprocessor,
                             name ='UdfPreProcessor', inputs=["FileConverter"])
-    return preprocessing_pipeline

 from haystack.nodes.base import BaseComponent
 from haystack.schema import Document
+from haystack.nodes import ImageToTextConverter, PDFToTextConverter
 from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
+from pdf2image import convert_from_path
 from typing import Callable, Dict, List, Optional, Text, Tuple, Union
 from typing_extensions import Literal
 import pandas as pd
 import re
 import string
 from haystack.pipelines import Pipeline
+import streamlit as st
+@st.cache_data
 def useOCR(file_path: str)-> Text:
     """
     Converts image pdfs into text, Using the Farm-haystack[OCR]
     Returns the text file as string.
     """
+    # we need pdf file to be first converted into image file
+    # this will create each page as image file
+    images = convert_from_path(pdf_path = file_path)
+    list_ = []
+    # save image file in cache and read them one by one to pass it to OCR
+    for i, pdf in enumerate(images):
+        # Save pages as images in the pdf
+        pdf.save(f'PDF\image_converted_{i+1}.png', 'PNG')
+        list_.append(f'PDF\image_converted_{i+1}.png')
+    converter = ImageToTextConverter(remove_numeric_tables=True,
                                       valid_languages=["eng"])
+    # placeholder to collect  the text from each page
+    placeholder = []
+    for file in list_:
+        document = converter.convert(
+                            file_path=file, meta=None,
+                            )[0]
+        text = document.content
+        placeholder.append(text)
+    # join the text from each page by page separator
+    text = '\x0c'.join(placeholder)
+    return text
     Converter class, will use internally haystack PDFToTextOCR in case of image
     pdf. Cannot use the FileClassifier from haystack as its doesnt has any
     label/output class for image.
     1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
     2. https://docs.haystack.deepset.ai/docs/file_converters
     3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
     4. https://docs.haystack.deepset.ai/reference/file-converters-api
     """
     outgoing_edges = 1
         documents = []
         document = converter.convert(
                       file_path=file_path, meta=None,
                       encoding=encoding, id_hash_keys=id_hash_keys
         if filtered == "":
             logging.info("Using OCR")
             text = useOCR(file_path)
         documents.append(Document(content=text,
                               meta={"name": file_name},
                               id_hash_keys=id_hash_keys))
         logging.info('file conversion succesful')
         output = {'documents': documents}
     """
     Performs basic cleaning of text.
     Params
     ----------
     s: string to be processed
     return s.strip()
 def paraLengthCheck(paraList, max_len = 100):
     """
     There are cases where preprocessor cannot respect word limit, when using
     class to preprocess the document returned by FileConverter. It will check
     for splitting strategy and splits the document by word or sentences and then
     synthetically create the paragraphs.
     1. https://docs.haystack.deepset.ai/docs/preprocessor
     2. https://docs.haystack.deepset.ai/reference/preprocessor-api
     3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
     """
     outgoing_edges = 1
+    def run(self, documents:List[Document], remove_punc:bool=False, apply_clean = True,
             split_by: Literal["sentence", "word"] = 'sentence',
             split_length:int = 2, split_respect_sentence_boundary:bool = False,
             split_overlap:int = 0):
             # # basic cleaning before passing it to preprocessor.
             # i = basic(i)
             docs_processed = preprocessor.process([i])
+            if apply_clean:
+                for item in docs_processed:
+                    item.content = basic(item.content, remove_punc= remove_punc)
+            else:
+                pass
         df = pd.DataFrame(docs_processed)
         all_text = " ".join(df.content.to_list())
     """
     Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
     from utils.preprocessing
     """
     preprocessing_pipeline = Pipeline()
     preprocessing_pipeline.add_node(component = custom_preprocessor,
                             name ='UdfPreProcessor', inputs=["FileConverter"])
+    return preprocessing_pipeline