Spaces:

leavoigt
/

vulnerability

Sleeping

App Files Files Community

leavoigt commited on Aug 1, 2023

Commit

32a1b64

1 Parent(s): 5b4a98a

Create file_processing.py

Browse files

Files changed (1) hide show

file_processing.py +42 -0

file_processing.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import utils
+from utils.preprocessing import processingpipeline
+def get_paragraphs(file_input):
+    # Declare params
+    SPLIT_BY = 'word'
+    # usually models have max-length of 384/512
+    SPLIT_LENGTH = 100
+    # too much overlap can lead to repeatitive text
+    # but as a rule fo thumb we keep (20% of Split Length)
+    SPLIT_OVERLAP = 10
+    # the text is cleaned for removing htmls and other annoying texts
+    # but if you need to remove all punctuations like ,.; etc.
+    # good to use for non-Transformers based models.
+    REMOVE_PUNC = False
+    # This param is used only for split_by ='word'
+    RESPECT_SENTENCE_BOUNDARY = True
+    # initialize the preprocessing pipeline and pass params for Preprocessor either
+    # on go or as per delcared variables above.
+    prep_pipeline  = processingpipeline()
+    output_pre = prep_pipeline.run(file_paths = file_path,
+                          params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                   "UdfPreProcessor": {"remove_punc": REMOVE_PUNC, \
+                                        "split_by": SPLIT_BY, \
+                                        "split_length":SPLIT_LENGTH,\
+                                        "split_overlap": SPLIT_OVERLAP, \
+                                        "split_respect_sentence_boundary":RESPECT_SENTENCE_BOUNDARY}})
+    output_pre.keys()
+    par_list = output_pre['paraList']
+    #print(par_list)
+    return par_list