Spaces:
Sleeping
Sleeping
| from utils.preprocessing import processingpipeline | |
| def get_paragraphs(file_path_input): | |
| # Declare params | |
| SPLIT_BY = 'word' | |
| # usually models have max-length of 384/512 | |
| SPLIT_LENGTH = 100 | |
| # too much overlap can lead to repeatitive text | |
| # but as a rule fo thumb we keep (20% of Split Length) | |
| SPLIT_OVERLAP = 10 | |
| # the text is cleaned for removing htmls and other annoying texts | |
| # but if you need to remove all punctuations like ,.; etc. | |
| # good to use for non-Transformers based models. | |
| REMOVE_PUNC = False | |
| # This param is used only for split_by ='word' | |
| RESPECT_SENTENCE_BOUNDARY = True | |
| # initialize the preprocessing pipeline and pass params for Preprocessor either | |
| # on go or as per delcared variables above. | |
| prep_pipeline = processingpipeline() | |
| output_pre = prep_pipeline.run(file_paths = file_path_input, | |
| params= {"FileConverter": {"file_path": file_path, \ | |
| "file_name": file_name}, | |
| "UdfPreProcessor": {"remove_punc": REMOVE_PUNC, \ | |
| "split_by": SPLIT_BY, \ | |
| "split_length":SPLIT_LENGTH,\ | |
| "split_overlap": SPLIT_OVERLAP, \ | |
| "split_respect_sentence_boundary":RESPECT_SENTENCE_BOUNDARY}}) | |
| output_pre.keys() | |
| par_list = output_pre['paraList'] | |
| #print(par_list) | |
| return par_list |