mtyrrell commited on
Commit
01327a1
·
1 Parent(s): 8404bb4

params.cfg

Browse files
Files changed (3) hide show
  1. app/main.py +14 -3
  2. app/utils.py +18 -0
  3. params.cfg +7 -0
app/main.py CHANGED
@@ -17,6 +17,11 @@ import PyPDF2
17
  from docx import Document as DocxDocument
18
  from langchain_text_splitters import RecursiveCharacterTextSplitter
19
 
 
 
 
 
 
20
  # Configure logging
21
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
22
  logger = logging.getLogger(__name__)
@@ -86,12 +91,18 @@ def clean_and_chunk_text(text: str, doc_id: str) -> List[DocumentChunk]:
86
  text = re.sub(r'\s+', ' ', text)
87
  text = text.strip()
88
 
 
 
 
 
 
 
89
  # Split text into chunks
90
  text_splitter = RecursiveCharacterTextSplitter(
91
- chunk_size=700,
92
- chunk_overlap=50,
93
  length_function=len,
94
- separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""],
95
  is_separator_regex=False
96
  )
97
 
 
17
  from docx import Document as DocxDocument
18
  from langchain_text_splitters import RecursiveCharacterTextSplitter
19
 
20
+ # Local imports
21
+ from .utils import getconfig
22
+
23
+ config = getconfig("params.cfg")
24
+
25
  # Configure logging
26
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
27
  logger = logging.getLogger(__name__)
 
91
  text = re.sub(r'\s+', ' ', text)
92
  text = text.strip()
93
 
94
+ # Get chunking parameters from config
95
+ chunk_size = config.getint('chunking', 'chunk_size', fallback=700)
96
+ chunk_overlap = config.getint('chunking', 'chunk_overlap', fallback=50)
97
+ separators_str = config.get('chunking', 'separators', fallback='\n\n,\n,. ,! ,? , ,')
98
+ separators = [s.strip() for s in separators_str.split(',')]
99
+
100
  # Split text into chunks
101
  text_splitter = RecursiveCharacterTextSplitter(
102
+ chunk_size=chunk_size,
103
+ chunk_overlap=chunk_overlap,
104
  length_function=len,
105
+ separators=separators,
106
  is_separator_regex=False
107
  )
108
 
app/utils.py CHANGED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import configparser
3
+ import logging
4
+
5
+
6
+ def getconfig(configfile_path: str):
7
+ """
8
+ Read the config file
9
+ Params
10
+ ----------------
11
+ configfile_path: file path of .cfg file
12
+ """
13
+ config = configparser.ConfigParser()
14
+ try:
15
+ config.read_file(open(configfile_path))
16
+ return config
17
+ except:
18
+ logging.warning("config file not found")
params.cfg CHANGED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [chunking]
2
+ # Size of each text chunk in characters
3
+ chunk_size = 700
4
+ # Overlap between consecutive chunks in characters
5
+ chunk_overlap = 50
6
+ # Text separators for splitting, comma-separated (order of preference)
7
+ separators = \n\n,\n,. ,! ,? , ,