Update digestor.py
Browse files- digestor.py +2 -2
digestor.py
CHANGED
|
@@ -159,10 +159,10 @@ class Digestor:
|
|
| 159 |
# Finally, chunk the piece, adjusting the chunks if too long.
|
| 160 |
for i, j in range_list:
|
| 161 |
if (tokenized_len := len(tokenizer(chunk := ' '.join(fractured[i:j])))) <= self.token_limit: # d[i:j]).replace('\n',' ')))) <= self.token_limit:
|
| 162 |
-
chunk_list.append(chunk
|
| 163 |
else: # if chunks of <limit> words are too long, back them off.
|
| 164 |
chunk_list.append(' '.join(chunk.split(' ')[: self.token_limit - tokenized_len ])) # tokenized_len ]).replace('\n',' '))
|
| 165 |
-
|
| 166 |
return chunk_list
|
| 167 |
|
| 168 |
|
|
|
|
| 159 |
# Finally, chunk the piece, adjusting the chunks if too long.
|
| 160 |
for i, j in range_list:
|
| 161 |
if (tokenized_len := len(tokenizer(chunk := ' '.join(fractured[i:j])))) <= self.token_limit: # d[i:j]).replace('\n',' ')))) <= self.token_limit:
|
| 162 |
+
chunk_list.append(chunk)
|
| 163 |
else: # if chunks of <limit> words are too long, back them off.
|
| 164 |
chunk_list.append(' '.join(chunk.split(' ')[: self.token_limit - tokenized_len ])) # tokenized_len ]).replace('\n',' '))
|
| 165 |
+
chunk_list = [i.replace(' . ','. ') for i in chunk_list]
|
| 166 |
return chunk_list
|
| 167 |
|
| 168 |
|