Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,9 +17,8 @@ def length_tokens(txt):
|
|
| 17 |
|
| 18 |
|
| 19 |
def extract_separators_from_string(separators_str):
|
| 20 |
-
print('Received:', type(separators_str), 'with value', repr(separators_str))
|
| 21 |
try:
|
| 22 |
-
separators_str = separators_str.replace("\\n", "\n").replace("\\t", "\t") # fix special characters
|
| 23 |
separators = separators_str[1:-1].split(", ")
|
| 24 |
return [separator.replace('"', "").replace("'", "") for separator in separators]
|
| 25 |
except Exception as e:
|
|
@@ -47,7 +46,6 @@ def chunk(text, length, splitter_selection, separators_str, length_unit_selectio
|
|
| 47 |
separator=" ",
|
| 48 |
)
|
| 49 |
elif splitter_selection == LABEL_RECURSIVE:
|
| 50 |
-
print('Splitting with separators:', ',,'.join([repr(el) for el in separators]), f',and chunk length {length} and chunk overlap {chunk_overlap}')
|
| 51 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 52 |
chunk_size=length,
|
| 53 |
chunk_overlap=int(chunk_overlap),
|
|
@@ -55,14 +53,9 @@ def chunk(text, length, splitter_selection, separators_str, length_unit_selectio
|
|
| 55 |
strip_whitespace=False,
|
| 56 |
separators=separators,
|
| 57 |
)
|
| 58 |
-
print(text_splitter._separators)
|
| 59 |
splits = text_splitter.create_documents([text])
|
| 60 |
text_splits = [split.page_content for split in splits]
|
| 61 |
-
print('I did splits:')
|
| 62 |
-
print(text_splits)
|
| 63 |
-
|
| 64 |
unoverlapped_text_splits = unoverlap_list(text_splits)
|
| 65 |
-
|
| 66 |
output = [((split[0], 'Overlap') if split[1] else (split[0], f"Chunk {str(i)}")) for i, split in enumerate(unoverlapped_text_splits)]
|
| 67 |
return output
|
| 68 |
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
def extract_separators_from_string(separators_str):
|
|
|
|
| 20 |
try:
|
| 21 |
+
separators_str = separators_str.replace("\\n", "\n").replace("\\t", "\t").replace("\\\\", "\\") # fix special characters
|
| 22 |
separators = separators_str[1:-1].split(", ")
|
| 23 |
return [separator.replace('"', "").replace("'", "") for separator in separators]
|
| 24 |
except Exception as e:
|
|
|
|
| 46 |
separator=" ",
|
| 47 |
)
|
| 48 |
elif splitter_selection == LABEL_RECURSIVE:
|
|
|
|
| 49 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 50 |
chunk_size=length,
|
| 51 |
chunk_overlap=int(chunk_overlap),
|
|
|
|
| 53 |
strip_whitespace=False,
|
| 54 |
separators=separators,
|
| 55 |
)
|
|
|
|
| 56 |
splits = text_splitter.create_documents([text])
|
| 57 |
text_splits = [split.page_content for split in splits]
|
|
|
|
|
|
|
|
|
|
| 58 |
unoverlapped_text_splits = unoverlap_list(text_splits)
|
|
|
|
| 59 |
output = [((split[0], 'Overlap') if split[1] else (split[0], f"Chunk {str(i)}")) for i, split in enumerate(unoverlapped_text_splits)]
|
| 60 |
return output
|
| 61 |
|