Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -25,19 +25,19 @@ def extract_separators_from_string(separators_str):
|
|
| 25 |
Please type it in the correct format: "['separator_1', 'separator_2', etc]"
|
| 26 |
""")
|
| 27 |
|
| 28 |
-
def change_split_selection(text, slider_count, split_selection, separator_selection, length_unit_selection):
|
| 29 |
return (
|
| 30 |
gr.Textbox.update(visible=(split_selection==LABEL_RECURSIVE)),
|
| 31 |
-
chunk(text, slider_count, split_selection, separator_selection, length_unit_selection)
|
| 32 |
)
|
| 33 |
|
| 34 |
-
def chunk(text, length, splitter_selection, separators_str, length_unit_selection):
|
| 35 |
separators = extract_separators_from_string(separators_str)
|
| 36 |
length_function = (length_tokens if "token" in length_unit_selection.lower() else len)
|
| 37 |
if splitter_selection == LABEL_TEXTSPLITTER:
|
| 38 |
text_splitter = CharacterTextSplitter(
|
| 39 |
chunk_size=length,
|
| 40 |
-
chunk_overlap=
|
| 41 |
length_function=length_function,
|
| 42 |
strip_whitespace=False,
|
| 43 |
is_separator_regex=False,
|
|
@@ -46,7 +46,7 @@ def chunk(text, length, splitter_selection, separators_str, length_unit_selectio
|
|
| 46 |
elif splitter_selection == LABEL_RECURSIVE:
|
| 47 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 48 |
chunk_size=length,
|
| 49 |
-
chunk_overlap=
|
| 50 |
length_function=length_function,
|
| 51 |
strip_whitespace=False,
|
| 52 |
separators=separators,
|
|
@@ -54,7 +54,9 @@ def chunk(text, length, splitter_selection, separators_str, length_unit_selectio
|
|
| 54 |
splits = text_splitter.create_documents([text])
|
| 55 |
text_splits = [split.page_content for split in splits]
|
| 56 |
|
| 57 |
-
|
|
|
|
|
|
|
| 58 |
return output
|
| 59 |
|
| 60 |
|
|
@@ -134,6 +136,9 @@ with gr.Blocks(theme=gr.themes.Soft(text_size='lg', font=["monospace"], primary_
|
|
| 134 |
slider_count = gr.Slider(
|
| 135 |
20, 500, value=200, label="Chunk length π", info="In the chosen unit."
|
| 136 |
)
|
|
|
|
|
|
|
|
|
|
| 137 |
out = gr.HighlightedText(
|
| 138 |
label="Output",
|
| 139 |
show_legend=True,
|
|
@@ -141,22 +146,27 @@ with gr.Blocks(theme=gr.themes.Soft(text_size='lg', font=["monospace"], primary_
|
|
| 141 |
)
|
| 142 |
text.change(
|
| 143 |
fn=chunk,
|
| 144 |
-
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
|
| 145 |
outputs=out,
|
| 146 |
)
|
| 147 |
length_unit_selection.change(
|
| 148 |
fn=chunk,
|
| 149 |
-
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
|
| 150 |
outputs=out,
|
| 151 |
)
|
| 152 |
split_selection.change(
|
| 153 |
fn=change_split_selection,
|
| 154 |
-
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
|
| 155 |
outputs=[separator_selection, out],
|
| 156 |
)
|
| 157 |
slider_count.change(
|
| 158 |
fn=chunk,
|
| 159 |
-
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
outputs=out,
|
| 161 |
)
|
| 162 |
demo.launch()
|
|
|
|
| 25 |
Please type it in the correct format: "['separator_1', 'separator_2', etc]"
|
| 26 |
""")
|
| 27 |
|
| 28 |
+
def change_split_selection(text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap):
|
| 29 |
return (
|
| 30 |
gr.Textbox.update(visible=(split_selection==LABEL_RECURSIVE)),
|
| 31 |
+
chunk(text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap)
|
| 32 |
)
|
| 33 |
|
| 34 |
+
def chunk(text, length, splitter_selection, separators_str, length_unit_selection, chunk_overlap):
|
| 35 |
separators = extract_separators_from_string(separators_str)
|
| 36 |
length_function = (length_tokens if "token" in length_unit_selection.lower() else len)
|
| 37 |
if splitter_selection == LABEL_TEXTSPLITTER:
|
| 38 |
text_splitter = CharacterTextSplitter(
|
| 39 |
chunk_size=length,
|
| 40 |
+
chunk_overlap=chunk_overlap,
|
| 41 |
length_function=length_function,
|
| 42 |
strip_whitespace=False,
|
| 43 |
is_separator_regex=False,
|
|
|
|
| 46 |
elif splitter_selection == LABEL_RECURSIVE:
|
| 47 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 48 |
chunk_size=length,
|
| 49 |
+
chunk_overlap=chunk_overlap,
|
| 50 |
length_function=length_function,
|
| 51 |
strip_whitespace=False,
|
| 52 |
separators=separators,
|
|
|
|
| 54 |
splits = text_splitter.create_documents([text])
|
| 55 |
text_splits = [split.page_content for split in splits]
|
| 56 |
|
| 57 |
+
unoverlapped_text_splits = unoverlap_list(text_splits)
|
| 58 |
+
|
| 59 |
+
output = [((split[0], 0) if split[1] else (split[0], str(i+1))) for i, split in enumerate(unoverlapped_text_splits)]
|
| 60 |
return output
|
| 61 |
|
| 62 |
|
|
|
|
| 136 |
slider_count = gr.Slider(
|
| 137 |
20, 500, value=200, label="Chunk length π", info="In the chosen unit."
|
| 138 |
)
|
| 139 |
+
chunk_overlap = gr.Slider(
|
| 140 |
+
0, 30, value=10, label="Overlap between chunks", info="In the chosen unit."
|
| 141 |
+
)
|
| 142 |
out = gr.HighlightedText(
|
| 143 |
label="Output",
|
| 144 |
show_legend=True,
|
|
|
|
| 146 |
)
|
| 147 |
text.change(
|
| 148 |
fn=chunk,
|
| 149 |
+
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
|
| 150 |
outputs=out,
|
| 151 |
)
|
| 152 |
length_unit_selection.change(
|
| 153 |
fn=chunk,
|
| 154 |
+
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
|
| 155 |
outputs=out,
|
| 156 |
)
|
| 157 |
split_selection.change(
|
| 158 |
fn=change_split_selection,
|
| 159 |
+
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
|
| 160 |
outputs=[separator_selection, out],
|
| 161 |
)
|
| 162 |
slider_count.change(
|
| 163 |
fn=chunk,
|
| 164 |
+
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
|
| 165 |
+
outputs=out,
|
| 166 |
+
)
|
| 167 |
+
chunk_overlap.change(
|
| 168 |
+
fn=chunk,
|
| 169 |
+
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
|
| 170 |
outputs=out,
|
| 171 |
)
|
| 172 |
demo.launch()
|