Spaces:

m-ric
/

chunk_visualizer

Running

App Files Files Community

Aymeric Roucher commited on Jan 11, 2024

Commit

cb842ed

verified ·

1 Parent(s): fadcc20

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -46

app.py CHANGED Viewed

@@ -13,41 +13,14 @@ preprocessor = PreProcessor(
     remove_substrings=None,
     max_chars_check=10_000,
 )
-import difflib
-def separate_overlap(s1, s2):
-    for i in range(len(s1) - len(s2), len(s1)):
-        if s1[i:] == s2[: len(s1) - i]:
-            overlap = s1[i:]
-            return [s1[:i], overlap, s2[len(s1) - i :]]
-    # if no overlap is found, return the strings
-    return [s1, s2]
-def extract_overlaps(list):
-    i = 0
-    annotated_list = [[el, i] for i, el in enumerate(list)]
-    while i < len(annotated_list) - 1:
-        separated = separate_overlap(annotated_list[i][0], annotated_list[i + 1][0])
-        if len(separated) == 2:
-            i += 1
-        elif len(separated) == 3:
-            annotated_list[i][0] = separated[0]
-            annotated_list.insert(i + 1, [separated[1], "overlap"])
-            annotated_list[i + 2][0] = separated[2]
-            i += 2
-    return annotated_list
-def chunk(text, words, splitter_selection, slider_overlap):
     if "Word" in splitter_selection:
         splits = preprocessor.split(
             Document(text),
             split_length=words,
             split_by="word",
-            split_overlap=slider_overlap,
             split_respect_sentence_boundary=(
                 "respect sentence boundaries" in splitter_selection
             ),
@@ -57,7 +30,6 @@ def chunk(text, words, splitter_selection, slider_overlap):
         text_splitter = CharacterTextSplitter(
             separator="",
             chunk_size=words,
-            chunk_overlap=slider_overlap,
             length_function=len,
             is_separator_regex=False,
         )
@@ -66,7 +38,6 @@ def chunk(text, words, splitter_selection, slider_overlap):
     elif splitter_selection == "Langchain's RecursiveCharacterTextSplitter - vanilla":
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=words,
-            chunk_overlap=slider_overlap,
             length_function=len,
             add_start_index=True,
         )
@@ -75,7 +46,6 @@ def chunk(text, words, splitter_selection, slider_overlap):
     elif splitter_selection == "Langchain's RecursiveCharacterTextSplitter - with '.'":
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=words,
-            chunk_overlap=slider_overlap,
             length_function=len,
             add_start_index=True,
             separators=["\n\n", "\n", ".", " ", ""],
@@ -151,13 +121,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     slider_count = gr.Slider(
         20, 500, value=50, label="Count 🧮", info="Chunk size, in the chosen unit."
     )
-    slider_overlap = gr.Slider(
-        0,
-        100,
-        value=0,
-        label="Overlap 🔀",
-        info="Size of overlap between adjacent chunks.",
-    )
     out = gr.HighlightedText(
         label="Output",
         show_legend=True,
@@ -165,22 +128,17 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     )
     text.change(
         fn=chunk,
-        inputs=[text, slider_count, split_selection, slider_overlap],
         outputs=out,
     )
     split_selection.change(
         fn=chunk,
-        inputs=[text, slider_count, split_selection, slider_overlap],
         outputs=out,
     )
     slider_count.change(
         fn=chunk,
-        inputs=[text, slider_count, split_selection, slider_overlap],
-        outputs=out,
-    )
-    slider_overlap.change(
-        fn=chunk,
-        inputs=[text, slider_count, split_selection, slider_overlap],
         outputs=out,
     )
 demo.launch()

     remove_substrings=None,
     max_chars_check=10_000,
 )
+def chunk(text, words, splitter_selection):
     if "Word" in splitter_selection:
         splits = preprocessor.split(
             Document(text),
             split_length=words,
             split_by="word",
             split_respect_sentence_boundary=(
                 "respect sentence boundaries" in splitter_selection
             ),
         text_splitter = CharacterTextSplitter(
             separator="",
             chunk_size=words,
             length_function=len,
             is_separator_regex=False,
         )
     elif splitter_selection == "Langchain's RecursiveCharacterTextSplitter - vanilla":
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=words,
             length_function=len,
             add_start_index=True,
         )
     elif splitter_selection == "Langchain's RecursiveCharacterTextSplitter - with '.'":
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=words,
             length_function=len,
             add_start_index=True,
             separators=["\n\n", "\n", ".", " ", ""],
     slider_count = gr.Slider(
         20, 500, value=50, label="Count 🧮", info="Chunk size, in the chosen unit."
     )
     out = gr.HighlightedText(
         label="Output",
         show_legend=True,
     )
     text.change(
         fn=chunk,
+        inputs=[text, slider_count, split_selection],
         outputs=out,
     )
     split_selection.change(
         fn=chunk,
+        inputs=[text, slider_count, split_selection],
         outputs=out,
     )
     slider_count.change(
         fn=chunk,
+        inputs=[text, slider_count, split_selection],
         outputs=out,
     )
 demo.launch()