Spaces:

m-ric
/

chunk_visualizer

Running

App Files Files Community

m-ric commited on Feb 15, 2024

Commit

1fa958e

verified ·

1 Parent(s): e13bfd4

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -34

app.py CHANGED Viewed

@@ -10,6 +10,10 @@ LABEL_RECURSIVE = "🦜🔗 LangChain's RecursiveCharacterTextSplitter"
 bert_tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
 def extract_separators_from_string(separators_str):
     try:
         separators = separators_str[1:-1].split(", ")
@@ -31,42 +35,24 @@ def change_split_selection(text, slider_count, split_selection, separator_select
 def chunk(text, length, splitter_selection, separators_str, length_unit_selection):
     separators = extract_separators_from_string(separators_str)
     print(splitter_selection, length_unit_selection.lower())
     if splitter_selection == LABEL_TEXTSPLITTER:
-        if "token" in length_unit_selection.lower():
-            text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
-                bert_tokenizer,
-                separator="",
-                chunk_size=length,
-                chunk_overlap=0,
-                is_separator_regex=False,
-            )
-        else:
-            text_splitter = CharacterTextSplitter(
-                separator="",
-                chunk_size=length,
-                chunk_overlap=0,
-                length_function=len,
-                is_separator_regex=False,
-            )
     elif splitter_selection == LABEL_RECURSIVE:
-        if "token" in length_unit_selection.lower():
-            text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
-                bert_tokenizer,
-                chunk_size=length,
-                chunk_overlap=0,
-                add_start_index=True,
-                strip_whitespace=False,
-                separators=separators,
-            )
-        else:
-            text_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=length,
-                chunk_overlap=0,
-                length_function=len,
-                add_start_index=True,
-                strip_whitespace=False,
-                separators=separators,
-            )
     splits = text_splitter.create_documents([text])
     text_splits = [split.page_content for split in splits]

 bert_tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
+def length_tokens(txt):
+    return len(bert_tokenizer.tokenize(txt))
 def extract_separators_from_string(separators_str):
     try:
         separators = separators_str[1:-1].split(", ")
 def chunk(text, length, splitter_selection, separators_str, length_unit_selection):
     separators = extract_separators_from_string(separators_str)
     print(splitter_selection, length_unit_selection.lower())
+    length_function = (length_tokens if "token" in length_unit_selection.lower() else len)
     if splitter_selection == LABEL_TEXTSPLITTER:
+        text_splitter = CharacterTextSplitter(
+            chunk_size=length,
+            chunk_overlap=0,
+            length_function=length_function,
+            stipe_whitespace=False,
+            is_separator_regex=False,
+            separator="",
+        )
     elif splitter_selection == LABEL_RECURSIVE:
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=length,
+            chunk_overlap=0,
+            length_function=length_function,
+            strip_whitespace=False,
+            separators=separators,
+        )
     splits = text_splitter.create_documents([text])
     text_splits = [split.page_content for split in splits]