Spaces:

OrganizedProgrammers
/

TokenCounter

Runtime error

App Files Files Community

Almaatla commited on Jul 22, 2023

Commit

14a6f5b

1 Parent(s): 6cfb094

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -40

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import gradio as gr
 from PyPDF4 import PdfFileReader
 import tiktoken
@@ -10,71 +12,88 @@ def extract_text_from_pdf(file_path):
             text += pdf.getPage(page_num).extractText()
     return text
-def tokenize(text,model="gpt-3.5-turbo"):
     tokenizer = tiktoken.encoding_for_model(model)
-    tokens = tokenizer.encode(
-        text,
-        disallowed_special=()
-    )
     return tokens
 def count_tokens(text):
     return len(tokenize(text))
-def count_tokens_in_file(file):
-    # Extract text from the PDF file
     paper_text = extract_text_from_pdf(file.name)
-    return count_tokens(paper_text)
 def chunk_text(text, max_char, overlap):
     chunks = []
     start = 0
     end = max_char
-    print(f"max char: {max_char}")
     while start < len(text):
         if end >= len(text):
             end = len(text)
         chunk = text[start:end]
-        print(f"chunk[{start}:{end}] size: {count_tokens(chunk)} tokens")
-        chunks.append(chunk)
         start += max_char - overlap
         end = start + max_char
     return chunks
-def chunk_file(file, max_char,overlap):
-    # Extract text from the PDF file
     text = extract_text_from_pdf(file.name)
     chunks = chunk_text(text, max_char, overlap)
-    return '\n\n[xxxxxxxxxxxxxxxxx]\n\n'.join(chunks)
 with gr.Blocks() as demo:
-    gr.Markdown("Upload your document to count their tokens")
-    with gr.Tab("Upload PDF"):
-        docs_input = gr.File(file_count="single", file_types=[".pdf"])
-        tb_tokenCount = gr.Textbox(label='Number of tokens')
-        docs_input.upload(count_tokens_in_file,inputs=[docs_input],outputs=[tb_tokenCount])
-        sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk")
-        sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")
-        btn_chunk = gr.Button("Chunk text")
-        tb_chunked_text = gr.Textbox(label='Result')
-        btn_chunk.click(chunk_file,inputs=[docs_input,sl_max_char_per_chunk,sl_overlap],outputs=[tb_chunked_text])
-    with gr.Tab("Text"):
-        text_input = gr.Textbox(label='Insert your text here')
-        text_tb_tokenCount = gr.Textbox(label='Number of tokens')
-        text_input.change(count_tokens,inputs=[text_input],outputs=[text_tb_tokenCount])
-        text_sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk")
-        text_sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")
-        text_btn_chunk = gr.Button("Chunk text")
-        text_tb_chunked_text = gr.Textbox(label='Result')
-        def format_chunks(text,max_char,overlap):
-            return '\n\n[xxxxxxxxxxxxxxxx]\n\n'.join(chunk_text(text,max_char,overlap))
-        text_btn_chunk.click(format_chunks,
-                             inputs=[text_input,text_sl_max_char_per_chunk,text_sl_overlap],
-                             outputs=[text_tb_chunked_text])
-#demo.queue()
-demo.launch(debug=True,share=False)

+import os
+import zipfile
 import gradio as gr
 from PyPDF4 import PdfFileReader
 import tiktoken
             text += pdf.getPage(page_num).extractText()
     return text
+def tokenize(text, model="gpt-3.5-turbo"):
     tokenizer = tiktoken.encoding_for_model(model)
+    tokens = tokenizer.encode(text, disallowed_special=())
     return tokens
 def count_tokens(text):
     return len(tokenize(text))
+def analyse_text(text):
+    num_tokens = count_tokens(text)
+    result = []
+    try:
+        result.append(f"Text length: {len(text)}")
+        result.append(f"Token counts: {num_tokens}")
+        result.append(f"Char per token: {'%.1f' % (len(text)/num_tokens)}")
+    except:
+        result = 'no text'
+    return '\n'.join(result)
+def analyse_file(file):
     paper_text = extract_text_from_pdf(file.name)
+    return paper_text
+def write_chunks_to_files(chunks):
+    file_paths = []
+    for i, chunk in enumerate(chunks, start=1):
+        file_path = f"chunk_{i}.txt"
+        with open(file_path, "w") as file:
+            file.write(chunk)
+        file_paths.append(file_path)
+    return file_paths
+def write_chunks_to_zip(chunks):
+    file_paths = write_chunks_to_files(chunks)
+    zip_file_name = "chunks.zip"
+    with zipfile.ZipFile(zip_file_name, 'w') as zipf:
+        for file in file_paths:
+            zipf.write(file)
+            os.remove(file)  # Remove the file after writing it into the zip
+    return zip_file_name
 def chunk_text(text, max_char, overlap):
     chunks = []
     start = 0
     end = max_char
     while start < len(text):
         if end >= len(text):
             end = len(text)
         chunk = text[start:end]
+        num_tokens = count_tokens(chunk)
+        chunks.append((chunk, len(chunk), num_tokens))
         start += max_char - overlap
         end = start + max_char
     return chunks
+def chunk_file(file, max_char, overlap):
     text = extract_text_from_pdf(file.name)
     chunks = chunk_text(text, max_char, overlap)
+    formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)]
+    zip_file_path = write_chunks_to_zip([c[0] for c in chunks])
+    return '\n'.join(formatted_chunks), zip_file_path
+def chunk_and_zip_text(text, max_char, overlap):
+    chunks = chunk_text(text, max_char, overlap)
+    formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)]
+    zip_file_path = write_chunks_to_zip([c[0] for c in chunks])
+    return '\n'.join(formatted_chunks), zip_file_path
 with gr.Blocks() as demo:
+    docs_input = gr.File(file_count="single", file_types=[".pdf"])
+    text_to_chunk = gr.Textbox(label='Text to chunk',show_copy_button=True)
+    tb_analysis = gr.Textbox(label='Text Analysis')
+    sl_max_char_per_chunk = gr.Slider(1000, 300000, value=10000, label="Number of characters", info="Choose a number of characters per chunk")
+    sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")
+    btn_chunk = gr.Button("Chunk text")
+    tb_chunked_text = gr.Textbox(label='Chunks Info')
+    download_link = gr.File(label='Download Chunks')
+    # Call analyse_file when a file is uploaded and display the results in tb_analysis
+    docs_input.upload(analyse_file,inputs=[docs_input], outputs=[text_to_chunk])
+    text_to_chunk.change(analyse_text,inputs=[text_to_chunk],outputs=[tb_analysis])
+    btn_chunk.click(chunk_and_zip_text, inputs=[text_to_chunk, sl_max_char_per_chunk, sl_overlap], outputs=[tb_chunked_text, download_link])
+demo.launch(debug=True, share=False)