Spaces:

HF-test-lab
/

bulk_embeddings

Runtime error

App Files Files Community

nbroad commited on Jul 22, 2023

Commit

f7ff38b

1 Parent(s): 2492d76

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -18

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
-from utils import load_hf_dataset, get_model_and_tokenizer, batch_embed, download_wikipedia
 # TODO: add instructor models
 # "hkunlp/instructor-xl",
@@ -40,23 +40,35 @@ optimization_options = list(opt2desc.values())
-def download(
     ds_name,
     ds_config,
     ds_split,
     num2skip,
     num2embed,
-    progress=gr.Progress(),
 ):
-    if progress is not None:
-        progress(0.5, "Loading dataset...")
-    if ds_name == "wikipedia":
-        ds = download_wikipedia(ds_name, ds_config, num2skip, num2embed)
-    else:
-        ds = load_hf_dataset(ds_name, ds_config, ds_split)
-    return f"Downloaded! It has {len(ds)} docs."
@@ -71,11 +83,10 @@ def embed(
     new_dataset_id,
     num2skip,
     num2embed,
-    progress=gr.Progress(),
 ):
-    if progress is not None:
-        progress(0.5, "Loading dataset...")
-    ds = load_hf_dataset(ds_name, ds_config, ds_split)
     opt_level = desc2opt[opt_desc]
@@ -104,6 +115,9 @@ def embed(
 with gr.Blocks(title="Bulk embeddings") as demo:
     gr.Markdown(
         """
         This Space allows you to embed a large dataset easily. For instance, this can easily create vectors for Wikipedia \
         articles -- taking about __ hours and costing approximately $__.
         This utilizes state-of-the-art open-source embedding models, \
@@ -118,6 +132,7 @@ with gr.Blocks(title="Bulk embeddings") as demo:
           - Text splitting options
           - More control about which rows to embed (skip some, stop early)
           - Dynamic padding
         ## Steps
         1. Upload the dataset to the Hugging Face Hub.
         2. Enter dataset details into the form below.
@@ -125,6 +140,7 @@ with gr.Blocks(title="Bulk embeddings") as demo:
         4. Enter optimization level. See [here](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization#optimization-configuration) for details.
         5. Choose a name for the new dataset.
         6. Hit run!
         ### Note:
         If you have short documents, O3 will be faster than O4. If you have long documents, O4 will be faster than O3. \
             O4 requires the tokenized documents to be padded to max length.
@@ -170,7 +186,7 @@ with gr.Blocks(title="Bulk embeddings") as demo:
         num2skip = gr.Slider(
             value=0,
             minimum=0,
-            maximum=10_000_000,
             step=1,
             label="Number of rows to skip",
         )
@@ -178,14 +194,22 @@ with gr.Blocks(title="Bulk embeddings") as demo:
         num2embed = gr.Slider(
             value=30000,
             minimum=-1,
-            maximum=10_000_000,
             step=1,
             label="Number of rows to embed (-1 = all)",
         )
     with gr.Row():
-        download_btn = gr.Button(value="Download dataset!")
         embed_btn = gr.Button(value="Embed texts!")
         last = gr.Textbox(value="")

 import gradio as gr
+from data import download_dataset, tokenize_dataset, load_tokenized_dataset
+from infer import get_model_and_tokenizer, batch_embed
 # TODO: add instructor models
 # "hkunlp/instructor-xl",
+def download_and_tokenize(
     ds_name,
     ds_config,
+    column_name,
     ds_split,
+    model_choice,
+    opt_desc,
     num2skip,
     num2embed,
+    progress=gr.Progress(track_tqdm=True),
 ):
+    num_samples = download_dataset(ds_name, ds_config, ds_split, num2skip, num2embed)
+    opt_level = desc2opt[opt_desc]
+    model_name = model_choice.split()[0]
+    tokenize_dataset(
+        ds_name=ds_name,
+        ds_config=ds_config,
+        model_name=model_name,
+        opt_level=opt_level,
+        column_name=column_name,
+        num2skip=num2skip,
+        num2embed=num2embed,
+    )
+    return f"Downloaded! It has {len(num_samples)} docs."
     new_dataset_id,
     num2skip,
     num2embed,
+    progress=gr.Progress(track_tqdm=True),
 ):
+    ds = load_tokenized_dataset(ds_name, ds_config, ds_split)
     opt_level = desc2opt[opt_desc]
 with gr.Blocks(title="Bulk embeddings") as demo:
     gr.Markdown(
         """
+        # Bulk Embeddings
         This Space allows you to embed a large dataset easily. For instance, this can easily create vectors for Wikipedia \
         articles -- taking about __ hours and costing approximately $__.
         This utilizes state-of-the-art open-source embedding models, \
           - Text splitting options
           - More control about which rows to embed (skip some, stop early)
           - Dynamic padding
         ## Steps
         1. Upload the dataset to the Hugging Face Hub.
         2. Enter dataset details into the form below.
         4. Enter optimization level. See [here](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization#optimization-configuration) for details.
         5. Choose a name for the new dataset.
         6. Hit run!
         ### Note:
         If you have short documents, O3 will be faster than O4. If you have long documents, O4 will be faster than O3. \
             O4 requires the tokenized documents to be padded to max length.
         num2skip = gr.Slider(
             value=0,
             minimum=0,
+            maximum=100_000_000,
             step=1,
             label="Number of rows to skip",
         )
         num2embed = gr.Slider(
             value=30000,
             minimum=-1,
+            maximum=100_000_000,
             step=1,
             label="Number of rows to embed (-1 = all)",
         )
+        num2upload = gr.Slider(
+            value=10000,
+            minimum=1000,
+            maximum=100000,
+            step=1000,
+            label="Chunk size for uploading",
+        )
     with gr.Row():
+        download_btn = gr.Button(value="Download and tokenize dataset!")
         embed_btn = gr.Button(value="Embed texts!")
         last = gr.Textbox(value="")