Spaces:
Runtime error
Runtime error
add option to download
Browse files
app.py
CHANGED
|
@@ -39,7 +39,23 @@ desc2opt = {v: k for k, v in opt2desc.items()}
|
|
| 39 |
optimization_options = list(opt2desc.values())
|
| 40 |
|
| 41 |
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
ds_name,
|
| 44 |
ds_config,
|
| 45 |
column_name,
|
|
@@ -84,14 +100,10 @@ with gr.Blocks(title="Bulk embeddings") as demo:
|
|
| 84 |
"""
|
| 85 |
This Space allows you to embed a large dataset easily. For instance, this can easily create vectors for Wikipedia \
|
| 86 |
articles -- taking about __ hours and costing approximately $__.
|
| 87 |
-
|
| 88 |
-
|
| 89 |
This utilizes state-of-the-art open-source embedding models, \
|
| 90 |
and optimizes them for inference using Hugging Face [optimum](https://github.com/huggingface/optimum). There are various \
|
| 91 |
levels of optimizations that can be applied - the quality of the embeddings will degrade as the optimizations increase.
|
| 92 |
-
|
| 93 |
Currently available options: O2/O3/O4 on T4/A10 GPUs using onnx runtime.
|
| 94 |
-
|
| 95 |
Future options:
|
| 96 |
- OpenVino for CPU inference
|
| 97 |
- TensorRT for GPU inference
|
|
@@ -100,22 +112,16 @@ with gr.Blocks(title="Bulk embeddings") as demo:
|
|
| 100 |
- Text splitting options
|
| 101 |
- More control about which rows to embed (skip some, stop early)
|
| 102 |
- Dynamic padding
|
| 103 |
-
|
| 104 |
## Steps
|
| 105 |
-
|
| 106 |
1. Upload the dataset to the Hugging Face Hub.
|
| 107 |
2. Enter dataset details into the form below.
|
| 108 |
3. Choose a model. These are taken from the top of the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
|
| 109 |
4. Enter optimization level. See [here](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization#optimization-configuration) for details.
|
| 110 |
5. Choose a name for the new dataset.
|
| 111 |
6. Hit run!
|
| 112 |
-
|
| 113 |
-
|
| 114 |
### Note:
|
| 115 |
-
|
| 116 |
If you have short documents, O3 will be faster than O4. If you have long documents, O4 will be faster than O3. \
|
| 117 |
O4 requires the tokenized documents to be padded to max length.
|
| 118 |
-
|
| 119 |
"""
|
| 120 |
)
|
| 121 |
|
|
@@ -172,12 +178,25 @@ with gr.Blocks(title="Bulk embeddings") as demo:
|
|
| 172 |
)
|
| 173 |
|
| 174 |
with gr.Row():
|
| 175 |
-
|
|
|
|
|
|
|
| 176 |
|
| 177 |
last = gr.Textbox(value="")
|
| 178 |
|
| 179 |
-
|
| 180 |
-
fn=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
inputs=[
|
| 182 |
ds_name,
|
| 183 |
ds_config,
|
|
@@ -194,4 +213,4 @@ with gr.Blocks(title="Bulk embeddings") as demo:
|
|
| 194 |
|
| 195 |
|
| 196 |
if __name__ == "__main__":
|
| 197 |
-
demo.queue(concurrency_count=20).launch(show_error=True)
|
|
|
|
| 39 |
optimization_options = list(opt2desc.values())
|
| 40 |
|
| 41 |
|
| 42 |
+
|
| 43 |
+
def download(
|
| 44 |
+
ds_name,
|
| 45 |
+
ds_config,
|
| 46 |
+
ds_split,
|
| 47 |
+
progress=gr.Progress(),
|
| 48 |
+
):
|
| 49 |
+
if progress is not None:
|
| 50 |
+
progress(0.5, "Loading dataset...")
|
| 51 |
+
ds = load_hf_dataset(ds_name, ds_config, ds_split)
|
| 52 |
+
|
| 53 |
+
return f"Downloaded! It has {len(ds)} docs."
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def embed(
|
| 59 |
ds_name,
|
| 60 |
ds_config,
|
| 61 |
column_name,
|
|
|
|
| 100 |
"""
|
| 101 |
This Space allows you to embed a large dataset easily. For instance, this can easily create vectors for Wikipedia \
|
| 102 |
articles -- taking about __ hours and costing approximately $__.
|
|
|
|
|
|
|
| 103 |
This utilizes state-of-the-art open-source embedding models, \
|
| 104 |
and optimizes them for inference using Hugging Face [optimum](https://github.com/huggingface/optimum). There are various \
|
| 105 |
levels of optimizations that can be applied - the quality of the embeddings will degrade as the optimizations increase.
|
|
|
|
| 106 |
Currently available options: O2/O3/O4 on T4/A10 GPUs using onnx runtime.
|
|
|
|
| 107 |
Future options:
|
| 108 |
- OpenVino for CPU inference
|
| 109 |
- TensorRT for GPU inference
|
|
|
|
| 112 |
- Text splitting options
|
| 113 |
- More control about which rows to embed (skip some, stop early)
|
| 114 |
- Dynamic padding
|
|
|
|
| 115 |
## Steps
|
|
|
|
| 116 |
1. Upload the dataset to the Hugging Face Hub.
|
| 117 |
2. Enter dataset details into the form below.
|
| 118 |
3. Choose a model. These are taken from the top of the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
|
| 119 |
4. Enter optimization level. See [here](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization#optimization-configuration) for details.
|
| 120 |
5. Choose a name for the new dataset.
|
| 121 |
6. Hit run!
|
|
|
|
|
|
|
| 122 |
### Note:
|
|
|
|
| 123 |
If you have short documents, O3 will be faster than O4. If you have long documents, O4 will be faster than O3. \
|
| 124 |
O4 requires the tokenized documents to be padded to max length.
|
|
|
|
| 125 |
"""
|
| 126 |
)
|
| 127 |
|
|
|
|
| 178 |
)
|
| 179 |
|
| 180 |
with gr.Row():
|
| 181 |
+
|
| 182 |
+
download_btn = gr.Button(value="Download dataset!")
|
| 183 |
+
embed_btn = gr.Button(value="Embed texts!")
|
| 184 |
|
| 185 |
last = gr.Textbox(value="")
|
| 186 |
|
| 187 |
+
download_btn.click(
|
| 188 |
+
fn=download,
|
| 189 |
+
inputs=[
|
| 190 |
+
ds_name,
|
| 191 |
+
ds_config,
|
| 192 |
+
column_name,
|
| 193 |
+
ds_split,
|
| 194 |
+
],
|
| 195 |
+
outputs=last,
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
embed_btn.click(
|
| 199 |
+
fn=embed,
|
| 200 |
inputs=[
|
| 201 |
ds_name,
|
| 202 |
ds_config,
|
|
|
|
| 213 |
|
| 214 |
|
| 215 |
if __name__ == "__main__":
|
| 216 |
+
demo.queue(concurrency_count=20).launch(show_error=True, debug=True)
|