Spaces:
Running
on
Zero
Running
on
Zero
jedick
commited on
Commit
·
6f5111d
1
Parent(s):
ff1808d
Use @spaces.GPU(duration=100)
Browse files- app.py +19 -36
- main.py +1 -1
- requirements.txt +0 -2
app.py
CHANGED
|
@@ -242,25 +242,15 @@ def to_workflow(request: gr.Request, *args):
|
|
| 242 |
new_args = args + (request.session_hash,)
|
| 243 |
if compute_mode == "local":
|
| 244 |
# Call the workflow function with the @spaces.GPU decorator
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
yield value
|
| 248 |
-
else:
|
| 249 |
-
for value in run_workflow_local(*new_args):
|
| 250 |
-
yield value
|
| 251 |
if compute_mode == "remote":
|
| 252 |
for value in run_workflow_remote(*new_args):
|
| 253 |
yield value
|
| 254 |
|
| 255 |
|
| 256 |
-
@spaces.GPU(duration=75)
|
| 257 |
-
def run_workflow_local(*args):
|
| 258 |
-
for value in run_workflow(*args):
|
| 259 |
-
yield value
|
| 260 |
-
|
| 261 |
-
|
| 262 |
@spaces.GPU(duration=100)
|
| 263 |
-
def
|
| 264 |
for value in run_workflow(*args):
|
| 265 |
yield value
|
| 266 |
|
|
@@ -440,8 +430,7 @@ with gr.Blocks(
|
|
| 440 |
end = None
|
| 441 |
info_text = f"""
|
| 442 |
**Database:** {len(sources)} emails from {start} to {end}.
|
| 443 |
-
**Features:** RAG, today's date, hybrid search (dense+sparse), multiple retrievals,
|
| 444 |
-
thinking output (local), citations output (remote), chat memory.
|
| 445 |
**Tech:** LangChain + Hugging Face + Gradio; ChromaDB and BM25S-based retrievers.<br>
|
| 446 |
"""
|
| 447 |
return info_text
|
|
@@ -456,9 +445,9 @@ with gr.Blocks(
|
|
| 456 |
"Who reported installation problems in 2023-2024?",
|
| 457 |
]
|
| 458 |
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
|
| 463 |
# cf. https://github.com/gradio-app/gradio/pull/8745 for updating examples
|
| 464 |
return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
|
|
@@ -470,9 +459,6 @@ with gr.Blocks(
|
|
| 470 |
"Discuss pipe operator usage in 2022, 2023, and 2024",
|
| 471 |
]
|
| 472 |
|
| 473 |
-
if compute_mode == "remote":
|
| 474 |
-
questions = [q.replace(" /think", "") for q in questions]
|
| 475 |
-
|
| 476 |
return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
|
| 477 |
|
| 478 |
def get_multi_turn_questions(compute_mode, as_dataset=True):
|
|
@@ -482,9 +468,6 @@ with gr.Blocks(
|
|
| 482 |
"Did the authors you cited report bugs before 2025?",
|
| 483 |
]
|
| 484 |
|
| 485 |
-
if compute_mode == "remote":
|
| 486 |
-
questions = [q.replace(" /think", "") for q in questions]
|
| 487 |
-
|
| 488 |
return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
|
| 489 |
|
| 490 |
with gr.Row():
|
|
@@ -591,18 +574,6 @@ with gr.Blocks(
|
|
| 591 |
generate_thread_id,
|
| 592 |
outputs=[thread_id],
|
| 593 |
api_name=False,
|
| 594 |
-
).then(
|
| 595 |
-
# Clear the chatbot history
|
| 596 |
-
clear_component,
|
| 597 |
-
[chatbot],
|
| 598 |
-
[chatbot],
|
| 599 |
-
api_name=False,
|
| 600 |
-
).then(
|
| 601 |
-
# Change the chatbot avatar
|
| 602 |
-
set_avatar,
|
| 603 |
-
[compute_mode],
|
| 604 |
-
[chatbot],
|
| 605 |
-
api_name=False,
|
| 606 |
).then(
|
| 607 |
# Focus textbox by updating the textbox with the current value
|
| 608 |
lambda x: gr.update(value=x),
|
|
@@ -615,6 +586,18 @@ with gr.Blocks(
|
|
| 615 |
[compute_mode],
|
| 616 |
[status],
|
| 617 |
api_name=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
)
|
| 619 |
|
| 620 |
input.submit(
|
|
|
|
| 242 |
new_args = args + (request.session_hash,)
|
| 243 |
if compute_mode == "local":
|
| 244 |
# Call the workflow function with the @spaces.GPU decorator
|
| 245 |
+
for value in run_workflow_local(*new_args):
|
| 246 |
+
yield value
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
if compute_mode == "remote":
|
| 248 |
for value in run_workflow_remote(*new_args):
|
| 249 |
yield value
|
| 250 |
|
| 251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
@spaces.GPU(duration=100)
|
| 253 |
+
def run_workflow_local(*args):
|
| 254 |
for value in run_workflow(*args):
|
| 255 |
yield value
|
| 256 |
|
|
|
|
| 430 |
end = None
|
| 431 |
info_text = f"""
|
| 432 |
**Database:** {len(sources)} emails from {start} to {end}.
|
| 433 |
+
**Features:** RAG, today's date, hybrid search (dense+sparse), multiple retrievals, citations output (remote), chat memory.
|
|
|
|
| 434 |
**Tech:** LangChain + Hugging Face + Gradio; ChromaDB and BM25S-based retrievers.<br>
|
| 435 |
"""
|
| 436 |
return info_text
|
|
|
|
| 445 |
"Who reported installation problems in 2023-2024?",
|
| 446 |
]
|
| 447 |
|
| 448 |
+
## Remove "/think" from questions in remote mode
|
| 449 |
+
# if compute_mode == "remote":
|
| 450 |
+
# questions = [q.replace(" /think", "") for q in questions]
|
| 451 |
|
| 452 |
# cf. https://github.com/gradio-app/gradio/pull/8745 for updating examples
|
| 453 |
return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
|
|
|
|
| 459 |
"Discuss pipe operator usage in 2022, 2023, and 2024",
|
| 460 |
]
|
| 461 |
|
|
|
|
|
|
|
|
|
|
| 462 |
return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
|
| 463 |
|
| 464 |
def get_multi_turn_questions(compute_mode, as_dataset=True):
|
|
|
|
| 468 |
"Did the authors you cited report bugs before 2025?",
|
| 469 |
]
|
| 470 |
|
|
|
|
|
|
|
|
|
|
| 471 |
return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
|
| 472 |
|
| 473 |
with gr.Row():
|
|
|
|
| 574 |
generate_thread_id,
|
| 575 |
outputs=[thread_id],
|
| 576 |
api_name=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
).then(
|
| 578 |
# Focus textbox by updating the textbox with the current value
|
| 579 |
lambda x: gr.update(value=x),
|
|
|
|
| 586 |
[compute_mode],
|
| 587 |
[status],
|
| 588 |
api_name=False,
|
| 589 |
+
).then(
|
| 590 |
+
# Clear the chatbot history
|
| 591 |
+
clear_component,
|
| 592 |
+
[chatbot],
|
| 593 |
+
[chatbot],
|
| 594 |
+
api_name=False,
|
| 595 |
+
).then(
|
| 596 |
+
# Change the chatbot avatar
|
| 597 |
+
set_avatar,
|
| 598 |
+
[compute_mode],
|
| 599 |
+
[chatbot],
|
| 600 |
+
api_name=False,
|
| 601 |
)
|
| 602 |
|
| 603 |
input.submit(
|
main.py
CHANGED
|
@@ -157,7 +157,7 @@ def GetChatModel(compute_mode, ckpt_dir=None):
|
|
| 157 |
# Enable FlashAttention (requires pip install flash-attn)
|
| 158 |
# https://huggingface.co/docs/transformers/en/attention_interface
|
| 159 |
# https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention
|
| 160 |
-
attn_implementation="
|
| 161 |
)
|
| 162 |
# For Flash Attention version of Qwen3
|
| 163 |
tokenizer.padding_side = "left"
|
|
|
|
| 157 |
# Enable FlashAttention (requires pip install flash-attn)
|
| 158 |
# https://huggingface.co/docs/transformers/en/attention_interface
|
| 159 |
# https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention
|
| 160 |
+
attn_implementation="flash_attention_2",
|
| 161 |
)
|
| 162 |
# For Flash Attention version of Qwen3
|
| 163 |
tokenizer.padding_side = "left"
|
requirements.txt
CHANGED
|
@@ -11,8 +11,6 @@ flash-attn==2.8.2
|
|
| 11 |
# Gemma 3: transformers>=4.50
|
| 12 |
# Qwen3: transformers>=4.51
|
| 13 |
# SmolLM3: transformers>=4.53
|
| 14 |
-
# NOTE: Gemma 3 with transformers==4.54.0 gives:
|
| 15 |
-
# ValueError: Max cache length is not consistent across layers
|
| 16 |
transformers==4.51.3
|
| 17 |
tokenizers==0.21.2
|
| 18 |
# Only needed with AutoModelForCausalLM.from_pretrained(device_map="auto")
|
|
|
|
| 11 |
# Gemma 3: transformers>=4.50
|
| 12 |
# Qwen3: transformers>=4.51
|
| 13 |
# SmolLM3: transformers>=4.53
|
|
|
|
|
|
|
| 14 |
transformers==4.51.3
|
| 15 |
tokenizers==0.21.2
|
| 16 |
# Only needed with AutoModelForCausalLM.from_pretrained(device_map="auto")
|