Spaces:

jedick
/

R-help-chat

Running on Zero

App Files Files Community

jedick commited on Aug 3

Commit

6f5111d

1 Parent(s): ff1808d

Use @spaces.GPU(duration=100)

Browse files

Files changed (3) hide show

app.py +19 -36
main.py +1 -1
requirements.txt +0 -2

app.py CHANGED Viewed

@@ -242,25 +242,15 @@ def to_workflow(request: gr.Request, *args):
     new_args = args + (request.session_hash,)
     if compute_mode == "local":
         # Call the workflow function with the @spaces.GPU decorator
-        if "/think" in input:
-            for value in run_workflow_local_long(*new_args):
-                yield value
-        else:
-            for value in run_workflow_local(*new_args):
-                yield value
     if compute_mode == "remote":
         for value in run_workflow_remote(*new_args):
             yield value
-@spaces.GPU(duration=75)
-def run_workflow_local(*args):
-    for value in run_workflow(*args):
-        yield value
 @spaces.GPU(duration=100)
-def run_workflow_local_long(*args):
     for value in run_workflow(*args):
         yield value
@@ -440,8 +430,7 @@ with gr.Blocks(
             end = None
         info_text = f"""
             **Database:** {len(sources)} emails from {start} to {end}.
-            **Features:** RAG, today's date, hybrid search (dense+sparse), multiple retrievals,
-            thinking output (local), citations output (remote), chat memory.
             **Tech:** LangChain + Hugging Face + Gradio; ChromaDB and BM25S-based retrievers.<br>
             """
         return info_text
@@ -456,9 +445,9 @@ with gr.Blocks(
             "Who reported installation problems in 2023-2024?",
         ]
-        if compute_mode == "remote":
-            # Remove "/no_think" from questions in remote mode
-            questions = [q.replace(" /no_think", "") for q in questions]
         # cf. https://github.com/gradio-app/gradio/pull/8745 for updating examples
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
@@ -470,9 +459,6 @@ with gr.Blocks(
             "Discuss pipe operator usage in 2022, 2023, and 2024",
         ]
-        if compute_mode == "remote":
-            questions = [q.replace(" /think", "") for q in questions]
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
     def get_multi_turn_questions(compute_mode, as_dataset=True):
@@ -482,9 +468,6 @@ with gr.Blocks(
             "Did the authors you cited report bugs before 2025?",
         ]
-        if compute_mode == "remote":
-            questions = [q.replace(" /think", "") for q in questions]
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
     with gr.Row():
@@ -591,18 +574,6 @@ with gr.Blocks(
         generate_thread_id,
         outputs=[thread_id],
         api_name=False,
-    ).then(
-        # Clear the chatbot history
-        clear_component,
-        [chatbot],
-        [chatbot],
-        api_name=False,
-    ).then(
-        # Change the chatbot avatar
-        set_avatar,
-        [compute_mode],
-        [chatbot],
-        api_name=False,
     ).then(
         # Focus textbox by updating the textbox with the current value
         lambda x: gr.update(value=x),
@@ -615,6 +586,18 @@ with gr.Blocks(
         [compute_mode],
         [status],
         api_name=False,
     )
     input.submit(

     new_args = args + (request.session_hash,)
     if compute_mode == "local":
         # Call the workflow function with the @spaces.GPU decorator
+        for value in run_workflow_local(*new_args):
+            yield value
     if compute_mode == "remote":
         for value in run_workflow_remote(*new_args):
             yield value
 @spaces.GPU(duration=100)
+def run_workflow_local(*args):
     for value in run_workflow(*args):
         yield value
             end = None
         info_text = f"""
             **Database:** {len(sources)} emails from {start} to {end}.
+            **Features:** RAG, today's date, hybrid search (dense+sparse), multiple retrievals, citations output (remote), chat memory.
             **Tech:** LangChain + Hugging Face + Gradio; ChromaDB and BM25S-based retrievers.<br>
             """
         return info_text
             "Who reported installation problems in 2023-2024?",
         ]
+        ## Remove "/think" from questions in remote mode
+        # if compute_mode == "remote":
+        #     questions = [q.replace(" /think", "") for q in questions]
         # cf. https://github.com/gradio-app/gradio/pull/8745 for updating examples
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
             "Discuss pipe operator usage in 2022, 2023, and 2024",
         ]
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
     def get_multi_turn_questions(compute_mode, as_dataset=True):
             "Did the authors you cited report bugs before 2025?",
         ]
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
     with gr.Row():
         generate_thread_id,
         outputs=[thread_id],
         api_name=False,
     ).then(
         # Focus textbox by updating the textbox with the current value
         lambda x: gr.update(value=x),
         [compute_mode],
         [status],
         api_name=False,
+    ).then(
+        # Clear the chatbot history
+        clear_component,
+        [chatbot],
+        [chatbot],
+        api_name=False,
+    ).then(
+        # Change the chatbot avatar
+        set_avatar,
+        [compute_mode],
+        [chatbot],
+        api_name=False,
     )
     input.submit(

main.py CHANGED Viewed

@@ -157,7 +157,7 @@ def GetChatModel(compute_mode, ckpt_dir=None):
             # Enable FlashAttention (requires pip install flash-attn)
             # https://huggingface.co/docs/transformers/en/attention_interface
             # https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention
-            attn_implementation="sdpa",
         )
         # For Flash Attention version of Qwen3
         tokenizer.padding_side = "left"

             # Enable FlashAttention (requires pip install flash-attn)
             # https://huggingface.co/docs/transformers/en/attention_interface
             # https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention
+            attn_implementation="flash_attention_2",
         )
         # For Flash Attention version of Qwen3
         tokenizer.padding_side = "left"

requirements.txt CHANGED Viewed

@@ -11,8 +11,6 @@ flash-attn==2.8.2
 #   Gemma 3: transformers>=4.50
 #   Qwen3:   transformers>=4.51
 #   SmolLM3: transformers>=4.53
-# NOTE: Gemma 3 with transformers==4.54.0 gives:
-#   ValueError: Max cache length is not consistent across layers
 transformers==4.51.3
 tokenizers==0.21.2
 # Only needed with AutoModelForCausalLM.from_pretrained(device_map="auto")

 #   Gemma 3: transformers>=4.50
 #   Qwen3:   transformers>=4.51
 #   SmolLM3: transformers>=4.53
 transformers==4.51.3
 tokenizers==0.21.2
 # Only needed with AutoModelForCausalLM.from_pretrained(device_map="auto")