Spaces:

akisg
/

care-notes

Sleeping

App Files Files Community

Akis Giannoukos commited on 28 days ago

Commit

5731404

1 Parent(s): 17f0761

Added dynamic dtype selection and improved decoding parameters

Browse files

Files changed (1) hide show

app.py +16 -5

app.py CHANGED Viewed

@@ -76,12 +76,18 @@ def get_textgen_pipeline():
     global _gen_pipe
     if _gen_pipe is None:
         # Use a small default chat model for Spaces CPU; override via LLM_MODEL_ID
         _gen_pipe = pipeline(
             task="text-generation",
             model=current_model_id,
             tokenizer=current_model_id,
             device=_hf_device(),
-            torch_dtype=(torch.float16 if torch.cuda.is_available() else torch.float32),
         )
     return _gen_pipe
@@ -275,6 +281,8 @@ def generate_recording_agent_reply(chat_history: List[Tuple[str, str]]) -> str:
         max_new_tokens=96,
         temperature=0.7,
         do_sample=True,
         pad_token_id=tokenizer.eos_token_id,
         return_full_text=False,
     )
@@ -309,11 +317,12 @@ def scoring_agent_infer(chat_history: List[Tuple[str, str]], features: Dict[str,
         {"role": "user", "content": combined_prompt},
     ]
     prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     gen = pipe(
         prompt,
         max_new_tokens=256,
-        temperature=0.2,
-        do_sample=True,
         pad_token_id=tokenizer.eos_token_id,
         return_full_text=False,
     )
@@ -438,12 +447,14 @@ def process_turn(
     chat_history: List[Tuple[str, str]],
     threshold: float,
     tts_enabled: bool,
-    finished: bool,
-    turns: int,
     prev_scores: Dict[str, Any],
     prev_meta: Dict[str, Any],
 ):
     # If already finished, do nothing
     if finished:
         return (
             chat_history,

     global _gen_pipe
     if _gen_pipe is None:
         # Use a small default chat model for Spaces CPU; override via LLM_MODEL_ID
+        if torch.cuda.is_available() and hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported():
+            _dtype = torch.bfloat16
+        elif torch.cuda.is_available():
+            _dtype = torch.float16
+        else:
+            _dtype = torch.float32
         _gen_pipe = pipeline(
             task="text-generation",
             model=current_model_id,
             tokenizer=current_model_id,
             device=_hf_device(),
+            torch_dtype=_dtype,
         )
     return _gen_pipe
         max_new_tokens=96,
         temperature=0.7,
         do_sample=True,
+        top_p=0.9,
+        top_k=50,
         pad_token_id=tokenizer.eos_token_id,
         return_full_text=False,
     )
         {"role": "user", "content": combined_prompt},
     ]
     prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Use deterministic decoding to avoid CUDA sampling edge cases on some models
     gen = pipe(
         prompt,
         max_new_tokens=256,
+        temperature=0.0,
+        do_sample=False,
         pad_token_id=tokenizer.eos_token_id,
         return_full_text=False,
     )
     chat_history: List[Tuple[str, str]],
     threshold: float,
     tts_enabled: bool,
+    finished: Optional[bool],
+    turns: Optional[int],
     prev_scores: Dict[str, Any],
     prev_meta: Dict[str, Any],
 ):
     # If already finished, do nothing
+    finished = bool(finished) if finished is not None else False
+    turns = int(turns) if isinstance(turns, int) else 0
     if finished:
         return (
             chat_history,