Spaces:

SmartHeal
/

SmartHeal-Agentic-AI

Sleeping

App Files Files Community

SmartHeal commited on Aug 25

Commit

74de941

verified ·

1 Parent(s): 5e405a7

Update src/ai_processor.py

Browse files

Files changed (1) hide show

src/ai_processor.py +55 -53

src/ai_processor.py CHANGED Viewed

@@ -140,53 +140,39 @@ Keep to 220–300 words. Do NOT provide diagnosis. Avoid contraindicated advice.
 # ---------- MedGemma-only text generator ----------
 @_SPACES_GPU(enable_queue=True)
-def _build_vlm_pipeline(model_id: str, token: str | None):
     import os, torch
-    from transformers import pipeline
-    # don't mask CUDA here
     os.environ.pop("CUDA_VISIBLE_DEVICES", None)
-    use_cuda = torch.cuda.is_available()
-    kwargs = dict(
-        task="image-text-to-text",
         model=model_id,
         trust_remote_code=True,
-        torch_dtype=(torch.bfloat16 if use_cuda else torch.float32),
-        device=(0 if use_cuda else -1),
     )
-    if token:
-        try: kwargs["token"] = token
-        except TypeError: kwargs["use_auth_token"] = token
-    # if it's a 4-bit Unsloth build, attach bnb config (GPU required)
-    if "bnb-4bit" in model_id.lower() or "4bit" in model_id.lower():
-        if not use_cuda:
-            raise RuntimeError("CUDA not available for 4-bit quantized model.")
-        from transformers import BitsAndBytesConfig
-        kwargs["model_kwargs"] = {
-            "quantization_config": BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_use_double_quant=True,
-                bnb_4bit_compute_dtype=torch.bfloat16,
-            )
-        }
-    return pipeline(**kwargs)
-def _vlm_generate_with_messages(prompt: str,
-                                image_pil,
-                                model_id: str,
-                                max_new_tokens: int,
-                                token: str | None) -> str:
-    # try preferred; on error, fall back to a tiny CPU-friendly VLM
-    try:
-        p = _build_vlm_pipeline(model_id or "unsloth/medgemma-4b-it-bnb-4bit", token)
-    except Exception:
-        p = _build_vlm_pipeline("bczhou/tiny-llava-v1-hf", None)
     messages = [{
         "role": "user",
         "content": [
@@ -195,18 +181,31 @@ def _vlm_generate_with_messages(prompt: str,
         ],
     }]
-    out = p(text=messages,
-            max_new_tokens=int(max_new_tokens or 256),
-            do_sample=False,
-            temperature=0.2,
-            return_full_text=False)
-    # robust extraction
     if isinstance(out, list) and out and isinstance(out[0], dict) and "generated_text" in out[0]:
         return (out[0]["generated_text"] or "").strip()
     return (str(out) or "").strip() or "⚠️ Empty response"
-def generate_medgemma_report(patient_info, visual_results, guideline_context, image_pil, max_new_tokens=None) -> str:
     if os.getenv("SMARTHEAL_ENABLE_VLM", "1") != "1":
         return "⚠️ VLM disabled"
@@ -221,15 +220,18 @@ def generate_medgemma_report(patient_info, visual_results, guideline_context, im
         guideline_context=(guideline_context or "")[:900],
     )
     prompt = f"{SMARTHEAL_SYSTEM_PROMPT}\n\n{uprompt}\n\nAnswer:"
     model_id = os.getenv("SMARTHEAL_MEDGEMMA_MODEL", "unsloth/medgemma-4b-it-bnb-4bit")
     max_new_tokens = max_new_tokens or int(os.getenv("SMARTHEAL_VLM_MAX_TOKENS", "600"))
-    try:
-        return _vlm_generate_with_messages(prompt, image_pil, model_id, max_new_tokens, os.getenv("HF_TOKEN"))
-    except Exception as e:
-        logging.error(f"MedGemma pipeline failed: {e}", exc_info=True)
-        return "⚠️ VLM error"
 # ---------- Input-shape helpers (avoid `.as_list()` on strings) ----------

 # ---------- MedGemma-only text generator ----------
 @_SPACES_GPU(enable_queue=True)
+def vlm_generate(prompt, image_pil, model_id="unsloth/medgemma-4b-it-bnb-4bit",
+                 max_new_tokens=256, token=None):
+    """
+    Simple helper: messages-style image+text → text using a 4-bit MedGemma pipeline.
+    - No explicit `device` argument (pipeline will auto-detect).
+    - Uses HF token from arg or HF_TOKEN env.
+    """
     import os, torch
+    from transformers import pipeline, BitsAndBytesConfig
+    # Unmask GPU if it was masked upstream (harmless on CPU too)
     os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    hf_token = token or os.getenv("HF_TOKEN")
+    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    # 4-bit quantization config (required by the Unsloth 4-bit model)
+    bnb = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_compute_dtype=dtype,
+    )
+    pipe = pipeline(
+        "image-text-to-text",
         model=model_id,
+        model_kwargs={"quantization_config": bnb},
+        torch_dtype=dtype,
+        token=hf_token,
         trust_remote_code=True,
     )
     messages = [{
         "role": "user",
         "content": [
         ],
     }]
+    out = pipe(
+        text=messages,
+        max_new_tokens=int(max_new_tokens),
+        do_sample=False,
+        temperature=0.2,
+        return_full_text=False,
+    )
     if isinstance(out, list) and out and isinstance(out[0], dict) and "generated_text" in out[0]:
         return (out[0]["generated_text"] or "").strip()
     return (str(out) or "").strip() or "⚠️ Empty response"
+def generate_medgemma_report(
+    patient_info: str,
+    visual_results: dict,
+    guideline_context: str,
+    image_pil,                          # PIL.Image
+    max_new_tokens: int | None = None,
+) -> str:
+    """
+    Build SmartHeal prompt and generate with the Unsloth MedGemma 4-bit VLM.
+    No fallback to any other model.
+    """
+    import os
     if os.getenv("SMARTHEAL_ENABLE_VLM", "1") != "1":
         return "⚠️ VLM disabled"
         guideline_context=(guideline_context or "")[:900],
     )
     prompt = f"{SMARTHEAL_SYSTEM_PROMPT}\n\n{uprompt}\n\nAnswer:"
     model_id = os.getenv("SMARTHEAL_MEDGEMMA_MODEL", "unsloth/medgemma-4b-it-bnb-4bit")
     max_new_tokens = max_new_tokens or int(os.getenv("SMARTHEAL_VLM_MAX_TOKENS", "600"))
+    # Uses the simple messages-based VLM helper you added earlier (no device param).
+    return vlm_generate(
+        prompt=prompt,
+        image_pil=image_pil,
+        model_id=model_id,
+        max_new_tokens=max_new_tokens,
+        token=os.getenv("HF_TOKEN"),
+    )
 # ---------- Input-shape helpers (avoid `.as_list()` on strings) ----------