Spaces:

SmartHeal
/

SmartHeal-Agentic-AI

Running

App Files Files Community

SmartHeal commited on Aug 25

Commit

5e405a7

verified ·

1 Parent(s): cfd566e

Update src/ai_processor.py

Browse files

Files changed (1) hide show

src/ai_processor.py +53 -105

src/ai_processor.py CHANGED Viewed

@@ -140,116 +140,76 @@ Keep to 220–300 words. Do NOT provide diagnosis. Avoid contraindicated advice.
 # ---------- MedGemma-only text generator ----------
 @_SPACES_GPU(enable_queue=True)
-def _medgemma_generate_gpu_with_pipeline(
-    prompt: str,
-    image_pil,                              # PIL.Image (the wound image)
-    model_id: str | None = None,            # e.g. "unsloth/medgemma-4b-it-bnb-4bit"
-    max_new_tokens: int = 256,
-    token: str | None = None,
-) -> str:
-    """
-    Vision LLM via Transformers pipeline using the "messages" format:
-      [{"role":"user","content":[{"type":"image","image": PIL}, {"type":"text","text": "..."}]}]
-    Returns a generated string.
-    """
     import os, torch
     from transformers import pipeline
-    try:
-        from transformers import BitsAndBytesConfig  # only needed for 4-bit
-    except Exception:
-        BitsAndBytesConfig = None
-    # <<< START OF FIX >>>
-    # Force CUDA initialization to prevent IndexError in bitsandbytes/triton check.
-    # This ensures the CUDA context is ready before transformers and bnb probe the device.
     use_cuda = torch.cuda.is_available()
-    if use_cuda:
-        try:
-            torch.tensor([1.0]).cuda()
-        except Exception as e:
-            # If even this fails, CUDA is truly not working.
-            print(f"WARNING: CUDA pre-initialization failed: {e}")
-            use_cuda = False
-    # <<< END OF FIX >>>
-    hf_token = token or os.getenv("HF_TOKEN")
-    mid = model_id or "unsloth/medgemma-4b-it-bnb-4bit"
-    # device / dtype
-    # use_cuda is already defined above
-    device   = 0 if use_cuda else -1
-    dtype    = torch.bfloat16 if use_cuda else torch.float32
-    # Build messages in the doc format
     messages = [{
         "role": "user",
         "content": [
-            {"type": "image", "image": image_pil},   # local PIL image
             {"type": "text",  "text": prompt},
         ],
     }]
-    pipe_kwargs = dict(
-        task="image-text-to-text",
-        model=mid,
-        torch_dtype=dtype,
-        device=device,                 # GPU=0 or CPU=-1
-        trust_remote_code=True,
-    )
-    # Pass HF token (newer Transformers uses `token`; older uses `use_auth_token`)
-    if hf_token:
-        try:
-            pipe_kwargs["token"] = hf_token
-        except TypeError:
-            pipe_kwargs["use_auth_token"] = hf_token
-    # If this is the 4-bit Unsloth build, attach quantization (requires CUDA + bitsandbytes)
-    if "bnb-4bit" in mid.lower():
-        if not use_cuda or BitsAndBytesConfig is None:
-            raise RuntimeError("Unsloth 4-bit requires CUDA + bitsandbytes; no GPU available.")
-        bnb = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_compute_dtype=torch.bfloat16,
-        )
-        pipe_kwargs["model_kwargs"] = {"quantization_config": bnb}
-    # Create pipeline and run with messages
-    p = pipeline(**pipe_kwargs)
-    out = p(
-        text=messages,
-        max_new_tokens=int(max_new_tokens or 256),
-        do_sample=False,
-        temperature=0.2,
-        return_full_text=False,   # we just want the answer
-    )
-    # Normalize output to a string
-    if isinstance(out, list):
-        # pipelines often return a list of strings or dicts; handle both
-        first = out[0]
-        text = first.get("generated_text") if isinstance(first, dict) else str(first)
-    else:
-        text = str(out)
-    return (text or "").strip() or "⚠️ Empty response"
-def generate_medgemma_report(
-    patient_info: str,
-    visual_results: Dict,
-    guideline_context: str,
-    image_pil,                      # keep passing the PIL image
-    max_new_tokens: int | None = None,
-) -> str:
     if os.getenv("SMARTHEAL_ENABLE_VLM", "1") != "1":
         return "⚠️ VLM disabled"
-    # Build your prompt as before
     uprompt = SMARTHEAL_USER_PREFIX.format(
         patient_info=patient_info,
         wound_type=visual_results.get("wound_type", "Unknown"),
@@ -261,29 +221,17 @@ def generate_medgemma_report(
         guideline_context=(guideline_context or "")[:900],
     )
     prompt = f"{SMARTHEAL_SYSTEM_PROMPT}\n\n{uprompt}\n\nAnswer:"
     model_id = os.getenv("SMARTHEAL_MEDGEMMA_MODEL", "unsloth/medgemma-4b-it-bnb-4bit")
     max_new_tokens = max_new_tokens or int(os.getenv("SMARTHEAL_VLM_MAX_TOKENS", "600"))
     try:
-        return _medgemma_generate_gpu_with_pipeline(prompt, image_pil, model_id, max_new_tokens, HF_TOKEN)
     except Exception as e:
-        # Optional: automatic tiny fallback if CUDA/bnb/space issues show up
-        err = str(e)
-        if any(s in err for s in ("No space left", "bitsandbytes", "CUDA", "requires CUDA")):
-            try:
-                return _medgemma_generate_gpu_with_pipeline(
-                    prompt, image_pil,
-                    model_id="bczhou/tiny-llava-v1-hf",   # ~1GB; CPU OK
-                    max_new_tokens=max_new_tokens,
-                    token=HF_TOKEN,
-                )
-            except Exception:
-                pass
         logging.error(f"MedGemma pipeline failed: {e}", exc_info=True)
         return "⚠️ VLM error"
 # ---------- Input-shape helpers (avoid `.as_list()` on strings) ----------
 def _shape_to_hw(shape) -> Tuple[Optional[int], Optional[int]]:
     try:

 # ---------- MedGemma-only text generator ----------
 @_SPACES_GPU(enable_queue=True)
+def _build_vlm_pipeline(model_id: str, token: str | None):
     import os, torch
     from transformers import pipeline
+    # don't mask CUDA here
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     use_cuda = torch.cuda.is_available()
+    kwargs = dict(
+        task="image-text-to-text",
+        model=model_id,
+        trust_remote_code=True,
+        torch_dtype=(torch.bfloat16 if use_cuda else torch.float32),
+        device=(0 if use_cuda else -1),
+    )
+    if token:
+        try: kwargs["token"] = token
+        except TypeError: kwargs["use_auth_token"] = token
+    # if it's a 4-bit Unsloth build, attach bnb config (GPU required)
+    if "bnb-4bit" in model_id.lower() or "4bit" in model_id.lower():
+        if not use_cuda:
+            raise RuntimeError("CUDA not available for 4-bit quantized model.")
+        from transformers import BitsAndBytesConfig
+        kwargs["model_kwargs"] = {
+            "quantization_config": BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_compute_dtype=torch.bfloat16,
+            )
+        }
+    return pipeline(**kwargs)
+def _vlm_generate_with_messages(prompt: str,
+                                image_pil,
+                                model_id: str,
+                                max_new_tokens: int,
+                                token: str | None) -> str:
+    # try preferred; on error, fall back to a tiny CPU-friendly VLM
+    try:
+        p = _build_vlm_pipeline(model_id or "unsloth/medgemma-4b-it-bnb-4bit", token)
+    except Exception:
+        p = _build_vlm_pipeline("bczhou/tiny-llava-v1-hf", None)
     messages = [{
         "role": "user",
         "content": [
+            {"type": "image", "image": image_pil},
             {"type": "text",  "text": prompt},
         ],
     }]
+    out = p(text=messages,
+            max_new_tokens=int(max_new_tokens or 256),
+            do_sample=False,
+            temperature=0.2,
+            return_full_text=False)
+    # robust extraction
+    if isinstance(out, list) and out and isinstance(out[0], dict) and "generated_text" in out[0]:
+        return (out[0]["generated_text"] or "").strip()
+    return (str(out) or "").strip() or "⚠️ Empty response"
+def generate_medgemma_report(patient_info, visual_results, guideline_context, image_pil, max_new_tokens=None) -> str:
     if os.getenv("SMARTHEAL_ENABLE_VLM", "1") != "1":
         return "⚠️ VLM disabled"
     uprompt = SMARTHEAL_USER_PREFIX.format(
         patient_info=patient_info,
         wound_type=visual_results.get("wound_type", "Unknown"),
         guideline_context=(guideline_context or "")[:900],
     )
     prompt = f"{SMARTHEAL_SYSTEM_PROMPT}\n\n{uprompt}\n\nAnswer:"
     model_id = os.getenv("SMARTHEAL_MEDGEMMA_MODEL", "unsloth/medgemma-4b-it-bnb-4bit")
     max_new_tokens = max_new_tokens or int(os.getenv("SMARTHEAL_VLM_MAX_TOKENS", "600"))
     try:
+        return _vlm_generate_with_messages(prompt, image_pil, model_id, max_new_tokens, os.getenv("HF_TOKEN"))
     except Exception as e:
         logging.error(f"MedGemma pipeline failed: {e}", exc_info=True)
         return "⚠️ VLM error"
 # ---------- Input-shape helpers (avoid `.as_list()` on strings) ----------
 def _shape_to_hw(shape) -> Tuple[Optional[int], Optional[int]]:
     try: