Spaces:

moazx
/

HBV_AI_Assistant

Running

moazx commited on 8 days ago

Commit

9788643

1 Parent(s): 91b29af

Fix JSON parsing error in HBV assessment endpoint and add robust JSON cleaning

- Added `clean_json_string()` helper to escape control characters (e.g., newlines, tabs) in LLM responses.
- Updated parsing logic to strip markdown code blocks and sanitize JSON before loading.
- Improved LLM prompt to prevent literal control characters in JSON output.

- Resolves intermittent “Invalid control character” error in assessment endpoint after Hugging Face deployment.

Files changed (1) hide show

core/hbv_assessment.py +84 -1

core/hbv_assessment.py CHANGED Viewed

@@ -4,6 +4,7 @@ Evaluates patient eligibility for HBV treatment according to SASLT 2021 guidelin
 """
 import logging
 import json
 from typing import Dict, Any
 from .retrievers import hybrid_search
 from .config import get_llm
@@ -11,6 +12,66 @@ from .config import get_llm
 logger = logging.getLogger(__name__)
 def create_patient_query(patient_data: Dict[str, Any]) -> str:
     """
     Create a comprehensive search query based on patient parameters
@@ -198,6 +259,12 @@ You MUST respond with a valid JSON object in this exact format:
   "recommendations": "Comprehensive assessment with inline citations"
 }}
 CRITICAL CITATION REQUIREMENTS:
 1. The "recommendations" field must be a comprehensive narrative that includes:
    - Eligibility determination with rationale
@@ -257,12 +324,28 @@ IMPORTANT:
         # Try to parse JSON from response
         try:
             # Find JSON in response (handle cases where LLM adds extra text)
             json_start = response_text.find('{')
             json_end = response_text.rfind('}') + 1
             if json_start >= 0 and json_end > json_start:
                 json_str = response_text[json_start:json_end]
-                result = json.loads(json_str)
                 logger.info(f"✅ Successfully parsed JSON response")
             else:
                 raise ValueError("No JSON found in response")

 """
 import logging
 import json
+import re
 from typing import Dict, Any
 from .retrievers import hybrid_search
 from .config import get_llm
 logger = logging.getLogger(__name__)
+def clean_json_string(json_str: str) -> str:
+    """
+    Clean a JSON string by properly escaping control characters within string values.
+    This handles cases where LLMs generate JSON with literal newlines, tabs, etc.
+    Args:
+        json_str: Raw JSON string that may contain unescaped control characters
+    Returns:
+        Cleaned JSON string with properly escaped control characters
+    """
+    # First, try to identify string values in the JSON and escape control characters within them
+    # We need to be careful not to break the JSON structure itself
+    # Replace common control characters that appear in string values
+    # but preserve the JSON structure (newlines between key-value pairs are OK)
+    # Strategy: Parse character by character, track if we're inside a string value
+    result = []
+    in_string = False
+    escape_next = False
+    for i, char in enumerate(json_str):
+        if escape_next:
+            result.append(char)
+            escape_next = False
+            continue
+        if char == '\\':
+            result.append(char)
+            escape_next = True
+            continue
+        if char == '"':
+            in_string = not in_string
+            result.append(char)
+            continue
+        # If we're inside a string value, escape control characters
+        if in_string:
+            if char == '\n':
+                result.append('\\n')
+            elif char == '\r':
+                result.append('\\r')
+            elif char == '\t':
+                result.append('\\t')
+            elif char == '\b':
+                result.append('\\b')
+            elif char == '\f':
+                result.append('\\f')
+            elif ord(char) < 32:  # Other control characters
+                result.append(f'\\u{ord(char):04x}')
+            else:
+                result.append(char)
+        else:
+            result.append(char)
+    return ''.join(result)
 def create_patient_query(patient_data: Dict[str, Any]) -> str:
     """
     Create a comprehensive search query based on patient parameters
   "recommendations": "Comprehensive assessment with inline citations"
 }}
+IMPORTANT JSON FORMATTING:
+- Return ONLY valid JSON without markdown code blocks
+- Use spaces instead of literal newlines within the "recommendations" string
+- Separate paragraphs with double spaces or use \\n for line breaks
+- Do NOT include literal newline characters in the JSON string values
 CRITICAL CITATION REQUIREMENTS:
 1. The "recommendations" field must be a comprehensive narrative that includes:
    - Eligibility determination with rationale
         # Try to parse JSON from response
         try:
+            # Remove markdown code blocks if present
+            if '```json' in response_text:
+                json_start = response_text.find('```json') + 7
+                json_end = response_text.find('```', json_start)
+                response_text = response_text[json_start:json_end].strip()
+            elif '```' in response_text:
+                json_start = response_text.find('```') + 3
+                json_end = response_text.find('```', json_start)
+                response_text = response_text[json_start:json_end].strip()
             # Find JSON in response (handle cases where LLM adds extra text)
             json_start = response_text.find('{')
             json_end = response_text.rfind('}') + 1
             if json_start >= 0 and json_end > json_start:
                 json_str = response_text[json_start:json_end]
+                # Clean the JSON string to escape control characters within string values
+                cleaned_json_str = clean_json_string(json_str)
+                logger.debug(f"Cleaned JSON string (first 500 chars): {cleaned_json_str[:500]}")
+                # Parse the cleaned JSON
+                result = json.loads(cleaned_json_str)
                 logger.info(f"✅ Successfully parsed JSON response")
             else:
                 raise ValueError("No JSON found in response")