moazx commited on
Commit
9788643
·
1 Parent(s): 91b29af

Fix JSON parsing error in HBV assessment endpoint and add robust JSON cleaning

Browse files

- Added `clean_json_string()` helper to escape control characters (e.g., newlines, tabs) in LLM responses.
- Updated parsing logic to strip markdown code blocks and sanitize JSON before loading.
- Improved LLM prompt to prevent literal control characters in JSON output.

- Resolves intermittent “Invalid control character” error in assessment endpoint after Hugging Face deployment.

Files changed (1) hide show
  1. core/hbv_assessment.py +84 -1
core/hbv_assessment.py CHANGED
@@ -4,6 +4,7 @@ Evaluates patient eligibility for HBV treatment according to SASLT 2021 guidelin
4
  """
5
  import logging
6
  import json
 
7
  from typing import Dict, Any
8
  from .retrievers import hybrid_search
9
  from .config import get_llm
@@ -11,6 +12,66 @@ from .config import get_llm
11
  logger = logging.getLogger(__name__)
12
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def create_patient_query(patient_data: Dict[str, Any]) -> str:
15
  """
16
  Create a comprehensive search query based on patient parameters
@@ -198,6 +259,12 @@ You MUST respond with a valid JSON object in this exact format:
198
  "recommendations": "Comprehensive assessment with inline citations"
199
  }}
200
 
 
 
 
 
 
 
201
  CRITICAL CITATION REQUIREMENTS:
202
  1. The "recommendations" field must be a comprehensive narrative that includes:
203
  - Eligibility determination with rationale
@@ -257,12 +324,28 @@ IMPORTANT:
257
 
258
  # Try to parse JSON from response
259
  try:
 
 
 
 
 
 
 
 
 
 
260
  # Find JSON in response (handle cases where LLM adds extra text)
261
  json_start = response_text.find('{')
262
  json_end = response_text.rfind('}') + 1
263
  if json_start >= 0 and json_end > json_start:
264
  json_str = response_text[json_start:json_end]
265
- result = json.loads(json_str)
 
 
 
 
 
 
266
  logger.info(f"✅ Successfully parsed JSON response")
267
  else:
268
  raise ValueError("No JSON found in response")
 
4
  """
5
  import logging
6
  import json
7
+ import re
8
  from typing import Dict, Any
9
  from .retrievers import hybrid_search
10
  from .config import get_llm
 
12
  logger = logging.getLogger(__name__)
13
 
14
 
15
+ def clean_json_string(json_str: str) -> str:
16
+ """
17
+ Clean a JSON string by properly escaping control characters within string values.
18
+ This handles cases where LLMs generate JSON with literal newlines, tabs, etc.
19
+
20
+ Args:
21
+ json_str: Raw JSON string that may contain unescaped control characters
22
+
23
+ Returns:
24
+ Cleaned JSON string with properly escaped control characters
25
+ """
26
+ # First, try to identify string values in the JSON and escape control characters within them
27
+ # We need to be careful not to break the JSON structure itself
28
+
29
+ # Replace common control characters that appear in string values
30
+ # but preserve the JSON structure (newlines between key-value pairs are OK)
31
+
32
+ # Strategy: Parse character by character, track if we're inside a string value
33
+ result = []
34
+ in_string = False
35
+ escape_next = False
36
+
37
+ for i, char in enumerate(json_str):
38
+ if escape_next:
39
+ result.append(char)
40
+ escape_next = False
41
+ continue
42
+
43
+ if char == '\\':
44
+ result.append(char)
45
+ escape_next = True
46
+ continue
47
+
48
+ if char == '"':
49
+ in_string = not in_string
50
+ result.append(char)
51
+ continue
52
+
53
+ # If we're inside a string value, escape control characters
54
+ if in_string:
55
+ if char == '\n':
56
+ result.append('\\n')
57
+ elif char == '\r':
58
+ result.append('\\r')
59
+ elif char == '\t':
60
+ result.append('\\t')
61
+ elif char == '\b':
62
+ result.append('\\b')
63
+ elif char == '\f':
64
+ result.append('\\f')
65
+ elif ord(char) < 32: # Other control characters
66
+ result.append(f'\\u{ord(char):04x}')
67
+ else:
68
+ result.append(char)
69
+ else:
70
+ result.append(char)
71
+
72
+ return ''.join(result)
73
+
74
+
75
  def create_patient_query(patient_data: Dict[str, Any]) -> str:
76
  """
77
  Create a comprehensive search query based on patient parameters
 
259
  "recommendations": "Comprehensive assessment with inline citations"
260
  }}
261
 
262
+ IMPORTANT JSON FORMATTING:
263
+ - Return ONLY valid JSON without markdown code blocks
264
+ - Use spaces instead of literal newlines within the "recommendations" string
265
+ - Separate paragraphs with double spaces or use \\n for line breaks
266
+ - Do NOT include literal newline characters in the JSON string values
267
+
268
  CRITICAL CITATION REQUIREMENTS:
269
  1. The "recommendations" field must be a comprehensive narrative that includes:
270
  - Eligibility determination with rationale
 
324
 
325
  # Try to parse JSON from response
326
  try:
327
+ # Remove markdown code blocks if present
328
+ if '```json' in response_text:
329
+ json_start = response_text.find('```json') + 7
330
+ json_end = response_text.find('```', json_start)
331
+ response_text = response_text[json_start:json_end].strip()
332
+ elif '```' in response_text:
333
+ json_start = response_text.find('```') + 3
334
+ json_end = response_text.find('```', json_start)
335
+ response_text = response_text[json_start:json_end].strip()
336
+
337
  # Find JSON in response (handle cases where LLM adds extra text)
338
  json_start = response_text.find('{')
339
  json_end = response_text.rfind('}') + 1
340
  if json_start >= 0 and json_end > json_start:
341
  json_str = response_text[json_start:json_end]
342
+
343
+ # Clean the JSON string to escape control characters within string values
344
+ cleaned_json_str = clean_json_string(json_str)
345
+ logger.debug(f"Cleaned JSON string (first 500 chars): {cleaned_json_str[:500]}")
346
+
347
+ # Parse the cleaned JSON
348
+ result = json.loads(cleaned_json_str)
349
  logger.info(f"✅ Successfully parsed JSON response")
350
  else:
351
  raise ValueError("No JSON found in response")