backend_chatbot / app /services /context_builder.py
helal94hb1's picture
fix: changing max token length
4654f22
# app/services/context_builder.py
import logging
from typing import List, Dict, Tuple # Added Tuple
from app.core import state
logger = logging.getLogger(__name__)
DEFAULT_MAX_CONTEXT_CHARS = 512000
def build_context_from_ids(
top_chunk_ids: List[str],
max_context_chars: int = DEFAULT_MAX_CONTEXT_CHARS
) -> Tuple[str, List[str]]: # Return context string AND list of used IDs
"""
Builds context using the chunk_content_map stored in state.
Returns the concatenated context string and the list of chunk IDs
whose content was actually included.
"""
if not top_chunk_ids:
logger.warning("build_context_from_ids called with empty chunk ID list.")
return "", [] # Return empty string and empty list
if not state.chunk_content_map:
logger.error("Chunk content map is not loaded in state. Cannot build context.")
return "", [] # Return empty string and empty list
context_parts: List[str] = []
used_ids: List[str] = [] # List to track IDs included in the context
current_length = 0
separator = "\n\n---\n\n"
separator_len = len(separator)
logger.info(f"Building context from {len(top_chunk_ids)} top chunk IDs (max chars: {max_context_chars})...")
for i, chunk_id_str in enumerate(top_chunk_ids):
# Ensure chunk_id is treated as string for dictionary lookup
chunk_id = str(chunk_id_str)
content = state.chunk_content_map.get(chunk_id)
if content:
content_len = len(content)
potential_added_len = content_len + (separator_len if context_parts else 0)
if current_length + potential_added_len <= max_context_chars:
context_parts.append(content)
used_ids.append(chunk_id) # Add the ID to the used list
current_length += potential_added_len
logger.debug(f" Added chunk {i+1} (ID: {chunk_id[:20]}...): Length={content_len}, Total Context Chars={current_length}")
else:
logger.warning(f"Stopping context building: Chunk {i+1} (ID: {chunk_id[:10]}...) with length {content_len} would exceed max chars ({max_context_chars}). Current length: {current_length}.")
break
else:
logger.warning(f"Content not found in state map for chunk ID: {chunk_id}")
if not context_parts:
logger.warning("No content could be added to the context.")
return "", []
final_context = separator.join(context_parts)
logger.info(f"Final context built. Length: {len(final_context)} chars, Chunks used: {len(used_ids)}/{len(top_chunk_ids)}")
# Return both the context string and the list of used IDs
return final_context, used_ids