mtyrrell's picture
ingestor context handling
efd3c5f
import re
from typing import List, Dict, Any, Union
import ast
from langchain_core.messages import SystemMessage, HumanMessage
# ---------------------------------------------------------------------
# Core Processing Functions
# ---------------------------------------------------------------------
def _parse_citations(response: str) -> List[int]:
"""Parse citation numbers from response text"""
citation_pattern = r'\[(\d+)\]'
matches = re.findall(citation_pattern, response)
citation_numbers = sorted(list(set(int(match) for match in matches)))
return citation_numbers
def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]:
"""Extract sources that were cited in the response"""
if not cited_numbers:
return []
cited_sources = []
for citation_num in cited_numbers:
source_index = citation_num - 1
if 0 <= source_index < len(processed_results):
source = processed_results[source_index].copy() # Make copy to avoid modifying original
source['_citation_number'] = citation_num # Preserve original citation number
cited_sources.append(source)
return cited_sources
def clean_citations(response: str) -> str:
"""Normalize all citation formats to [x] and remove unwanted sections"""
# Remove References/Sources/Bibliography sections
ref_patterns = [
r'\n\s*#+\s*References?\s*:?.*$',
r'\n\s*#+\s*Sources?\s*:?.*$',
r'\n\s*#+\s*Bibliography\s*:?.*$',
r'\n\s*References?\s*:.*$',
r'\n\s*Sources?\s*:.*$',
r'\n\s*Bibliography\s*:.*$',
]
for pattern in ref_patterns:
response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
# Fix (Document X, Page Y, Year Z) -> [X]
response = re.sub(
r'\(Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?\)',
r'[\1]',
response,
flags=re.IGNORECASE
)
# Fix [Document X, Page Y, Year Z] -> [X]
response = re.sub(
r'\[Document\s+(\d+)(?:[^\]]*)\]',
r'[\1]',
response,
flags=re.IGNORECASE
)
# Fix [Document X: filename, Page Y, Year Z] -> [X]
response = re.sub(
r'\[Document\s+(\d+):[^\]]+\]',
r'[\1]',
response,
flags=re.IGNORECASE
)
# Fix [X.Y.Z] style (section numbers) -> [X]
response = re.sub(
r'\[(\d+)\.[\d\.]+\]',
r'[\1]',
response
)
# Fix (Document X) -> [X]
response = re.sub(
r'\(Document\s+(\d+)\)',
r'[\1]',
response,
flags=re.IGNORECASE
)
# Fix "Document X, Page Y, Year Z" (no brackets) -> [X]
response = re.sub(
r'Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?(?=\s|[,.])',
r'[\1]',
response,
flags=re.IGNORECASE
)
# Fix "Document X states/says/mentions" -> [X]
response = re.sub(
r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates|notes|shows)',
r'[\1]',
response,
flags=re.IGNORECASE
)
# Clean up any double citations [[1]] -> [1]
response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response)
# Clean up multiple spaces
response = re.sub(r'\s+', ' ', response)
return response.strip()
def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
"""Process context and return formatted context string and processed results"""
processed_results = []
if isinstance(context, list):
if not context:
raise ValueError("No retrieval results provided")
# Extract relevant fields from retrieval results
for result in context:
if isinstance(result, str):
result = ast.literal_eval(result)
# Handle both ingested files (metadata at top level) and retrieved documents (metadata in answer_metadata)
# Check if metadata is nested in 'answer_metadata' (retrieved documents)
metadata = result.get('answer_metadata', {})
# If answer_metadata is empty or missing, check top level (ingested files)
if not metadata or all(v is None or v == 'Unknown' for v in metadata.values()):
# For ingested files, metadata is at the top level
doc_info = {
'answer': result.get('answer', result.get('content', '')),
'filename': result.get('filename', 'Unknown'),
'page': result.get('page', 'Unknown'),
'year': result.get('year', 'Unknown'),
'source': result.get('source', 'Unknown'),
'document_id': result.get('_id', result.get('document_id', 'Unknown'))
}
else:
# For retrieved documents, use nested metadata
doc_info = {
'answer': result.get('answer', ''),
'filename': metadata.get('filename', 'Unknown'),
'page': metadata.get('page', 'Unknown'),
'year': metadata.get('year', 'Unknown'),
'source': metadata.get('source', 'Unknown'),
'document_id': metadata.get('_id', 'Unknown')
}
processed_results.append(doc_info)
# Format context string - SIMPLIFIED TO ONLY USE [1], [2], [3]
context_parts = []
for i, result in enumerate(processed_results, 1):
# Simple format: [1], [2], etc.
context_parts.append(f"[{i}]\n{result['answer']}\n")
formatted_context = "\n".join(context_parts)
elif isinstance(context, str):
if not context.strip():
raise ValueError("Context cannot be empty")
formatted_context = context
else:
raise ValueError("Context must be either a string or list of retrieval results")
return formatted_context, processed_results
def _build_messages(system_prompt: str, question: str, context: str) -> list:
"""Build messages for LLM call"""
system_content = system_prompt
user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
return [SystemMessage(content=system_content), HumanMessage(content=user_content)]
def _create_sources_list(cited_sources: List[Dict[str, Any]]) -> List[Dict[str, str]]:
"""Create sources list for ChatUI format"""
sources = []
for result in cited_sources:
filename = result.get('filename', 'Unknown')
page = result.get('page', 'Unknown')
year = result.get('year', 'Unknown')
link = f"doc://{filename}"
title_parts = [filename]
if page != 'Unknown':
title_parts.append(f"Page {page}")
if year != 'Unknown':
title_parts.append(f"({year})")
sources.append({"link": link, "title": " - ".join(title_parts)})
return sources