eudr_chabo_generator

Running on CPU Upgrade

File size: 7,140 Bytes

import re
from typing import List, Dict, Any, Union
import ast
from langchain_core.messages import SystemMessage, HumanMessage


# ---------------------------------------------------------------------
# Core Processing Functions
# ---------------------------------------------------------------------
def _parse_citations(response: str) -> List[int]:
    """Parse citation numbers from response text"""
    citation_pattern = r'\[(\d+)\]'
    matches = re.findall(citation_pattern, response)
    citation_numbers = sorted(list(set(int(match) for match in matches)))
    
    return citation_numbers

def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]:
    """Extract sources that were cited in the response"""
    if not cited_numbers:
        return []
    
    cited_sources = []
    for citation_num in cited_numbers:
        source_index = citation_num - 1
        
        if 0 <= source_index < len(processed_results):
            source = processed_results[source_index].copy()  # Make copy to avoid modifying original
            source['_citation_number'] = citation_num  # Preserve original citation number
            cited_sources.append(source)
    
    return cited_sources

def clean_citations(response: str) -> str:
    """Normalize all citation formats to [x] and remove unwanted sections"""
    
    # Remove References/Sources/Bibliography sections
    ref_patterns = [
        r'\n\s*#+\s*References?\s*:?.*$',
        r'\n\s*#+\s*Sources?\s*:?.*$',
        r'\n\s*#+\s*Bibliography\s*:?.*$',
        r'\n\s*References?\s*:.*$',
        r'\n\s*Sources?\s*:.*$',
        r'\n\s*Bibliography\s*:.*$',
    ]
    for pattern in ref_patterns:
        response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
    
    # Fix (Document X, Page Y, Year Z) -> [X]
    response = re.sub(
        r'\(Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?\)',
        r'[\1]',
        response,
        flags=re.IGNORECASE
    )
    
    # Fix [Document X, Page Y, Year Z] -> [X]
    response = re.sub(
        r'\[Document\s+(\d+)(?:[^\]]*)\]', 
        r'[\1]', 
        response, 
        flags=re.IGNORECASE
    )
    
    # Fix [Document X: filename, Page Y, Year Z] -> [X]
    response = re.sub(
        r'\[Document\s+(\d+):[^\]]+\]',
        r'[\1]',
        response,
        flags=re.IGNORECASE
    )
    
    # Fix [X.Y.Z] style (section numbers) -> [X]
    response = re.sub(
        r'\[(\d+)\.[\d\.]+\]', 
        r'[\1]', 
        response
    )
    
    # Fix (Document X) -> [X]
    response = re.sub(
        r'\(Document\s+(\d+)\)', 
        r'[\1]', 
        response, 
        flags=re.IGNORECASE
    )
    
    # Fix "Document X, Page Y, Year Z" (no brackets) -> [X]
    response = re.sub(
        r'Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?(?=\s|[,.])',
        r'[\1]',
        response,
        flags=re.IGNORECASE
    )
    
    # Fix "Document X states/says/mentions" -> [X]
    response = re.sub(
        r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates|notes|shows)',
        r'[\1]',
        response,
        flags=re.IGNORECASE
    )
    
    # Clean up any double citations [[1]] -> [1]
    response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response)
    
    # Clean up multiple spaces
    response = re.sub(r'\s+', ' ', response)
    
    return response.strip()

def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
    """Process context and return formatted context string and processed results"""
    processed_results = []
    
    if isinstance(context, list):
        if not context:
            raise ValueError("No retrieval results provided")
        
        # Extract relevant fields from retrieval results
        for result in context:
            if isinstance(result, str):
                result = ast.literal_eval(result)
            
            # Handle both ingested files (metadata at top level) and retrieved documents (metadata in answer_metadata)
            # Check if metadata is nested in 'answer_metadata' (retrieved documents)
            metadata = result.get('answer_metadata', {})
            
            # If answer_metadata is empty or missing, check top level (ingested files)
            if not metadata or all(v is None or v == 'Unknown' for v in metadata.values()):
                # For ingested files, metadata is at the top level
                doc_info = {
                    'answer': result.get('answer', result.get('content', '')),
                    'filename': result.get('filename', 'Unknown'),
                    'page': result.get('page', 'Unknown'),
                    'year': result.get('year', 'Unknown'),
                    'source': result.get('source', 'Unknown'),
                    'document_id': result.get('_id', result.get('document_id', 'Unknown'))
                }
            else:
                # For retrieved documents, use nested metadata
                doc_info = {
                    'answer': result.get('answer', ''),
                    'filename': metadata.get('filename', 'Unknown'),
                    'page': metadata.get('page', 'Unknown'),
                    'year': metadata.get('year', 'Unknown'),
                    'source': metadata.get('source', 'Unknown'),
                    'document_id': metadata.get('_id', 'Unknown')
                }
            
            processed_results.append(doc_info)
        
        # Format context string - SIMPLIFIED TO ONLY USE [1], [2], [3]
        context_parts = []
        for i, result in enumerate(processed_results, 1):
            # Simple format: [1], [2], etc.
            context_parts.append(f"[{i}]\n{result['answer']}\n")
        
        formatted_context = "\n".join(context_parts)
        
    elif isinstance(context, str):
        if not context.strip():
            raise ValueError("Context cannot be empty")
        formatted_context = context
    else:
        raise ValueError("Context must be either a string or list of retrieval results")
    
    return formatted_context, processed_results

def _build_messages(system_prompt: str, question: str, context: str) -> list:
    """Build messages for LLM call"""
    system_content = system_prompt
    user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
    return [SystemMessage(content=system_content), HumanMessage(content=user_content)]

def _create_sources_list(cited_sources: List[Dict[str, Any]]) -> List[Dict[str, str]]:
    """Create sources list for ChatUI format"""
    sources = []
    for result in cited_sources:
        filename = result.get('filename', 'Unknown')
        page = result.get('page', 'Unknown')
        year = result.get('year', 'Unknown')
        
        link = f"doc://{filename}"
        title_parts = [filename]
        if page != 'Unknown':
            title_parts.append(f"Page {page}")
        if year != 'Unknown':
            title_parts.append(f"({year})")
        
        sources.append({"link": link, "title": " - ".join(title_parts)})
    
    return sources