eudr_chabo_generator

Running on CPU Upgrade

App Files Files Community

eudr_chabo_generator / utils /sources.py

mtyrrell

ingestor context handling

efd3c5f about 1 month ago

raw

history blame contribute delete

7.14 kB

	import re
	from typing import List, Dict, Any, Union
	import ast
	from langchain_core.messages import SystemMessage, HumanMessage


	# ---------------------------------------------------------------------
	# Core Processing Functions
	# ---------------------------------------------------------------------
	def _parse_citations(response: str) -> List[int]:
	"""Parse citation numbers from response text"""
	citation_pattern = r'\[(\d+)\]'
	matches = re.findall(citation_pattern, response)
	citation_numbers = sorted(list(set(int(match) for match in matches)))

	return citation_numbers

	def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]:
	"""Extract sources that were cited in the response"""
	if not cited_numbers:
	return []

	cited_sources = []
	for citation_num in cited_numbers:
	source_index = citation_num - 1

	if 0 <= source_index < len(processed_results):
	source = processed_results[source_index].copy() # Make copy to avoid modifying original
	source['_citation_number'] = citation_num # Preserve original citation number
	cited_sources.append(source)

	return cited_sources

	def clean_citations(response: str) -> str:
	"""Normalize all citation formats to [x] and remove unwanted sections"""

	# Remove References/Sources/Bibliography sections
	ref_patterns = [
	r'\n\s#+\sReferences?\s:?.$',
	r'\n\s#+\sSources?\s:?.$',
	r'\n\s#+\sBibliography\s:?.$',
	r'\n\sReferences?\s:.*$',
	r'\n\sSources?\s:.*$',
	r'\n\sBibliography\s:.*$',
	]
	for pattern in ref_patterns:
	response = re.sub(pattern, '', response, flags=re.IGNORECASE \| re.DOTALL)

	# Fix (Document X, Page Y, Year Z) -> [X]
	response = re.sub(
	r'$Document\s+(\d+)(?:,\sPage\s+\d+)?(?:,\s(?:Year\s+)?\d+)?$',
	r'[\1]',
	response,
	flags=re.IGNORECASE
	)

	# Fix [Document X, Page Y, Year Z] -> [X]
	response = re.sub(
	r'\[Document\s+(\d+)(?:[^\]]*)\]',
	r'[\1]',
	response,
	flags=re.IGNORECASE
	)

	# Fix [Document X: filename, Page Y, Year Z] -> [X]
	response = re.sub(
	r'\[Document\s+(\d+):[^\]]+\]',
	r'[\1]',
	response,
	flags=re.IGNORECASE
	)

	# Fix [X.Y.Z] style (section numbers) -> [X]
	response = re.sub(
	r'\[(\d+)\.[\d\.]+\]',
	r'[\1]',
	response
	)

	# Fix (Document X) -> [X]
	response = re.sub(
	r'$Document\s+(\d+)$',
	r'[\1]',
	response,
	flags=re.IGNORECASE
	)

	# Fix "Document X, Page Y, Year Z" (no brackets) -> [X]
	response = re.sub(
	r'Document\s+(\d+)(?:,\sPage\s+\d+)?(?:,\s(?:Year\s+)?\d+)?(?=\s\|[,.])',
	r'[\1]',
	response,
	flags=re.IGNORECASE
	)

	# Fix "Document X states/says/mentions" -> [X]
	response = re.sub(
	r'Document\s+(\d+)\s+(?:states\|says\|mentions\|reports\|indicates\|notes\|shows)',
	r'[\1]',
	response,
	flags=re.IGNORECASE
	)

	# Clean up any double citations [[1]] -> [1]
	response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response)

	# Clean up multiple spaces
	response = re.sub(r'\s+', ' ', response)

	return response.strip()

	def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
	"""Process context and return formatted context string and processed results"""
	processed_results = []

	if isinstance(context, list):
	if not context:
	raise ValueError("No retrieval results provided")

	# Extract relevant fields from retrieval results
	for result in context:
	if isinstance(result, str):
	result = ast.literal_eval(result)

	# Handle both ingested files (metadata at top level) and retrieved documents (metadata in answer_metadata)
	# Check if metadata is nested in 'answer_metadata' (retrieved documents)
	metadata = result.get('answer_metadata', {})

	# If answer_metadata is empty or missing, check top level (ingested files)
	if not metadata or all(v is None or v == 'Unknown' for v in metadata.values()):
	# For ingested files, metadata is at the top level
	doc_info = {
	'answer': result.get('answer', result.get('content', '')),
	'filename': result.get('filename', 'Unknown'),
	'page': result.get('page', 'Unknown'),
	'year': result.get('year', 'Unknown'),
	'source': result.get('source', 'Unknown'),
	'document_id': result.get('_id', result.get('document_id', 'Unknown'))
	}
	else:
	# For retrieved documents, use nested metadata
	doc_info = {
	'answer': result.get('answer', ''),
	'filename': metadata.get('filename', 'Unknown'),
	'page': metadata.get('page', 'Unknown'),
	'year': metadata.get('year', 'Unknown'),
	'source': metadata.get('source', 'Unknown'),
	'document_id': metadata.get('_id', 'Unknown')
	}

	processed_results.append(doc_info)

	# Format context string - SIMPLIFIED TO ONLY USE [1], [2], [3]
	context_parts = []
	for i, result in enumerate(processed_results, 1):
	# Simple format: [1], [2], etc.
	context_parts.append(f"[{i}]\n{result['answer']}\n")

	formatted_context = "\n".join(context_parts)

	elif isinstance(context, str):
	if not context.strip():
	raise ValueError("Context cannot be empty")
	formatted_context = context
	else:
	raise ValueError("Context must be either a string or list of retrieval results")

	return formatted_context, processed_results

	def _build_messages(system_prompt: str, question: str, context: str) -> list:
	"""Build messages for LLM call"""
	system_content = system_prompt
	user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
	return [SystemMessage(content=system_content), HumanMessage(content=user_content)]

	def _create_sources_list(cited_sources: List[Dict[str, Any]]) -> List[Dict[str, str]]:
	"""Create sources list for ChatUI format"""
	sources = []
	for result in cited_sources:
	filename = result.get('filename', 'Unknown')
	page = result.get('page', 'Unknown')
	year = result.get('year', 'Unknown')

	link = f"doc://{filename}"
	title_parts = [filename]
	if page != 'Unknown':
	title_parts.append(f"Page {page}")
	if year != 'Unknown':
	title_parts.append(f"({year})")

	sources.append({"link": link, "title": " - ".join(title_parts)})

	return sources