Spaces:

mtyrrell
/

chatfed_ingestor

Sleeping

App Files Files Community

chatfed_ingestor / app /main.py

mtyrrell

ts startup

c510faf 3 months ago

raw

history blame contribute delete

4.82 kB

	import gradio as gr
	import os
	import hashlib
	import logging
	from datetime import datetime
	import re
	from pathlib import Path

	# Document processing imports
	import PyPDF2
	from docx import Document as DocxDocument
	from langchain_text_splitters import RecursiveCharacterTextSplitter

	# Local imports
	from .utils import getconfig

	config = getconfig("params.cfg")

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Models

	def extract_text_from_pdf_bytes(file_content: bytes) -> tuple[str, dict]:
	"""Extract text from PDF bytes (in memory)"""
	try:
	from io import BytesIO
	pdf_reader = PyPDF2.PdfReader(BytesIO(file_content))
	text = ""
	metadata = {"total_pages": len(pdf_reader.pages)}

	for page_num, page in enumerate(pdf_reader.pages):
	page_text = page.extract_text()
	text += f"\n--- Page {page_num + 1} ---\n{page_text}"

	return text, metadata
	except Exception as e:
	logger.error(f"PDF extraction error: {str(e)}")
	raise Exception(f"Failed to extract text from PDF: {str(e)}")

	def extract_text_from_docx_bytes(file_content: bytes) -> tuple[str, dict]:
	"""Extract text from DOCX bytes (in memory)"""
	try:
	from io import BytesIO
	doc = DocxDocument(BytesIO(file_content))
	text = ""
	metadata = {"total_paragraphs": 0}

	for paragraph in doc.paragraphs:
	if paragraph.text.strip():
	text += f"{paragraph.text}\n"
	metadata["total_paragraphs"] += 1

	return text, metadata
	except Exception as e:
	logger.error(f"DOCX extraction error: {str(e)}")
	raise Exception(f"Failed to extract text from DOCX: {str(e)}")

	def clean_and_chunk_text(text: str) -> str:
	"""Clean text and split into chunks, returning formatted context"""
	# Basic text cleaning
	text = re.sub(r'\n+', '\n', text)
	text = re.sub(r'\s+', ' ', text)
	text = text.strip()

	# Get chunking parameters from config
	chunk_size = config.getint('chunking', 'chunk_size', fallback=700)
	chunk_overlap = config.getint('chunking', 'chunk_overlap', fallback=50)
	separators_str = config.get('chunking', 'separators', fallback='\n\n,\n,. ,! ,? , ,')
	separators = [s.strip() for s in separators_str.split(',')]

	# Split text into chunks
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	length_function=len,
	separators=separators,
	is_separator_regex=False
	)

	chunks = text_splitter.split_text(text)

	# Create DocumentChunk objects
	context_parts = []
	for i, chunk_text in enumerate(chunks):
	context_parts.append(f"[Chunk {i+1}]: {chunk_text}")

	return "\n\n".join(context_parts)

	def ingest(file):
	"""Main ingestion function - processes file and returns context directly"""
	if file is None:
	return "No file uploaded", ""

	try:
	with open(file.name, 'rb') as f:
	file_content = f.read()

	filename = os.path.basename(file.name)

	# Extract text based on file type (in memory)
	file_extension = os.path.splitext(filename)[1].lower()

	if file_extension == '.pdf':
	text, extraction_metadata = extract_text_from_pdf_bytes(file_content)
	elif file_extension == '.docx':
	text, extraction_metadata = extract_text_from_docx_bytes(file_content)
	else:
	raise ValueError(f"Unsupported file type: {file_extension}")

	# Clean and chunk text
	context = clean_and_chunk_text(text)

	logger.info(f"Successfully processed document {filename}: {len(text)} characters")

	return context

	except Exception as e:
	logger.error(f"Document processing failed: {str(e)}")
	raise Exception(f"Processing failed: {str(e)}")

	if __name__ == "__main__":
	ui = gr.Interface(
	fn=ingest,
	inputs=gr.File(
	label="Document Upload",
	file_types=[".pdf", ".docx"]
	),
	outputs=gr.Textbox(
	label="Processed Context",
	lines=15,
	show_copy_button=True
	),
	title="ChatFed Ingestion Module",
	description="Processes PDF or DOCX files and returns chunked text context. Intended for use in RAG pipelines as an MCP server with other ChatFed modules (i.e. context supplied to generation service).",
	api_name="ingest"
	)

	ui.launch(
	server_name="0.0.0.0",
	server_port=7860,
	# mcp_server=True,
	show_error=True
	)