mtyrrell's picture
ts startup
c510faf
import gradio as gr
import os
import hashlib
import logging
from datetime import datetime
import re
from pathlib import Path
# Document processing imports
import PyPDF2
from docx import Document as DocxDocument
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Local imports
from .utils import getconfig
config = getconfig("params.cfg")
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Models
def extract_text_from_pdf_bytes(file_content: bytes) -> tuple[str, dict]:
"""Extract text from PDF bytes (in memory)"""
try:
from io import BytesIO
pdf_reader = PyPDF2.PdfReader(BytesIO(file_content))
text = ""
metadata = {"total_pages": len(pdf_reader.pages)}
for page_num, page in enumerate(pdf_reader.pages):
page_text = page.extract_text()
text += f"\n--- Page {page_num + 1} ---\n{page_text}"
return text, metadata
except Exception as e:
logger.error(f"PDF extraction error: {str(e)}")
raise Exception(f"Failed to extract text from PDF: {str(e)}")
def extract_text_from_docx_bytes(file_content: bytes) -> tuple[str, dict]:
"""Extract text from DOCX bytes (in memory)"""
try:
from io import BytesIO
doc = DocxDocument(BytesIO(file_content))
text = ""
metadata = {"total_paragraphs": 0}
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text += f"{paragraph.text}\n"
metadata["total_paragraphs"] += 1
return text, metadata
except Exception as e:
logger.error(f"DOCX extraction error: {str(e)}")
raise Exception(f"Failed to extract text from DOCX: {str(e)}")
def clean_and_chunk_text(text: str) -> str:
"""Clean text and split into chunks, returning formatted context"""
# Basic text cleaning
text = re.sub(r'\n+', '\n', text)
text = re.sub(r'\s+', ' ', text)
text = text.strip()
# Get chunking parameters from config
chunk_size = config.getint('chunking', 'chunk_size', fallback=700)
chunk_overlap = config.getint('chunking', 'chunk_overlap', fallback=50)
separators_str = config.get('chunking', 'separators', fallback='\n\n,\n,. ,! ,? , ,')
separators = [s.strip() for s in separators_str.split(',')]
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
separators=separators,
is_separator_regex=False
)
chunks = text_splitter.split_text(text)
# Create DocumentChunk objects
context_parts = []
for i, chunk_text in enumerate(chunks):
context_parts.append(f"[Chunk {i+1}]: {chunk_text}")
return "\n\n".join(context_parts)
def ingest(file):
"""Main ingestion function - processes file and returns context directly"""
if file is None:
return "No file uploaded", ""
try:
with open(file.name, 'rb') as f:
file_content = f.read()
filename = os.path.basename(file.name)
# Extract text based on file type (in memory)
file_extension = os.path.splitext(filename)[1].lower()
if file_extension == '.pdf':
text, extraction_metadata = extract_text_from_pdf_bytes(file_content)
elif file_extension == '.docx':
text, extraction_metadata = extract_text_from_docx_bytes(file_content)
else:
raise ValueError(f"Unsupported file type: {file_extension}")
# Clean and chunk text
context = clean_and_chunk_text(text)
logger.info(f"Successfully processed document {filename}: {len(text)} characters")
return context
except Exception as e:
logger.error(f"Document processing failed: {str(e)}")
raise Exception(f"Processing failed: {str(e)}")
if __name__ == "__main__":
ui = gr.Interface(
fn=ingest,
inputs=gr.File(
label="Document Upload",
file_types=[".pdf", ".docx"]
),
outputs=gr.Textbox(
label="Processed Context",
lines=15,
show_copy_button=True
),
title="ChatFed Ingestion Module",
description="Processes PDF or DOCX files and returns chunked text context. Intended for use in RAG pipelines as an MCP server with other ChatFed modules (i.e. context supplied to generation service).",
api_name="ingest"
)
ui.launch(
server_name="0.0.0.0",
server_port=7860,
# mcp_server=True,
show_error=True
)