Spaces:
Sleeping
Sleeping
File size: 4,822 Bytes
537051a 698f034 01327a1 537051a 707cb4c 4ccbb05 537051a 4ccbb05 537051a 707cb4c 4ccbb05 537051a 4ccbb05 537051a 7989ddb 537051a 7989ddb 537051a 707cb4c 537051a 7989ddb 537051a 01327a1 7989ddb 698f034 01327a1 7989ddb 01327a1 7989ddb 698f034 537051a 707cb4c 537051a 707cb4c 537051a 707cb4c 537051a 707cb4c 537051a 707cb4c 537051a 4ccbb05 537051a 4ccbb05 537051a 4ccbb05 537051a 707cb4c 537051a 707cb4c 537051a 707cb4c 537051a 707cb4c 537051a 707cb4c bb59241 707cb4c bb59241 707cb4c c510faf 707cb4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import gradio as gr
import os
import hashlib
import logging
from datetime import datetime
import re
from pathlib import Path
# Document processing imports
import PyPDF2
from docx import Document as DocxDocument
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Local imports
from .utils import getconfig
config = getconfig("params.cfg")
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Models
def extract_text_from_pdf_bytes(file_content: bytes) -> tuple[str, dict]:
"""Extract text from PDF bytes (in memory)"""
try:
from io import BytesIO
pdf_reader = PyPDF2.PdfReader(BytesIO(file_content))
text = ""
metadata = {"total_pages": len(pdf_reader.pages)}
for page_num, page in enumerate(pdf_reader.pages):
page_text = page.extract_text()
text += f"\n--- Page {page_num + 1} ---\n{page_text}"
return text, metadata
except Exception as e:
logger.error(f"PDF extraction error: {str(e)}")
raise Exception(f"Failed to extract text from PDF: {str(e)}")
def extract_text_from_docx_bytes(file_content: bytes) -> tuple[str, dict]:
"""Extract text from DOCX bytes (in memory)"""
try:
from io import BytesIO
doc = DocxDocument(BytesIO(file_content))
text = ""
metadata = {"total_paragraphs": 0}
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text += f"{paragraph.text}\n"
metadata["total_paragraphs"] += 1
return text, metadata
except Exception as e:
logger.error(f"DOCX extraction error: {str(e)}")
raise Exception(f"Failed to extract text from DOCX: {str(e)}")
def clean_and_chunk_text(text: str) -> str:
"""Clean text and split into chunks, returning formatted context"""
# Basic text cleaning
text = re.sub(r'\n+', '\n', text)
text = re.sub(r'\s+', ' ', text)
text = text.strip()
# Get chunking parameters from config
chunk_size = config.getint('chunking', 'chunk_size', fallback=700)
chunk_overlap = config.getint('chunking', 'chunk_overlap', fallback=50)
separators_str = config.get('chunking', 'separators', fallback='\n\n,\n,. ,! ,? , ,')
separators = [s.strip() for s in separators_str.split(',')]
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
separators=separators,
is_separator_regex=False
)
chunks = text_splitter.split_text(text)
# Create DocumentChunk objects
context_parts = []
for i, chunk_text in enumerate(chunks):
context_parts.append(f"[Chunk {i+1}]: {chunk_text}")
return "\n\n".join(context_parts)
def ingest(file):
"""Main ingestion function - processes file and returns context directly"""
if file is None:
return "No file uploaded", ""
try:
with open(file.name, 'rb') as f:
file_content = f.read()
filename = os.path.basename(file.name)
# Extract text based on file type (in memory)
file_extension = os.path.splitext(filename)[1].lower()
if file_extension == '.pdf':
text, extraction_metadata = extract_text_from_pdf_bytes(file_content)
elif file_extension == '.docx':
text, extraction_metadata = extract_text_from_docx_bytes(file_content)
else:
raise ValueError(f"Unsupported file type: {file_extension}")
# Clean and chunk text
context = clean_and_chunk_text(text)
logger.info(f"Successfully processed document {filename}: {len(text)} characters")
return context
except Exception as e:
logger.error(f"Document processing failed: {str(e)}")
raise Exception(f"Processing failed: {str(e)}")
if __name__ == "__main__":
ui = gr.Interface(
fn=ingest,
inputs=gr.File(
label="Document Upload",
file_types=[".pdf", ".docx"]
),
outputs=gr.Textbox(
label="Processed Context",
lines=15,
show_copy_button=True
),
title="ChatFed Ingestion Module",
description="Processes PDF or DOCX files and returns chunked text context. Intended for use in RAG pipelines as an MCP server with other ChatFed modules (i.e. context supplied to generation service).",
api_name="ingest"
)
ui.launch(
server_name="0.0.0.0",
server_port=7860,
# mcp_server=True,
show_error=True
) |