Spaces:

jeongsoo
/

RAG_AgenticServer_Small

Running

App Files Files Community

RAG_AgenticServer_Small / utils /document_processor.py

jeongsoo

init

6575706 7 months ago

raw

history blame contribute delete

14.9 kB

	"""
	문서 처리 유틸리티 모듈
	"""

	import os
	import re
	import csv
	import io
	import logging
	from typing import List, Dict, Any, Optional, Tuple, Union
	import numpy as np

	logger = logging.getLogger("DocProcessor")
	if not logger.hasHandlers():
	handler = logging.StreamHandler()
	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	handler.setFormatter(formatter)
	logger.addHandler(handler)
	logger.setLevel(logging.INFO)

	class DocumentProcessor:
	"""문서 처리 유틸리티 클래스"""

	@staticmethod
	def split_text(
	text: str,
	chunk_size: int = 512,
	chunk_overlap: int = 50,
	separator: str = "\n"
	) -> List[str]:
	"""
	텍스트를 더 작은 청크로 분할

	Args:
	text: 분할할 텍스트
	chunk_size: 각 청크의 최대 문자 수
	chunk_overlap: 청크 간 중첩되는 문자 수
	separator: 분할 시 사용할 구분자

	Returns:
	분할된 텍스트 청크 목록
	"""
	if not text or chunk_size <= 0:
	return []

	# 구분자로 분할
	parts = text.split(separator)
	chunks = []
	current_chunk = []
	current_size = 0

	for part in parts:
	part_size = len(part)

	if current_size + part_size + len(current_chunk) > chunk_size and current_chunk:
	# 현재 청크가 최대 크기를 초과하면 저장
	chunks.append(separator.join(current_chunk))

	# 중첩을 위해 일부 청크 유지
	overlap_tokens = []
	overlap_size = 0
	for token in reversed(current_chunk):
	if overlap_size + len(token) <= chunk_overlap:
	overlap_tokens.insert(0, token)
	overlap_size += len(token) + 1 # separator 길이 포함
	else:
	break

	current_chunk = overlap_tokens
	current_size = overlap_size - len(current_chunk) # separator 길이 제외

	current_chunk.append(part)
	current_size += part_size

	# 마지막 청크 추가
	if current_chunk:
	chunks.append(separator.join(current_chunk))

	return chunks

	@staticmethod
	def clean_text(text: str, remove_urls: bool = True, remove_extra_whitespace: bool = True) -> str:
	"""
	텍스트 정제

	Args:
	text: 정제할 텍스트
	remove_urls: URL 제거 여부
	remove_extra_whitespace: 여분의 공백 제거 여부

	Returns:
	정제된 텍스트
	"""
	if not text:
	return ""

	# URL 제거
	if remove_urls:
	text = re.sub(r'https?://\S+\|www\.\S+', '', text)

	# 특수 문자 및 HTML 태그 정제
	text = re.sub(r'<.*?>', '', text) # HTML 태그 제거

	# 여분의 공백 제거
	if remove_extra_whitespace:
	text = re.sub(r'\s+', ' ', text).strip()

	return text

	@staticmethod
	def text_to_documents(
	text: str,
	metadata: Optional[Dict[str, Any]] = None,
	chunk_size: int = 512,
	chunk_overlap: int = 50
	) -> List[Dict[str, Any]]:
	"""
	텍스트를 문서 객체 목록으로 변환

	Args:
	text: 변환할 텍스트
	metadata: 문서에 추가할 메타데이터
	chunk_size: 각 청크의 최대 문자 수
	chunk_overlap: 청크 간 중첩되는 문자 수

	Returns:
	문서 객체 목록
	"""
	if not text:
	return []

	# 텍스트 정제
	clean = DocumentProcessor.clean_text(text)

	# 텍스트 분할
	chunks = DocumentProcessor.split_text(
	clean,
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	)

	# 문서 객체 생성
	documents = []
	for i, chunk in enumerate(chunks):
	doc = {
	"text": chunk,
	"index": i,
	"chunk_count": len(chunks)
	}

	# 메타데이터 추가
	if metadata:
	doc.update(metadata)

	documents.append(doc)

	return documents

	@staticmethod
	def load_documents_from_directory(
	directory: str,
	extensions: List[str] = [".txt", ".md", ".csv"],
	recursive: bool = True,
	chunk_size: int = 512,
	chunk_overlap: int = 50
	) -> List[Dict[str, Any]]:
	"""
	디렉토리에서 문서 로드 및 처리

	Args:
	directory: 로드할 디렉토리 경로
	extensions: 처리할 파일 확장자 목록
	recursive: 하위 디렉토리 검색 여부
	chunk_size: 각 청크의 최대 문자 수
	chunk_overlap: 청크 간 중첩되는 문자 수

	Returns:
	문서 객체 목록
	"""
	if not os.path.isdir(directory):
	logger.error(f"디렉토리를 찾을 수 없습니다: {directory}")
	return []

	documents = []

	for root, dirs, files in os.walk(directory):
	if not recursive and root != directory:
	continue

	for file in files:
	_, ext = os.path.splitext(file)
	if ext.lower() not in extensions:
	continue

	file_path = os.path.join(root, file)
	rel_path = os.path.relpath(file_path, directory)

	try:
	logger.info(f"파일 로드 중: {rel_path}")
	# 먼저 UTF-8로 시도
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()
	except UnicodeDecodeError:
	# UTF-8로 실패하면 CP949(한국어 Windows 기본 인코딩)로 시도
	logger.info(f"UTF-8 디코딩 실패, CP949로 시도: {rel_path}")
	with open(file_path, 'r', encoding='cp949') as f:
	content = f.read()

	# 메타데이터 생성
	metadata = {
	"source": rel_path,
	"filename": file,
	"filetype": ext.lower()[1:],
	"filepath": file_path
	}

	# CSV 파일은 특별 처리
	if ext.lower() == '.csv':
	logger.info(f"CSV 파일 감지, 행 단위로 분할 처리: {rel_path}")
	file_docs = DocumentProcessor.csv_to_documents(content, metadata)
	else:
	# 일반 텍스트 문서 처리
	file_docs = DocumentProcessor.text_to_documents(
	content,
	metadata=metadata,
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	)

	documents.extend(file_docs)
	logger.info(f"{len(file_docs)}개 청크 추출: {rel_path}")

	except Exception as e:
	logger.error(f"파일 '{rel_path}' 처리 중 오류 발생: {e}")
	continue

	logger.info(f"총 {len(documents)}개 문서 청크를 로드했습니다.")
	return documents

	@staticmethod
	def prepare_rag_context(results: List[Dict[str, Any]], field: str = "text") -> List[str]:
	"""
	검색 결과에서 RAG에 사용할 컨텍스트 추출

	Args:
	results: 검색 결과 목록
	field: 텍스트 내용이 있는 필드 이름

	Returns:
	컨텍스트 텍스트 목록
	"""
	context = []

	for result in results:
	if field in result:
	context.append(result[field])

	return context

	@staticmethod
	def csv_to_documents(content: str, metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	CSV 파일 내용을 행 단위로 분리하여 각 행을 별도의 문서로 처리

	Args:
	content: CSV 파일의 내용
	metadata: 기본 메타데이터

	Returns:
	문서 객체 목록 (각 행이 별도의 문서)
	"""
	documents = []

	try:
	# 일반 CSV 파싱 시도 (코마 구분자 기본)
	try:
	csv_reader = csv.reader(io.StringIO(content))
	rows = list(csv_reader)
	if len(rows) > 0 and len(rows[0]) > 1:
	# 코마로 제대로 구분되었다고 판단
	logger.info(f"CSV 파일 코마 구분자로 처리: {metadata.get('source', 'unknown')}")
	has_valid_format = True
	else:
	# 코마로 제대로 구분되지 않음
	has_valid_format = False
	except Exception:
	has_valid_format = False

	# 코마 형식이 아닌 경우, 공백 구분자 처리 시도
	if not has_valid_format:
	logger.warning(f"CSV 파일이 표준 코마 형식이 아닙니다. 공백 구분자로 처리하겠습니다: {metadata.get('source', 'unknown')}")
	lines = content.strip().split('\n')

	for i, line in enumerate(lines):
	# IT로 시작하는 줄만 처리 (데이터 행으로 간주)
	if not line.strip().startswith('IT'):
	continue

	# 공백으로 분리하되, 최소 5개 열로 보장
	parts = line.split(maxsplit=4)

	# 유효한 행의 최소 길이 확인
	if len(parts) < 5:
	logger.warning(f"행 {i+1} 부족한 데이터: {line[:50]}...")
	continue

	# 각 필드 추출
	doc_id = parts[0].strip() # IT 번호
	query_type = parts[1].strip() # 쿼리 유형
	question = parts[2].strip() # 질문
	answer = parts[3].strip() # 답변
	reference = parts[4].strip() if len(parts) > 4 else "" # 참조

	# 문서 텍스트 생성 - 각 필드를 구분하여 포함
	text = f"ID: {doc_id}\n"
	text += f"쿼리 유형: {query_type}\n"
	text += f"질의 (Question): {question}\n"
	text += f"응답 (Answer): {answer}\n"
	if reference:
	text += f"참조 문서/맥락 (Reference/Context): {reference}"

	# 문서 객체 생성
	doc_metadata = metadata.copy()
	doc_metadata.update({
	"row": i,
	"query_type": query_type,
	"question": question,
	"answer": answer,
	"reference": reference
	})

	document = {
	"text": text,
	"id": doc_id, # IT 번호를 ID로 사용
	**doc_metadata
	}

	documents.append(document)
	logger.debug(f"IT 문서 처리: {doc_id} - {question[:30]}...")

	logger.info(f"공백 구분자 CSV 파일 '{metadata.get('source', 'unknown')}'에서 {len(documents)}개 행을 문서로 변환했습니다.")
	return documents

	# 표준 CSV 형식 처리 (코마 구분자 사용)
	if not rows:
	logger.warning(f"CSV 파일에 데이터가 없습니다: {metadata.get('source', 'unknown')}")
	return []

	# 첫 번째 행을 헤더로 사용
	headers = rows[0]
	logger.debug(f"CSV 헤더: {headers}")

	# 각 행을 별도의 문서로 변환
	for i, row in enumerate(rows[1:], 1): # 헤더 제외, 1부터 시작
	# 행이 헤더보다 짧으면 빈 값으로 채움
	while len(row) < len(headers):
	row.append("")

	# 행 데이터를 사전형으로 변환
	row_data = {headers[j]: value for j, value in enumerate(row) if j < len(headers)}

	# 첫 번째 열을 ID로 사용 (있는 경우)
	row_id = row[0] if row and len(row) > 0 else f"row_{i}"

	# 문서 텍스트 생성 - 모든 필드를 포함한 표현
	text_parts = []
	for j, header in enumerate(headers):
	if j < len(row) and row[j]:
	text_parts.append(f"{header}: {row[j]}")

	text = "\n".join(text_parts)

	# 문서 객체 생성
	doc_metadata = metadata.copy()
	doc_metadata.update({
	"row": i,
	"row_id": row_id,
	"total_rows": len(rows) - 1, # 헤더 제외
	"csv_data": row_data # 원본 행 데이터도 저장
	})

	document = {
	"text": text,
	"id": row_id,
	**doc_metadata
	}

	documents.append(document)

	logger.info(f"CSV 파일 '{metadata.get('source', 'unknown')}'에서 {len(documents)}개 행을 문서로 변환했습니다.")

	except Exception as e:
	logger.error(f"CSV 파일 처리 중 오류 발생: {e}")

	return documents