import os import logging from typing import Any from pathlib import Path from markitdown import MarkItDown def setup_logging(): """Sets up the logging configuration.""" logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler("app.log", encoding="utf-8"), logging.StreamHandler() ] ) return logging.getLogger(__name__) logger = setup_logging() def extract_filename(filepath: Path) -> str: """Extracts the filename without extension. Args: filepath: The complete path to the file. Returns: The filename without extension. """ logger.info(f"Extracting filename from {filepath}") return os.path.splitext(os.path.basename(filepath))[0] # More concise way to get filename def convert_document_to_markdown(filepath: Path) -> str: """Converts a document to markdown. Args: filepath: The path to the document file. Returns: The raw markdown content. """ logger.info(f"Converting document to markdown: {filepath}") md = MarkItDown(enable_plugins=False) # Set to True to enable plugins if needed result = md.convert(filepath) return result.markdown def save_to_markdown(text: Any, path: Path) -> str: """Saves text content to a markdown file. Args: text: The text or markdown content to save. path: The complete path to the markdown file. Returns: The path to the saved markdown file as a string. """ filename = extract_filename(path) filepath = f'{filename}.md' # Create the full filepath with open(filepath, 'w', encoding='utf-8') as f: f.write(text) logger.info(f"Markdown file saved successfully at {filepath}") return filepath # Return the filepath def determine_top_k(num_chunks: int) -> int: """Determines the top_k value based on the number of chunks. Args: num_chunks: The total number of chunks. Returns: The appropriate top_k value. """ if num_chunks <= 5: top_k = num_chunks else: top_k = 5 logger.info(f"Determined top_k: {top_k} based on num_chunks: {num_chunks}") return top_k def determine_reranking_top_n(top_k: int) -> int: """Determines the top_n value for reranking based on top_k. Args: top_k: The number of top results to consider. Returns: The appropriate top_n value for reranking. """ total_top_k = top_k * 2 if total_top_k <= 5: top_n = round(total_top_k / 2) + 1 else: top_n = 6 logger.info(f"Determined top_n: {top_n} based on top_k: {top_k}") return top_n