Spaces:
Runtime error
Runtime error
| import logging | |
| import zipfile | |
| from pathlib import Path | |
| logger = logging.getLogger(__name__) | |
| class DocxToXml: | |
| def __init__(self, docx_path: str): | |
| """ | |
| Initialize the converter with path to DOCX file | |
| Args: | |
| docx_path (str): Path to the DOCX file | |
| """ | |
| self.docx_path = Path(docx_path) | |
| if not self.docx_path.exists(): | |
| raise FileNotFoundError(f"File not found: {docx_path}") | |
| def extract_document_xml(self) -> str: | |
| """ | |
| Extract document.xml content from the DOCX file | |
| Returns: | |
| str: Content of document.xml file | |
| Raises: | |
| ValueError: If document.xml is not found in the DOCX file | |
| """ | |
| try: | |
| with zipfile.ZipFile(self.docx_path) as docx_zip: | |
| # The main document content is always stored in word/document.xml | |
| xml_content = docx_zip.read('word/document.xml') | |
| return xml_content.decode('utf-8') | |
| except KeyError: | |
| raise ValueError("document.xml not found in the DOCX file") | |
| except Exception as e: | |
| raise Exception(f"Error extracting XML: {str(e)}") | |
| def convert_file(docx_path: str) -> str: | |
| """ | |
| Static method to quickly convert a DOCX file to XML | |
| Args: | |
| docx_path (str): Path to the DOCX file | |
| Returns: | |
| str: Content of document.xml file | |
| """ | |
| converter = DocxToXml(docx_path) | |
| return converter.extract_document_xml() | |