Spaces:
Sleeping
Sleeping
| """ | |
| Script to convert all the pdf documents to markdown format in azure. | |
| """ | |
| import logging | |
| import time | |
| from pathlib import Path | |
| import os | |
| import yaml | |
| from azureml.fsspec import AzureMachineLearningFileSystem | |
| import shutil | |
| from concurrent.futures import ThreadPoolExecutor | |
| from docling_core.types.doc import ImageRefMode, PictureItem, TableItem | |
| from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend | |
| from docling.datamodel.base_models import ConversionStatus, InputFormat | |
| from docling.datamodel.document import ConversionResult | |
| from docling.datamodel.settings import settings | |
| from docling.document_converter import DocumentConverter, PdfFormatOption | |
| from docling_core.types.doc import ImageRefMode | |
| from huggingface_hub import snapshot_download | |
| from docling.datamodel.settings import settings | |
| from docling.datamodel.pipeline_options import ( | |
| AcceleratorDevice, | |
| AcceleratorOptions, | |
| PdfPipelineOptions, | |
| TesseractCliOcrOptions, | |
| TableFormerMode, | |
| ) | |
| from indexing import document_indexing | |
| from docling_utils import save_json | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
| class Docling_Coversion: | |
| def __init__(self, image_scale=1.0): | |
| logging.info("Initializing Docling_Coversion with image_scale=%s", image_scale) | |
| accelerator_options = AcceleratorOptions( | |
| num_threads=8, device=AcceleratorDevice.CUDA | |
| ) | |
| # Turn on inline debug visualizations: | |
| settings.debug.visualize_layout = True | |
| settings.debug.visualize_ocr = True | |
| settings.debug.visualize_tables = True | |
| settings.debug.visualize_cells = True | |
| pipeline_options = PdfPipelineOptions( | |
| do_ocr=True, | |
| do_table_structure=True, | |
| images_scale=image_scale, | |
| generate_page_images=True, | |
| generate_picture_images=True, | |
| accelerator_options=accelerator_options, | |
| ocr_options=TesseractCliOcrOptions(force_full_page_ocr=True) | |
| ) | |
| pipeline_options.table_structure_options.do_cell_matching = True | |
| pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE | |
| self.converter = DocumentConverter( | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption( | |
| pipeline_options=pipeline_options, | |
| backend=DoclingParseV4DocumentBackend, | |
| ) | |
| } | |
| ) | |
| logging.info("Docling_Coversion initialized successfully.") | |
| def document_conversion(self, file_path): | |
| """Convert a file and return the document object.""" | |
| logging.info("Starting document conversion for file: %s", file_path) | |
| return self.converter.convert(Path(file_path)).document | |
| def save_document(self, file_path, output_dir, azure_fs): | |
| """Convert a file, save the output as markdown with embedded images, | |
| and upload to Azure.""" | |
| input_path = Path(file_path) | |
| logging.info("Processing file: %s", file_path) | |
| try: | |
| result = self.converter.convert(input_path) | |
| doc_name = input_path.stem | |
| temp_md_file_path = Path(output_dir) / f"{doc_name}-with-images.md" | |
| docling_document_class = document_indexing(result, | |
| "ibm-granite/granite-embedding-125m-english", | |
| speciality= input_path.parent.name, | |
| file_name=input_path.stem | |
| ) | |
| tables_doc = docling_document_class.extract_tables() | |
| images_doc = docling_document_class.extract_images() | |
| text_doc = docling_document_class.extract_all_text() | |
| chunks_doc = docling_document_class.create_chunks() | |
| # Save the extracted data as JSON | |
| save_json(file_path=output_dir, category="tables", data=tables_doc) | |
| save_json(file_path=output_dir, category="images", data=images_doc) | |
| save_json(file_path=output_dir, category="text", data=text_doc) | |
| save_json(file_path=output_dir, category="chunks", data=chunks_doc) | |
| logging.info("Saved extracted data as JSON files.") | |
| # Save locally first | |
| result.document.save_as_markdown(temp_md_file_path, image_mode=ImageRefMode.REFERENCED) | |
| logging.info("Saved locally: %s", temp_md_file_path) | |
| # Upload to Azure | |
| azure_output_path = f"converted_docs_json/{doc_name}" | |
| azure_fs.upload(lpath=str(output_dir), rpath=azure_output_path, recursive=True) | |
| logging.info("Uploaded to Azure: %s", azure_output_path) | |
| # Optionally, delete the local file after upload | |
| if output_dir.exists() and output_dir.is_dir(): | |
| shutil.rmtree(output_dir) | |
| logging.info("Deleted local directory: %s", output_dir) | |
| except Exception as e: | |
| logging.error("Error processing file %s: %s", file_path, e) | |
| def main(source_dir: str): | |
| logging.info("Starting main function with source_dir: %s", source_dir) | |
| # Set the temporary output directory | |
| # Set the local directory to save PDFs | |
| local_pdf_dir = Path("./local_pdfs") | |
| local_pdf_dir.mkdir(parents=True, exist_ok=True) # Create the directory if it doesn't exist | |
| logging.info("Local PDF directory created: %s", local_pdf_dir) | |
| fs = AzureMachineLearningFileSystem(source_dir) | |
| all_pdf_files = fs.glob('**/*.pdf') | |
| logging.info("Found %d PDF files in source directory.", len(all_pdf_files)) | |
| converter = Docling_Coversion(image_scale=2) | |
| for file_path in all_pdf_files: | |
| # file_path = Path(file_path) | |
| output_dir = Path("./temp") | |
| output_dir.mkdir(parents=True, exist_ok=True) # Create the directory if it doesn't exist | |
| logging.info("Temporary output directory created: %s", output_dir) | |
| file_path_ = Path(file_path) | |
| file_name = file_path_.name | |
| local_pdf_path = local_pdf_dir / file_name | |
| azure_output_path = f"converted_docs_json/{file_path_.stem}" | |
| # Check if the file already exists in Azure | |
| if fs.exists(azure_output_path): | |
| logging.info("Skipping %s, already processed.", file_name) | |
| continue | |
| # Save the PDF locally | |
| logging.info("Downloading file: %s", file_name) | |
| with fs.open(file_path, "rb") as remote_file: | |
| with open(local_pdf_path, "wb") as local_file: | |
| local_file.write(remote_file.read()) | |
| logging.info("File saved locally: %s", local_pdf_path) | |
| # Process the local PDF file | |
| logging.info("Processing: %s", file_name) | |
| converter.save_document(local_pdf_path, output_dir, fs) | |
| # Optionally, delete the local PDF after processing | |
| local_pdf_path.unlink() | |
| logging.info("Deleted local PDF: %s", local_pdf_path) | |
| logging.info("Processing completed for all files.") | |
| if __name__ == "__main__": | |
| logging.info("Script started.") | |
| main(source_dir=( | |
| 'azureml://subscriptions/485363cd-687d-4adb-a30b-35108c11d682/resourcegroups/medbot/workspaces/karthik/datastores/workspaceartifactstore/paths/UI/2025-04-11_075006_UTC/PdfFiles/' | |
| )) | |
| logging.info("Script finished.") |