Spaces:

davanstrien
/

deepseek-ocr

Runtime error

davanstrien HF Staff commited on Oct 21

Commit

b7b3c0d

1 Parent(s): 0a5527f

Switch to batch processing pattern from official run_dpsk_ocr_eval_batch.py

- Use LLM class instead of AsyncLLMEngine (fixes segfault)
- Use ThreadPoolExecutor for parallel image preprocessing
- Single llm.generate() call for true batch processing
- Added max_num_seqs and num_workers parameters
- Mirrors official DeepSeek batch processing script

Files changed (1) hide show

process_dataset.py +59 -58

process_dataset.py CHANGED Viewed

@@ -1,16 +1,15 @@
 #!/usr/bin/env python3
 """
 DeepSeek-OCR Dataset Processing
-Minimal adaptation of official run_dpsk_ocr_image.py for dataset processing
 """
 import argparse
-import asyncio
 import json
 import os
 import sys
-import time
 from datetime import datetime
 import torch
 if torch.version.cuda == '11.8':
@@ -18,13 +17,12 @@ if torch.version.cuda == '11.8':
 os.environ['VLLM_USE_V1'] = '0'
-from vllm import AsyncLLMEngine, SamplingParams
-from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.model_executor.models.registry import ModelRegistry
 from PIL import Image, ImageOps
 from tqdm.auto import tqdm
 from datasets import load_dataset
-from huggingface_hub import DatasetCard, login
 # Import DeepSeek-OCR modules (unchanged from original)
 from deepseek_ocr import DeepseekOCRForCausalLM
@@ -44,27 +42,19 @@ def check_cuda():
     print(f"Using GPU: {torch.cuda.get_device_name(0)}")
-async def process_single_image(engine, sampling_params, image_features, prompt):
-    """Process a single image through the engine (unchanged from original)"""
-    request_id = f"request-{int(time.time() * 1000000)}"
-    if image_features and '<image>' in prompt:
-        request = {
-            "prompt": prompt,
-            "multi_modal_data": {"image": image_features}
-        }
-    else:
-        request = {"prompt": prompt}
-    final_output = ""
-    async for request_output in engine.generate(request, sampling_params, request_id):
-        if request_output.outputs:
-            final_output = request_output.outputs[0].text
-    return final_output.strip()
-async def main_async(args):
     """Main processing function"""
     check_cuda()
@@ -95,23 +85,24 @@ async def main_async(args):
         dataset = dataset.select(range(min(args.max_samples, len(dataset))))
         print(f"Processing {len(dataset)} samples")
-    # Initialize vLLM engine (UNCHANGED from original)
     print("Initializing vLLM engine...")
-    engine_args = AsyncEngineArgs(
         model=MODEL_PATH,
         hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
         block_size=256,
-        max_model_len=args.max_model_len,
         enforce_eager=False,
         trust_remote_code=True,
         tensor_parallel_size=1,
         gpu_memory_utilization=args.gpu_memory_utilization,
     )
-    engine = AsyncLLMEngine.from_engine_args(engine_args)
-    # Sampling params (UNCHANGED from original)
     logits_processors = [NoRepeatNGramLogitsProcessor(
-        ngram_size=30, window_size=90, whitelist_token_ids={128821, 128822}
     )]
     sampling_params = SamplingParams(
@@ -121,37 +112,44 @@ async def main_async(args):
         skip_special_tokens=False,
     )
-    # Process images
-    print(f"Processing {len(dataset)} images...")
-    all_markdown = []
-    processor = DeepseekOCRProcessor()
-    for idx in tqdm(range(len(dataset)), desc="OCR processing"):
         try:
-            # Load image
             image = dataset[idx][args.image_column]
             if not isinstance(image, Image.Image):
                 image = Image.open(image) if isinstance(image, str) else image
             image = ImageOps.exif_transpose(image.convert('RGB'))
-            # Preprocess image (UNCHANGED from original)
-            if '<image>' in PROMPT:
-                image_features = processor.tokenize_with_images(
-                    images=[image], bos=True, eos=True, cropping=CROP_MODE
-                )
-            else:
-                image_features = ''
-            # Process
-            result = await process_single_image(
-                engine, sampling_params, image_features, PROMPT
-            )
-            all_markdown.append(result)
         except Exception as e:
-            print(f"Error processing image {idx}: {e}")
-            all_markdown.append("[OCR FAILED]")
     # Add markdown column
     print("Adding markdown column...")
@@ -177,8 +175,9 @@ async def main_async(args):
         "max_tokens": args.max_tokens,
         "max_model_len": args.max_model_len,
         "gpu_memory_utilization": args.gpu_memory_utilization,
         "script": "process_dataset.py",
-        "implementation": "vllm-async (official deepseek code)",
     }
     existing_info.append(new_info)
@@ -207,8 +206,10 @@ if __name__ == "__main__":
     parser.add_argument("--max-model-len", type=int, default=8192)
     parser.add_argument("--max-tokens", type=int, default=8192)
     parser.add_argument("--gpu-memory-utilization", type=float, default=0.75)
     parser.add_argument("--hf-token", help="HF API token")
     parser.add_argument("--private", action="store_true", help="Make output private")
     args = parser.parse_args()
-    asyncio.run(main_async(args))

 #!/usr/bin/env python3
 """
 DeepSeek-OCR Dataset Processing
+Minimal adaptation of official run_dpsk_ocr_eval_batch.py for dataset processing
 """
 import argparse
 import json
 import os
 import sys
 from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor
 import torch
 if torch.version.cuda == '11.8':
 os.environ['VLLM_USE_V1'] = '0'
+from vllm import LLM, SamplingParams
 from vllm.model_executor.models.registry import ModelRegistry
 from PIL import Image, ImageOps
 from tqdm.auto import tqdm
 from datasets import load_dataset
+from huggingface_hub import login
 # Import DeepSeek-OCR modules (unchanged from original)
 from deepseek_ocr import DeepseekOCRForCausalLM
     print(f"Using GPU: {torch.cuda.get_device_name(0)}")
+def process_single_image(image):
+    """Preprocess single image (unchanged from official batch script)"""
+    prompt_in = PROMPT
+    cache_item = {
+        "prompt": prompt_in,
+        "multi_modal_data": {"image": DeepseekOCRProcessor().tokenize_with_images(
+            images=[image], bos=True, eos=True, cropping=CROP_MODE
+        )},
+    }
+    return cache_item
+def main(args):
     """Main processing function"""
     check_cuda()
         dataset = dataset.select(range(min(args.max_samples, len(dataset))))
         print(f"Processing {len(dataset)} samples")
+    # Initialize vLLM engine (UNCHANGED from official batch script)
     print("Initializing vLLM engine...")
+    llm = LLM(
         model=MODEL_PATH,
         hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
         block_size=256,
         enforce_eager=False,
         trust_remote_code=True,
+        max_model_len=args.max_model_len,
+        swap_space=0,
+        max_num_seqs=args.max_num_seqs,
         tensor_parallel_size=1,
         gpu_memory_utilization=args.gpu_memory_utilization,
     )
+    # Sampling params (UNCHANGED from official batch script)
     logits_processors = [NoRepeatNGramLogitsProcessor(
+        ngram_size=40, window_size=90, whitelist_token_ids={128821, 128822}
     )]
     sampling_params = SamplingParams(
         skip_special_tokens=False,
     )
+    # Load and preprocess images
+    print(f"Loading images from dataset...")
+    images = []
+    for idx in range(len(dataset)):
         try:
             image = dataset[idx][args.image_column]
             if not isinstance(image, Image.Image):
                 image = Image.open(image) if isinstance(image, str) else image
             image = ImageOps.exif_transpose(image.convert('RGB'))
+            images.append(image)
         except Exception as e:
+            print(f"Error loading image {idx}: {e}")
+            images.append(None)
+    # Preprocess images in parallel (UNCHANGED from official batch script)
+    print(f"Preprocessing images...")
+    with ThreadPoolExecutor(max_workers=args.num_workers) as executor:
+        batch_inputs = list(tqdm(
+            executor.map(lambda img: process_single_image(img) if img else None, images),
+            total=len(images),
+            desc="Pre-processing images"
+        ))
+    # Filter out None entries and track their indices
+    valid_indices = [i for i, inp in enumerate(batch_inputs) if inp is not None]
+    valid_batch_inputs = [inp for inp in batch_inputs if inp is not None]
+    # Batch inference (UNCHANGED from official batch script)
+    print(f"Running batch inference on {len(valid_batch_inputs)} images...")
+    outputs_list = llm.generate(
+        valid_batch_inputs,
+        sampling_params=sampling_params
+    )
+    # Extract results
+    all_markdown = ["[OCR FAILED]"] * len(dataset)
+    for idx, output in zip(valid_indices, outputs_list):
+        all_markdown[idx] = output.outputs[0].text.strip()
     # Add markdown column
     print("Adding markdown column...")
         "max_tokens": args.max_tokens,
         "max_model_len": args.max_model_len,
         "gpu_memory_utilization": args.gpu_memory_utilization,
+        "max_num_seqs": args.max_num_seqs,
         "script": "process_dataset.py",
+        "implementation": "vllm-batch (official deepseek batch code)",
     }
     existing_info.append(new_info)
     parser.add_argument("--max-model-len", type=int, default=8192)
     parser.add_argument("--max-tokens", type=int, default=8192)
     parser.add_argument("--gpu-memory-utilization", type=float, default=0.75)
+    parser.add_argument("--max-num-seqs", type=int, default=100, help="Max concurrent sequences")
+    parser.add_argument("--num-workers", type=int, default=64, help="Image preprocessing workers")
     parser.add_argument("--hf-token", help="HF API token")
     parser.add_argument("--private", action="store_true", help="Make output private")
     args = parser.parse_args()
+    main(args)