Spaces:

davanstrien
/

deepseek-ocr

Runtime error

davanstrien HF Staff commited on 18 days ago

Commit

0a5527f

1 Parent(s): 40f1c08

Restart with minimal changes to official DeepSeek code

- Created process_dataset.py based on run_dpsk_ocr_image.py
- Using original config.py with tokenizer initialization
- Removed custom main.py that was causing import issues
- Minimal changes: only dataset loading/processing added
- Dockerfile updated to use process_dataset.py

Files changed (6) hide show

Dockerfile +1 -1
README.md +9 -23
config.py +28 -37
config_template.py +0 -42
main.py +0 -535
process_dataset.py +214 -0

Dockerfile CHANGED Viewed

@@ -48,4 +48,4 @@ RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 # Default command (can be overridden by HF Jobs)
-CMD ["python", "main.py", "--help"]

 COPY . .
 # Default command (can be overridden by HF Jobs)
+CMD ["python", "process_dataset.py", "--help"]

README.md CHANGED Viewed

@@ -27,7 +27,7 @@ Process any image dataset without needing your own GPU:
 hf jobs run --flavor l4x1 \
     --secrets HF_TOKEN \
     hf.co/spaces/davanstrien/deepseek-ocr \
-    python main.py \
     input-dataset \
     output-dataset
@@ -35,11 +35,10 @@ hf jobs run --flavor l4x1 \
 hf jobs run --flavor l4x1 \
     --secrets HF_TOKEN \
     hf.co/spaces/davanstrien/deepseek-ocr \
-    python main.py \
     your-input-dataset \
     your-output-dataset \
-    --max-samples 10 \
-    --resolution-mode tiny
 ```
 That's it! The script will:
@@ -84,31 +83,19 @@ That's it! The script will:
 # Default (Gundam mode)
 hf jobs run --flavor l4x1 --secrets HF_TOKEN \
     hf.co/spaces/davanstrien/deepseek-ocr \
-    python main.py \
     my-images-dataset \
     ocr-results
 ```
-### High Quality Mode
-```bash
-hf jobs run --flavor l40sx1 --secrets HF_TOKEN \
-    hf.co/spaces/davanstrien/deepseek-ocr \
-    python main.py \
-    documents-dataset \
-    extracted-text \
-    --resolution-mode large
-```
 ### Fast Processing for Testing
 ```bash
 hf jobs run --flavor l4x1 --secrets HF_TOKEN \
     hf.co/spaces/davanstrien/deepseek-ocr \
-    python main.py \
     large-dataset \
     test-output \
-    --resolution-mode tiny \
     --max-samples 100
 ```
@@ -117,7 +104,7 @@ hf jobs run --flavor l4x1 --secrets HF_TOKEN \
 ```bash
 hf jobs run --flavor l4x1 --secrets HF_TOKEN \
     hf.co/spaces/davanstrien/deepseek-ocr \
-    python main.py \
     ordered-dataset \
     random-sample \
     --max-samples 50 \
@@ -130,11 +117,10 @@ hf jobs run --flavor l4x1 --secrets HF_TOKEN \
 ```bash
 hf jobs run --flavor a10g-large --secrets HF_TOKEN \
     hf.co/spaces/davanstrien/deepseek-ocr \
-    python main.py \
     davanstrien/ufo-ColPali \
     ufo-ocr \
-    --image-column image \
-    --resolution-mode gundam
 ```
 ### Private Output Dataset
@@ -142,7 +128,7 @@ hf jobs run --flavor a10g-large --secrets HF_TOKEN \
 ```bash
 hf jobs run --flavor l4x1 --secrets HF_TOKEN \
     hf.co/spaces/davanstrien/deepseek-ocr \
-    python main.py \
     private-input \
     private-output \
     --private

 hf jobs run --flavor l4x1 \
     --secrets HF_TOKEN \
     hf.co/spaces/davanstrien/deepseek-ocr \
+    python process_dataset.py \
     input-dataset \
     output-dataset
 hf jobs run --flavor l4x1 \
     --secrets HF_TOKEN \
     hf.co/spaces/davanstrien/deepseek-ocr \
+    python process_dataset.py \
     your-input-dataset \
     your-output-dataset \
+    --max-samples 10
 ```
 That's it! The script will:
 # Default (Gundam mode)
 hf jobs run --flavor l4x1 --secrets HF_TOKEN \
     hf.co/spaces/davanstrien/deepseek-ocr \
+    python process_dataset.py \
     my-images-dataset \
     ocr-results
 ```
 ### Fast Processing for Testing
 ```bash
 hf jobs run --flavor l4x1 --secrets HF_TOKEN \
     hf.co/spaces/davanstrien/deepseek-ocr \
+    python process_dataset.py \
     large-dataset \
     test-output \
     --max-samples 100
 ```
 ```bash
 hf jobs run --flavor l4x1 --secrets HF_TOKEN \
     hf.co/spaces/davanstrien/deepseek-ocr \
+    python process_dataset.py \
     ordered-dataset \
     random-sample \
     --max-samples 50 \
 ```bash
 hf jobs run --flavor a10g-large --secrets HF_TOKEN \
     hf.co/spaces/davanstrien/deepseek-ocr \
+    python process_dataset.py \
     davanstrien/ufo-ColPali \
     ufo-ocr \
+    --image-column image
 ```
 ### Private Output Dataset
 ```bash
 hf jobs run --flavor l4x1 --secrets HF_TOKEN \
     hf.co/spaces/davanstrien/deepseek-ocr \
+    python process_dataset.py \
     private-input \
     private-output \
     --private

config.py CHANGED Viewed

@@ -1,51 +1,42 @@
-# Configuration for DeepSeek-OCR
-# These will be set programmatically by main.py based on command-line arguments
-# Resolution settings (set by resolution mode)
 BASE_SIZE = 1024
 IMAGE_SIZE = 640
 CROP_MODE = True
-# Processing settings
-MIN_CROPS = 2
-MAX_CROPS = 6  # max:9; If your GPU memory is small, it is recommended to set it to 6.
-MAX_CONCURRENCY = 100  # If you have limited GPU memory, lower the concurrency count.
-NUM_WORKERS = 64  # image pre-process (resize/padding) workers
 PRINT_NUM_VIS_TOKENS = False
 SKIP_REPEAT = True
-# Model settings
-MODEL_PATH = 'deepseek-ai/DeepSeek-OCR'
-# Paths (not used in Space version)
-INPUT_PATH = ''
 OUTPUT_PATH = ''
-# Default prompt
 PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
-# Tokenizer - initialized at import time for vLLM compatibility
-from transformers import AutoTokenizer
-TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
-def set_resolution_mode(mode: str):
-    """Update global config based on resolution mode."""
-    global BASE_SIZE, IMAGE_SIZE, CROP_MODE
-    modes = {
-        "tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
-        "small": {"base_size": 640, "image_size": 640, "crop_mode": False},
-        "base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
-        "large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
-        "gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
-    }
-    if mode not in modes:
-        raise ValueError(f"Unknown resolution mode: {mode}. Choose from {list(modes.keys())}")
-    BASE_SIZE = modes[mode]["base_size"]
-    IMAGE_SIZE = modes[mode]["image_size"]
-    CROP_MODE = modes[mode]["crop_mode"]
-    return BASE_SIZE, IMAGE_SIZE, CROP_MODE

+# TODO: change modes
+# Tiny: base_size = 512, image_size = 512, crop_mode = False
+# Small: base_size = 640, image_size = 640, crop_mode = False
+# Base: base_size = 1024, image_size = 1024, crop_mode = False
+# Large: base_size = 1280, image_size = 1280, crop_mode = False
+# Gundam: base_size = 1024, image_size = 640, crop_mode = True
 BASE_SIZE = 1024
 IMAGE_SIZE = 640
 CROP_MODE = True
+MIN_CROPS= 2
+MAX_CROPS= 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
+MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
+NUM_WORKERS = 64 # image pre-process (resize/padding) workers
 PRINT_NUM_VIS_TOKENS = False
 SKIP_REPEAT = True
+MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path
+# TODO: change INPUT_PATH
+# .pdf: run_dpsk_ocr_pdf.py;
+# .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
+# Omnidocbench images path: run_dpsk_ocr_eval_batch.py
+INPUT_PATH = ''
 OUTPUT_PATH = ''
 PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
+# PROMPT = '<image>\nFree OCR.'
+# TODO commonly used prompts
+# document: <image>\n<|grounding|>Convert the document to markdown.
+# other image: <image>\n<|grounding|>OCR this image.
+# without layouts: <image>\nFree OCR.
+# figures in document: <image>\nParse the figure.
+# general: <image>\nDescribe this image in detail.
+# rec: <image>\nLocate <|ref|>xxxx<|/ref|> in the image.
+# '先天下之忧而忧'
+# .......
+from transformers import AutoTokenizer
+TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)

config_template.py DELETED Viewed

@@ -1,42 +0,0 @@
-# TODO: change modes
-# Tiny: base_size = 512, image_size = 512, crop_mode = False
-# Small: base_size = 640, image_size = 640, crop_mode = False
-# Base: base_size = 1024, image_size = 1024, crop_mode = False
-# Large: base_size = 1280, image_size = 1280, crop_mode = False
-# Gundam: base_size = 1024, image_size = 640, crop_mode = True
-BASE_SIZE = 1024
-IMAGE_SIZE = 640
-CROP_MODE = True
-MIN_CROPS= 2
-MAX_CROPS= 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
-MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
-NUM_WORKERS = 64 # image pre-process (resize/padding) workers
-PRINT_NUM_VIS_TOKENS = False
-SKIP_REPEAT = True
-MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path
-# TODO: change INPUT_PATH
-# .pdf: run_dpsk_ocr_pdf.py;
-# .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
-# Omnidocbench images path: run_dpsk_ocr_eval_batch.py
-INPUT_PATH = ''
-OUTPUT_PATH = ''
-PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
-# PROMPT = '<image>\nFree OCR.'
-# TODO commonly used prompts
-# document: <image>\n<|grounding|>Convert the document to markdown.
-# other image: <image>\n<|grounding|>OCR this image.
-# without layouts: <image>\nFree OCR.
-# figures in document: <image>\nParse the figure.
-# general: <image>\nDescribe this image in detail.
-# rec: <image>\nLocate <|ref|>xxxx<|/ref|> in the image.
-# '先天下之忧而忧'
-# .......
-from transformers import AutoTokenizer
-TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)

main.py DELETED Viewed

@@ -1,535 +0,0 @@
-#!/usr/bin/env python3
-"""
-DeepSeek-OCR Dataset Processing with vLLM
-This script processes image datasets through DeepSeek-OCR using vLLM for efficient batch processing.
-"""
-import argparse
-import asyncio
-import json
-import logging
-import os
-import sys
-import time
-from datetime import datetime
-from typing import List
-import torch
-from datasets import load_dataset
-from huggingface_hub import DatasetCard, login
-from PIL import Image, ImageOps
-from tqdm.auto import tqdm
-from vllm import AsyncLLMEngine, SamplingParams
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.model_executor.models.registry import ModelRegistry
-# Import DeepSeek-OCR modules
-import config
-from deepseek_ocr import DeepseekOCRForCausalLM
-from process.image_process import DeepseekOCRProcessor
-from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Resolution mode presets
-RESOLUTION_MODES = {
-    "tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
-    "small": {"base_size": 640, "image_size": 640, "crop_mode": False},
-    "base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
-    "large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
-    "gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
-}
-def check_cuda_availability():
-    """Check if CUDA is available and exit if not."""
-    if not torch.cuda.is_available():
-        logger.error("CUDA is not available. This script requires a GPU.")
-        logger.error("Please run on a machine with a CUDA-capable GPU.")
-        sys.exit(1)
-    else:
-        logger.info(f"CUDA is available. GPU: {torch.cuda.get_device_name(0)}")
-def setup_config(resolution_mode: str):
-    """Set up global config based on resolution mode."""
-    if resolution_mode not in RESOLUTION_MODES:
-        raise ValueError(
-            f"Invalid resolution mode: {resolution_mode}. "
-            f"Choose from {list(RESOLUTION_MODES.keys())}"
-        )
-    mode_config = RESOLUTION_MODES[resolution_mode]
-    config.BASE_SIZE = mode_config["base_size"]
-    config.IMAGE_SIZE = mode_config["image_size"]
-    config.CROP_MODE = mode_config["crop_mode"]
-    logger.info(
-        f"Resolution mode: {resolution_mode} "
-        f"(BASE_SIZE={config.BASE_SIZE}, IMAGE_SIZE={config.IMAGE_SIZE}, "
-        f"CROP_MODE={config.CROP_MODE})"
-    )
-async def process_images_async(
-    images: List[Image.Image],
-    engine: AsyncLLMEngine,
-    processor: DeepseekOCRProcessor,
-    sampling_params: SamplingParams,
-    prompt: str,
-) -> List[str]:
-    """Process a batch of images asynchronously with vLLM."""
-    results = []
-    for image in images:
-        # Preprocess image
-        image = image.convert("RGB")
-        image_features = processor.tokenize_with_images(
-            images=[image], bos=True, eos=True, cropping=config.CROP_MODE
-        )
-        # Generate async
-        request_id = f"request-{int(time.time() * 1000)}"
-        request = {"prompt": prompt, "multi_modal_data": {"image": image_features}}
-        output_text = ""
-        async for request_output in engine.generate(request, sampling_params, request_id):
-            if request_output.outputs:
-                output_text = request_output.outputs[0].text
-        results.append(output_text.strip())
-    return results
-def create_dataset_card(
-    source_dataset: str,
-    model: str,
-    num_samples: int,
-    processing_time: str,
-    resolution_mode: str,
-    base_size: int,
-    image_size: int,
-    crop_mode: bool,
-    max_model_len: int,
-    max_tokens: int,
-    gpu_memory_utilization: float,
-    image_column: str = "image",
-    split: str = "train",
-) -> str:
-    """Create a dataset card documenting the OCR process."""
-    return f"""---
-tags:
-- ocr
-- document-processing
-- deepseek
-- deepseek-ocr
-- markdown
-- vllm
-- generated
----
-# Document OCR using DeepSeek-OCR (vLLM)
-This dataset contains markdown-formatted OCR results from images in [{source_dataset}](https://huggingface.co/datasets/{source_dataset}) using DeepSeek-OCR with vLLM.
-## Processing Details
-- **Source Dataset**: [{source_dataset}](https://huggingface.co/datasets/{source_dataset})
-- **Model**: [{model}](https://huggingface.co/{model})
-- **Number of Samples**: {num_samples:,}
-- **Processing Time**: {processing_time}
-- **Processing Date**: {datetime.now().strftime("%Y-%m-%d %H:%M UTC")}
-### Configuration
-- **Image Column**: `{image_column}`
-- **Output Column**: `markdown`
-- **Dataset Split**: `{split}`
-- **Resolution Mode**: {resolution_mode}
-- **Base Size**: {base_size}
-- **Image Size**: {image_size}
-- **Crop Mode**: {crop_mode}
-- **Max Model Length**: {max_model_len:,} tokens
-- **Max Output Tokens**: {max_tokens:,}
-- **GPU Memory Utilization**: {gpu_memory_utilization:.1%}
-- **Implementation**: vLLM AsyncEngine (batch processing)
-## Model Information
-DeepSeek-OCR is a state-of-the-art document OCR model that excels at:
-- 📐 **LaTeX equations** - Mathematical formulas preserved in LaTeX format
-- 📊 **Tables** - Extracted and formatted as HTML/markdown
-- 📝 **Document structure** - Headers, lists, and formatting maintained
-- 🖼️ **Image grounding** - Spatial layout and bounding box information
-- 🔍 **Complex layouts** - Multi-column and hierarchical structures
-- 🌍 **Multilingual** - Supports multiple languages
-### Resolution Modes
-- **Tiny** (512×512): Fast processing, 64 vision tokens
-- **Small** (640×640): Balanced speed/quality, 100 vision tokens
-- **Base** (1024×1024): High quality, 256 vision tokens
-- **Large** (1280×1280): Maximum quality, 400 vision tokens
-- **Gundam** (dynamic): Adaptive multi-tile processing for large documents
-## Dataset Structure
-The dataset contains all original columns plus:
-- `markdown`: The extracted text in markdown format with preserved structure
-- `inference_info`: JSON list tracking all OCR models applied to this dataset
-## Usage
-```python
-from datasets import load_dataset
-# Load the dataset
-dataset = load_dataset("{{{{output_dataset_id}}}}", split="{split}")
-# Access the markdown text
-for example in dataset:
-    print(example["markdown"])
-    break
-```
-## Reproduction
-This dataset was generated using the DeepSeek-OCR vLLM Space:
-```bash
-hf jobs run --flavor l4x1 \\
-    --secrets HF_TOKEN \\
-    hf.co/spaces/davanstrien/deepseek-ocr \\
-    python main.py \\
-    --input-dataset {source_dataset} \\
-    --output-dataset <output-dataset> \\
-    --resolution-mode {resolution_mode} \\
-    --image-column {image_column}
-```
-## Performance
-- **Processing Speed**: ~{num_samples / (float(processing_time.split()[0]) * 60) if processing_time.split()[0].replace('.','').isdigit() else 'N/A':.1f} images/second
-- **Processing Method**: Async batch processing with vLLM (optimized for throughput)
-Generated with 🤖 [DeepSeek-OCR Space](https://huggingface.co/spaces/davanstrien/deepseek-ocr)
-"""
-async def main_async(
-    input_dataset: str,
-    output_dataset: str,
-    image_column: str = "image",
-    model: str = "deepseek-ai/DeepSeek-OCR",
-    resolution_mode: str = "gundam",
-    max_model_len: int = 8192,
-    max_tokens: int = 8192,
-    gpu_memory_utilization: float = 0.75,
-    prompt: str = "<image>\n<|grounding|>Convert the document to markdown.",
-    hf_token: str = None,
-    split: str = "train",
-    max_samples: int = None,
-    private: bool = False,
-    shuffle: bool = False,
-    seed: int = 42,
-):
-    """Process images from HF dataset through DeepSeek-OCR model with vLLM."""
-    # Check CUDA availability
-    check_cuda_availability()
-    # Track processing start time
-    start_time = datetime.now()
-    # Enable HF_TRANSFER for faster downloads
-    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-    # Login to HF if token provided
-    HF_TOKEN = hf_token or os.environ.get("HF_TOKEN")
-    if HF_TOKEN:
-        login(token=HF_TOKEN)
-    # Set up config for resolution mode
-    setup_config(resolution_mode)
-    # Set model and prompt (tokenizer already initialized in config.py)
-    config.MODEL_PATH = model
-    config.PROMPT = prompt
-    # Load dataset
-    logger.info(f"Loading dataset: {input_dataset}")
-    dataset = load_dataset(input_dataset, split=split)
-    # Validate image column
-    if image_column not in dataset.column_names:
-        raise ValueError(
-            f"Column '{image_column}' not found. Available: {dataset.column_names}"
-        )
-    # Shuffle if requested
-    if shuffle:
-        logger.info(f"Shuffling dataset with seed {seed}")
-        dataset = dataset.shuffle(seed=seed)
-    # Limit samples if requested
-    if max_samples:
-        dataset = dataset.select(range(min(max_samples, len(dataset))))
-        logger.info(f"Limited to {len(dataset)} samples")
-    # Register custom model
-    logger.info("Registering custom DeepSeek-OCR model...")
-    ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
-    # Initialize vLLM AsyncEngine
-    logger.info(f"Initializing vLLM AsyncEngine with model: {model}")
-    logger.info("This may take a few minutes on first run...")
-    engine_args = AsyncEngineArgs(
-        model=model,
-        hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
-        block_size=256,
-        max_model_len=max_model_len,
-        enforce_eager=False,
-        trust_remote_code=True,
-        tensor_parallel_size=1,
-        gpu_memory_utilization=gpu_memory_utilization,
-    )
-    engine = AsyncLLMEngine.from_engine_args(engine_args)
-    # Set up sampling params
-    logits_processors = [
-        NoRepeatNGramLogitsProcessor(
-            ngram_size=30, window_size=90, whitelist_token_ids={128821, 128822}
-        )
-    ]
-    sampling_params = SamplingParams(
-        temperature=0.0,
-        max_tokens=max_tokens,
-        logits_processors=logits_processors,
-        skip_special_tokens=False,
-    )
-    # Initialize processor
-    processor = DeepseekOCRProcessor()
-    logger.info(f"Processing {len(dataset)} images with vLLM AsyncEngine")
-    # Process images one at a time (async but sequential for simplicity)
-    all_markdown = []
-    for idx in tqdm(range(len(dataset)), desc="DeepSeek-OCR processing"):
-        image = dataset[idx][image_column]
-        # Convert to PIL if needed
-        if not isinstance(image, Image.Image):
-            image = Image.open(image) if isinstance(image, str) else image
-        try:
-            image = ImageOps.exif_transpose(image.convert("RGB"))
-            # Process single image
-            results = await process_images_async(
-                [image], engine, processor, sampling_params, prompt
-            )
-            all_markdown.append(results[0])
-        except Exception as e:
-            logger.error(f"Error processing image {idx}: {e}")
-            all_markdown.append("[OCR FAILED]")
-    # Calculate processing time
-    processing_duration = datetime.now() - start_time
-    processing_time_str = f"{processing_duration.total_seconds() / 60:.1f} min"
-    # Add markdown column to dataset
-    logger.info("Adding markdown column to dataset")
-    dataset = dataset.add_column("markdown", all_markdown)
-    # Handle inference_info tracking
-    logger.info("Updating inference_info...")
-    if "inference_info" in dataset.column_names:
-        try:
-            existing_info = json.loads(dataset[0]["inference_info"])
-            if not isinstance(existing_info, list):
-                existing_info = [existing_info]
-        except (json.JSONDecodeError, TypeError):
-            existing_info = []
-        dataset = dataset.remove_columns(["inference_info"])
-    else:
-        existing_info = []
-    # Add new inference info
-    new_info = {
-        "column_name": "markdown",
-        "model_id": model,
-        "processing_date": datetime.now().isoformat(),
-        "resolution_mode": resolution_mode,
-        "base_size": config.BASE_SIZE,
-        "image_size": config.IMAGE_SIZE,
-        "crop_mode": config.CROP_MODE,
-        "prompt": prompt,
-        "max_tokens": max_tokens,
-        "gpu_memory_utilization": gpu_memory_utilization,
-        "max_model_len": max_model_len,
-        "script": "main.py",
-        "script_version": "1.0.0",
-        "space_url": "https://huggingface.co/spaces/davanstrien/deepseek-ocr",
-        "implementation": "vllm-async (optimized)",
-    }
-    existing_info.append(new_info)
-    # Add updated inference_info column
-    info_json = json.dumps(existing_info, ensure_ascii=False)
-    dataset = dataset.add_column("inference_info", [info_json] * len(dataset))
-    # Push to hub
-    logger.info(f"Pushing to {output_dataset}")
-    dataset.push_to_hub(output_dataset, private=private, token=HF_TOKEN)
-    # Create and push dataset card
-    logger.info("Creating dataset card...")
-    card_content = create_dataset_card(
-        source_dataset=input_dataset,
-        model=model,
-        num_samples=len(dataset),
-        processing_time=processing_time_str,
-        resolution_mode=resolution_mode,
-        base_size=config.BASE_SIZE,
-        image_size=config.IMAGE_SIZE,
-        crop_mode=config.CROP_MODE,
-        max_model_len=max_model_len,
-        max_tokens=max_tokens,
-        gpu_memory_utilization=gpu_memory_utilization,
-        image_column=image_column,
-        split=split,
-    )
-    card = DatasetCard(card_content)
-    card.push_to_hub(output_dataset, token=HF_TOKEN)
-    logger.info("✅ Dataset card created and pushed!")
-    logger.info("✅ OCR conversion complete!")
-    logger.info(f"Dataset available at: https://huggingface.co/datasets/{output_dataset}")
-    logger.info(f"Processing time: {processing_time_str}")
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="OCR images to markdown using DeepSeek-OCR (vLLM AsyncEngine)",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Resolution Modes:
-  tiny      512×512 pixels, fast processing (64 vision tokens)
-  small     640×640 pixels, balanced (100 vision tokens)
-  base      1024×1024 pixels, high quality (256 vision tokens)
-  large     1280×1280 pixels, maximum quality (400 vision tokens)
-  gundam    Dynamic multi-tile processing (adaptive)
-Examples:
-  # Basic usage with default Gundam mode
-  python main.py input-dataset output-dataset
-  # High quality processing
-  python main.py input-dataset output-dataset --resolution-mode large
-  # Fast processing for testing
-  python main.py input-dataset output-dataset --resolution-mode tiny --max-samples 100
-  # With HF Jobs
-  hf jobs run --flavor l4x1 --secrets HF_TOKEN \\
-      hf.co/spaces/davanstrien/deepseek-ocr \\
-      python main.py input-dataset output-dataset --resolution-mode gundam
-        """,
-    )
-    parser.add_argument("input_dataset", help="Input dataset ID from Hugging Face Hub")
-    parser.add_argument("output_dataset", help="Output dataset ID for Hugging Face Hub")
-    parser.add_argument(
-        "--image-column",
-        default="image",
-        help="Column containing images (default: image)",
-    )
-    parser.add_argument(
-        "--model",
-        default="deepseek-ai/DeepSeek-OCR",
-        help="Model to use (default: deepseek-ai/DeepSeek-OCR)",
-    )
-    parser.add_argument(
-        "--resolution-mode",
-        default="gundam",
-        choices=list(RESOLUTION_MODES.keys()),
-        help="Resolution mode preset (default: gundam)",
-    )
-    parser.add_argument(
-        "--max-model-len",
-        type=int,
-        default=8192,
-        help="Maximum model context length (default: 8192)",
-    )
-    parser.add_argument(
-        "--max-tokens",
-        type=int,
-        default=8192,
-        help="Maximum tokens to generate (default: 8192)",
-    )
-    parser.add_argument(
-        "--gpu-memory-utilization",
-        type=float,
-        default=0.75,
-        help="GPU memory utilization (default: 0.75)",
-    )
-    parser.add_argument(
-        "--prompt",
-        default="<image>\n<|grounding|>Convert the document to markdown.",
-        help="Prompt for OCR (default: grounding markdown conversion)",
-    )
-    parser.add_argument("--hf-token", help="Hugging Face API token")
-    parser.add_argument(
-        "--split", default="train", help="Dataset split to use (default: train)"
-    )
-    parser.add_argument(
-        "--max-samples",
-        type=int,
-        help="Maximum number of samples to process (for testing)",
-    )
-    parser.add_argument(
-        "--private", action="store_true", help="Make output dataset private"
-    )
-    parser.add_argument(
-        "--shuffle",
-        action="store_true",
-        help="Shuffle the dataset before processing (useful for random sampling)",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="Random seed for shuffling (default: 42)",
-    )
-    args = parser.parse_args()
-    # Run async main
-    asyncio.run(
-        main_async(
-            input_dataset=args.input_dataset,
-            output_dataset=args.output_dataset,
-            image_column=args.image_column,
-            model=args.model,
-            resolution_mode=args.resolution_mode,
-            max_model_len=args.max_model_len,
-            max_tokens=args.max_tokens,
-            gpu_memory_utilization=args.gpu_memory_utilization,
-            prompt=args.prompt,
-            hf_token=args.hf_token,
-            split=args.split,
-            max_samples=args.max_samples,
-            private=args.private,
-            shuffle=args.shuffle,
-            seed=args.seed,
-        )
-    )

process_dataset.py ADDED Viewed

	@@ -0,0 +1,214 @@

+#!/usr/bin/env python3
+"""
+DeepSeek-OCR Dataset Processing
+Minimal adaptation of official run_dpsk_ocr_image.py for dataset processing
+"""
+import argparse
+import asyncio
+import json
+import os
+import sys
+import time
+from datetime import datetime
+import torch
+if torch.version.cuda == '11.8':
+    os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
+os.environ['VLLM_USE_V1'] = '0'
+from vllm import AsyncLLMEngine, SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.model_executor.models.registry import ModelRegistry
+from PIL import Image, ImageOps
+from tqdm.auto import tqdm
+from datasets import load_dataset
+from huggingface_hub import DatasetCard, login
+# Import DeepSeek-OCR modules (unchanged from original)
+from deepseek_ocr import DeepseekOCRForCausalLM
+from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
+from process.image_process import DeepseekOCRProcessor
+from config import MODEL_PATH, PROMPT, CROP_MODE
+# Register custom model (unchanged from original)
+ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
+def check_cuda():
+    """Check CUDA availability"""
+    if not torch.cuda.is_available():
+        print("ERROR: CUDA is not available. This script requires a GPU.")
+        sys.exit(1)
+    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
+async def process_single_image(engine, sampling_params, image_features, prompt):
+    """Process a single image through the engine (unchanged from original)"""
+    request_id = f"request-{int(time.time() * 1000000)}"
+    if image_features and '<image>' in prompt:
+        request = {
+            "prompt": prompt,
+            "multi_modal_data": {"image": image_features}
+        }
+    else:
+        request = {"prompt": prompt}
+    final_output = ""
+    async for request_output in engine.generate(request, sampling_params, request_id):
+        if request_output.outputs:
+            final_output = request_output.outputs[0].text
+    return final_output.strip()
+async def main_async(args):
+    """Main processing function"""
+    check_cuda()
+    # Enable HF_TRANSFER
+    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+    # Login to HF if token provided
+    HF_TOKEN = args.hf_token or os.environ.get("HF_TOKEN")
+    if HF_TOKEN:
+        login(token=HF_TOKEN)
+    # Load dataset
+    print(f"Loading dataset: {args.input_dataset}")
+    dataset = load_dataset(args.input_dataset, split=args.split)
+    if args.image_column not in dataset.column_names:
+        print(f"ERROR: Column '{args.image_column}' not found")
+        print(f"Available columns: {dataset.column_names}")
+        sys.exit(1)
+    # Shuffle if requested
+    if args.shuffle:
+        print(f"Shuffling with seed {args.seed}")
+        dataset = dataset.shuffle(seed=args.seed)
+    # Limit samples if requested
+    if args.max_samples:
+        dataset = dataset.select(range(min(args.max_samples, len(dataset))))
+        print(f"Processing {len(dataset)} samples")
+    # Initialize vLLM engine (UNCHANGED from original)
+    print("Initializing vLLM engine...")
+    engine_args = AsyncEngineArgs(
+        model=MODEL_PATH,
+        hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
+        block_size=256,
+        max_model_len=args.max_model_len,
+        enforce_eager=False,
+        trust_remote_code=True,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+    )
+    engine = AsyncLLMEngine.from_engine_args(engine_args)
+    # Sampling params (UNCHANGED from original)
+    logits_processors = [NoRepeatNGramLogitsProcessor(
+        ngram_size=30, window_size=90, whitelist_token_ids={128821, 128822}
+    )]
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=args.max_tokens,
+        logits_processors=logits_processors,
+        skip_special_tokens=False,
+    )
+    # Process images
+    print(f"Processing {len(dataset)} images...")
+    all_markdown = []
+    processor = DeepseekOCRProcessor()
+    for idx in tqdm(range(len(dataset)), desc="OCR processing"):
+        try:
+            # Load image
+            image = dataset[idx][args.image_column]
+            if not isinstance(image, Image.Image):
+                image = Image.open(image) if isinstance(image, str) else image
+            image = ImageOps.exif_transpose(image.convert('RGB'))
+            # Preprocess image (UNCHANGED from original)
+            if '<image>' in PROMPT:
+                image_features = processor.tokenize_with_images(
+                    images=[image], bos=True, eos=True, cropping=CROP_MODE
+                )
+            else:
+                image_features = ''
+            # Process
+            result = await process_single_image(
+                engine, sampling_params, image_features, PROMPT
+            )
+            all_markdown.append(result)
+        except Exception as e:
+            print(f"Error processing image {idx}: {e}")
+            all_markdown.append("[OCR FAILED]")
+    # Add markdown column
+    print("Adding markdown column...")
+    dataset = dataset.add_column("markdown", all_markdown)
+    # Handle inference_info
+    if "inference_info" in dataset.column_names:
+        try:
+            existing_info = json.loads(dataset[0]["inference_info"])
+            if not isinstance(existing_info, list):
+                existing_info = [existing_info]
+        except:
+            existing_info = []
+        dataset = dataset.remove_columns(["inference_info"])
+    else:
+        existing_info = []
+    new_info = {
+        "column_name": "markdown",
+        "model_id": MODEL_PATH,
+        "processing_date": datetime.now().isoformat(),
+        "prompt": PROMPT,
+        "max_tokens": args.max_tokens,
+        "max_model_len": args.max_model_len,
+        "gpu_memory_utilization": args.gpu_memory_utilization,
+        "script": "process_dataset.py",
+        "implementation": "vllm-async (official deepseek code)",
+    }
+    existing_info.append(new_info)
+    info_json = json.dumps(existing_info, ensure_ascii=False)
+    dataset = dataset.add_column("inference_info", [info_json] * len(dataset))
+    # Push to hub
+    print(f"Pushing to {args.output_dataset}")
+    dataset.push_to_hub(args.output_dataset, private=args.private, token=HF_TOKEN)
+    print("✅ Complete!")
+    print(f"Dataset: https://huggingface.co/datasets/{args.output_dataset}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Process images through DeepSeek-OCR"
+    )
+    parser.add_argument("input_dataset", help="Input dataset ID")
+    parser.add_argument("output_dataset", help="Output dataset ID")
+    parser.add_argument("--image-column", default="image", help="Image column name")
+    parser.add_argument("--split", default="train", help="Dataset split")
+    parser.add_argument("--max-samples", type=int, help="Limit number of samples")
+    parser.add_argument("--shuffle", action="store_true", help="Shuffle dataset")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--max-model-len", type=int, default=8192)
+    parser.add_argument("--max-tokens", type=int, default=8192)
+    parser.add_argument("--gpu-memory-utilization", type=float, default=0.75)
+    parser.add_argument("--hf-token", help="HF API token")
+    parser.add_argument("--private", action="store_true", help="Make output private")
+    args = parser.parse_args()
+    asyncio.run(main_async(args))