Spaces:
Runtime error
Runtime error
Commit
·
0a5527f
1
Parent(s):
40f1c08
Restart with minimal changes to official DeepSeek code
Browse files- Created process_dataset.py based on run_dpsk_ocr_image.py
- Using original config.py with tokenizer initialization
- Removed custom main.py that was causing import issues
- Minimal changes: only dataset loading/processing added
- Dockerfile updated to use process_dataset.py
- Dockerfile +1 -1
- README.md +9 -23
- config.py +28 -37
- config_template.py +0 -42
- main.py +0 -535
- process_dataset.py +214 -0
Dockerfile
CHANGED
|
@@ -48,4 +48,4 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|
| 48 |
COPY . .
|
| 49 |
|
| 50 |
# Default command (can be overridden by HF Jobs)
|
| 51 |
-
CMD ["python", "
|
|
|
|
| 48 |
COPY . .
|
| 49 |
|
| 50 |
# Default command (can be overridden by HF Jobs)
|
| 51 |
+
CMD ["python", "process_dataset.py", "--help"]
|
README.md
CHANGED
|
@@ -27,7 +27,7 @@ Process any image dataset without needing your own GPU:
|
|
| 27 |
hf jobs run --flavor l4x1 \
|
| 28 |
--secrets HF_TOKEN \
|
| 29 |
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 30 |
-
python
|
| 31 |
input-dataset \
|
| 32 |
output-dataset
|
| 33 |
|
|
@@ -35,11 +35,10 @@ hf jobs run --flavor l4x1 \
|
|
| 35 |
hf jobs run --flavor l4x1 \
|
| 36 |
--secrets HF_TOKEN \
|
| 37 |
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 38 |
-
python
|
| 39 |
your-input-dataset \
|
| 40 |
your-output-dataset \
|
| 41 |
-
--max-samples 10
|
| 42 |
-
--resolution-mode tiny
|
| 43 |
```
|
| 44 |
|
| 45 |
That's it! The script will:
|
|
@@ -84,31 +83,19 @@ That's it! The script will:
|
|
| 84 |
# Default (Gundam mode)
|
| 85 |
hf jobs run --flavor l4x1 --secrets HF_TOKEN \
|
| 86 |
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 87 |
-
python
|
| 88 |
my-images-dataset \
|
| 89 |
ocr-results
|
| 90 |
```
|
| 91 |
|
| 92 |
-
### High Quality Mode
|
| 93 |
-
|
| 94 |
-
```bash
|
| 95 |
-
hf jobs run --flavor l40sx1 --secrets HF_TOKEN \
|
| 96 |
-
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 97 |
-
python main.py \
|
| 98 |
-
documents-dataset \
|
| 99 |
-
extracted-text \
|
| 100 |
-
--resolution-mode large
|
| 101 |
-
```
|
| 102 |
-
|
| 103 |
### Fast Processing for Testing
|
| 104 |
|
| 105 |
```bash
|
| 106 |
hf jobs run --flavor l4x1 --secrets HF_TOKEN \
|
| 107 |
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 108 |
-
python
|
| 109 |
large-dataset \
|
| 110 |
test-output \
|
| 111 |
-
--resolution-mode tiny \
|
| 112 |
--max-samples 100
|
| 113 |
```
|
| 114 |
|
|
@@ -117,7 +104,7 @@ hf jobs run --flavor l4x1 --secrets HF_TOKEN \
|
|
| 117 |
```bash
|
| 118 |
hf jobs run --flavor l4x1 --secrets HF_TOKEN \
|
| 119 |
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 120 |
-
python
|
| 121 |
ordered-dataset \
|
| 122 |
random-sample \
|
| 123 |
--max-samples 50 \
|
|
@@ -130,11 +117,10 @@ hf jobs run --flavor l4x1 --secrets HF_TOKEN \
|
|
| 130 |
```bash
|
| 131 |
hf jobs run --flavor a10g-large --secrets HF_TOKEN \
|
| 132 |
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 133 |
-
python
|
| 134 |
davanstrien/ufo-ColPali \
|
| 135 |
ufo-ocr \
|
| 136 |
-
--image-column image
|
| 137 |
-
--resolution-mode gundam
|
| 138 |
```
|
| 139 |
|
| 140 |
### Private Output Dataset
|
|
@@ -142,7 +128,7 @@ hf jobs run --flavor a10g-large --secrets HF_TOKEN \
|
|
| 142 |
```bash
|
| 143 |
hf jobs run --flavor l4x1 --secrets HF_TOKEN \
|
| 144 |
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 145 |
-
python
|
| 146 |
private-input \
|
| 147 |
private-output \
|
| 148 |
--private
|
|
|
|
| 27 |
hf jobs run --flavor l4x1 \
|
| 28 |
--secrets HF_TOKEN \
|
| 29 |
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 30 |
+
python process_dataset.py \
|
| 31 |
input-dataset \
|
| 32 |
output-dataset
|
| 33 |
|
|
|
|
| 35 |
hf jobs run --flavor l4x1 \
|
| 36 |
--secrets HF_TOKEN \
|
| 37 |
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 38 |
+
python process_dataset.py \
|
| 39 |
your-input-dataset \
|
| 40 |
your-output-dataset \
|
| 41 |
+
--max-samples 10
|
|
|
|
| 42 |
```
|
| 43 |
|
| 44 |
That's it! The script will:
|
|
|
|
| 83 |
# Default (Gundam mode)
|
| 84 |
hf jobs run --flavor l4x1 --secrets HF_TOKEN \
|
| 85 |
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 86 |
+
python process_dataset.py \
|
| 87 |
my-images-dataset \
|
| 88 |
ocr-results
|
| 89 |
```
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
### Fast Processing for Testing
|
| 92 |
|
| 93 |
```bash
|
| 94 |
hf jobs run --flavor l4x1 --secrets HF_TOKEN \
|
| 95 |
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 96 |
+
python process_dataset.py \
|
| 97 |
large-dataset \
|
| 98 |
test-output \
|
|
|
|
| 99 |
--max-samples 100
|
| 100 |
```
|
| 101 |
|
|
|
|
| 104 |
```bash
|
| 105 |
hf jobs run --flavor l4x1 --secrets HF_TOKEN \
|
| 106 |
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 107 |
+
python process_dataset.py \
|
| 108 |
ordered-dataset \
|
| 109 |
random-sample \
|
| 110 |
--max-samples 50 \
|
|
|
|
| 117 |
```bash
|
| 118 |
hf jobs run --flavor a10g-large --secrets HF_TOKEN \
|
| 119 |
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 120 |
+
python process_dataset.py \
|
| 121 |
davanstrien/ufo-ColPali \
|
| 122 |
ufo-ocr \
|
| 123 |
+
--image-column image
|
|
|
|
| 124 |
```
|
| 125 |
|
| 126 |
### Private Output Dataset
|
|
|
|
| 128 |
```bash
|
| 129 |
hf jobs run --flavor l4x1 --secrets HF_TOKEN \
|
| 130 |
hf.co/spaces/davanstrien/deepseek-ocr \
|
| 131 |
+
python process_dataset.py \
|
| 132 |
private-input \
|
| 133 |
private-output \
|
| 134 |
--private
|
config.py
CHANGED
|
@@ -1,51 +1,42 @@
|
|
| 1 |
-
#
|
| 2 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
# Resolution settings (set by resolution mode)
|
| 5 |
BASE_SIZE = 1024
|
| 6 |
IMAGE_SIZE = 640
|
| 7 |
CROP_MODE = True
|
| 8 |
-
|
| 9 |
-
#
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
|
| 13 |
-
NUM_WORKERS = 64 # image pre-process (resize/padding) workers
|
| 14 |
PRINT_NUM_VIS_TOKENS = False
|
| 15 |
SKIP_REPEAT = True
|
|
|
|
| 16 |
|
| 17 |
-
#
|
| 18 |
-
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
INPUT_PATH = ''
|
| 22 |
OUTPUT_PATH = ''
|
| 23 |
|
| 24 |
-
# Default prompt
|
| 25 |
PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
# Tokenizer - initialized at import time for vLLM compatibility
|
| 28 |
-
from transformers import AutoTokenizer
|
| 29 |
-
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
def set_resolution_mode(mode: str):
|
| 33 |
-
"""Update global config based on resolution mode."""
|
| 34 |
-
global BASE_SIZE, IMAGE_SIZE, CROP_MODE
|
| 35 |
|
| 36 |
-
|
| 37 |
-
"tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
|
| 38 |
-
"small": {"base_size": 640, "image_size": 640, "crop_mode": False},
|
| 39 |
-
"base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
|
| 40 |
-
"large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
|
| 41 |
-
"gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
-
if mode not in modes:
|
| 45 |
-
raise ValueError(f"Unknown resolution mode: {mode}. Choose from {list(modes.keys())}")
|
| 46 |
-
|
| 47 |
-
BASE_SIZE = modes[mode]["base_size"]
|
| 48 |
-
IMAGE_SIZE = modes[mode]["image_size"]
|
| 49 |
-
CROP_MODE = modes[mode]["crop_mode"]
|
| 50 |
|
| 51 |
-
|
|
|
|
| 1 |
+
# TODO: change modes
|
| 2 |
+
# Tiny: base_size = 512, image_size = 512, crop_mode = False
|
| 3 |
+
# Small: base_size = 640, image_size = 640, crop_mode = False
|
| 4 |
+
# Base: base_size = 1024, image_size = 1024, crop_mode = False
|
| 5 |
+
# Large: base_size = 1280, image_size = 1280, crop_mode = False
|
| 6 |
+
# Gundam: base_size = 1024, image_size = 640, crop_mode = True
|
| 7 |
|
|
|
|
| 8 |
BASE_SIZE = 1024
|
| 9 |
IMAGE_SIZE = 640
|
| 10 |
CROP_MODE = True
|
| 11 |
+
MIN_CROPS= 2
|
| 12 |
+
MAX_CROPS= 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
|
| 13 |
+
MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
|
| 14 |
+
NUM_WORKERS = 64 # image pre-process (resize/padding) workers
|
|
|
|
|
|
|
| 15 |
PRINT_NUM_VIS_TOKENS = False
|
| 16 |
SKIP_REPEAT = True
|
| 17 |
+
MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path
|
| 18 |
|
| 19 |
+
# TODO: change INPUT_PATH
|
| 20 |
+
# .pdf: run_dpsk_ocr_pdf.py;
|
| 21 |
+
# .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
|
| 22 |
+
# Omnidocbench images path: run_dpsk_ocr_eval_batch.py
|
| 23 |
|
| 24 |
+
INPUT_PATH = ''
|
|
|
|
| 25 |
OUTPUT_PATH = ''
|
| 26 |
|
|
|
|
| 27 |
PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
|
| 28 |
+
# PROMPT = '<image>\nFree OCR.'
|
| 29 |
+
# TODO commonly used prompts
|
| 30 |
+
# document: <image>\n<|grounding|>Convert the document to markdown.
|
| 31 |
+
# other image: <image>\n<|grounding|>OCR this image.
|
| 32 |
+
# without layouts: <image>\nFree OCR.
|
| 33 |
+
# figures in document: <image>\nParse the figure.
|
| 34 |
+
# general: <image>\nDescribe this image in detail.
|
| 35 |
+
# rec: <image>\nLocate <|ref|>xxxx<|/ref|> in the image.
|
| 36 |
+
# '先天下之忧而忧'
|
| 37 |
+
# .......
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
from transformers import AutoTokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
+
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
config_template.py
DELETED
|
@@ -1,42 +0,0 @@
|
|
| 1 |
-
# TODO: change modes
|
| 2 |
-
# Tiny: base_size = 512, image_size = 512, crop_mode = False
|
| 3 |
-
# Small: base_size = 640, image_size = 640, crop_mode = False
|
| 4 |
-
# Base: base_size = 1024, image_size = 1024, crop_mode = False
|
| 5 |
-
# Large: base_size = 1280, image_size = 1280, crop_mode = False
|
| 6 |
-
# Gundam: base_size = 1024, image_size = 640, crop_mode = True
|
| 7 |
-
|
| 8 |
-
BASE_SIZE = 1024
|
| 9 |
-
IMAGE_SIZE = 640
|
| 10 |
-
CROP_MODE = True
|
| 11 |
-
MIN_CROPS= 2
|
| 12 |
-
MAX_CROPS= 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
|
| 13 |
-
MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
|
| 14 |
-
NUM_WORKERS = 64 # image pre-process (resize/padding) workers
|
| 15 |
-
PRINT_NUM_VIS_TOKENS = False
|
| 16 |
-
SKIP_REPEAT = True
|
| 17 |
-
MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path
|
| 18 |
-
|
| 19 |
-
# TODO: change INPUT_PATH
|
| 20 |
-
# .pdf: run_dpsk_ocr_pdf.py;
|
| 21 |
-
# .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
|
| 22 |
-
# Omnidocbench images path: run_dpsk_ocr_eval_batch.py
|
| 23 |
-
|
| 24 |
-
INPUT_PATH = ''
|
| 25 |
-
OUTPUT_PATH = ''
|
| 26 |
-
|
| 27 |
-
PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
|
| 28 |
-
# PROMPT = '<image>\nFree OCR.'
|
| 29 |
-
# TODO commonly used prompts
|
| 30 |
-
# document: <image>\n<|grounding|>Convert the document to markdown.
|
| 31 |
-
# other image: <image>\n<|grounding|>OCR this image.
|
| 32 |
-
# without layouts: <image>\nFree OCR.
|
| 33 |
-
# figures in document: <image>\nParse the figure.
|
| 34 |
-
# general: <image>\nDescribe this image in detail.
|
| 35 |
-
# rec: <image>\nLocate <|ref|>xxxx<|/ref|> in the image.
|
| 36 |
-
# '先天下之忧而忧'
|
| 37 |
-
# .......
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
from transformers import AutoTokenizer
|
| 41 |
-
|
| 42 |
-
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main.py
DELETED
|
@@ -1,535 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
DeepSeek-OCR Dataset Processing with vLLM
|
| 4 |
-
|
| 5 |
-
This script processes image datasets through DeepSeek-OCR using vLLM for efficient batch processing.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import argparse
|
| 9 |
-
import asyncio
|
| 10 |
-
import json
|
| 11 |
-
import logging
|
| 12 |
-
import os
|
| 13 |
-
import sys
|
| 14 |
-
import time
|
| 15 |
-
from datetime import datetime
|
| 16 |
-
from typing import List
|
| 17 |
-
|
| 18 |
-
import torch
|
| 19 |
-
from datasets import load_dataset
|
| 20 |
-
from huggingface_hub import DatasetCard, login
|
| 21 |
-
from PIL import Image, ImageOps
|
| 22 |
-
from tqdm.auto import tqdm
|
| 23 |
-
from vllm import AsyncLLMEngine, SamplingParams
|
| 24 |
-
from vllm.engine.arg_utils import AsyncEngineArgs
|
| 25 |
-
from vllm.model_executor.models.registry import ModelRegistry
|
| 26 |
-
|
| 27 |
-
# Import DeepSeek-OCR modules
|
| 28 |
-
import config
|
| 29 |
-
from deepseek_ocr import DeepseekOCRForCausalLM
|
| 30 |
-
from process.image_process import DeepseekOCRProcessor
|
| 31 |
-
from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
|
| 32 |
-
|
| 33 |
-
logging.basicConfig(level=logging.INFO)
|
| 34 |
-
logger = logging.getLogger(__name__)
|
| 35 |
-
|
| 36 |
-
# Resolution mode presets
|
| 37 |
-
RESOLUTION_MODES = {
|
| 38 |
-
"tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
|
| 39 |
-
"small": {"base_size": 640, "image_size": 640, "crop_mode": False},
|
| 40 |
-
"base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
|
| 41 |
-
"large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
|
| 42 |
-
"gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
|
| 43 |
-
}
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
def check_cuda_availability():
|
| 47 |
-
"""Check if CUDA is available and exit if not."""
|
| 48 |
-
if not torch.cuda.is_available():
|
| 49 |
-
logger.error("CUDA is not available. This script requires a GPU.")
|
| 50 |
-
logger.error("Please run on a machine with a CUDA-capable GPU.")
|
| 51 |
-
sys.exit(1)
|
| 52 |
-
else:
|
| 53 |
-
logger.info(f"CUDA is available. GPU: {torch.cuda.get_device_name(0)}")
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
def setup_config(resolution_mode: str):
|
| 57 |
-
"""Set up global config based on resolution mode."""
|
| 58 |
-
if resolution_mode not in RESOLUTION_MODES:
|
| 59 |
-
raise ValueError(
|
| 60 |
-
f"Invalid resolution mode: {resolution_mode}. "
|
| 61 |
-
f"Choose from {list(RESOLUTION_MODES.keys())}"
|
| 62 |
-
)
|
| 63 |
-
|
| 64 |
-
mode_config = RESOLUTION_MODES[resolution_mode]
|
| 65 |
-
config.BASE_SIZE = mode_config["base_size"]
|
| 66 |
-
config.IMAGE_SIZE = mode_config["image_size"]
|
| 67 |
-
config.CROP_MODE = mode_config["crop_mode"]
|
| 68 |
-
|
| 69 |
-
logger.info(
|
| 70 |
-
f"Resolution mode: {resolution_mode} "
|
| 71 |
-
f"(BASE_SIZE={config.BASE_SIZE}, IMAGE_SIZE={config.IMAGE_SIZE}, "
|
| 72 |
-
f"CROP_MODE={config.CROP_MODE})"
|
| 73 |
-
)
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
async def process_images_async(
|
| 77 |
-
images: List[Image.Image],
|
| 78 |
-
engine: AsyncLLMEngine,
|
| 79 |
-
processor: DeepseekOCRProcessor,
|
| 80 |
-
sampling_params: SamplingParams,
|
| 81 |
-
prompt: str,
|
| 82 |
-
) -> List[str]:
|
| 83 |
-
"""Process a batch of images asynchronously with vLLM."""
|
| 84 |
-
results = []
|
| 85 |
-
|
| 86 |
-
for image in images:
|
| 87 |
-
# Preprocess image
|
| 88 |
-
image = image.convert("RGB")
|
| 89 |
-
image_features = processor.tokenize_with_images(
|
| 90 |
-
images=[image], bos=True, eos=True, cropping=config.CROP_MODE
|
| 91 |
-
)
|
| 92 |
-
|
| 93 |
-
# Generate async
|
| 94 |
-
request_id = f"request-{int(time.time() * 1000)}"
|
| 95 |
-
request = {"prompt": prompt, "multi_modal_data": {"image": image_features}}
|
| 96 |
-
|
| 97 |
-
output_text = ""
|
| 98 |
-
async for request_output in engine.generate(request, sampling_params, request_id):
|
| 99 |
-
if request_output.outputs:
|
| 100 |
-
output_text = request_output.outputs[0].text
|
| 101 |
-
|
| 102 |
-
results.append(output_text.strip())
|
| 103 |
-
|
| 104 |
-
return results
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
def create_dataset_card(
|
| 108 |
-
source_dataset: str,
|
| 109 |
-
model: str,
|
| 110 |
-
num_samples: int,
|
| 111 |
-
processing_time: str,
|
| 112 |
-
resolution_mode: str,
|
| 113 |
-
base_size: int,
|
| 114 |
-
image_size: int,
|
| 115 |
-
crop_mode: bool,
|
| 116 |
-
max_model_len: int,
|
| 117 |
-
max_tokens: int,
|
| 118 |
-
gpu_memory_utilization: float,
|
| 119 |
-
image_column: str = "image",
|
| 120 |
-
split: str = "train",
|
| 121 |
-
) -> str:
|
| 122 |
-
"""Create a dataset card documenting the OCR process."""
|
| 123 |
-
return f"""---
|
| 124 |
-
tags:
|
| 125 |
-
- ocr
|
| 126 |
-
- document-processing
|
| 127 |
-
- deepseek
|
| 128 |
-
- deepseek-ocr
|
| 129 |
-
- markdown
|
| 130 |
-
- vllm
|
| 131 |
-
- generated
|
| 132 |
-
---
|
| 133 |
-
|
| 134 |
-
# Document OCR using DeepSeek-OCR (vLLM)
|
| 135 |
-
|
| 136 |
-
This dataset contains markdown-formatted OCR results from images in [{source_dataset}](https://huggingface.co/datasets/{source_dataset}) using DeepSeek-OCR with vLLM.
|
| 137 |
-
|
| 138 |
-
## Processing Details
|
| 139 |
-
|
| 140 |
-
- **Source Dataset**: [{source_dataset}](https://huggingface.co/datasets/{source_dataset})
|
| 141 |
-
- **Model**: [{model}](https://huggingface.co/{model})
|
| 142 |
-
- **Number of Samples**: {num_samples:,}
|
| 143 |
-
- **Processing Time**: {processing_time}
|
| 144 |
-
- **Processing Date**: {datetime.now().strftime("%Y-%m-%d %H:%M UTC")}
|
| 145 |
-
|
| 146 |
-
### Configuration
|
| 147 |
-
|
| 148 |
-
- **Image Column**: `{image_column}`
|
| 149 |
-
- **Output Column**: `markdown`
|
| 150 |
-
- **Dataset Split**: `{split}`
|
| 151 |
-
- **Resolution Mode**: {resolution_mode}
|
| 152 |
-
- **Base Size**: {base_size}
|
| 153 |
-
- **Image Size**: {image_size}
|
| 154 |
-
- **Crop Mode**: {crop_mode}
|
| 155 |
-
- **Max Model Length**: {max_model_len:,} tokens
|
| 156 |
-
- **Max Output Tokens**: {max_tokens:,}
|
| 157 |
-
- **GPU Memory Utilization**: {gpu_memory_utilization:.1%}
|
| 158 |
-
- **Implementation**: vLLM AsyncEngine (batch processing)
|
| 159 |
-
|
| 160 |
-
## Model Information
|
| 161 |
-
|
| 162 |
-
DeepSeek-OCR is a state-of-the-art document OCR model that excels at:
|
| 163 |
-
- 📐 **LaTeX equations** - Mathematical formulas preserved in LaTeX format
|
| 164 |
-
- 📊 **Tables** - Extracted and formatted as HTML/markdown
|
| 165 |
-
- 📝 **Document structure** - Headers, lists, and formatting maintained
|
| 166 |
-
- 🖼️ **Image grounding** - Spatial layout and bounding box information
|
| 167 |
-
- 🔍 **Complex layouts** - Multi-column and hierarchical structures
|
| 168 |
-
- 🌍 **Multilingual** - Supports multiple languages
|
| 169 |
-
|
| 170 |
-
### Resolution Modes
|
| 171 |
-
|
| 172 |
-
- **Tiny** (512×512): Fast processing, 64 vision tokens
|
| 173 |
-
- **Small** (640×640): Balanced speed/quality, 100 vision tokens
|
| 174 |
-
- **Base** (1024×1024): High quality, 256 vision tokens
|
| 175 |
-
- **Large** (1280×1280): Maximum quality, 400 vision tokens
|
| 176 |
-
- **Gundam** (dynamic): Adaptive multi-tile processing for large documents
|
| 177 |
-
|
| 178 |
-
## Dataset Structure
|
| 179 |
-
|
| 180 |
-
The dataset contains all original columns plus:
|
| 181 |
-
- `markdown`: The extracted text in markdown format with preserved structure
|
| 182 |
-
- `inference_info`: JSON list tracking all OCR models applied to this dataset
|
| 183 |
-
|
| 184 |
-
## Usage
|
| 185 |
-
|
| 186 |
-
```python
|
| 187 |
-
from datasets import load_dataset
|
| 188 |
-
|
| 189 |
-
# Load the dataset
|
| 190 |
-
dataset = load_dataset("{{{{output_dataset_id}}}}", split="{split}")
|
| 191 |
-
|
| 192 |
-
# Access the markdown text
|
| 193 |
-
for example in dataset:
|
| 194 |
-
print(example["markdown"])
|
| 195 |
-
break
|
| 196 |
-
```
|
| 197 |
-
|
| 198 |
-
## Reproduction
|
| 199 |
-
|
| 200 |
-
This dataset was generated using the DeepSeek-OCR vLLM Space:
|
| 201 |
-
|
| 202 |
-
```bash
|
| 203 |
-
hf jobs run --flavor l4x1 \\
|
| 204 |
-
--secrets HF_TOKEN \\
|
| 205 |
-
hf.co/spaces/davanstrien/deepseek-ocr \\
|
| 206 |
-
python main.py \\
|
| 207 |
-
--input-dataset {source_dataset} \\
|
| 208 |
-
--output-dataset <output-dataset> \\
|
| 209 |
-
--resolution-mode {resolution_mode} \\
|
| 210 |
-
--image-column {image_column}
|
| 211 |
-
```
|
| 212 |
-
|
| 213 |
-
## Performance
|
| 214 |
-
|
| 215 |
-
- **Processing Speed**: ~{num_samples / (float(processing_time.split()[0]) * 60) if processing_time.split()[0].replace('.','').isdigit() else 'N/A':.1f} images/second
|
| 216 |
-
- **Processing Method**: Async batch processing with vLLM (optimized for throughput)
|
| 217 |
-
|
| 218 |
-
Generated with 🤖 [DeepSeek-OCR Space](https://huggingface.co/spaces/davanstrien/deepseek-ocr)
|
| 219 |
-
"""
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
async def main_async(
|
| 223 |
-
input_dataset: str,
|
| 224 |
-
output_dataset: str,
|
| 225 |
-
image_column: str = "image",
|
| 226 |
-
model: str = "deepseek-ai/DeepSeek-OCR",
|
| 227 |
-
resolution_mode: str = "gundam",
|
| 228 |
-
max_model_len: int = 8192,
|
| 229 |
-
max_tokens: int = 8192,
|
| 230 |
-
gpu_memory_utilization: float = 0.75,
|
| 231 |
-
prompt: str = "<image>\n<|grounding|>Convert the document to markdown.",
|
| 232 |
-
hf_token: str = None,
|
| 233 |
-
split: str = "train",
|
| 234 |
-
max_samples: int = None,
|
| 235 |
-
private: bool = False,
|
| 236 |
-
shuffle: bool = False,
|
| 237 |
-
seed: int = 42,
|
| 238 |
-
):
|
| 239 |
-
"""Process images from HF dataset through DeepSeek-OCR model with vLLM."""
|
| 240 |
-
|
| 241 |
-
# Check CUDA availability
|
| 242 |
-
check_cuda_availability()
|
| 243 |
-
|
| 244 |
-
# Track processing start time
|
| 245 |
-
start_time = datetime.now()
|
| 246 |
-
|
| 247 |
-
# Enable HF_TRANSFER for faster downloads
|
| 248 |
-
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
| 249 |
-
|
| 250 |
-
# Login to HF if token provided
|
| 251 |
-
HF_TOKEN = hf_token or os.environ.get("HF_TOKEN")
|
| 252 |
-
if HF_TOKEN:
|
| 253 |
-
login(token=HF_TOKEN)
|
| 254 |
-
|
| 255 |
-
# Set up config for resolution mode
|
| 256 |
-
setup_config(resolution_mode)
|
| 257 |
-
|
| 258 |
-
# Set model and prompt (tokenizer already initialized in config.py)
|
| 259 |
-
config.MODEL_PATH = model
|
| 260 |
-
config.PROMPT = prompt
|
| 261 |
-
|
| 262 |
-
# Load dataset
|
| 263 |
-
logger.info(f"Loading dataset: {input_dataset}")
|
| 264 |
-
dataset = load_dataset(input_dataset, split=split)
|
| 265 |
-
|
| 266 |
-
# Validate image column
|
| 267 |
-
if image_column not in dataset.column_names:
|
| 268 |
-
raise ValueError(
|
| 269 |
-
f"Column '{image_column}' not found. Available: {dataset.column_names}"
|
| 270 |
-
)
|
| 271 |
-
|
| 272 |
-
# Shuffle if requested
|
| 273 |
-
if shuffle:
|
| 274 |
-
logger.info(f"Shuffling dataset with seed {seed}")
|
| 275 |
-
dataset = dataset.shuffle(seed=seed)
|
| 276 |
-
|
| 277 |
-
# Limit samples if requested
|
| 278 |
-
if max_samples:
|
| 279 |
-
dataset = dataset.select(range(min(max_samples, len(dataset))))
|
| 280 |
-
logger.info(f"Limited to {len(dataset)} samples")
|
| 281 |
-
|
| 282 |
-
# Register custom model
|
| 283 |
-
logger.info("Registering custom DeepSeek-OCR model...")
|
| 284 |
-
ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
|
| 285 |
-
|
| 286 |
-
# Initialize vLLM AsyncEngine
|
| 287 |
-
logger.info(f"Initializing vLLM AsyncEngine with model: {model}")
|
| 288 |
-
logger.info("This may take a few minutes on first run...")
|
| 289 |
-
|
| 290 |
-
engine_args = AsyncEngineArgs(
|
| 291 |
-
model=model,
|
| 292 |
-
hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
|
| 293 |
-
block_size=256,
|
| 294 |
-
max_model_len=max_model_len,
|
| 295 |
-
enforce_eager=False,
|
| 296 |
-
trust_remote_code=True,
|
| 297 |
-
tensor_parallel_size=1,
|
| 298 |
-
gpu_memory_utilization=gpu_memory_utilization,
|
| 299 |
-
)
|
| 300 |
-
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
| 301 |
-
|
| 302 |
-
# Set up sampling params
|
| 303 |
-
logits_processors = [
|
| 304 |
-
NoRepeatNGramLogitsProcessor(
|
| 305 |
-
ngram_size=30, window_size=90, whitelist_token_ids={128821, 128822}
|
| 306 |
-
)
|
| 307 |
-
]
|
| 308 |
-
|
| 309 |
-
sampling_params = SamplingParams(
|
| 310 |
-
temperature=0.0,
|
| 311 |
-
max_tokens=max_tokens,
|
| 312 |
-
logits_processors=logits_processors,
|
| 313 |
-
skip_special_tokens=False,
|
| 314 |
-
)
|
| 315 |
-
|
| 316 |
-
# Initialize processor
|
| 317 |
-
processor = DeepseekOCRProcessor()
|
| 318 |
-
|
| 319 |
-
logger.info(f"Processing {len(dataset)} images with vLLM AsyncEngine")
|
| 320 |
-
|
| 321 |
-
# Process images one at a time (async but sequential for simplicity)
|
| 322 |
-
all_markdown = []
|
| 323 |
-
for idx in tqdm(range(len(dataset)), desc="DeepSeek-OCR processing"):
|
| 324 |
-
image = dataset[idx][image_column]
|
| 325 |
-
|
| 326 |
-
# Convert to PIL if needed
|
| 327 |
-
if not isinstance(image, Image.Image):
|
| 328 |
-
image = Image.open(image) if isinstance(image, str) else image
|
| 329 |
-
|
| 330 |
-
try:
|
| 331 |
-
image = ImageOps.exif_transpose(image.convert("RGB"))
|
| 332 |
-
|
| 333 |
-
# Process single image
|
| 334 |
-
results = await process_images_async(
|
| 335 |
-
[image], engine, processor, sampling_params, prompt
|
| 336 |
-
)
|
| 337 |
-
all_markdown.append(results[0])
|
| 338 |
-
|
| 339 |
-
except Exception as e:
|
| 340 |
-
logger.error(f"Error processing image {idx}: {e}")
|
| 341 |
-
all_markdown.append("[OCR FAILED]")
|
| 342 |
-
|
| 343 |
-
# Calculate processing time
|
| 344 |
-
processing_duration = datetime.now() - start_time
|
| 345 |
-
processing_time_str = f"{processing_duration.total_seconds() / 60:.1f} min"
|
| 346 |
-
|
| 347 |
-
# Add markdown column to dataset
|
| 348 |
-
logger.info("Adding markdown column to dataset")
|
| 349 |
-
dataset = dataset.add_column("markdown", all_markdown)
|
| 350 |
-
|
| 351 |
-
# Handle inference_info tracking
|
| 352 |
-
logger.info("Updating inference_info...")
|
| 353 |
-
|
| 354 |
-
if "inference_info" in dataset.column_names:
|
| 355 |
-
try:
|
| 356 |
-
existing_info = json.loads(dataset[0]["inference_info"])
|
| 357 |
-
if not isinstance(existing_info, list):
|
| 358 |
-
existing_info = [existing_info]
|
| 359 |
-
except (json.JSONDecodeError, TypeError):
|
| 360 |
-
existing_info = []
|
| 361 |
-
dataset = dataset.remove_columns(["inference_info"])
|
| 362 |
-
else:
|
| 363 |
-
existing_info = []
|
| 364 |
-
|
| 365 |
-
# Add new inference info
|
| 366 |
-
new_info = {
|
| 367 |
-
"column_name": "markdown",
|
| 368 |
-
"model_id": model,
|
| 369 |
-
"processing_date": datetime.now().isoformat(),
|
| 370 |
-
"resolution_mode": resolution_mode,
|
| 371 |
-
"base_size": config.BASE_SIZE,
|
| 372 |
-
"image_size": config.IMAGE_SIZE,
|
| 373 |
-
"crop_mode": config.CROP_MODE,
|
| 374 |
-
"prompt": prompt,
|
| 375 |
-
"max_tokens": max_tokens,
|
| 376 |
-
"gpu_memory_utilization": gpu_memory_utilization,
|
| 377 |
-
"max_model_len": max_model_len,
|
| 378 |
-
"script": "main.py",
|
| 379 |
-
"script_version": "1.0.0",
|
| 380 |
-
"space_url": "https://huggingface.co/spaces/davanstrien/deepseek-ocr",
|
| 381 |
-
"implementation": "vllm-async (optimized)",
|
| 382 |
-
}
|
| 383 |
-
existing_info.append(new_info)
|
| 384 |
-
|
| 385 |
-
# Add updated inference_info column
|
| 386 |
-
info_json = json.dumps(existing_info, ensure_ascii=False)
|
| 387 |
-
dataset = dataset.add_column("inference_info", [info_json] * len(dataset))
|
| 388 |
-
|
| 389 |
-
# Push to hub
|
| 390 |
-
logger.info(f"Pushing to {output_dataset}")
|
| 391 |
-
dataset.push_to_hub(output_dataset, private=private, token=HF_TOKEN)
|
| 392 |
-
|
| 393 |
-
# Create and push dataset card
|
| 394 |
-
logger.info("Creating dataset card...")
|
| 395 |
-
card_content = create_dataset_card(
|
| 396 |
-
source_dataset=input_dataset,
|
| 397 |
-
model=model,
|
| 398 |
-
num_samples=len(dataset),
|
| 399 |
-
processing_time=processing_time_str,
|
| 400 |
-
resolution_mode=resolution_mode,
|
| 401 |
-
base_size=config.BASE_SIZE,
|
| 402 |
-
image_size=config.IMAGE_SIZE,
|
| 403 |
-
crop_mode=config.CROP_MODE,
|
| 404 |
-
max_model_len=max_model_len,
|
| 405 |
-
max_tokens=max_tokens,
|
| 406 |
-
gpu_memory_utilization=gpu_memory_utilization,
|
| 407 |
-
image_column=image_column,
|
| 408 |
-
split=split,
|
| 409 |
-
)
|
| 410 |
-
|
| 411 |
-
card = DatasetCard(card_content)
|
| 412 |
-
card.push_to_hub(output_dataset, token=HF_TOKEN)
|
| 413 |
-
logger.info("✅ Dataset card created and pushed!")
|
| 414 |
-
|
| 415 |
-
logger.info("✅ OCR conversion complete!")
|
| 416 |
-
logger.info(f"Dataset available at: https://huggingface.co/datasets/{output_dataset}")
|
| 417 |
-
logger.info(f"Processing time: {processing_time_str}")
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
if __name__ == "__main__":
|
| 421 |
-
parser = argparse.ArgumentParser(
|
| 422 |
-
description="OCR images to markdown using DeepSeek-OCR (vLLM AsyncEngine)",
|
| 423 |
-
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 424 |
-
epilog="""
|
| 425 |
-
Resolution Modes:
|
| 426 |
-
tiny 512×512 pixels, fast processing (64 vision tokens)
|
| 427 |
-
small 640×640 pixels, balanced (100 vision tokens)
|
| 428 |
-
base 1024×1024 pixels, high quality (256 vision tokens)
|
| 429 |
-
large 1280×1280 pixels, maximum quality (400 vision tokens)
|
| 430 |
-
gundam Dynamic multi-tile processing (adaptive)
|
| 431 |
-
|
| 432 |
-
Examples:
|
| 433 |
-
# Basic usage with default Gundam mode
|
| 434 |
-
python main.py input-dataset output-dataset
|
| 435 |
-
|
| 436 |
-
# High quality processing
|
| 437 |
-
python main.py input-dataset output-dataset --resolution-mode large
|
| 438 |
-
|
| 439 |
-
# Fast processing for testing
|
| 440 |
-
python main.py input-dataset output-dataset --resolution-mode tiny --max-samples 100
|
| 441 |
-
|
| 442 |
-
# With HF Jobs
|
| 443 |
-
hf jobs run --flavor l4x1 --secrets HF_TOKEN \\
|
| 444 |
-
hf.co/spaces/davanstrien/deepseek-ocr \\
|
| 445 |
-
python main.py input-dataset output-dataset --resolution-mode gundam
|
| 446 |
-
""",
|
| 447 |
-
)
|
| 448 |
-
|
| 449 |
-
parser.add_argument("input_dataset", help="Input dataset ID from Hugging Face Hub")
|
| 450 |
-
parser.add_argument("output_dataset", help="Output dataset ID for Hugging Face Hub")
|
| 451 |
-
parser.add_argument(
|
| 452 |
-
"--image-column",
|
| 453 |
-
default="image",
|
| 454 |
-
help="Column containing images (default: image)",
|
| 455 |
-
)
|
| 456 |
-
parser.add_argument(
|
| 457 |
-
"--model",
|
| 458 |
-
default="deepseek-ai/DeepSeek-OCR",
|
| 459 |
-
help="Model to use (default: deepseek-ai/DeepSeek-OCR)",
|
| 460 |
-
)
|
| 461 |
-
parser.add_argument(
|
| 462 |
-
"--resolution-mode",
|
| 463 |
-
default="gundam",
|
| 464 |
-
choices=list(RESOLUTION_MODES.keys()),
|
| 465 |
-
help="Resolution mode preset (default: gundam)",
|
| 466 |
-
)
|
| 467 |
-
parser.add_argument(
|
| 468 |
-
"--max-model-len",
|
| 469 |
-
type=int,
|
| 470 |
-
default=8192,
|
| 471 |
-
help="Maximum model context length (default: 8192)",
|
| 472 |
-
)
|
| 473 |
-
parser.add_argument(
|
| 474 |
-
"--max-tokens",
|
| 475 |
-
type=int,
|
| 476 |
-
default=8192,
|
| 477 |
-
help="Maximum tokens to generate (default: 8192)",
|
| 478 |
-
)
|
| 479 |
-
parser.add_argument(
|
| 480 |
-
"--gpu-memory-utilization",
|
| 481 |
-
type=float,
|
| 482 |
-
default=0.75,
|
| 483 |
-
help="GPU memory utilization (default: 0.75)",
|
| 484 |
-
)
|
| 485 |
-
parser.add_argument(
|
| 486 |
-
"--prompt",
|
| 487 |
-
default="<image>\n<|grounding|>Convert the document to markdown.",
|
| 488 |
-
help="Prompt for OCR (default: grounding markdown conversion)",
|
| 489 |
-
)
|
| 490 |
-
parser.add_argument("--hf-token", help="Hugging Face API token")
|
| 491 |
-
parser.add_argument(
|
| 492 |
-
"--split", default="train", help="Dataset split to use (default: train)"
|
| 493 |
-
)
|
| 494 |
-
parser.add_argument(
|
| 495 |
-
"--max-samples",
|
| 496 |
-
type=int,
|
| 497 |
-
help="Maximum number of samples to process (for testing)",
|
| 498 |
-
)
|
| 499 |
-
parser.add_argument(
|
| 500 |
-
"--private", action="store_true", help="Make output dataset private"
|
| 501 |
-
)
|
| 502 |
-
parser.add_argument(
|
| 503 |
-
"--shuffle",
|
| 504 |
-
action="store_true",
|
| 505 |
-
help="Shuffle the dataset before processing (useful for random sampling)",
|
| 506 |
-
)
|
| 507 |
-
parser.add_argument(
|
| 508 |
-
"--seed",
|
| 509 |
-
type=int,
|
| 510 |
-
default=42,
|
| 511 |
-
help="Random seed for shuffling (default: 42)",
|
| 512 |
-
)
|
| 513 |
-
|
| 514 |
-
args = parser.parse_args()
|
| 515 |
-
|
| 516 |
-
# Run async main
|
| 517 |
-
asyncio.run(
|
| 518 |
-
main_async(
|
| 519 |
-
input_dataset=args.input_dataset,
|
| 520 |
-
output_dataset=args.output_dataset,
|
| 521 |
-
image_column=args.image_column,
|
| 522 |
-
model=args.model,
|
| 523 |
-
resolution_mode=args.resolution_mode,
|
| 524 |
-
max_model_len=args.max_model_len,
|
| 525 |
-
max_tokens=args.max_tokens,
|
| 526 |
-
gpu_memory_utilization=args.gpu_memory_utilization,
|
| 527 |
-
prompt=args.prompt,
|
| 528 |
-
hf_token=args.hf_token,
|
| 529 |
-
split=args.split,
|
| 530 |
-
max_samples=args.max_samples,
|
| 531 |
-
private=args.private,
|
| 532 |
-
shuffle=args.shuffle,
|
| 533 |
-
seed=args.seed,
|
| 534 |
-
)
|
| 535 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
process_dataset.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
DeepSeek-OCR Dataset Processing
|
| 4 |
+
Minimal adaptation of official run_dpsk_ocr_image.py for dataset processing
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import argparse
|
| 8 |
+
import asyncio
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import sys
|
| 12 |
+
import time
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
|
| 15 |
+
import torch
|
| 16 |
+
if torch.version.cuda == '11.8':
|
| 17 |
+
os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
|
| 18 |
+
|
| 19 |
+
os.environ['VLLM_USE_V1'] = '0'
|
| 20 |
+
|
| 21 |
+
from vllm import AsyncLLMEngine, SamplingParams
|
| 22 |
+
from vllm.engine.arg_utils import AsyncEngineArgs
|
| 23 |
+
from vllm.model_executor.models.registry import ModelRegistry
|
| 24 |
+
from PIL import Image, ImageOps
|
| 25 |
+
from tqdm.auto import tqdm
|
| 26 |
+
from datasets import load_dataset
|
| 27 |
+
from huggingface_hub import DatasetCard, login
|
| 28 |
+
|
| 29 |
+
# Import DeepSeek-OCR modules (unchanged from original)
|
| 30 |
+
from deepseek_ocr import DeepseekOCRForCausalLM
|
| 31 |
+
from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
|
| 32 |
+
from process.image_process import DeepseekOCRProcessor
|
| 33 |
+
from config import MODEL_PATH, PROMPT, CROP_MODE
|
| 34 |
+
|
| 35 |
+
# Register custom model (unchanged from original)
|
| 36 |
+
ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def check_cuda():
|
| 40 |
+
"""Check CUDA availability"""
|
| 41 |
+
if not torch.cuda.is_available():
|
| 42 |
+
print("ERROR: CUDA is not available. This script requires a GPU.")
|
| 43 |
+
sys.exit(1)
|
| 44 |
+
print(f"Using GPU: {torch.cuda.get_device_name(0)}")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
async def process_single_image(engine, sampling_params, image_features, prompt):
|
| 48 |
+
"""Process a single image through the engine (unchanged from original)"""
|
| 49 |
+
request_id = f"request-{int(time.time() * 1000000)}"
|
| 50 |
+
|
| 51 |
+
if image_features and '<image>' in prompt:
|
| 52 |
+
request = {
|
| 53 |
+
"prompt": prompt,
|
| 54 |
+
"multi_modal_data": {"image": image_features}
|
| 55 |
+
}
|
| 56 |
+
else:
|
| 57 |
+
request = {"prompt": prompt}
|
| 58 |
+
|
| 59 |
+
final_output = ""
|
| 60 |
+
async for request_output in engine.generate(request, sampling_params, request_id):
|
| 61 |
+
if request_output.outputs:
|
| 62 |
+
final_output = request_output.outputs[0].text
|
| 63 |
+
|
| 64 |
+
return final_output.strip()
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
async def main_async(args):
|
| 68 |
+
"""Main processing function"""
|
| 69 |
+
check_cuda()
|
| 70 |
+
|
| 71 |
+
# Enable HF_TRANSFER
|
| 72 |
+
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
| 73 |
+
|
| 74 |
+
# Login to HF if token provided
|
| 75 |
+
HF_TOKEN = args.hf_token or os.environ.get("HF_TOKEN")
|
| 76 |
+
if HF_TOKEN:
|
| 77 |
+
login(token=HF_TOKEN)
|
| 78 |
+
|
| 79 |
+
# Load dataset
|
| 80 |
+
print(f"Loading dataset: {args.input_dataset}")
|
| 81 |
+
dataset = load_dataset(args.input_dataset, split=args.split)
|
| 82 |
+
|
| 83 |
+
if args.image_column not in dataset.column_names:
|
| 84 |
+
print(f"ERROR: Column '{args.image_column}' not found")
|
| 85 |
+
print(f"Available columns: {dataset.column_names}")
|
| 86 |
+
sys.exit(1)
|
| 87 |
+
|
| 88 |
+
# Shuffle if requested
|
| 89 |
+
if args.shuffle:
|
| 90 |
+
print(f"Shuffling with seed {args.seed}")
|
| 91 |
+
dataset = dataset.shuffle(seed=args.seed)
|
| 92 |
+
|
| 93 |
+
# Limit samples if requested
|
| 94 |
+
if args.max_samples:
|
| 95 |
+
dataset = dataset.select(range(min(args.max_samples, len(dataset))))
|
| 96 |
+
print(f"Processing {len(dataset)} samples")
|
| 97 |
+
|
| 98 |
+
# Initialize vLLM engine (UNCHANGED from original)
|
| 99 |
+
print("Initializing vLLM engine...")
|
| 100 |
+
engine_args = AsyncEngineArgs(
|
| 101 |
+
model=MODEL_PATH,
|
| 102 |
+
hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
|
| 103 |
+
block_size=256,
|
| 104 |
+
max_model_len=args.max_model_len,
|
| 105 |
+
enforce_eager=False,
|
| 106 |
+
trust_remote_code=True,
|
| 107 |
+
tensor_parallel_size=1,
|
| 108 |
+
gpu_memory_utilization=args.gpu_memory_utilization,
|
| 109 |
+
)
|
| 110 |
+
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
| 111 |
+
|
| 112 |
+
# Sampling params (UNCHANGED from original)
|
| 113 |
+
logits_processors = [NoRepeatNGramLogitsProcessor(
|
| 114 |
+
ngram_size=30, window_size=90, whitelist_token_ids={128821, 128822}
|
| 115 |
+
)]
|
| 116 |
+
|
| 117 |
+
sampling_params = SamplingParams(
|
| 118 |
+
temperature=0.0,
|
| 119 |
+
max_tokens=args.max_tokens,
|
| 120 |
+
logits_processors=logits_processors,
|
| 121 |
+
skip_special_tokens=False,
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Process images
|
| 125 |
+
print(f"Processing {len(dataset)} images...")
|
| 126 |
+
all_markdown = []
|
| 127 |
+
processor = DeepseekOCRProcessor()
|
| 128 |
+
|
| 129 |
+
for idx in tqdm(range(len(dataset)), desc="OCR processing"):
|
| 130 |
+
try:
|
| 131 |
+
# Load image
|
| 132 |
+
image = dataset[idx][args.image_column]
|
| 133 |
+
if not isinstance(image, Image.Image):
|
| 134 |
+
image = Image.open(image) if isinstance(image, str) else image
|
| 135 |
+
|
| 136 |
+
image = ImageOps.exif_transpose(image.convert('RGB'))
|
| 137 |
+
|
| 138 |
+
# Preprocess image (UNCHANGED from original)
|
| 139 |
+
if '<image>' in PROMPT:
|
| 140 |
+
image_features = processor.tokenize_with_images(
|
| 141 |
+
images=[image], bos=True, eos=True, cropping=CROP_MODE
|
| 142 |
+
)
|
| 143 |
+
else:
|
| 144 |
+
image_features = ''
|
| 145 |
+
|
| 146 |
+
# Process
|
| 147 |
+
result = await process_single_image(
|
| 148 |
+
engine, sampling_params, image_features, PROMPT
|
| 149 |
+
)
|
| 150 |
+
all_markdown.append(result)
|
| 151 |
+
|
| 152 |
+
except Exception as e:
|
| 153 |
+
print(f"Error processing image {idx}: {e}")
|
| 154 |
+
all_markdown.append("[OCR FAILED]")
|
| 155 |
+
|
| 156 |
+
# Add markdown column
|
| 157 |
+
print("Adding markdown column...")
|
| 158 |
+
dataset = dataset.add_column("markdown", all_markdown)
|
| 159 |
+
|
| 160 |
+
# Handle inference_info
|
| 161 |
+
if "inference_info" in dataset.column_names:
|
| 162 |
+
try:
|
| 163 |
+
existing_info = json.loads(dataset[0]["inference_info"])
|
| 164 |
+
if not isinstance(existing_info, list):
|
| 165 |
+
existing_info = [existing_info]
|
| 166 |
+
except:
|
| 167 |
+
existing_info = []
|
| 168 |
+
dataset = dataset.remove_columns(["inference_info"])
|
| 169 |
+
else:
|
| 170 |
+
existing_info = []
|
| 171 |
+
|
| 172 |
+
new_info = {
|
| 173 |
+
"column_name": "markdown",
|
| 174 |
+
"model_id": MODEL_PATH,
|
| 175 |
+
"processing_date": datetime.now().isoformat(),
|
| 176 |
+
"prompt": PROMPT,
|
| 177 |
+
"max_tokens": args.max_tokens,
|
| 178 |
+
"max_model_len": args.max_model_len,
|
| 179 |
+
"gpu_memory_utilization": args.gpu_memory_utilization,
|
| 180 |
+
"script": "process_dataset.py",
|
| 181 |
+
"implementation": "vllm-async (official deepseek code)",
|
| 182 |
+
}
|
| 183 |
+
existing_info.append(new_info)
|
| 184 |
+
|
| 185 |
+
info_json = json.dumps(existing_info, ensure_ascii=False)
|
| 186 |
+
dataset = dataset.add_column("inference_info", [info_json] * len(dataset))
|
| 187 |
+
|
| 188 |
+
# Push to hub
|
| 189 |
+
print(f"Pushing to {args.output_dataset}")
|
| 190 |
+
dataset.push_to_hub(args.output_dataset, private=args.private, token=HF_TOKEN)
|
| 191 |
+
|
| 192 |
+
print("✅ Complete!")
|
| 193 |
+
print(f"Dataset: https://huggingface.co/datasets/{args.output_dataset}")
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
if __name__ == "__main__":
|
| 197 |
+
parser = argparse.ArgumentParser(
|
| 198 |
+
description="Process images through DeepSeek-OCR"
|
| 199 |
+
)
|
| 200 |
+
parser.add_argument("input_dataset", help="Input dataset ID")
|
| 201 |
+
parser.add_argument("output_dataset", help="Output dataset ID")
|
| 202 |
+
parser.add_argument("--image-column", default="image", help="Image column name")
|
| 203 |
+
parser.add_argument("--split", default="train", help="Dataset split")
|
| 204 |
+
parser.add_argument("--max-samples", type=int, help="Limit number of samples")
|
| 205 |
+
parser.add_argument("--shuffle", action="store_true", help="Shuffle dataset")
|
| 206 |
+
parser.add_argument("--seed", type=int, default=42, help="Random seed")
|
| 207 |
+
parser.add_argument("--max-model-len", type=int, default=8192)
|
| 208 |
+
parser.add_argument("--max-tokens", type=int, default=8192)
|
| 209 |
+
parser.add_argument("--gpu-memory-utilization", type=float, default=0.75)
|
| 210 |
+
parser.add_argument("--hf-token", help="HF API token")
|
| 211 |
+
parser.add_argument("--private", action="store_true", help="Make output private")
|
| 212 |
+
|
| 213 |
+
args = parser.parse_args()
|
| 214 |
+
asyncio.run(main_async(args))
|