davanstrien HF Staff commited on
Commit
0a5527f
·
1 Parent(s): 40f1c08

Restart with minimal changes to official DeepSeek code

Browse files

- Created process_dataset.py based on run_dpsk_ocr_image.py
- Using original config.py with tokenizer initialization
- Removed custom main.py that was causing import issues
- Minimal changes: only dataset loading/processing added
- Dockerfile updated to use process_dataset.py

Files changed (6) hide show
  1. Dockerfile +1 -1
  2. README.md +9 -23
  3. config.py +28 -37
  4. config_template.py +0 -42
  5. main.py +0 -535
  6. process_dataset.py +214 -0
Dockerfile CHANGED
@@ -48,4 +48,4 @@ RUN pip install --no-cache-dir -r requirements.txt
48
  COPY . .
49
 
50
  # Default command (can be overridden by HF Jobs)
51
- CMD ["python", "main.py", "--help"]
 
48
  COPY . .
49
 
50
  # Default command (can be overridden by HF Jobs)
51
+ CMD ["python", "process_dataset.py", "--help"]
README.md CHANGED
@@ -27,7 +27,7 @@ Process any image dataset without needing your own GPU:
27
  hf jobs run --flavor l4x1 \
28
  --secrets HF_TOKEN \
29
  hf.co/spaces/davanstrien/deepseek-ocr \
30
- python main.py \
31
  input-dataset \
32
  output-dataset
33
 
@@ -35,11 +35,10 @@ hf jobs run --flavor l4x1 \
35
  hf jobs run --flavor l4x1 \
36
  --secrets HF_TOKEN \
37
  hf.co/spaces/davanstrien/deepseek-ocr \
38
- python main.py \
39
  your-input-dataset \
40
  your-output-dataset \
41
- --max-samples 10 \
42
- --resolution-mode tiny
43
  ```
44
 
45
  That's it! The script will:
@@ -84,31 +83,19 @@ That's it! The script will:
84
  # Default (Gundam mode)
85
  hf jobs run --flavor l4x1 --secrets HF_TOKEN \
86
  hf.co/spaces/davanstrien/deepseek-ocr \
87
- python main.py \
88
  my-images-dataset \
89
  ocr-results
90
  ```
91
 
92
- ### High Quality Mode
93
-
94
- ```bash
95
- hf jobs run --flavor l40sx1 --secrets HF_TOKEN \
96
- hf.co/spaces/davanstrien/deepseek-ocr \
97
- python main.py \
98
- documents-dataset \
99
- extracted-text \
100
- --resolution-mode large
101
- ```
102
-
103
  ### Fast Processing for Testing
104
 
105
  ```bash
106
  hf jobs run --flavor l4x1 --secrets HF_TOKEN \
107
  hf.co/spaces/davanstrien/deepseek-ocr \
108
- python main.py \
109
  large-dataset \
110
  test-output \
111
- --resolution-mode tiny \
112
  --max-samples 100
113
  ```
114
 
@@ -117,7 +104,7 @@ hf jobs run --flavor l4x1 --secrets HF_TOKEN \
117
  ```bash
118
  hf jobs run --flavor l4x1 --secrets HF_TOKEN \
119
  hf.co/spaces/davanstrien/deepseek-ocr \
120
- python main.py \
121
  ordered-dataset \
122
  random-sample \
123
  --max-samples 50 \
@@ -130,11 +117,10 @@ hf jobs run --flavor l4x1 --secrets HF_TOKEN \
130
  ```bash
131
  hf jobs run --flavor a10g-large --secrets HF_TOKEN \
132
  hf.co/spaces/davanstrien/deepseek-ocr \
133
- python main.py \
134
  davanstrien/ufo-ColPali \
135
  ufo-ocr \
136
- --image-column image \
137
- --resolution-mode gundam
138
  ```
139
 
140
  ### Private Output Dataset
@@ -142,7 +128,7 @@ hf jobs run --flavor a10g-large --secrets HF_TOKEN \
142
  ```bash
143
  hf jobs run --flavor l4x1 --secrets HF_TOKEN \
144
  hf.co/spaces/davanstrien/deepseek-ocr \
145
- python main.py \
146
  private-input \
147
  private-output \
148
  --private
 
27
  hf jobs run --flavor l4x1 \
28
  --secrets HF_TOKEN \
29
  hf.co/spaces/davanstrien/deepseek-ocr \
30
+ python process_dataset.py \
31
  input-dataset \
32
  output-dataset
33
 
 
35
  hf jobs run --flavor l4x1 \
36
  --secrets HF_TOKEN \
37
  hf.co/spaces/davanstrien/deepseek-ocr \
38
+ python process_dataset.py \
39
  your-input-dataset \
40
  your-output-dataset \
41
+ --max-samples 10
 
42
  ```
43
 
44
  That's it! The script will:
 
83
  # Default (Gundam mode)
84
  hf jobs run --flavor l4x1 --secrets HF_TOKEN \
85
  hf.co/spaces/davanstrien/deepseek-ocr \
86
+ python process_dataset.py \
87
  my-images-dataset \
88
  ocr-results
89
  ```
90
 
 
 
 
 
 
 
 
 
 
 
 
91
  ### Fast Processing for Testing
92
 
93
  ```bash
94
  hf jobs run --flavor l4x1 --secrets HF_TOKEN \
95
  hf.co/spaces/davanstrien/deepseek-ocr \
96
+ python process_dataset.py \
97
  large-dataset \
98
  test-output \
 
99
  --max-samples 100
100
  ```
101
 
 
104
  ```bash
105
  hf jobs run --flavor l4x1 --secrets HF_TOKEN \
106
  hf.co/spaces/davanstrien/deepseek-ocr \
107
+ python process_dataset.py \
108
  ordered-dataset \
109
  random-sample \
110
  --max-samples 50 \
 
117
  ```bash
118
  hf jobs run --flavor a10g-large --secrets HF_TOKEN \
119
  hf.co/spaces/davanstrien/deepseek-ocr \
120
+ python process_dataset.py \
121
  davanstrien/ufo-ColPali \
122
  ufo-ocr \
123
+ --image-column image
 
124
  ```
125
 
126
  ### Private Output Dataset
 
128
  ```bash
129
  hf jobs run --flavor l4x1 --secrets HF_TOKEN \
130
  hf.co/spaces/davanstrien/deepseek-ocr \
131
+ python process_dataset.py \
132
  private-input \
133
  private-output \
134
  --private
config.py CHANGED
@@ -1,51 +1,42 @@
1
- # Configuration for DeepSeek-OCR
2
- # These will be set programmatically by main.py based on command-line arguments
 
 
 
 
3
 
4
- # Resolution settings (set by resolution mode)
5
  BASE_SIZE = 1024
6
  IMAGE_SIZE = 640
7
  CROP_MODE = True
8
-
9
- # Processing settings
10
- MIN_CROPS = 2
11
- MAX_CROPS = 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
12
- MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
13
- NUM_WORKERS = 64 # image pre-process (resize/padding) workers
14
  PRINT_NUM_VIS_TOKENS = False
15
  SKIP_REPEAT = True
 
16
 
17
- # Model settings
18
- MODEL_PATH = 'deepseek-ai/DeepSeek-OCR'
 
 
19
 
20
- # Paths (not used in Space version)
21
- INPUT_PATH = ''
22
  OUTPUT_PATH = ''
23
 
24
- # Default prompt
25
  PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
 
 
 
 
 
 
 
 
 
 
26
 
27
- # Tokenizer - initialized at import time for vLLM compatibility
28
- from transformers import AutoTokenizer
29
- TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
30
-
31
-
32
- def set_resolution_mode(mode: str):
33
- """Update global config based on resolution mode."""
34
- global BASE_SIZE, IMAGE_SIZE, CROP_MODE
35
 
36
- modes = {
37
- "tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
38
- "small": {"base_size": 640, "image_size": 640, "crop_mode": False},
39
- "base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
40
- "large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
41
- "gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
42
- }
43
-
44
- if mode not in modes:
45
- raise ValueError(f"Unknown resolution mode: {mode}. Choose from {list(modes.keys())}")
46
-
47
- BASE_SIZE = modes[mode]["base_size"]
48
- IMAGE_SIZE = modes[mode]["image_size"]
49
- CROP_MODE = modes[mode]["crop_mode"]
50
 
51
- return BASE_SIZE, IMAGE_SIZE, CROP_MODE
 
1
+ # TODO: change modes
2
+ # Tiny: base_size = 512, image_size = 512, crop_mode = False
3
+ # Small: base_size = 640, image_size = 640, crop_mode = False
4
+ # Base: base_size = 1024, image_size = 1024, crop_mode = False
5
+ # Large: base_size = 1280, image_size = 1280, crop_mode = False
6
+ # Gundam: base_size = 1024, image_size = 640, crop_mode = True
7
 
 
8
  BASE_SIZE = 1024
9
  IMAGE_SIZE = 640
10
  CROP_MODE = True
11
+ MIN_CROPS= 2
12
+ MAX_CROPS= 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
13
+ MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
14
+ NUM_WORKERS = 64 # image pre-process (resize/padding) workers
 
 
15
  PRINT_NUM_VIS_TOKENS = False
16
  SKIP_REPEAT = True
17
+ MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path
18
 
19
+ # TODO: change INPUT_PATH
20
+ # .pdf: run_dpsk_ocr_pdf.py;
21
+ # .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
22
+ # Omnidocbench images path: run_dpsk_ocr_eval_batch.py
23
 
24
+ INPUT_PATH = ''
 
25
  OUTPUT_PATH = ''
26
 
 
27
  PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
28
+ # PROMPT = '<image>\nFree OCR.'
29
+ # TODO commonly used prompts
30
+ # document: <image>\n<|grounding|>Convert the document to markdown.
31
+ # other image: <image>\n<|grounding|>OCR this image.
32
+ # without layouts: <image>\nFree OCR.
33
+ # figures in document: <image>\nParse the figure.
34
+ # general: <image>\nDescribe this image in detail.
35
+ # rec: <image>\nLocate <|ref|>xxxx<|/ref|> in the image.
36
+ # '先天下之忧而忧'
37
+ # .......
38
 
 
 
 
 
 
 
 
 
39
 
40
+ from transformers import AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
config_template.py DELETED
@@ -1,42 +0,0 @@
1
- # TODO: change modes
2
- # Tiny: base_size = 512, image_size = 512, crop_mode = False
3
- # Small: base_size = 640, image_size = 640, crop_mode = False
4
- # Base: base_size = 1024, image_size = 1024, crop_mode = False
5
- # Large: base_size = 1280, image_size = 1280, crop_mode = False
6
- # Gundam: base_size = 1024, image_size = 640, crop_mode = True
7
-
8
- BASE_SIZE = 1024
9
- IMAGE_SIZE = 640
10
- CROP_MODE = True
11
- MIN_CROPS= 2
12
- MAX_CROPS= 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
13
- MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
14
- NUM_WORKERS = 64 # image pre-process (resize/padding) workers
15
- PRINT_NUM_VIS_TOKENS = False
16
- SKIP_REPEAT = True
17
- MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path
18
-
19
- # TODO: change INPUT_PATH
20
- # .pdf: run_dpsk_ocr_pdf.py;
21
- # .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
22
- # Omnidocbench images path: run_dpsk_ocr_eval_batch.py
23
-
24
- INPUT_PATH = ''
25
- OUTPUT_PATH = ''
26
-
27
- PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
28
- # PROMPT = '<image>\nFree OCR.'
29
- # TODO commonly used prompts
30
- # document: <image>\n<|grounding|>Convert the document to markdown.
31
- # other image: <image>\n<|grounding|>OCR this image.
32
- # without layouts: <image>\nFree OCR.
33
- # figures in document: <image>\nParse the figure.
34
- # general: <image>\nDescribe this image in detail.
35
- # rec: <image>\nLocate <|ref|>xxxx<|/ref|> in the image.
36
- # '先天下之忧而忧'
37
- # .......
38
-
39
-
40
- from transformers import AutoTokenizer
41
-
42
- TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py DELETED
@@ -1,535 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- DeepSeek-OCR Dataset Processing with vLLM
4
-
5
- This script processes image datasets through DeepSeek-OCR using vLLM for efficient batch processing.
6
- """
7
-
8
- import argparse
9
- import asyncio
10
- import json
11
- import logging
12
- import os
13
- import sys
14
- import time
15
- from datetime import datetime
16
- from typing import List
17
-
18
- import torch
19
- from datasets import load_dataset
20
- from huggingface_hub import DatasetCard, login
21
- from PIL import Image, ImageOps
22
- from tqdm.auto import tqdm
23
- from vllm import AsyncLLMEngine, SamplingParams
24
- from vllm.engine.arg_utils import AsyncEngineArgs
25
- from vllm.model_executor.models.registry import ModelRegistry
26
-
27
- # Import DeepSeek-OCR modules
28
- import config
29
- from deepseek_ocr import DeepseekOCRForCausalLM
30
- from process.image_process import DeepseekOCRProcessor
31
- from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
32
-
33
- logging.basicConfig(level=logging.INFO)
34
- logger = logging.getLogger(__name__)
35
-
36
- # Resolution mode presets
37
- RESOLUTION_MODES = {
38
- "tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
39
- "small": {"base_size": 640, "image_size": 640, "crop_mode": False},
40
- "base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
41
- "large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
42
- "gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
43
- }
44
-
45
-
46
- def check_cuda_availability():
47
- """Check if CUDA is available and exit if not."""
48
- if not torch.cuda.is_available():
49
- logger.error("CUDA is not available. This script requires a GPU.")
50
- logger.error("Please run on a machine with a CUDA-capable GPU.")
51
- sys.exit(1)
52
- else:
53
- logger.info(f"CUDA is available. GPU: {torch.cuda.get_device_name(0)}")
54
-
55
-
56
- def setup_config(resolution_mode: str):
57
- """Set up global config based on resolution mode."""
58
- if resolution_mode not in RESOLUTION_MODES:
59
- raise ValueError(
60
- f"Invalid resolution mode: {resolution_mode}. "
61
- f"Choose from {list(RESOLUTION_MODES.keys())}"
62
- )
63
-
64
- mode_config = RESOLUTION_MODES[resolution_mode]
65
- config.BASE_SIZE = mode_config["base_size"]
66
- config.IMAGE_SIZE = mode_config["image_size"]
67
- config.CROP_MODE = mode_config["crop_mode"]
68
-
69
- logger.info(
70
- f"Resolution mode: {resolution_mode} "
71
- f"(BASE_SIZE={config.BASE_SIZE}, IMAGE_SIZE={config.IMAGE_SIZE}, "
72
- f"CROP_MODE={config.CROP_MODE})"
73
- )
74
-
75
-
76
- async def process_images_async(
77
- images: List[Image.Image],
78
- engine: AsyncLLMEngine,
79
- processor: DeepseekOCRProcessor,
80
- sampling_params: SamplingParams,
81
- prompt: str,
82
- ) -> List[str]:
83
- """Process a batch of images asynchronously with vLLM."""
84
- results = []
85
-
86
- for image in images:
87
- # Preprocess image
88
- image = image.convert("RGB")
89
- image_features = processor.tokenize_with_images(
90
- images=[image], bos=True, eos=True, cropping=config.CROP_MODE
91
- )
92
-
93
- # Generate async
94
- request_id = f"request-{int(time.time() * 1000)}"
95
- request = {"prompt": prompt, "multi_modal_data": {"image": image_features}}
96
-
97
- output_text = ""
98
- async for request_output in engine.generate(request, sampling_params, request_id):
99
- if request_output.outputs:
100
- output_text = request_output.outputs[0].text
101
-
102
- results.append(output_text.strip())
103
-
104
- return results
105
-
106
-
107
- def create_dataset_card(
108
- source_dataset: str,
109
- model: str,
110
- num_samples: int,
111
- processing_time: str,
112
- resolution_mode: str,
113
- base_size: int,
114
- image_size: int,
115
- crop_mode: bool,
116
- max_model_len: int,
117
- max_tokens: int,
118
- gpu_memory_utilization: float,
119
- image_column: str = "image",
120
- split: str = "train",
121
- ) -> str:
122
- """Create a dataset card documenting the OCR process."""
123
- return f"""---
124
- tags:
125
- - ocr
126
- - document-processing
127
- - deepseek
128
- - deepseek-ocr
129
- - markdown
130
- - vllm
131
- - generated
132
- ---
133
-
134
- # Document OCR using DeepSeek-OCR (vLLM)
135
-
136
- This dataset contains markdown-formatted OCR results from images in [{source_dataset}](https://huggingface.co/datasets/{source_dataset}) using DeepSeek-OCR with vLLM.
137
-
138
- ## Processing Details
139
-
140
- - **Source Dataset**: [{source_dataset}](https://huggingface.co/datasets/{source_dataset})
141
- - **Model**: [{model}](https://huggingface.co/{model})
142
- - **Number of Samples**: {num_samples:,}
143
- - **Processing Time**: {processing_time}
144
- - **Processing Date**: {datetime.now().strftime("%Y-%m-%d %H:%M UTC")}
145
-
146
- ### Configuration
147
-
148
- - **Image Column**: `{image_column}`
149
- - **Output Column**: `markdown`
150
- - **Dataset Split**: `{split}`
151
- - **Resolution Mode**: {resolution_mode}
152
- - **Base Size**: {base_size}
153
- - **Image Size**: {image_size}
154
- - **Crop Mode**: {crop_mode}
155
- - **Max Model Length**: {max_model_len:,} tokens
156
- - **Max Output Tokens**: {max_tokens:,}
157
- - **GPU Memory Utilization**: {gpu_memory_utilization:.1%}
158
- - **Implementation**: vLLM AsyncEngine (batch processing)
159
-
160
- ## Model Information
161
-
162
- DeepSeek-OCR is a state-of-the-art document OCR model that excels at:
163
- - 📐 **LaTeX equations** - Mathematical formulas preserved in LaTeX format
164
- - 📊 **Tables** - Extracted and formatted as HTML/markdown
165
- - 📝 **Document structure** - Headers, lists, and formatting maintained
166
- - 🖼️ **Image grounding** - Spatial layout and bounding box information
167
- - 🔍 **Complex layouts** - Multi-column and hierarchical structures
168
- - 🌍 **Multilingual** - Supports multiple languages
169
-
170
- ### Resolution Modes
171
-
172
- - **Tiny** (512×512): Fast processing, 64 vision tokens
173
- - **Small** (640×640): Balanced speed/quality, 100 vision tokens
174
- - **Base** (1024×1024): High quality, 256 vision tokens
175
- - **Large** (1280×1280): Maximum quality, 400 vision tokens
176
- - **Gundam** (dynamic): Adaptive multi-tile processing for large documents
177
-
178
- ## Dataset Structure
179
-
180
- The dataset contains all original columns plus:
181
- - `markdown`: The extracted text in markdown format with preserved structure
182
- - `inference_info`: JSON list tracking all OCR models applied to this dataset
183
-
184
- ## Usage
185
-
186
- ```python
187
- from datasets import load_dataset
188
-
189
- # Load the dataset
190
- dataset = load_dataset("{{{{output_dataset_id}}}}", split="{split}")
191
-
192
- # Access the markdown text
193
- for example in dataset:
194
- print(example["markdown"])
195
- break
196
- ```
197
-
198
- ## Reproduction
199
-
200
- This dataset was generated using the DeepSeek-OCR vLLM Space:
201
-
202
- ```bash
203
- hf jobs run --flavor l4x1 \\
204
- --secrets HF_TOKEN \\
205
- hf.co/spaces/davanstrien/deepseek-ocr \\
206
- python main.py \\
207
- --input-dataset {source_dataset} \\
208
- --output-dataset <output-dataset> \\
209
- --resolution-mode {resolution_mode} \\
210
- --image-column {image_column}
211
- ```
212
-
213
- ## Performance
214
-
215
- - **Processing Speed**: ~{num_samples / (float(processing_time.split()[0]) * 60) if processing_time.split()[0].replace('.','').isdigit() else 'N/A':.1f} images/second
216
- - **Processing Method**: Async batch processing with vLLM (optimized for throughput)
217
-
218
- Generated with 🤖 [DeepSeek-OCR Space](https://huggingface.co/spaces/davanstrien/deepseek-ocr)
219
- """
220
-
221
-
222
- async def main_async(
223
- input_dataset: str,
224
- output_dataset: str,
225
- image_column: str = "image",
226
- model: str = "deepseek-ai/DeepSeek-OCR",
227
- resolution_mode: str = "gundam",
228
- max_model_len: int = 8192,
229
- max_tokens: int = 8192,
230
- gpu_memory_utilization: float = 0.75,
231
- prompt: str = "<image>\n<|grounding|>Convert the document to markdown.",
232
- hf_token: str = None,
233
- split: str = "train",
234
- max_samples: int = None,
235
- private: bool = False,
236
- shuffle: bool = False,
237
- seed: int = 42,
238
- ):
239
- """Process images from HF dataset through DeepSeek-OCR model with vLLM."""
240
-
241
- # Check CUDA availability
242
- check_cuda_availability()
243
-
244
- # Track processing start time
245
- start_time = datetime.now()
246
-
247
- # Enable HF_TRANSFER for faster downloads
248
- os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
249
-
250
- # Login to HF if token provided
251
- HF_TOKEN = hf_token or os.environ.get("HF_TOKEN")
252
- if HF_TOKEN:
253
- login(token=HF_TOKEN)
254
-
255
- # Set up config for resolution mode
256
- setup_config(resolution_mode)
257
-
258
- # Set model and prompt (tokenizer already initialized in config.py)
259
- config.MODEL_PATH = model
260
- config.PROMPT = prompt
261
-
262
- # Load dataset
263
- logger.info(f"Loading dataset: {input_dataset}")
264
- dataset = load_dataset(input_dataset, split=split)
265
-
266
- # Validate image column
267
- if image_column not in dataset.column_names:
268
- raise ValueError(
269
- f"Column '{image_column}' not found. Available: {dataset.column_names}"
270
- )
271
-
272
- # Shuffle if requested
273
- if shuffle:
274
- logger.info(f"Shuffling dataset with seed {seed}")
275
- dataset = dataset.shuffle(seed=seed)
276
-
277
- # Limit samples if requested
278
- if max_samples:
279
- dataset = dataset.select(range(min(max_samples, len(dataset))))
280
- logger.info(f"Limited to {len(dataset)} samples")
281
-
282
- # Register custom model
283
- logger.info("Registering custom DeepSeek-OCR model...")
284
- ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
285
-
286
- # Initialize vLLM AsyncEngine
287
- logger.info(f"Initializing vLLM AsyncEngine with model: {model}")
288
- logger.info("This may take a few minutes on first run...")
289
-
290
- engine_args = AsyncEngineArgs(
291
- model=model,
292
- hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
293
- block_size=256,
294
- max_model_len=max_model_len,
295
- enforce_eager=False,
296
- trust_remote_code=True,
297
- tensor_parallel_size=1,
298
- gpu_memory_utilization=gpu_memory_utilization,
299
- )
300
- engine = AsyncLLMEngine.from_engine_args(engine_args)
301
-
302
- # Set up sampling params
303
- logits_processors = [
304
- NoRepeatNGramLogitsProcessor(
305
- ngram_size=30, window_size=90, whitelist_token_ids={128821, 128822}
306
- )
307
- ]
308
-
309
- sampling_params = SamplingParams(
310
- temperature=0.0,
311
- max_tokens=max_tokens,
312
- logits_processors=logits_processors,
313
- skip_special_tokens=False,
314
- )
315
-
316
- # Initialize processor
317
- processor = DeepseekOCRProcessor()
318
-
319
- logger.info(f"Processing {len(dataset)} images with vLLM AsyncEngine")
320
-
321
- # Process images one at a time (async but sequential for simplicity)
322
- all_markdown = []
323
- for idx in tqdm(range(len(dataset)), desc="DeepSeek-OCR processing"):
324
- image = dataset[idx][image_column]
325
-
326
- # Convert to PIL if needed
327
- if not isinstance(image, Image.Image):
328
- image = Image.open(image) if isinstance(image, str) else image
329
-
330
- try:
331
- image = ImageOps.exif_transpose(image.convert("RGB"))
332
-
333
- # Process single image
334
- results = await process_images_async(
335
- [image], engine, processor, sampling_params, prompt
336
- )
337
- all_markdown.append(results[0])
338
-
339
- except Exception as e:
340
- logger.error(f"Error processing image {idx}: {e}")
341
- all_markdown.append("[OCR FAILED]")
342
-
343
- # Calculate processing time
344
- processing_duration = datetime.now() - start_time
345
- processing_time_str = f"{processing_duration.total_seconds() / 60:.1f} min"
346
-
347
- # Add markdown column to dataset
348
- logger.info("Adding markdown column to dataset")
349
- dataset = dataset.add_column("markdown", all_markdown)
350
-
351
- # Handle inference_info tracking
352
- logger.info("Updating inference_info...")
353
-
354
- if "inference_info" in dataset.column_names:
355
- try:
356
- existing_info = json.loads(dataset[0]["inference_info"])
357
- if not isinstance(existing_info, list):
358
- existing_info = [existing_info]
359
- except (json.JSONDecodeError, TypeError):
360
- existing_info = []
361
- dataset = dataset.remove_columns(["inference_info"])
362
- else:
363
- existing_info = []
364
-
365
- # Add new inference info
366
- new_info = {
367
- "column_name": "markdown",
368
- "model_id": model,
369
- "processing_date": datetime.now().isoformat(),
370
- "resolution_mode": resolution_mode,
371
- "base_size": config.BASE_SIZE,
372
- "image_size": config.IMAGE_SIZE,
373
- "crop_mode": config.CROP_MODE,
374
- "prompt": prompt,
375
- "max_tokens": max_tokens,
376
- "gpu_memory_utilization": gpu_memory_utilization,
377
- "max_model_len": max_model_len,
378
- "script": "main.py",
379
- "script_version": "1.0.0",
380
- "space_url": "https://huggingface.co/spaces/davanstrien/deepseek-ocr",
381
- "implementation": "vllm-async (optimized)",
382
- }
383
- existing_info.append(new_info)
384
-
385
- # Add updated inference_info column
386
- info_json = json.dumps(existing_info, ensure_ascii=False)
387
- dataset = dataset.add_column("inference_info", [info_json] * len(dataset))
388
-
389
- # Push to hub
390
- logger.info(f"Pushing to {output_dataset}")
391
- dataset.push_to_hub(output_dataset, private=private, token=HF_TOKEN)
392
-
393
- # Create and push dataset card
394
- logger.info("Creating dataset card...")
395
- card_content = create_dataset_card(
396
- source_dataset=input_dataset,
397
- model=model,
398
- num_samples=len(dataset),
399
- processing_time=processing_time_str,
400
- resolution_mode=resolution_mode,
401
- base_size=config.BASE_SIZE,
402
- image_size=config.IMAGE_SIZE,
403
- crop_mode=config.CROP_MODE,
404
- max_model_len=max_model_len,
405
- max_tokens=max_tokens,
406
- gpu_memory_utilization=gpu_memory_utilization,
407
- image_column=image_column,
408
- split=split,
409
- )
410
-
411
- card = DatasetCard(card_content)
412
- card.push_to_hub(output_dataset, token=HF_TOKEN)
413
- logger.info("✅ Dataset card created and pushed!")
414
-
415
- logger.info("✅ OCR conversion complete!")
416
- logger.info(f"Dataset available at: https://huggingface.co/datasets/{output_dataset}")
417
- logger.info(f"Processing time: {processing_time_str}")
418
-
419
-
420
- if __name__ == "__main__":
421
- parser = argparse.ArgumentParser(
422
- description="OCR images to markdown using DeepSeek-OCR (vLLM AsyncEngine)",
423
- formatter_class=argparse.RawDescriptionHelpFormatter,
424
- epilog="""
425
- Resolution Modes:
426
- tiny 512×512 pixels, fast processing (64 vision tokens)
427
- small 640×640 pixels, balanced (100 vision tokens)
428
- base 1024×1024 pixels, high quality (256 vision tokens)
429
- large 1280×1280 pixels, maximum quality (400 vision tokens)
430
- gundam Dynamic multi-tile processing (adaptive)
431
-
432
- Examples:
433
- # Basic usage with default Gundam mode
434
- python main.py input-dataset output-dataset
435
-
436
- # High quality processing
437
- python main.py input-dataset output-dataset --resolution-mode large
438
-
439
- # Fast processing for testing
440
- python main.py input-dataset output-dataset --resolution-mode tiny --max-samples 100
441
-
442
- # With HF Jobs
443
- hf jobs run --flavor l4x1 --secrets HF_TOKEN \\
444
- hf.co/spaces/davanstrien/deepseek-ocr \\
445
- python main.py input-dataset output-dataset --resolution-mode gundam
446
- """,
447
- )
448
-
449
- parser.add_argument("input_dataset", help="Input dataset ID from Hugging Face Hub")
450
- parser.add_argument("output_dataset", help="Output dataset ID for Hugging Face Hub")
451
- parser.add_argument(
452
- "--image-column",
453
- default="image",
454
- help="Column containing images (default: image)",
455
- )
456
- parser.add_argument(
457
- "--model",
458
- default="deepseek-ai/DeepSeek-OCR",
459
- help="Model to use (default: deepseek-ai/DeepSeek-OCR)",
460
- )
461
- parser.add_argument(
462
- "--resolution-mode",
463
- default="gundam",
464
- choices=list(RESOLUTION_MODES.keys()),
465
- help="Resolution mode preset (default: gundam)",
466
- )
467
- parser.add_argument(
468
- "--max-model-len",
469
- type=int,
470
- default=8192,
471
- help="Maximum model context length (default: 8192)",
472
- )
473
- parser.add_argument(
474
- "--max-tokens",
475
- type=int,
476
- default=8192,
477
- help="Maximum tokens to generate (default: 8192)",
478
- )
479
- parser.add_argument(
480
- "--gpu-memory-utilization",
481
- type=float,
482
- default=0.75,
483
- help="GPU memory utilization (default: 0.75)",
484
- )
485
- parser.add_argument(
486
- "--prompt",
487
- default="<image>\n<|grounding|>Convert the document to markdown.",
488
- help="Prompt for OCR (default: grounding markdown conversion)",
489
- )
490
- parser.add_argument("--hf-token", help="Hugging Face API token")
491
- parser.add_argument(
492
- "--split", default="train", help="Dataset split to use (default: train)"
493
- )
494
- parser.add_argument(
495
- "--max-samples",
496
- type=int,
497
- help="Maximum number of samples to process (for testing)",
498
- )
499
- parser.add_argument(
500
- "--private", action="store_true", help="Make output dataset private"
501
- )
502
- parser.add_argument(
503
- "--shuffle",
504
- action="store_true",
505
- help="Shuffle the dataset before processing (useful for random sampling)",
506
- )
507
- parser.add_argument(
508
- "--seed",
509
- type=int,
510
- default=42,
511
- help="Random seed for shuffling (default: 42)",
512
- )
513
-
514
- args = parser.parse_args()
515
-
516
- # Run async main
517
- asyncio.run(
518
- main_async(
519
- input_dataset=args.input_dataset,
520
- output_dataset=args.output_dataset,
521
- image_column=args.image_column,
522
- model=args.model,
523
- resolution_mode=args.resolution_mode,
524
- max_model_len=args.max_model_len,
525
- max_tokens=args.max_tokens,
526
- gpu_memory_utilization=args.gpu_memory_utilization,
527
- prompt=args.prompt,
528
- hf_token=args.hf_token,
529
- split=args.split,
530
- max_samples=args.max_samples,
531
- private=args.private,
532
- shuffle=args.shuffle,
533
- seed=args.seed,
534
- )
535
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
process_dataset.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ DeepSeek-OCR Dataset Processing
4
+ Minimal adaptation of official run_dpsk_ocr_image.py for dataset processing
5
+ """
6
+
7
+ import argparse
8
+ import asyncio
9
+ import json
10
+ import os
11
+ import sys
12
+ import time
13
+ from datetime import datetime
14
+
15
+ import torch
16
+ if torch.version.cuda == '11.8':
17
+ os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
18
+
19
+ os.environ['VLLM_USE_V1'] = '0'
20
+
21
+ from vllm import AsyncLLMEngine, SamplingParams
22
+ from vllm.engine.arg_utils import AsyncEngineArgs
23
+ from vllm.model_executor.models.registry import ModelRegistry
24
+ from PIL import Image, ImageOps
25
+ from tqdm.auto import tqdm
26
+ from datasets import load_dataset
27
+ from huggingface_hub import DatasetCard, login
28
+
29
+ # Import DeepSeek-OCR modules (unchanged from original)
30
+ from deepseek_ocr import DeepseekOCRForCausalLM
31
+ from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
32
+ from process.image_process import DeepseekOCRProcessor
33
+ from config import MODEL_PATH, PROMPT, CROP_MODE
34
+
35
+ # Register custom model (unchanged from original)
36
+ ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
37
+
38
+
39
+ def check_cuda():
40
+ """Check CUDA availability"""
41
+ if not torch.cuda.is_available():
42
+ print("ERROR: CUDA is not available. This script requires a GPU.")
43
+ sys.exit(1)
44
+ print(f"Using GPU: {torch.cuda.get_device_name(0)}")
45
+
46
+
47
+ async def process_single_image(engine, sampling_params, image_features, prompt):
48
+ """Process a single image through the engine (unchanged from original)"""
49
+ request_id = f"request-{int(time.time() * 1000000)}"
50
+
51
+ if image_features and '<image>' in prompt:
52
+ request = {
53
+ "prompt": prompt,
54
+ "multi_modal_data": {"image": image_features}
55
+ }
56
+ else:
57
+ request = {"prompt": prompt}
58
+
59
+ final_output = ""
60
+ async for request_output in engine.generate(request, sampling_params, request_id):
61
+ if request_output.outputs:
62
+ final_output = request_output.outputs[0].text
63
+
64
+ return final_output.strip()
65
+
66
+
67
+ async def main_async(args):
68
+ """Main processing function"""
69
+ check_cuda()
70
+
71
+ # Enable HF_TRANSFER
72
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
73
+
74
+ # Login to HF if token provided
75
+ HF_TOKEN = args.hf_token or os.environ.get("HF_TOKEN")
76
+ if HF_TOKEN:
77
+ login(token=HF_TOKEN)
78
+
79
+ # Load dataset
80
+ print(f"Loading dataset: {args.input_dataset}")
81
+ dataset = load_dataset(args.input_dataset, split=args.split)
82
+
83
+ if args.image_column not in dataset.column_names:
84
+ print(f"ERROR: Column '{args.image_column}' not found")
85
+ print(f"Available columns: {dataset.column_names}")
86
+ sys.exit(1)
87
+
88
+ # Shuffle if requested
89
+ if args.shuffle:
90
+ print(f"Shuffling with seed {args.seed}")
91
+ dataset = dataset.shuffle(seed=args.seed)
92
+
93
+ # Limit samples if requested
94
+ if args.max_samples:
95
+ dataset = dataset.select(range(min(args.max_samples, len(dataset))))
96
+ print(f"Processing {len(dataset)} samples")
97
+
98
+ # Initialize vLLM engine (UNCHANGED from original)
99
+ print("Initializing vLLM engine...")
100
+ engine_args = AsyncEngineArgs(
101
+ model=MODEL_PATH,
102
+ hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
103
+ block_size=256,
104
+ max_model_len=args.max_model_len,
105
+ enforce_eager=False,
106
+ trust_remote_code=True,
107
+ tensor_parallel_size=1,
108
+ gpu_memory_utilization=args.gpu_memory_utilization,
109
+ )
110
+ engine = AsyncLLMEngine.from_engine_args(engine_args)
111
+
112
+ # Sampling params (UNCHANGED from original)
113
+ logits_processors = [NoRepeatNGramLogitsProcessor(
114
+ ngram_size=30, window_size=90, whitelist_token_ids={128821, 128822}
115
+ )]
116
+
117
+ sampling_params = SamplingParams(
118
+ temperature=0.0,
119
+ max_tokens=args.max_tokens,
120
+ logits_processors=logits_processors,
121
+ skip_special_tokens=False,
122
+ )
123
+
124
+ # Process images
125
+ print(f"Processing {len(dataset)} images...")
126
+ all_markdown = []
127
+ processor = DeepseekOCRProcessor()
128
+
129
+ for idx in tqdm(range(len(dataset)), desc="OCR processing"):
130
+ try:
131
+ # Load image
132
+ image = dataset[idx][args.image_column]
133
+ if not isinstance(image, Image.Image):
134
+ image = Image.open(image) if isinstance(image, str) else image
135
+
136
+ image = ImageOps.exif_transpose(image.convert('RGB'))
137
+
138
+ # Preprocess image (UNCHANGED from original)
139
+ if '<image>' in PROMPT:
140
+ image_features = processor.tokenize_with_images(
141
+ images=[image], bos=True, eos=True, cropping=CROP_MODE
142
+ )
143
+ else:
144
+ image_features = ''
145
+
146
+ # Process
147
+ result = await process_single_image(
148
+ engine, sampling_params, image_features, PROMPT
149
+ )
150
+ all_markdown.append(result)
151
+
152
+ except Exception as e:
153
+ print(f"Error processing image {idx}: {e}")
154
+ all_markdown.append("[OCR FAILED]")
155
+
156
+ # Add markdown column
157
+ print("Adding markdown column...")
158
+ dataset = dataset.add_column("markdown", all_markdown)
159
+
160
+ # Handle inference_info
161
+ if "inference_info" in dataset.column_names:
162
+ try:
163
+ existing_info = json.loads(dataset[0]["inference_info"])
164
+ if not isinstance(existing_info, list):
165
+ existing_info = [existing_info]
166
+ except:
167
+ existing_info = []
168
+ dataset = dataset.remove_columns(["inference_info"])
169
+ else:
170
+ existing_info = []
171
+
172
+ new_info = {
173
+ "column_name": "markdown",
174
+ "model_id": MODEL_PATH,
175
+ "processing_date": datetime.now().isoformat(),
176
+ "prompt": PROMPT,
177
+ "max_tokens": args.max_tokens,
178
+ "max_model_len": args.max_model_len,
179
+ "gpu_memory_utilization": args.gpu_memory_utilization,
180
+ "script": "process_dataset.py",
181
+ "implementation": "vllm-async (official deepseek code)",
182
+ }
183
+ existing_info.append(new_info)
184
+
185
+ info_json = json.dumps(existing_info, ensure_ascii=False)
186
+ dataset = dataset.add_column("inference_info", [info_json] * len(dataset))
187
+
188
+ # Push to hub
189
+ print(f"Pushing to {args.output_dataset}")
190
+ dataset.push_to_hub(args.output_dataset, private=args.private, token=HF_TOKEN)
191
+
192
+ print("✅ Complete!")
193
+ print(f"Dataset: https://huggingface.co/datasets/{args.output_dataset}")
194
+
195
+
196
+ if __name__ == "__main__":
197
+ parser = argparse.ArgumentParser(
198
+ description="Process images through DeepSeek-OCR"
199
+ )
200
+ parser.add_argument("input_dataset", help="Input dataset ID")
201
+ parser.add_argument("output_dataset", help="Output dataset ID")
202
+ parser.add_argument("--image-column", default="image", help="Image column name")
203
+ parser.add_argument("--split", default="train", help="Dataset split")
204
+ parser.add_argument("--max-samples", type=int, help="Limit number of samples")
205
+ parser.add_argument("--shuffle", action="store_true", help="Shuffle dataset")
206
+ parser.add_argument("--seed", type=int, default=42, help="Random seed")
207
+ parser.add_argument("--max-model-len", type=int, default=8192)
208
+ parser.add_argument("--max-tokens", type=int, default=8192)
209
+ parser.add_argument("--gpu-memory-utilization", type=float, default=0.75)
210
+ parser.add_argument("--hf-token", help="HF API token")
211
+ parser.add_argument("--private", action="store_true", help="Make output private")
212
+
213
+ args = parser.parse_args()
214
+ asyncio.run(main_async(args))