File size: 8,368 Bytes
8858188
 
1d5be6d
8858188
 
 
 
 
 
 
71e27e4
44b7045
 
8858188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c6ee2e
 
780e14f
1d5be6d
 
b35c8ef
1d5be6d
8858188
 
 
9c6ee2e
 
 
780e14f
1d5be6d
 
b35c8ef
8858188
 
cda0268
d312ec3
8858188
 
450705b
8858188
 
 
 
 
 
81db712
fcbf10b
 
 
 
 
 
 
 
 
 
 
 
 
 
87f8a5f
fcbf10b
 
 
 
dfeaa3c
fcbf10b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3057e7a
e611474
8858188
1a6b1d2
 
3e85227
 
f5e4cb0
1a6b1d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d34e54d
1a6b1d2
d34e54d
 
 
 
 
 
 
 
 
 
 
 
1a6b1d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81db712
 
71089ff
 
 
36b14dc
8858188
36b14dc
 
2b2bd20
81db712
 
36b14dc
 
e611474
8858188
aac5be6
529282d
71089ff
529282d
 
81db712
71089ff
529282d
197734e
e35b706
 
 
197734e
e35b706
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8858188
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import gradio as gr
import spaces
from transformers import AutoModelForImageTextToText, AutoProcessor, Qwen3VLMoeForConditionalGeneration
from qwen_vl_utils import process_vision_info
import torch
from PIL import Image
import subprocess
from datetime import datetime
import numpy as np
import os
import json
import tempfile
import zipfile

def array_to_image_path(image_array):
    # Convert numpy array to PIL Image
    img = Image.fromarray(np.uint8(image_array))
    img.thumbnail((1024, 1024))
    
    # Generate a unique filename using timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"image_{timestamp}.png"
    
    # Save the image
    img.save(filename)
    
    # Get the full path of the saved image
    full_path = os.path.abspath(filename)
    
    return full_path
    
models = {
    "nanonets/Nanonets-OCR-s": AutoModelForImageTextToText.from_pretrained(
        "nanonets/Nanonets-OCR-s", trust_remote_code=True, dtype="auto"
    ).cuda().eval(),
    "Qwen/Qwen3-VL-30B-A3B-Instruct": Qwen3VLMoeForConditionalGeneration.from_pretrained(
        "Qwen/Qwen3-VL-30B-A3B-Instruct", trust_remote_code=True, dtype="auto", device_map="auto"
    ).cuda().eval()
    
}

processors = {
    
    "nanonets/Nanonets-OCR-s": AutoProcessor.from_pretrained(
        "nanonets/Nanonets-OCR-s", trust_remote_code=True
    ),
    "Qwen/Qwen3-VL-30B-A3B-Instruct": AutoProcessor.from_pretrained(
        "Qwen/Qwen3-VL-30B-A3B-Instruct", trust_remote_code=True
    )
}


DESCRIPTION = "This demo uses[Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)"

kwargs = {}
kwargs['dtype'] = torch.bfloat16

user_prompt = '<|user|>\n'
assistant_prompt = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"

@spaces.GPU
def run_example(image, model_id= "nanonets/Nanonets-OCR-s", prompt= """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""):
    image_path = array_to_image_path(image)
    
    model = models[model_id]
    processor = processors[model_id]
    
    image = Image.fromarray(image).convert("RGB")
    messages = [
    {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": image_path,
                },
                {"type": "text", "text": prompt},
            ],
        }
    ]
    
    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    
    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    
    ocr_text = output_text[0]
        
    return ocr_text, ocr_text  

@spaces.GPU
def run_video(image_paths:list, model_id= "Qwen/Qwen3-VL-30B-A3B-Instruct", prompt= """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""):
    for image_path in image_paths:
        image_path = image_path.replace('/full/full/', f'/full/400,/')
    print('also image_paths:', image_paths)
    model = models[model_id]
    processor = processors[model_id]
    
    messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": image_paths,
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ]
    
    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True, fps=1.0
    )

    images, videos, video_kwargs = process_vision_info(messages, image_patch_size=16, return_video_kwargs=True, return_video_metadata=True)

    # split the videos and according metadatas
    if videos is not None:
        videos, video_metadatas = zip(*videos)
        videos, video_metadatas = list(videos), list(video_metadatas)
    else:
        video_metadatas = None

    inputs = processor(text=text, images=images, videos=videos, video_metadata=video_metadatas, return_tensors="pt", do_resize=False, **video_kwargs)
    #image_inputs, video_inputs = process_vision_info(messages)
    inputs = inputs.to("cuda")
    
    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    
    ocr_text = output_text[0]
        
    return ocr_text, ocr_text  


with gr.Blocks() as demo:
    # Add state variables to store OCR results
    ocr_state = gr.State()
    
    with gr.Tab(label="Image Input", elem_classes="tabs"):
        with gr.Row():
            with gr.Column(elem_classes="input-container"):
                input_img = gr.Image(label="Input Picture", elem_classes="gr-image-input")
                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2.5-VL-7B-Instruct", elem_classes="gr-dropdown")
                prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...", elem_classes="gr-textbox")

                submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
            with gr.Column(elem_classes="output-container"):
                output_text = gr.Textbox(label="Output Text", elem_id="output")

        
        
        # Modify the submit button click handler to update state
        submit_btn.click(
            run_example,
            inputs=[input_img, model_selector,prompt],
            outputs=[output_text, ocr_state]  # Add ocr_state to outputs
        )

    with gr.Tab(label="Video Input", elem_classes="tabs"):
        with gr.Row():
            with gr.Column(elem_classes="input-container"):
                input_video = gr.Textbox(label="Input Video", elem_classes="gr-video-input")
                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2.5-VL-7B-Instruct", elem_classes="gr-dropdown")
                prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...", elem_classes="gr-textbox")

                submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
            with gr.Column(elem_classes="output-container"):
                output_text = gr.Textbox(label="Output Text", elem_id="output")


            # Modify the submit button click handler to update state
            submit_btn.click(
                run_video,
                inputs=[input_video, model_selector, prompt],
                outputs=[output_text, ocr_state]  # Add ocr_state to outputs
            )

demo.queue(api_open=False)
demo.launch(debug=True)