Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	| import gradio as gr | |
| import spaces | |
| from transformers import AutoModelForImageTextToText, AutoProcessor, Qwen3VLMoeForConditionalGeneration | |
| from qwen_vl_utils import process_vision_info | |
| import torch | |
| from PIL import Image | |
| import subprocess | |
| from datetime import datetime | |
| import numpy as np | |
| import os | |
| import json | |
| import tempfile | |
| import zipfile | |
| def array_to_image_path(image_array): | |
| # Convert numpy array to PIL Image | |
| img = Image.fromarray(np.uint8(image_array)) | |
| img.thumbnail((1024, 1024)) | |
| # Generate a unique filename using timestamp | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| filename = f"image_{timestamp}.png" | |
| # Save the image | |
| img.save(filename) | |
| # Get the full path of the saved image | |
| full_path = os.path.abspath(filename) | |
| return full_path | |
| models = { | |
| "nanonets/Nanonets-OCR-s": AutoModelForImageTextToText.from_pretrained( | |
| "nanonets/Nanonets-OCR-s", trust_remote_code=True, dtype="auto" | |
| ).cuda().eval(), | |
| "Qwen/Qwen3-VL-30B-A3B-Instruct": Qwen3VLMoeForConditionalGeneration.from_pretrained( | |
| "Qwen/Qwen3-VL-30B-A3B-Instruct", trust_remote_code=True, dtype="auto", device_map="auto" | |
| ).cuda().eval() | |
| } | |
| processors = { | |
| "nanonets/Nanonets-OCR-s": AutoProcessor.from_pretrained( | |
| "nanonets/Nanonets-OCR-s", trust_remote_code=True | |
| ), | |
| "Qwen/Qwen3-VL-30B-A3B-Instruct": AutoProcessor.from_pretrained( | |
| "Qwen/Qwen3-VL-30B-A3B-Instruct", trust_remote_code=True | |
| ) | |
| } | |
| DESCRIPTION = "This demo uses[Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)" | |
| kwargs = {} | |
| kwargs['dtype'] = torch.bfloat16 | |
| user_prompt = '<|user|>\n' | |
| assistant_prompt = '<|assistant|>\n' | |
| prompt_suffix = "<|end|>\n" | |
| def run_example(image, model_id= "nanonets/Nanonets-OCR-s", prompt= """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""): | |
| image_path = array_to_image_path(image) | |
| model = models[model_id] | |
| processor = processors[model_id] | |
| image = Image.fromarray(image).convert("RGB") | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image", | |
| "image": image_path, | |
| }, | |
| {"type": "text", "text": prompt}, | |
| ], | |
| } | |
| ] | |
| # Preparation for inference | |
| text = processor.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| inputs = inputs.to("cuda") | |
| # Inference: Generation of the output | |
| generated_ids = model.generate(**inputs, max_new_tokens=1024) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| ) | |
| ocr_text = output_text[0] | |
| return ocr_text, ocr_text | |
| def run_video(image_paths:list, model_id= "Qwen/Qwen3-VL-30B-A3B-Instruct", prompt= """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""): | |
| for image_path in image_paths: | |
| image_path = image_path.replace('/full/full/', f'/full/400,/') | |
| print('also image_paths:', image_paths) | |
| model = models[model_id] | |
| processor = processors[model_id] | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "video", | |
| "video": image_paths, | |
| }, | |
| {"type": "text", "text": prompt}, | |
| ], | |
| } | |
| ] | |
| # Preparation for inference | |
| text = processor.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True, fps=1.0 | |
| ) | |
| images, videos, video_kwargs = process_vision_info(messages, image_patch_size=16, return_video_kwargs=True, return_video_metadata=True) | |
| # split the videos and according metadatas | |
| if videos is not None: | |
| videos, video_metadatas = zip(*videos) | |
| videos, video_metadatas = list(videos), list(video_metadatas) | |
| else: | |
| video_metadatas = None | |
| inputs = processor(text=text, images=images, videos=videos, video_metadata=video_metadatas, return_tensors="pt", do_resize=False, **video_kwargs) | |
| #image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = inputs.to("cuda") | |
| # Inference: Generation of the output | |
| generated_ids = model.generate(**inputs, max_new_tokens=1024) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| ) | |
| ocr_text = output_text[0] | |
| return ocr_text, ocr_text | |
| with gr.Blocks() as demo: | |
| # Add state variables to store OCR results | |
| ocr_state = gr.State() | |
| with gr.Tab(label="Image Input", elem_classes="tabs"): | |
| with gr.Row(): | |
| with gr.Column(elem_classes="input-container"): | |
| input_img = gr.Image(label="Input Picture", elem_classes="gr-image-input") | |
| model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2.5-VL-7B-Instruct", elem_classes="gr-dropdown") | |
| prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...", elem_classes="gr-textbox") | |
| submit_btn = gr.Button(value="Submit", elem_classes="submit-btn") | |
| with gr.Column(elem_classes="output-container"): | |
| output_text = gr.Textbox(label="Output Text", elem_id="output") | |
| # Modify the submit button click handler to update state | |
| submit_btn.click( | |
| run_example, | |
| inputs=[input_img, model_selector,prompt], | |
| outputs=[output_text, ocr_state] # Add ocr_state to outputs | |
| ) | |
| with gr.Tab(label="Video Input", elem_classes="tabs"): | |
| with gr.Row(): | |
| with gr.Column(elem_classes="input-container"): | |
| input_video = gr.Textbox(label="Input Video", elem_classes="gr-video-input") | |
| model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2.5-VL-7B-Instruct", elem_classes="gr-dropdown") | |
| prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...", elem_classes="gr-textbox") | |
| submit_btn = gr.Button(value="Submit", elem_classes="submit-btn") | |
| with gr.Column(elem_classes="output-container"): | |
| output_text = gr.Textbox(label="Output Text", elem_id="output") | |
| # Modify the submit button click handler to update state | |
| submit_btn.click( | |
| run_video, | |
| inputs=[input_video, model_selector, prompt], | |
| outputs=[output_text, ocr_state] # Add ocr_state to outputs | |
| ) | |
| demo.queue(api_open=False) | |
| demo.launch(debug=True) |