fantastic-futures

Running on Zero

App Files Files Community

fantastic-futures / app.py

apjanco

try adding video_metadatas

d34e54d 22 days ago

raw

history blame contribute delete

8.37 kB

	import gradio as gr
	import spaces
	from transformers import AutoModelForImageTextToText, AutoProcessor, Qwen3VLMoeForConditionalGeneration
	from qwen_vl_utils import process_vision_info
	import torch
	from PIL import Image
	import subprocess
	from datetime import datetime
	import numpy as np
	import os
	import json
	import tempfile
	import zipfile

	def array_to_image_path(image_array):
	# Convert numpy array to PIL Image
	img = Image.fromarray(np.uint8(image_array))
	img.thumbnail((1024, 1024))

	# Generate a unique filename using timestamp
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"image_{timestamp}.png"

	# Save the image
	img.save(filename)

	# Get the full path of the saved image
	full_path = os.path.abspath(filename)

	return full_path

	models = {
	"nanonets/Nanonets-OCR-s": AutoModelForImageTextToText.from_pretrained(
	"nanonets/Nanonets-OCR-s", trust_remote_code=True, dtype="auto"
	).cuda().eval(),
	"Qwen/Qwen3-VL-30B-A3B-Instruct": Qwen3VLMoeForConditionalGeneration.from_pretrained(
	"Qwen/Qwen3-VL-30B-A3B-Instruct", trust_remote_code=True, dtype="auto", device_map="auto"
	).cuda().eval()

	}

	processors = {

	"nanonets/Nanonets-OCR-s": AutoProcessor.from_pretrained(
	"nanonets/Nanonets-OCR-s", trust_remote_code=True
	),
	"Qwen/Qwen3-VL-30B-A3B-Instruct": AutoProcessor.from_pretrained(
	"Qwen/Qwen3-VL-30B-A3B-Instruct", trust_remote_code=True
	)
	}


	DESCRIPTION = "This demo uses[Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)"

	kwargs = {}
	kwargs['dtype'] = torch.bfloat16

	user_prompt = '<\|user\|>\n'
	assistant_prompt = '<\|assistant\|>\n'
	prompt_suffix = "<\|end\|>\n"

	@spaces.GPU
	def run_example(image, model_id= "nanonets/Nanonets-OCR-s", prompt= """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""):
	image_path = array_to_image_path(image)

	model = models[model_id]
	processor = processors[model_id]

	image = Image.fromarray(image).convert("RGB")
	messages = [
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": image_path,
	},
	{"type": "text", "text": prompt},
	],
	}
	]

	# Preparation for inference
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to("cuda")

	# Inference: Generation of the output
	generated_ids = model.generate(**inputs, max_new_tokens=1024)
	generated_ids_trimmed = [
	out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)

	ocr_text = output_text[0]

	return ocr_text, ocr_text

	@spaces.GPU
	def run_video(image_paths:list, model_id= "Qwen/Qwen3-VL-30B-A3B-Instruct", prompt= """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""):
	for image_path in image_paths:
	image_path = image_path.replace('/full/full/', f'/full/400,/')
	print('also image_paths:', image_paths)
	model = models[model_id]
	processor = processors[model_id]

	messages = [
	{
	"role": "user",
	"content": [
	{
	"type": "video",
	"video": image_paths,
	},
	{"type": "text", "text": prompt},
	],
	}
	]

	# Preparation for inference
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True, fps=1.0
	)

	images, videos, video_kwargs = process_vision_info(messages, image_patch_size=16, return_video_kwargs=True, return_video_metadata=True)

	# split the videos and according metadatas
	if videos is not None:
	videos, video_metadatas = zip(*videos)
	videos, video_metadatas = list(videos), list(video_metadatas)
	else:
	video_metadatas = None

	inputs = processor(text=text, images=images, videos=videos, video_metadata=video_metadatas, return_tensors="pt", do_resize=False, **video_kwargs)
	#image_inputs, video_inputs = process_vision_info(messages)
	inputs = inputs.to("cuda")

	# Inference: Generation of the output
	generated_ids = model.generate(**inputs, max_new_tokens=1024)
	generated_ids_trimmed = [
	out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)

	ocr_text = output_text[0]

	return ocr_text, ocr_text


	with gr.Blocks() as demo:
	# Add state variables to store OCR results
	ocr_state = gr.State()

	with gr.Tab(label="Image Input", elem_classes="tabs"):
	with gr.Row():
	with gr.Column(elem_classes="input-container"):
	input_img = gr.Image(label="Input Picture", elem_classes="gr-image-input")
	model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2.5-VL-7B-Instruct", elem_classes="gr-dropdown")
	prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...", elem_classes="gr-textbox")

	submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
	with gr.Column(elem_classes="output-container"):
	output_text = gr.Textbox(label="Output Text", elem_id="output")



	# Modify the submit button click handler to update state
	submit_btn.click(
	run_example,
	inputs=[input_img, model_selector,prompt],
	outputs=[output_text, ocr_state] # Add ocr_state to outputs
	)

	with gr.Tab(label="Video Input", elem_classes="tabs"):
	with gr.Row():
	with gr.Column(elem_classes="input-container"):
	input_video = gr.Textbox(label="Input Video", elem_classes="gr-video-input")
	model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2.5-VL-7B-Instruct", elem_classes="gr-dropdown")
	prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...", elem_classes="gr-textbox")

	submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
	with gr.Column(elem_classes="output-container"):
	output_text = gr.Textbox(label="Output Text", elem_id="output")


	# Modify the submit button click handler to update state
	submit_btn.click(
	run_video,
	inputs=[input_video, model_selector, prompt],
	outputs=[output_text, ocr_state] # Add ocr_state to outputs
	)

	demo.queue(api_open=False)
	demo.launch(debug=True)