Demo2-Multimodal-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 12

Commit

64f9a07

verified ·

1 Parent(s): 5e7d4ee

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -98

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ from transformers import (
     AutoProcessor,
     TextIteratorStreamer,
     AutoModelForImageTextToText,
-    Gemma3ForConditionalGeneration  # new Gemma3 model import
 )
 from transformers.image_utils import load_image
 from threading import Thread
@@ -32,10 +31,7 @@ def progress_bar_html(label: str) -> str:
 </style>
     '''
-### Load Models & Processors ###
-# Qwen2VL OCR model (default)
-QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"  # or alternate version
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
 qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     QV_MODEL_ID,
@@ -43,105 +39,62 @@ qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# Aya-Vision model (trigger with @aya-vision)
 AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
 aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
 aya_model = AutoModelForImageTextToText.from_pretrained(
     AYA_MODEL_ID, device_map="auto", torch_dtype=torch.float16
 )
-# Gemma3-4b model (trigger with @gemma3-4b)
-GEMMA3_MODEL_ID = "google/gemma-3-4b-it"
-gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(
-    GEMMA3_MODEL_ID, device_map="auto"
-).eval()
-gemma3_processor = AutoProcessor.from_pretrained(GEMMA3_MODEL_ID)
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"].strip()
     files = input_dict.get("files", [])
-    # Branch: Aya-Vision (trigger with @aya-vision)
     if text.lower().startswith("@aya-vision"):
         text_prompt = text[len("@aya-vision"):].strip()
         if not files:
             yield "Error: Please provide an image for the @aya-vision feature."
             return
-        image = load_image(files[0])
-        yield progress_bar_html("Processing with Aya-Vision-8b")
-        messages = [{
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": text_prompt},
-            ],
-        }]
-        inputs = aya_processor.apply_chat_template(
-            messages,
-            padding=True,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt"
-        ).to(aya_model.device)
-        streamer = TextIteratorStreamer(aya_processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = dict(
-            inputs,
-            streamer=streamer,
-            max_new_tokens=1024,
-            do_sample=True,
-            temperature=0.3
-        )
-        thread = Thread(target=aya_model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
-        return
-    # Branch: Gemma3-4b (trigger with @gemma3-4b)
-    if text.lower().startswith("@gemma3-4b"):
-        text_prompt = text[len("@gemma3-4b"):].strip()
-        if not files:
-            yield "Error: Please provide an image for the @gemma3-4b feature."
-            return
-        image = load_image(files[0])
-        yield progress_bar_html("Processing with Gemma3-4b")
-        messages = [
-            {
-                "role": "system",
-                "content": [{"type": "text", "text": "You are a helpful assistant."}]
-            },
-            {
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image},
-                    {"type": "text", "text": text_prompt}
-                ]
-            }
-        ]
-        inputs = gemma3_processor.apply_chat_template(
-            messages, add_generation_prompt=True, tokenize=True,
-            return_dict=True, return_tensors="pt"
-        ).to(gemma3_model.device, dtype=torch.bfloat16)
-        input_len = inputs["input_ids"].shape[-1]
-        streamer = TextIteratorStreamer(gemma3_processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512, do_sample=False)
-        thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
-        return
-    # Default Branch: Qwen2-VL OCR (for text query with optional images)
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
@@ -149,6 +102,7 @@ def model_inference(input_dict, history):
     else:
         images = []
     if text == "" and not images:
         yield "Error: Please input a query and optionally image(s)."
         return
@@ -156,6 +110,7 @@ def model_inference(input_dict, history):
         yield "Error: Please input a text query along with the image(s)."
         return
     messages = [{
         "role": "user",
         "content": [
@@ -174,9 +129,11 @@ def model_inference(input_dict, history):
         padding=True,
     ).to("cuda")
     streamer = TextIteratorStreamer(qwen_processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
     thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
     thread.start()
@@ -188,36 +145,28 @@ def model_inference(input_dict, history):
         time.sleep(0.01)
         yield buffer
-# Examples for quick testing.
 examples = [
-    [{"text": "@gemma3-4b Summarize the letter", "files": ["examples/1.png"]}],
-    [{"text": "@gemma3-4b Extract JSON from the image", "files": ["example_images/document.jpg"]}],
-    [{"text": "@gemma3-4b Describe the photo", "files": ["examples/3.png"]}],
     [{"text": "@aya-vision Summarize the full image in detail", "files": ["examples/2.jpg"]}],
     [{"text": "@aya-vision Describe this image.", "files": ["example_images/campeones.jpg"]}],
     [{"text": "@aya-vision What is this UI about?", "files": ["example_images/s2w_example.png"]}],
-    [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
     [{"text": "@aya-vision Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
 ]
-# Gradio ChatInterface with a multimodal textbox.
 demo = gr.ChatInterface(
     fn=model_inference,
-    description=(
-        "# **Multimodal OCR & Vision Features**\n\n"
-        "Use the following commands to select a model:\n"
-        "- `@aya-vision` for Aya-Vision-8b\n"
-        "- `@gemma3-4b` for Gemma3-4b\n\n"
-        "Default processing is done with Qwen2VL OCR."
-    ),
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
         file_types=["image"],
         file_count="multiple",
-        placeholder="Enter your text query and attach images if needed. Use @aya-vision or @gemma3-4b to choose a feature."
     ),
     stop_btn="Stop Generation",
     multimodal=True,

     AutoProcessor,
     TextIteratorStreamer,
     AutoModelForImageTextToText,
 )
 from transformers.image_utils import load_image
 from threading import Thread
 </style>
     '''
+QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct" # or use  #prithivMLmods/Qwen2-VL-OCR2-2B-Instruct
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
 qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     QV_MODEL_ID,
     torch_dtype=torch.float16
 ).to("cuda").eval()
 AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
 aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
 aya_model = AutoModelForImageTextToText.from_pretrained(
     AYA_MODEL_ID, device_map="auto", torch_dtype=torch.float16
 )
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"].strip()
     files = input_dict.get("files", [])
     if text.lower().startswith("@aya-vision"):
+        # Remove the command prefix and trim the prompt.
         text_prompt = text[len("@aya-vision"):].strip()
         if not files:
             yield "Error: Please provide an image for the @aya-vision feature."
             return
+        else:
+            # For simplicity, use the first provided image.
+            image = load_image(files[0])
+            yield progress_bar_html("Processing with Aya-Vision-8b")
+            messages = [{
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image},
+                    {"type": "text", "text": text_prompt},
+                ],
+            }]
+            inputs = aya_processor.apply_chat_template(
+                messages,
+                padding=True,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt"
+            ).to(aya_model.device)
+            # Set up a streamer for Aya-Vision output
+            streamer = TextIteratorStreamer(aya_processor, skip_prompt=True, skip_special_tokens=True)
+            generation_kwargs = dict(
+                inputs,
+                streamer=streamer,
+                max_new_tokens=1024,
+                do_sample=True,
+                temperature=0.3
+            )
+            thread = Thread(target=aya_model.generate, kwargs=generation_kwargs)
+            thread.start()
+            buffer = ""
+            for new_text in streamer:
+                buffer += new_text
+                buffer = buffer.replace("<|im_end|>", "")
+                time.sleep(0.01)
+                yield buffer
+            return
+    # Load images if provided.
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
     else:
         images = []
+    # Validate input: require both text and (optionally) image(s).
     if text == "" and not images:
         yield "Error: Please input a query and optionally image(s)."
         return
         yield "Error: Please input a text query along with the image(s)."
         return
+    # Prepare messages for the Qwen2-VL model.
     messages = [{
         "role": "user",
         "content": [
         padding=True,
     ).to("cuda")
+    # Set up a streamer for real-time output.
     streamer = TextIteratorStreamer(qwen_processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
+    # Start generation in a separate thread.
     thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
     thread.start()
         time.sleep(0.01)
         yield buffer
 examples = [
+    [{"text": "@aya-vision Summarize the letter", "files": ["examples/1.png"]}],
+    [{"text": "@aya-vision Extract JSON from the image", "files": ["example_images/document.jpg"]}],
+    [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
+    [{"text": "@aya-vision Describe the photo", "files": ["examples/3.png"]}],
     [{"text": "@aya-vision Summarize the full image in detail", "files": ["examples/2.jpg"]}],
     [{"text": "@aya-vision Describe this image.", "files": ["example_images/campeones.jpg"]}],
     [{"text": "@aya-vision What is this UI about?", "files": ["example_images/s2w_example.png"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
     [{"text": "@aya-vision Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
         file_types=["image"],
         file_count="multiple",
+        placeholder="By default, it runs Qwen2VL OCR, Tag @aya-vision for Aya Vision 8B"
     ),
     stop_btn="Stop Generation",
     multimodal=True,