Spaces:

VanguardAI
/

Arabic-OCR

Running

App Files Files Community

VanguardAI commited on 15 days ago

Commit

b4b76e6

verified ·

1 Parent(s): 0961ac0

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -5

app.py CHANGED Viewed

@@ -13,6 +13,12 @@ import traceback
 # Model configuration
 MODEL_ID = "MBZUAI/AIN"
 # Global model and processor
 model = None
 processor = None
@@ -60,9 +66,11 @@ def ensure_model_loaded():
             trust_remote_code=True,
         )
-        # Load processor
         loaded_processor = AutoProcessor.from_pretrained(
             MODEL_ID,
             trust_remote_code=True,
         )
@@ -78,7 +86,13 @@ def ensure_model_loaded():
 @spaces.GPU()
-def extract_text_from_image(image: Image.Image, custom_prompt: str = None, max_new_tokens: int = 2048) -> str:
     """
     Extract text from image using AIN VLM model.
@@ -86,6 +100,8 @@ def extract_text_from_image(image: Image.Image, custom_prompt: str = None, max_n
         image: PIL Image to process
         custom_prompt: Optional custom prompt (uses default OCR prompt if None)
         max_new_tokens: Maximum tokens to generate
     Returns:
         Extracted text as string
@@ -100,6 +116,10 @@ def extract_text_from_image(image: Image.Image, custom_prompt: str = None, max_n
         # Use custom prompt or default OCR prompt
         prompt_to_use = custom_prompt if custom_prompt and custom_prompt.strip() else OCR_PROMPT
         # Prepare messages in the format expected by the model
         messages = [
             {
@@ -272,6 +292,18 @@ def create_gradio_interface():
                         info="Maximum length of extracted text"
                     )
                     show_prompt_btn = gr.Button("👁️ Show Default Prompt", size="sm")
                 # Process button
@@ -326,7 +358,7 @@ def create_gradio_interface():
         )
         # Event handlers
-        def process_image_handler(image, custom_prompt_text, max_tokens_value):
             """Handle image processing."""
             if image is None:
                 return "", "⚠️ Please upload an image first."
@@ -336,7 +368,9 @@ def create_gradio_interface():
                 extracted_text = extract_text_from_image(
                     image,
                     custom_prompt=custom_prompt_text,
-                    max_new_tokens=int(max_tokens_value)
                 )
                 if extracted_text and not extracted_text.startswith("❌"):
@@ -361,7 +395,7 @@ def create_gradio_interface():
         # Wire up events
         process_btn.click(
             process_image_handler,
-            inputs=[image_input, custom_prompt, max_tokens],
             outputs=[text_output, status_output]
         )

 # Model configuration
 MODEL_ID = "MBZUAI/AIN"
+# Image resolution settings for the processor
+# The default range for the number of visual tokens per image in the model is 4-16384
+# These settings balance speed and memory usage
+MIN_PIXELS = 256 * 28 * 28  # Minimum resolution
+MAX_PIXELS = 1280 * 28 * 28  # Maximum resolution
 # Global model and processor
 model = None
 processor = None
             trust_remote_code=True,
         )
+        # Load processor with resolution settings
         loaded_processor = AutoProcessor.from_pretrained(
             MODEL_ID,
+            min_pixels=MIN_PIXELS,
+            max_pixels=MAX_PIXELS,
             trust_remote_code=True,
         )
 @spaces.GPU()
+def extract_text_from_image(
+    image: Image.Image,
+    custom_prompt: str = None,
+    max_new_tokens: int = 2048,
+    min_pixels: int = None,
+    max_pixels: int = None
+) -> str:
     """
     Extract text from image using AIN VLM model.
         image: PIL Image to process
         custom_prompt: Optional custom prompt (uses default OCR prompt if None)
         max_new_tokens: Maximum tokens to generate
+        min_pixels: Minimum image resolution (optional)
+        max_pixels: Maximum image resolution (optional)
     Returns:
         Extracted text as string
         # Use custom prompt or default OCR prompt
         prompt_to_use = custom_prompt if custom_prompt and custom_prompt.strip() else OCR_PROMPT
+        # Use custom resolution settings if provided, otherwise use defaults
+        min_pix = min_pixels if min_pixels else MIN_PIXELS
+        max_pix = max_pixels if max_pixels else MAX_PIXELS
         # Prepare messages in the format expected by the model
         messages = [
             {
                         info="Maximum length of extracted text"
                     )
+                    with gr.Row():
+                        min_pixels_input = gr.Number(
+                            value=MIN_PIXELS,
+                            label="Min Pixels",
+                            info="Minimum image resolution"
+                        )
+                        max_pixels_input = gr.Number(
+                            value=MAX_PIXELS,
+                            label="Max Pixels",
+                            info="Maximum image resolution"
+                        )
                     show_prompt_btn = gr.Button("👁️ Show Default Prompt", size="sm")
                 # Process button
         )
         # Event handlers
+        def process_image_handler(image, custom_prompt_text, max_tokens_value, min_pix, max_pix):
             """Handle image processing."""
             if image is None:
                 return "", "⚠️ Please upload an image first."
                 extracted_text = extract_text_from_image(
                     image,
                     custom_prompt=custom_prompt_text,
+                    max_new_tokens=int(max_tokens_value),
+                    min_pixels=int(min_pix) if min_pix else None,
+                    max_pixels=int(max_pix) if max_pix else None
                 )
                 if extracted_text and not extracted_text.startswith("❌"):
         # Wire up events
         process_btn.click(
             process_image_handler,
+            inputs=[image_input, custom_prompt, max_tokens, min_pixels_input, max_pixels_input],
             outputs=[text_output, status_output]
         )