Spaces:

ChaseHan
/

Latex2Layout-Qwen2.5VL

Runtime error

App Files Files Community

ChaseHan commited on Jul 15

Commit

7ca7548

verified ·

1 Parent(s): ee06521

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -81

app.py CHANGED Viewed

@@ -6,40 +6,35 @@ import json
 import re
 from spaces import GPU
 # --- 1. Configurations and Constants ---
-# Hugging Face model repository
 MODEL_ID = "ChaseHan/Latex2Layout-2000-sync"
-# BUG FIX: Use fixed-size scaling
 TARGET_SIZE = (924, 1204)
-# Visualization style constants
 OUTLINE_WIDTH = 3
-# RGBA colors for layout regions (with transparency)
 LABEL_COLORS = {
-    "title": (255, 82, 82, 90),        # Red
-    "abstract": (46, 204, 113, 90),    # Green
-    "heading": (52, 152, 219, 90),     # Blue
-    "footnote": (241, 196, 15, 90),    # Yellow
     "figure": (155, 89, 182, 90),      # Purple
-    "figure caption": (26, 188, 156, 90),  # Teal
-    "table": (230, 126, 34, 90),       # Orange
-    "table caption": (44, 62, 80, 90), # Dark Blue/Gray
-    "math": (231, 76, 60, 90),         # Pomegranate
     "text": (149, 165, 166, 90),       # Gray
     "other": (127, 140, 141, 90)       # Light Gray
 }
-# Default prompt for layout detection
-PROMPT_GROUNDING = (
-    """<image>Please carefully observe the document and detect the following regions: "title", "abstract", "heading", "footnote", "figure", "figure caption", "table", "table caption", "math", "text". Output each detected region's bbox coordinates in JSON format. The format of the output is:
-    <answer>```json[{"bbox_2d": [x1, y1, x2, y2], "label": "region name", "order": "reading order"}]</answer>```"""
 )
 # --- 2. Load Model and Processor ---
 print("Loading model and processor, this may take a moment...")
 try:
     model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -53,50 +48,48 @@ except Exception as e:
     print(f"Error loading model: {e}")
     exit()
 # --- 3. Core Inference and Visualization Function ---
 @GPU
 def analyze_and_visualize_layout(input_image: Image.Image, prompt: str, temperature: float, top_p: float, progress=gr.Progress(track_tqdm=True)):
     """
-    Resizes input image, runs model inference, visualizes layout, and returns results.
     """
     if input_image is None:
         return None, "Please upload an image first."
     progress(0, desc="Resizing image...")
-    # BUG FIX: Use fixed-size scaling
     image = input_image.resize(TARGET_SIZE)
-    image = image.convert("RGBA")  # For transparent drawing
     messages = [
         {"role": "user", "content": [
             {"type": "image", "image": image},
-            {"type": "text", "text": prompt}
         ]}
     ]
     progress(0.2, desc="Preparing model inputs...")
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to(model.device)
     progress(0.5, desc="Generating layout data...")
     with torch.no_grad():
-        generate_kwargs = {
-            "max_new_tokens": 4096,
-            "do_sample": temperature > 0,  # Enable sampling if temperature > 0
-            "temperature": temperature if temperature > 0 else None,
-            "top_p": top_p if temperature > 0 else None,
-        }
-        output_ids = model.generate(**inputs, **generate_kwargs)
     output_text = processor.batch_decode(
         output_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True
     )[0]
     progress(0.8, desc="Parsing and visualizing results...")
     try:
-        json_match = re.search(r"json(.*?)", output_text, re.DOTALL)
         json_str = json_match.group(1).strip() if json_match else output_text.strip()
         results = json.loads(json_str)
     except (json.JSONDecodeError, AttributeError):
@@ -115,8 +108,7 @@ def analyze_and_visualize_layout(input_image: Image.Image, prompt: str, temperat
         label = item.get("label", "other")
         order = item.get("order", "")
-        if not bbox or len(bbox) != 4:
-            continue
         fill_color_rgba = LABEL_COLORS.get(label, LABEL_COLORS["other"])
         solid_color_rgb = fill_color_rgba[:3]
@@ -126,29 +118,25 @@ def analyze_and_visualize_layout(input_image: Image.Image, prompt: str, temperat
         tag_text = f"{order}: {label}"
         tag_bbox = draw.textbbox((0, 0), tag_text, font=font)
         tag_w, tag_h = tag_bbox[2] - tag_bbox[0], tag_bbox[3] - tag_bbox[1]
         tag_bg_box = [bbox[0], bbox[1], bbox[0] + tag_w + 10, bbox[1] + tag_h + 6]
         draw.rectangle(tag_bg_box, fill=solid_color_rgb)
         draw.text((bbox[0] + 5, bbox[1] + 3), tag_text, font=font, fill="white")
     visualized_image = Image.alpha_composite(image, overlay).convert("RGB")
     return visualized_image, output_text
 def clear_outputs():
-    """Clears output fields."""
     return None, None
 # --- 4. Gradio User Interface ---
 with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection") as demo:
     gr.Markdown("# 📄 Academic Paper Layout Detection")
     gr.Markdown(
         "Welcome! This tool uses a Qwen2.5-VL-3B-Instruct model fine-tuned on our Latex2Layout annotated layout dataset to identify layout regions in academic papers. "
         "Upload a document image to begin."
-        # BUG FIX: Updated description to reflect fixed-size scaling
         "\n> **Please note:** All uploaded images are automatically resized to 924x1204 pixels to meet the model's input requirements."
     )
     gr.Markdown("<hr>")
@@ -156,61 +144,58 @@ with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection")
     with gr.Row():
         with gr.Column(scale=4):
             input_image = gr.Image(type="pil", label="Upload Document Image", height=700)
-            with gr.Accordion("Advanced Settings", open=False):
-                prompt_input = gr.Textbox(
-                    value=PROMPT_GROUNDING,
-                    label="Custom Prompt",
-                    lines=5,
-                    info="Edit the prompt sent to the model. Changes may affect output format."
-                )
-                temperature_input = gr.Slider(
-                    minimum=0.0,
-                    maximum=2.0,
-                    value=1.0,
-                    step=0.1,
-                    label="Temperature",
-                    info="Controls randomness (higher = more creative, 0 = deterministic)."
-                )
-                top_p_input = gr.Slider(
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=0.95,
-                    step=0.05,
-                    label="Top-P",
-                    info="Nucleus sampling: considers the top p% probability mass."
-                )
         with gr.Column(scale=5):
             output_image = gr.Image(type="pil", label="Analyzed Layout", interactive=False, height=700)
     with gr.Row():
-        analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1)
     output_text = gr.Textbox(label="Model Raw Output", lines=8, interactive=False, visible=True)
-    # fix
     gr.Examples(
-        examples=[["1.png"], ["2.png"], ["3.png"], ["4.png"],["5.png"], ["6.png"], ["7.png"], ["8.png"], ["9.png"]],
-        inputs=[input_image, prompt_input, temperature_input, top_p_input],
-        outputs=[output_image, output_text],
-        fn=analyze_and_visualize_layout,
         label="Examples (Click to Run)",
-        cache_examples=True
     )
     gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset generated by Feijiang Han</p>")
     # --- Event Handlers ---
     analyze_btn.click(
         fn=analyze_and_visualize_layout,
-        inputs=[input_image, prompt_input, temperature_input, top_p_input],
         outputs=[output_image, output_text]
     )
     input_image.upload(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
     input_image.clear(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
 # --- 5. Launch the Application ---
 if __name__ == "__main__":
     demo.launch()

 import re
 from spaces import GPU
 # --- 1. Configurations and Constants ---
+# Model repository on Hugging Face
 MODEL_ID = "ChaseHan/Latex2Layout-2000-sync"
+# Target image size for model input
 TARGET_SIZE = (924, 1204)
+# Visualization Style Constants
 OUTLINE_WIDTH = 3
+# Color mapping for different layout regions (RGBA for transparency)
 LABEL_COLORS = {
+    "title": (255, 82, 82, 90),          # Red
+    "abstract": (46, 204, 113, 90),     # Green
+    "heading": (52, 152, 219, 90),      # Blue
+    "footnote": (241, 196, 15, 90),     # Yellow
     "figure": (155, 89, 182, 90),      # Purple
+    "figure caption": (26, 188, 156, 90),# Teal
+    "table": (230, 126, 34, 90),        # Orange
+    "table caption": (44, 62, 80, 90),   # Dark Blue/Gray
+    "math": (231, 76, 60, 90),        # Pomegranate
     "text": (149, 165, 166, 90),       # Gray
     "other": (127, 140, 141, 90)       # Light Gray
 }
+# The default prompt sent to the model for layout detection
+DEFAULT_PROMPT = (
+    """<image>Please carefully observe the document and detect the following regions: "title", "abstract", "heading", "footnote", "figure", "figure caption", "table", "table caption", "math", "text". Output each detected region's bbox coordinates in JSON format. The format of the output is: <answer>```json[{"bbox_2d": [x1, y1, x2, y2], "label": "region name", "order": "reading order"}]```</answer>."""
 )
 # --- 2. Load Model and Processor ---
 print("Loading model and processor, this may take a moment...")
 try:
     model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     print(f"Error loading model: {e}")
     exit()
 # --- 3. Core Inference and Visualization Function ---
 @GPU
 def analyze_and_visualize_layout(input_image: Image.Image, prompt: str, temperature: float, top_p: float, progress=gr.Progress(track_tqdm=True)):
     """
+    Takes an image and model parameters, runs inference, and returns a visualized image and raw text output.
     """
     if input_image is None:
         return None, "Please upload an image first."
     progress(0, desc="Resizing image...")
     image = input_image.resize(TARGET_SIZE)
+    image = image.convert("RGBA")
     messages = [
         {"role": "user", "content": [
             {"type": "image", "image": image},
+            {"type": "text", "text": prompt} # Use the configurable prompt
         ]}
     ]
     progress(0.2, desc="Preparing model inputs...")
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to(model.device)
     progress(0.5, desc="Generating layout data...")
     with torch.no_grad():
+        # Pass new parameters to the model generation
+        output_ids = model.generate(
+            **inputs,
+            max_new_tokens=4096,
+            do_sample=True, # Must be True for temperature/top_p to have an effect
+            temperature=temperature,
+            top_p=top_p
+        )
     output_text = processor.batch_decode(
         output_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True
     )[0]
     progress(0.8, desc="Parsing and visualizing results...")
     try:
+        json_match = re.search(r"```json(.*?)```", output_text, re.DOTALL)
         json_str = json_match.group(1).strip() if json_match else output_text.strip()
         results = json.loads(json_str)
     except (json.JSONDecodeError, AttributeError):
         label = item.get("label", "other")
         order = item.get("order", "")
+        if not bbox or len(bbox) != 4: continue
         fill_color_rgba = LABEL_COLORS.get(label, LABEL_COLORS["other"])
         solid_color_rgb = fill_color_rgba[:3]
         tag_text = f"{order}: {label}"
         tag_bbox = draw.textbbox((0, 0), tag_text, font=font)
         tag_w, tag_h = tag_bbox[2] - tag_bbox[0], tag_bbox[3] - tag_bbox[1]
         tag_bg_box = [bbox[0], bbox[1], bbox[0] + tag_w + 10, bbox[1] + tag_h + 6]
         draw.rectangle(tag_bg_box, fill=solid_color_rgb)
         draw.text((bbox[0] + 5, bbox[1] + 3), tag_text, font=font, fill="white")
     visualized_image = Image.alpha_composite(image, overlay).convert("RGB")
     return visualized_image, output_text
 def clear_outputs():
+    """Helper function to clear the output fields."""
     return None, None
 # --- 4. Gradio User Interface ---
 with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection") as demo:
     gr.Markdown("# 📄 Academic Paper Layout Detection")
     gr.Markdown(
         "Welcome! This tool uses a Qwen2.5-VL-3B-Instruct model fine-tuned on our Latex2Layout annotated layout dataset to identify layout regions in academic papers. "
         "Upload a document image to begin."
         "\n> **Please note:** All uploaded images are automatically resized to 924x1204 pixels to meet the model's input requirements."
     )
     gr.Markdown("<hr>")
     with gr.Row():
         with gr.Column(scale=4):
             input_image = gr.Image(type="pil", label="Upload Document Image", height=700)
         with gr.Column(scale=5):
             output_image = gr.Image(type="pil", label="Analyzed Layout", interactive=False, height=700)
     with gr.Row():
+         analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1)
+    # --- NEW: Advanced Settings Panel ---
+    with gr.Accordion("Advanced Settings", open=False):
+        prompt_textbox = gr.Textbox(
+            label="Prompt",
+            value=DEFAULT_PROMPT,
+            lines=5,
+            info="The prompt used to instruct the model."
+        )
+        temp_slider = gr.Slider(
+            minimum=0.0,
+            maximum=2.0,
+            step=0.05,
+            value=0.7,
+            label="Temperature",
+            info="Controls randomness. Higher values mean more random outputs."
+        )
+        top_p_slider = gr.Slider(
+            minimum=0.0,
+            maximum=1.0,
+            step=0.05,
+            value=0.9,
+            label="Top-p (Nucleus Sampling)",
+            info="Filters a cumulative probability mass. Lower values are less random."
+        )
     output_text = gr.Textbox(label="Model Raw Output", lines=8, interactive=False, visible=True)
     gr.Examples(
+        examples=[["page_2.png"], ["page_3.png"], ["page_5.png"], ["page_13.png"]],
+        inputs=[input_image],
         label="Examples (Click to Run)",
+        # Examples now only populate the image input. The user clicks "Analyze" to run with current settings.
     )
     gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset generated by Feijiang Han</p>")
     # --- Event Handlers ---
     analyze_btn.click(
         fn=analyze_and_visualize_layout,
+        inputs=[input_image, prompt_textbox, temp_slider, top_p_slider], # Add new inputs
         outputs=[output_image, output_text]
     )
     input_image.upload(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
     input_image.clear(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
 # --- 5. Launch the Application ---
 if __name__ == "__main__":
     demo.launch()