Spaces:

yuhangzang
/

spark

Running on Zero

File size: 14,249 Bytes

import os
import time
import glob
from typing import List

import spaces
import gradio as gr
import torch
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor

MODEL_ID = os.environ.get("SPARK_MODEL_ID", "internlm/Spark-VL-7B")
DTYPE = torch.bfloat16

_model = None
_processor = None
_attn_impl = None


def _load_model_and_processor():
    global _model, _processor, _attn_impl
    if _model is not None and _processor is not None:
        return _model, _processor

    # Prefer flash-attn if available, otherwise fall back to eager.
    attn_impl = os.environ.get("ATTN_IMPL", "flash_attention_2")
    try:
        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            MODEL_ID,
            # `torch_dtype` was deprecated in Transformers; use `dtype` instead.
            dtype=DTYPE,
            attn_implementation=attn_impl,
            device_map="auto",
        )
        _attn_impl = attn_impl
    except (ImportError, ValueError, RuntimeError):
        # Fallback for environments without flash-attn
        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            MODEL_ID,
            # Use the new `dtype` kwarg for consistency with deprecations
            dtype=DTYPE,
            attn_implementation="eager",
            device_map="auto",
        )
        _attn_impl = "eager"

    processor = AutoProcessor.from_pretrained(MODEL_ID)

    _model = model
    _processor = processor
    return _model, _processor


# Optionally preload the model at app startup so first click is fast.
# - On ZeroGPU, this will instantiate on CPU (no GPU at startup), so the
#   first generate only needs to move tensors to CUDA.
# - You can disable by setting env `PRELOAD_MODEL=0`.
if os.environ.get("PRELOAD_MODEL", "1") not in ("0", "false", "False"):
    try:
        _load_model_and_processor()
        print(f"[preload] Loaded {MODEL_ID} (attn_impl={_attn_impl})", flush=True)
    except Exception as e:
        # Don't block app if preload fails; fallback to lazy load on first call
        print(f"[preload] Skipped due to: {type(e).__name__}: {e}", flush=True)


def _prepare_inputs(image, prompt):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt},
            ],
        }
    ]
    chat_text = _processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = _processor(
        text=[chat_text],
        # Pass the single image directly; template contains <image> placeholder
        images=[image] if image is not None else None,
        return_tensors="pt",
    )
    return inputs


def _decode(generated_ids, input_ids):
    # Trim the prompt part before decoding
    trimmed = generated_ids[:, input_ids.shape[1] :]
    out = _processor.batch_decode(
        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return out[0].strip() if out else ""


@spaces.GPU(duration=120)
def generate(image, prompt, max_new_tokens, temperature, top_p, top_k):
    if image is None:
        return "Please upload an image."
    prompt = (prompt or "").strip()
    if not prompt:
        return "Please enter a prompt."

    start = time.time()
    model, _ = _load_model_and_processor()

    try:
        # Ensure model resides on GPU during the call
        dev = next(model.parameters()).device
        if dev.type != "cuda":
            model.to("cuda")
            dev = torch.device("cuda")
    except StopIteration:
        dev = torch.device("cuda")

    try:
        inputs = _prepare_inputs(image, prompt)
        inputs = {k: v.to(dev) if hasattr(v, "to") else v for k, v in inputs.items()}

        gen_kwargs = {
            "max_new_tokens": int(max_new_tokens),
            "do_sample": True,
            "temperature": float(temperature),
            "top_p": float(top_p),
            "top_k": int(top_k),
            "use_cache": True,
        }
        with torch.inference_mode():
            out_ids = model.generate(**inputs, **gen_kwargs)
        text = _decode(out_ids, inputs["input_ids"])
        took = time.time() - start
        return f"{text}\n\n[attn={_attn_impl}, time={took:.1f}s]"
    except Exception as e:
        return f"Inference failed: {type(e).__name__}: {e}"
    finally:
        # Release GPU memory cache for ZeroGPU
        try:
            torch.cuda.empty_cache()
        except Exception:
            pass


def build_ui():
    with gr.Blocks() as demo:
        gr.Markdown(
            """
            # Spark: Synergistic Policy And Reward Co-Evolving Framework

            <h3 align="center">
              📖<a href="https://arxiv.org/abs/2509.22624">Paper</a> 
            | 🤗<a href="https://huggingface.co/internlm/Spark-VL-7B">Models</a> 
            | 🤗<a href="https://huggingface.co/datasets/internlm/Spark-Data">Datasets</a>
            | 🤗<a href="https://huggingface.co/papers/2509.22624">Daily Paper</a>
            </h3>

            **🌈 Introduction:** We propose SPARK, <strong>a unified framework that integrates policy and reward into a single model for joint and synchronous training</strong>. SPARK can automatically derive reward and reflection data from verifiable reward, enabling <strong>self-learning and self-evolution</strong>.

            **🤗 Models:** We release the checkpoints at [internlm/Spark-VL-7B](https://huggingface.co/internlm/Spark-VL-7B).

            **🤗 Datasets:** Training data is available at [internlm/Spark-Data](https://huggingface.co/datasets/internlm/Spark-Data).

            **💻 Training Code:** The training code and implementation details can be found at [InternLM/Spark](https://github.com/InternLM/Spark).

            ---

            📸 **Upload an image and enter a prompt** or 🖼️ **choose the input from the example gallery** (image + prompt).
            """
        )

        # Build an image+prompt gallery from ./examples
        # Each example is an image file with an optional sidecar .txt containing the prompt.
        # If a .txt is present (same basename), we will display a caption and load the
        # prompt alongside the image when the thumbnail is selected.
        def _gather_examples() -> List[tuple]:
            pairs = []  # (image_path, prompt_text)
            imgs = []
            for ext in ("jpg", "jpeg", "png", "webp"):
                imgs.extend(glob.glob(os.path.join("examples", f"*.{ext}")))
            # Deduplicate while keeping order
            for img_path in list(dict.fromkeys(sorted(imgs))):
                stem, _ = os.path.splitext(img_path)
                prompt_path = stem + ".txt"
                prompt_text = None
                if os.path.exists(prompt_path):
                    try:
                        with open(prompt_path, "r", encoding="utf-8") as fh:
                            prompt_text = fh.read().strip()
                    except Exception:
                        prompt_text = None
                pairs.append((img_path, prompt_text))
            return pairs

        example_pairs = _gather_examples()

        # Load default image if exists
        default_path = os.path.join("examples", "example_0.png")
        default_image = Image.open(default_path) if os.path.exists(default_path) else None

        with gr.Row():
            with gr.Column(scale=1):
                image = gr.Image(type="pil", label="Image", value=default_image)

            with gr.Column(scale=1):
                prompt = gr.Textbox(
                    label="Prompt",
                    value=(
                        "As seen in the diagram, three darts are thrown at nine fixed balloons. "
                        "If a balloon is hit it will burst and the dart continues in the same direction "
                        "it had beforehand. How many balloons will not be hit by a dart?"
                    ),
                    lines=4,
                )
                max_new_tokens = gr.Slider(512, 4096, value=1024, step=8, label="max_new_tokens")
                temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
                top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.01, label="top_p")
                top_k = gr.Slider(1, 200, value=50, step=1, label="top_k")
                run = gr.Button("Generate")

        # Clear prompt when image is removed
        image.clear(fn=lambda: "", outputs=prompt)

        # Examples section: table-like layout with image and prompt columns
        gr.Markdown("## Examples")

        # Handler for clicking on example images
        def _on_example_click(img_path, prompt_text):
            try:
                img_val = Image.open(img_path)
            except Exception:
                img_val = None
            return img_val, prompt_text

        # Categorize examples by type
        math_examples = []
        reward_examples = []
        other_examples = []

        for img_path, prompt_text in example_pairs:
            basename = os.path.basename(img_path)
            if basename.startswith("example_0"):
                math_examples.append((img_path, prompt_text))
            elif basename.startswith("example_1"):
                reward_examples.append((img_path, prompt_text))
            else:
                other_examples.append((img_path, prompt_text))

        # Display math reasoning examples
        if math_examples:
            gr.Markdown("### 📐 Math Reasoning Examples")
            for idx, (img_path, prompt_text) in enumerate(math_examples):
                with gr.Row():
                    with gr.Column(scale=1):
                        ex_img = gr.Image(
                            value=img_path,
                            type="filepath",
                            label=f"Math Example {idx}",
                            interactive=False,
                            show_label=True,
                            height=200,
                        )
                        # Wire click event to load the example
                        ex_img.select(
                            fn=lambda ip=img_path, pt=prompt_text: _on_example_click(ip, pt),
                            outputs=[image, prompt],
                        )
                    with gr.Column(scale=3):
                        ex_text = gr.Textbox(
                            value=prompt_text or "",
                            label="Prompt",
                            lines=8,
                            max_lines=8,
                            interactive=False,
                            show_label=True,
                        )

        # Display reward model examples
        if reward_examples:
            gr.Markdown("### 🎯 Reward Model Examples")
            for idx, (img_path, prompt_text) in enumerate(reward_examples):
                with gr.Row():
                    with gr.Column(scale=1):
                        ex_img = gr.Image(
                            value=img_path,
                            type="filepath",
                            label=f"Reward Example {idx}",
                            interactive=False,
                            show_label=True,
                            height=200,
                        )
                        # Wire click event to load the example
                        ex_img.select(
                            fn=lambda ip=img_path, pt=prompt_text: _on_example_click(ip, pt),
                            outputs=[image, prompt],
                        )
                    with gr.Column(scale=3):
                        ex_text = gr.Textbox(
                            value=prompt_text or "",
                            label="Prompt",
                            lines=8,
                            max_lines=8,
                            interactive=False,
                            show_label=True,
                        )

        # Display other examples if any
        if other_examples:
            gr.Markdown("### 📋 Other Examples")
            for idx, (img_path, prompt_text) in enumerate(other_examples):
                with gr.Row():
                    with gr.Column(scale=1):
                        ex_img = gr.Image(
                            value=img_path,
                            type="filepath",
                            label=f"Example {idx}",
                            interactive=False,
                            show_label=True,
                            height=200,
                        )
                        # Wire click event to load the example
                        ex_img.select(
                            fn=lambda ip=img_path, pt=prompt_text: _on_example_click(ip, pt),
                            outputs=[image, prompt],
                        )
                    with gr.Column(scale=3):
                        ex_text = gr.Textbox(
                            value=prompt_text or "",
                            label="Prompt",
                            lines=8,
                            max_lines=8,
                            interactive=False,
                            show_label=True,
                        )

        output = gr.Textbox(label="Model Output", lines=8)

        run.click(
            fn=generate,
            inputs=[image, prompt, max_new_tokens, temperature, top_p, top_k],
            outputs=output,
            show_progress=True,
        )

        # Citation section at the bottom
        gr.Markdown(
            """
            ---
            If you find this project useful, please kindly cite:

            ```bibtex
            @article{liu2025spark,
              title={SPARK: Synergistic Policy And Reward Co-Evolving Framework},
              author={Liu, Ziyu and Zang, Yuhang and Ding, Shengyuan and Cao, Yuhang and Dong, Xiaoyi and Duan, Haodong and Lin, Dahua and Wang, Jiaqi},
              journal={arXiv preprint arXiv:2509.22624},
              year={2025}
            }
            ```
            """
        )

        demo.queue(max_size=10).launch()
    return demo


if __name__ == "__main__":
    build_ui()