import os
import torch
import gradio as gr
from PIL import Image
from typing import List, Dict, Any
from transformers import AutoModel, AutoTokenizer

"""
Gradio app to run MiniCPM-V-4_5 int4 on CPU for image+text chat.
- Requires: pip install transformers accelerate gradio pillow
- Model: openbmb/MiniCPM-V-4_5-int4 (quantized, CPU-friendly)
- This script is self-contained and uses a simple multi-turn chat interface.
"""

MODEL_ID = os.environ.get("MINICPM_MODEL_ID", "openbmb/MiniCPM-V-4_5-int4")

# Global model/tokenizer, loaded once
model = None
tokenizer = None

def load_model():
    global model, tokenizer
    if model is not None and tokenizer is not None:
        return

    # For CPU inference, keep it simple and avoid .cuda() / bfloat16
    # trust_remote_code is required because MiniCPM implements custom .chat()
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    model = AutoModel.from_pretrained(
        MODEL_ID,
        trust_remote_code=True,
        attn_implementation="sdpa",  # SDPA is fine on CPU; avoid flash-attn on CPU
        torch_dtype=torch.float32,   # Safer default for CPU
        device_map="cpu",             # Ensure CPU execution
        quantization_config=None,
    )
    model.eval()


def build_messages(history: List[Dict[str, Any]], image: Image.Image, user_input: str) -> List[Dict[str, Any]]:
    """
    Convert Gradio chat history + current inputs into the message format expected by MiniCPM's .chat().
    history: List of {"role": "user"/"assistant", "content": "..."} pairs (text-only transcript).
    image: PIL.Image or None for the current turn.
    user_input: current user text.
    Returns a msgs list with roles and content arrays [image?, text].
    """
    msgs = []
    # Reconstruct multi-turn context: interleave user/assistant turns
    # We assume each user message is text-only and assistant reply is text-only in history.
    # For the current turn, we can attach an image (if provided) and the user's text.
    for turn in history:
        # Each turn in history is a tuple (user_text, assistant_text) from gr.Chatbot
        user_text, assistant_text = turn
        if user_text is not None:
            msgs.append({"role": "user", "content": [user_text]})
        if assistant_text is not None:
            msgs.append({"role": "assistant", "content": [assistant_text]})

    # Append current user turn (with optional image)
    content = []
    if image is not None:
        # Ensure RGB
        if image.mode != "RGB":
            image = image.convert("RGB")
        content.append(image)
    if user_input and user_input.strip():
        content.append(user_input.strip())
    else:
        # Ensure there is at least something in the content
        content.append("")

    msgs.append({"role": "user", "content": content})
    return msgs


def respond(user_text: str, image: Image.Image, chat_history: List[List[str]], enable_thinking: bool):
    """
    Inference handler for Gradio. Returns updated chat history and clears the user textbox.
    """
    load_model()

    # Build MiniCPM messages
    msgs = build_messages(chat_history or [], image, user_text)

    # Run model.chat
    with torch.inference_mode():
        answer = model.chat(
            msgs=msgs,
            tokenizer=tokenizer,
            enable_thinking=enable_thinking
        )

    # Update history shown in Chatbot: append (user_text, answer)
    # If user_text is empty but image provided, show a placeholder text.
    shown_user_msg = user_text.strip() if (user_text and user_text.strip()) else "[Image]"
    chat_history = chat_history + [[shown_user_msg, answer]]
    return chat_history, ""


def clear_history():
    return [], None, ""


def demo_app():
    with gr.Blocks(title="MiniCPM-V-4_5-int4 (CPU) - Gradio", theme="soft") as demo:
        gr.Markdown("## MiniCPM-V-4_5-int4 (CPU) Demo\nUpload an image (optional) and ask a question.")
        with gr.Row():
            with gr.Column(scale=3):
                chatbot = gr.Chatbot(height=420, type="messages", avatar_images=(None, None))
                with gr.Row():
                    img = gr.Image(type="pil", label="Image (optional)", height=240)
                user_in = gr.Textbox(
                    label="Your message",
                    placeholder="Ask something about the image or chat without an image...",
                    lines=3
                )
                with gr.Row():
                    enable_thinking = gr.Checkbox(value=False, label="Enable thinking mode")
                    send_btn = gr.Button("Send", variant="primary")
                    clear_btn = gr.Button("Clear")

            with gr.Column(scale=1):
                gr.Markdown("### Model")
                gr.Markdown(f"- ID: `{MODEL_ID}`\n- Device: CPU\n- Quant: int4")

        # Events
        send_btn.click(
            fn=respond,
            inputs=[user_in, img, chatbot, enable_thinking],
            outputs=[chatbot, user_in]
        )
        user_in.submit(
            fn=respond,
            inputs=[user_in, img, chatbot, enable_thinking],
            outputs=[chatbot, user_in]
        )
        clear_btn.click(
            fn=clear_history,
            inputs=[],
            outputs=[chatbot, img, user_in]
        )

    return demo


if __name__ == "__main__":
    # Make sure we don't accidentally spawn CUDA context
    os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
    demo = demo_app()
    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))