import os
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

"""
🧮 Root_Math fine-tuned model chat app for Hugging Face Spaces.
Supports both Gradio UI and API access via `/chat`.
"""

# ✅ Load Hugging Face API token securely
api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if not api_token:
    raise ValueError("❌ ERROR: Hugging Face API token is not set. Please set it as an environment variable.")

# ✅ Define model names
base_model_name = "unsloth/qwen2.5-math-7b-bnb-4bit"
peft_model_name = "Hrushi02/Root_Math"  # <-- model name stays the same

# ✅ Load base model
print("🔄 Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    use_auth_token=api_token
)

# ✅ Load your fine-tuned PEFT adapter
print("🔄 Loading fine-tuned adapter...")
model = PeftModel.from_pretrained(base_model, peft_model_name, token=api_token)

# ✅ Load tokenizer
print("🔄 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_name, token=api_token)


# ✅ Define the response function
def respond(message, history, system_message, max_tokens, temperature, top_p):
    """Generate responses from your fine-tuned model."""
    full_prompt = system_message + "\n\n"
    for user_msg, bot_msg in history:
        if user_msg:
            full_prompt += f"User: {user_msg}\n"
        if bot_msg:
            full_prompt += f"Assistant: {bot_msg}\n"
    full_prompt += f"User: {message}\nAssistant:"

    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the assistant's last message
    if "Assistant:" in response:
        response = response.split("Assistant:")[-1].strip()

    return response


# ✅ Create Gradio Chat Interface
chat_ui = gr.ChatInterface(
    fn=lambda message, history, system_message, max_tokens, temperature, top_p: (
        respond(message, history, system_message, max_tokens, temperature, top_p)
    ),
    additional_inputs=[
        gr.Textbox(value="You are a helpful math assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
    ],
    title="🧮 Root Math Assistant",
    description="A fine-tuned math reasoning model by Hrushi02 using Unsloth + PEFT."
)


# ✅ Add API endpoint `/chat` (for gradio_client access)
api_chat = gr.Interface(
    fn=respond,
    inputs=[
        gr.Textbox(label="Message"),
        gr.State(),  # placeholder for chat history (can be None)
        gr.Textbox(value="You are a helpful math assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
    ],
    outputs="text",
    api_name="/chat"
)


# ✅ Combine UI + API
demo = gr.TabbedInterface([chat_ui, api_chat], ["Chat", "API"])


# ✅ Launch app
if __name__ == "__main__":
    demo.launch()