Spaces:

olacode55
/

zimble

Running

File size: 1,690 Bytes

9f17d88
 
fbb5853
9f17d88
fbb5853
9f17d88
 
 
358aa00
 
9f17d88
 
656abfe
e2052e7
fbb5853
358aa00
9f17d88
0a12030
 
42a1704
0a12030
388390c
0a12030
9f17d88
06f711f
fbb5853
 
9f17d88
fbb5853
 
0a12030
 
 
 
 
 
 
 
fbb5853
 
9f17d88
0a12030
 
 
 
 
 
 
 
fbb5853

import os
import torch
import gradio as gr
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
# === STEP 1: Authenticate with Hugging Face ===
# Make sure you set your HF token as an environment variable or paste it here temporarily
# For security, prefer environment variable (recommended)
#hf_token = "TLpIICgZJrDCTgVTsaaydFFWbWyGKiGAPa"
#login(token="hf_" + hf_token)

# === STEP 2: Load base and adapter models ===
base_model = "meta-llama/Llama-2-7b-chat-hf"
adapter_model = "olacode55/zimble-llama2-finetunedhybride"

tokenizer = AutoTokenizer.from_pretrained(adapter_model)

# Enable memory-efficient loading if needed
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(
    adapter_model,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    low_cpu_mem_usage=True
)

# === STEP 3: Define generation function ===
def generate(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=250,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# === STEP 4: Launch Gradio app ===
demo = gr.Interface(
    fn=generate,
    inputs=gr.Textbox(label="Enter your prompt", lines=4, placeholder="Type something..."),
    outputs=gr.Textbox(label="Model output"),
    title="🦙 Zimble LLaMA 2 (Merged)",
    description="Fine-tuned and merged version of LLaMA 2 running on Hugging Face Space"
)

demo.launch()