olacode55 commited on
Commit
0a12030
·
verified ·
1 Parent(s): 42a1704

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -17
app.py CHANGED
@@ -3,7 +3,6 @@ import torch
3
  import gradio as gr
4
  from huggingface_hub import login
5
  from transformers import AutoTokenizer, AutoModelForCausalLM
6
- from peft import PeftModel
7
 
8
  # === STEP 1: Authenticate with Hugging Face ===
9
  # Make sure you set your HF token as an environment variable or paste it here temporarily
@@ -13,32 +12,41 @@ login(token="hf_" + hf_token)
13
 
14
  # === STEP 2: Load base and adapter models ===
15
  base_model = "meta-llama/Llama-2-7b-chat-hf"
16
- adapter_model = "olacode55/zimble-llama2"
17
 
18
- tokenizer = AutoTokenizer.from_pretrained(base_model)
19
 
20
- offload_folder = "./offload" # must exist or be creatable
21
- os.makedirs(offload_folder, exist_ok=True)
22
 
23
- # --- Load model with 8-bit quantization and CPU offload ---
24
- base = AutoModelForCausalLM.from_pretrained(
25
- base_model,
26
- load_in_8bit=True,
27
  device_map="auto",
28
- offload_folder=offload_folder,
29
- llm_int8_enable_fp32_cpu_offload=True,
30
- use_auth_token="hf_" +hf_token
31
  )
32
 
33
-
34
- model = PeftModel.from_pretrained(base, adapter_model)
35
-
36
  # === STEP 3: Define generation function ===
37
  def generate(prompt):
38
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
39
- outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
 
 
 
 
 
 
 
40
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
41
 
42
  # === STEP 4: Launch Gradio app ===
43
- demo = gr.Interface(fn=generate, inputs="text", outputs="text", title="Zimble LLaMA 2 Fine-Tuned")
 
 
 
 
 
 
 
44
  demo.launch()
 
3
  import gradio as gr
4
  from huggingface_hub import login
5
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
6
 
7
  # === STEP 1: Authenticate with Hugging Face ===
8
  # Make sure you set your HF token as an environment variable or paste it here temporarily
 
12
 
13
  # === STEP 2: Load base and adapter models ===
14
  base_model = "meta-llama/Llama-2-7b-chat-hf"
15
+ adapter_model = "zimble-llama2-finetunedhybride"
16
 
17
+ tokenizer = AutoTokenizer.from_pretrained(merged_model_repo, use_auth_token=hf_token)
18
 
19
+ # Enable memory-efficient loading if needed
20
+ device = "cuda" if torch.cuda.is_available() else "cpu"
21
 
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ merged_model_repo,
24
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
 
25
  device_map="auto",
26
+ low_cpu_mem_usage=True,
27
+ use_auth_token=hf_token
 
28
  )
29
 
 
 
 
30
  # === STEP 3: Define generation function ===
31
  def generate(prompt):
32
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
33
+ with torch.no_grad():
34
+ outputs = model.generate(
35
+ **inputs,
36
+ max_new_tokens=250,
37
+ temperature=0.7,
38
+ top_p=0.9,
39
+ do_sample=True,
40
+ )
41
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
42
 
43
  # === STEP 4: Launch Gradio app ===
44
+ demo = gr.Interface(
45
+ fn=generate,
46
+ inputs=gr.Textbox(label="Enter your prompt", lines=4, placeholder="Type something..."),
47
+ outputs=gr.Textbox(label="Model output"),
48
+ title="🦙 Zimble LLaMA 2 (Merged)",
49
+ description="Fine-tuned and merged version of LLaMA 2 running on Hugging Face Space"
50
+ )
51
+
52
  demo.launch()