Spaces:

wuhp
/

myr1

Running

App Files Files Community

wuhp commited on Jan 30

Commit

13b1681

verified ·

1 Parent(s): b7d0639

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -55

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import gradio as gr
 import spaces
-from datasets import load_dataset
 import torch
 from transformers import (
     AutoConfig,
     AutoTokenizer,
@@ -9,131 +10,183 @@ from transformers import (
     DataCollatorForLanguageModeling,
     Trainer,
     TrainingArguments,
-    pipeline
 )
 ##############################################################################
-# GLOBALS / ZERO-GPU APPROACH
 ##############################################################################
-# We store a global pipeline after finetuning (if any).
 TEXT_PIPELINE = None
-# We'll train on only 50 examples from WikiText-2 to keep it short.
-NUM_EXAMPLES = 50
-@spaces.GPU(duration=600)  # up to 600 seconds (10 minutes) for mini-finetraining
 def finetune_small_subset():
     """
-    1) Loads 'wuhp/myr1' in 8-bit,
-    2) Takes 50 examples from WikiText-2,
-    3) Finetunes for 1 epoch,
-    4) Saves to 'finetuned_myr1/',
-    5) Reloads the new model into a pipeline for inference.
     """
-    # 1) Load dataset
     ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
-    # Keep only 50 to fit ephemeral GPU time
     ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
-    # 2) Load config, tokenizer, model
     config = AutoConfig.from_pretrained(
-        "wuhp/myr1",
         subfolder="myr1",
         trust_remote_code=True
     )
     tokenizer = AutoTokenizer.from_pretrained(
-        "wuhp/myr1",
         subfolder="myr1",
         trust_remote_code=True
     )
-    # 8-bit loading via bitsandbytes
-    model = AutoModelForCausalLM.from_pretrained(
         "wuhp/myr1",
         subfolder="myr1",
         config=config,
-        load_in_8bit=True,         # <--- 8-bit
-        device_map="auto",         # let HF manage device placement
         trust_remote_code=True
     )
-    # 3) Tokenize
     def tokenize_fn(ex):
         return tokenizer(ex["text"], truncation=True, max_length=512)
     ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])
     ds.set_format("torch")
     collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
-    # 4) TrainingArguments: no fp16 to avoid half-precision gradient issues
     training_args = TrainingArguments(
         output_dir="finetuned_myr1",
         num_train_epochs=1,
         per_device_train_batch_size=1,
         gradient_accumulation_steps=2,
-        logging_steps=10,
-        save_steps=999999,       # skip mid-training saves
         save_total_limit=1,
-        fp16=False,              # <--- disable FP16
     )
-    # 5) Trainer
     trainer = Trainer(
-        model=model,
         args=training_args,
         train_dataset=ds,
         data_collator=collator,
     )
-    # 6) Train
     trainer.train()
-    # 7) Save final model
-    trainer.save_model("finetuned_myr1")
     tokenizer.save_pretrained("finetuned_myr1")
-    # 8) Reload the newly finetuned model as a pipeline (for inference)
-    finetuned_model = AutoModelForCausalLM.from_pretrained(
-        "finetuned_myr1",
         device_map="auto",
         trust_remote_code=True
     )
     global TEXT_PIPELINE
-    TEXT_PIPELINE = pipeline("text-generation", model=finetuned_model, tokenizer=tokenizer)
-    return "Finetuning complete! Model reloaded for inference."
 def ensure_pipeline():
     """
-    If no pipeline yet, load the original model from wuhp/myr1 for inference.
-    (In 8-bit or normal float? We can do normal float here for a simpler approach.)
     """
     global TEXT_PIPELINE
     if TEXT_PIPELINE is None:
-        tokenizer = AutoTokenizer.from_pretrained(
-            "wuhp/myr1",
-            subfolder="myr1",
-            trust_remote_code=True
         )
-        model = AutoModelForCausalLM.from_pretrained(
             "wuhp/myr1",
             subfolder="myr1",
-            trust_remote_code=True,
-            load_in_8bit=True,   # load in 8-bit also for inference
-            device_map="auto"
         )
-        TEXT_PIPELINE = pipeline("text-generation", model=model, tokenizer=tokenizer)
     return TEXT_PIPELINE
-@spaces.GPU(duration=120)  # up to 120s for text generation
 def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     """
-    Generates text from either the finetuned pipeline (if it exists) or the base model.
-    Allows user to adjust temperature, top_p, min/max tokens.
     """
     pipe = ensure_pipeline()
     out = pipe(
@@ -149,13 +202,13 @@ def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
 # Build Gradio UI
 with gr.Blocks() as demo:
-    gr.Markdown("## ZeroGPU: Mini-Finetune with 8-bit + Extended Generation")
-    finetune_btn = gr.Button("Finetune on 50 lines of WikiText-2 (up to 10 min)")
     status_box = gr.Textbox(label="Finetune Status")
     finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
-    gr.Markdown("After finetuning, or even without it, generate text below:")
     prompt_in = gr.Textbox(lines=3, label="Prompt")
     temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")

 import gradio as gr
 import spaces
 import torch
+from datasets import load_dataset
 from transformers import (
     AutoConfig,
     AutoTokenizer,
     DataCollatorForLanguageModeling,
     Trainer,
     TrainingArguments,
+    pipeline,
+    BitsAndBytesConfig,  # for 4-bit config
 )
+# PEFT (LoRA / QLoRA)
+from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
 ##############################################################################
+# ZeroGPU + QLoRA Example
 ##############################################################################
 TEXT_PIPELINE = None
+NUM_EXAMPLES = 50  # We'll train on 50 lines of WikiText-2 for demonstration
+@spaces.GPU(duration=600)  # up to 10 min
 def finetune_small_subset():
     """
+    1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
+    2) Adds LoRA adapters (trainable),
+    3) Trains on 50 lines of WikiText-2,
+    4) Saves LoRA adapter to 'finetuned_myr1',
+    5) Reloads LoRA adapters for inference in a pipeline.
     """
+    # --- 1) Load WikiText-2 subset ---
     ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
     ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
+    # We'll define tokenize_fn after we have the tokenizer
+    # --- 2) Setup 4-bit quantization with BitsAndBytes ---
+    # This is QLoRA approach: we load the base model in 4-bit
+    # and attach LoRA adapters for training.
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,  # or torch.float16 if preferred
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",  # "nf4" is standard for QLoRA
+    )
     config = AutoConfig.from_pretrained(
+        "wuhp/myr1",
         subfolder="myr1",
         trust_remote_code=True
     )
     tokenizer = AutoTokenizer.from_pretrained(
+        "wuhp/myr1",
         subfolder="myr1",
         trust_remote_code=True
     )
+    # Load model in 4-bit
+    base_model = AutoModelForCausalLM.from_pretrained(
         "wuhp/myr1",
         subfolder="myr1",
         config=config,
+        quantization_config=bnb_config,   # <--- QLoRA 4-bit
+        device_map="auto",
         trust_remote_code=True
     )
+    # Prepare the model for k-bit training (QLoRA)
+    # This step disables dropout on some layers, sets up gradients for LN, etc.
+    base_model = prepare_model_for_kbit_training(base_model)
+    # --- 3) Create LoRA config & wrap the base model in LoRA adapter ---
+    # For LLaMA-like models, "q_proj" and "v_proj" are typical. If your model is different,
+    # adjust target_modules accordingly (maybe "c_attn", "W_pack", "query_key_value", etc.)
+    lora_config = LoraConfig(
+        r=16,
+        lora_alpha=32,
+        lora_dropout=0.05,
+        bias="none",
+        target_modules=["q_proj", "v_proj"],  # Adjust if your model uses different layer names
+        task_type=TaskType.CAUSAL_LM,
+    )
+    lora_model = get_peft_model(base_model, lora_config)
+    # --- 4) Tokenize dataset ---
     def tokenize_fn(ex):
         return tokenizer(ex["text"], truncation=True, max_length=512)
     ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])
     ds.set_format("torch")
+    # Data collator
     collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    # Training args
     training_args = TrainingArguments(
         output_dir="finetuned_myr1",
         num_train_epochs=1,
         per_device_train_batch_size=1,
         gradient_accumulation_steps=2,
+        logging_steps=5,
+        save_steps=999999,
         save_total_limit=1,
+        fp16=False,  # We'll rely on bnb_4bit/bfloat16 for the base model
     )
+    # Trainer
     trainer = Trainer(
+        model=lora_model,
         args=training_args,
         train_dataset=ds,
         data_collator=collator,
     )
+    # --- 5) Train ---
     trainer.train()
+    # Save LoRA adapter + tokenizer
+    # The 'save_model' would save only the LoRA adapter if using PEFT
+    trainer.model.save_pretrained("finetuned_myr1")
     tokenizer.save_pretrained("finetuned_myr1")
+    # --- 6) Reload the base model in 4-bit, then merge or apply the LoRA adapter for inference
+    # We'll do the same approach, then load adapter from 'finetuned_myr1'
+    base_model_2 = AutoModelForCausalLM.from_pretrained(
+        "wuhp/myr1",
+        subfolder="myr1",
+        config=config,
+        quantization_config=bnb_config,
         device_map="auto",
         trust_remote_code=True
     )
+    base_model_2 = prepare_model_for_kbit_training(base_model_2)
+    # Re-inject LoRA
+    # If your LoRA was saved in the same folder, you can do:
+    # from peft import PeftModel
+    # lora_model_2 = PeftModel.from_pretrained(base_model_2, "finetuned_myr1")
+    # or you can do get_peft_model and pass the weights, etc.
+    # But we can reuse 'get_peft_model' + load the LoRA weights
+    lora_model_2 = get_peft_model(base_model_2, lora_config)
+    lora_model_2.load_adapter("finetuned_myr1")
+    # Create pipeline
     global TEXT_PIPELINE
+    TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
+    return "Finetuning complete (QLoRA + LoRA). Model loaded for inference."
 def ensure_pipeline():
     """
+    If we haven't finetuned yet (TEXT_PIPELINE is None),
+    load the base model in 4-bit with NO LoRA.
     """
     global TEXT_PIPELINE
     if TEXT_PIPELINE is None:
+        # Just load base model in 4-bit
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
         )
+        config = AutoConfig.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True)
+        base_model = AutoModelForCausalLM.from_pretrained(
             "wuhp/myr1",
             subfolder="myr1",
+            config=config,
+            quantization_config=bnb_config,
+            device_map="auto",
+            trust_remote_code=True
         )
+        TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
     return TEXT_PIPELINE
+@spaces.GPU(duration=120)  # up to 2 min for text generation
 def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     """
+    Generates text from the finetuned (LoRA) model if present, else the base model.
     """
     pipe = ensure_pipeline()
     out = pipe(
 # Build Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1")
+    finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on 50 lines of WikiText-2 (up to 10 min)")
     status_box = gr.Textbox(label="Finetune Status")
     finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
+    gr.Markdown("Then generate text below (or skip finetuning to see base model).")
     prompt_in = gr.Textbox(lines=3, label="Prompt")
     temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")