Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from peft import PeftModel | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| # Try to import Unsloth; fallback if failed | |
| try: | |
| from unsloth import FastLanguageModel | |
| HAS_UNSLOTH = True | |
| except NotImplementedError: | |
| HAS_UNSLOTH = False | |
| except ImportError: | |
| HAS_UNSLOTH = False | |
| class ModelManager: | |
| _instance = None | |
| def __init__(self): | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.model, self.tokenizer = self.load_model() | |
| def get_instance(cls): | |
| if cls._instance is None: | |
| cls._instance = cls() | |
| return cls._instance | |
| def load_model(self): | |
| if HAS_UNSLOTH and self.device != "cpu": | |
| # GPU via Unsloth + LoRA | |
| backbone, tokenizer = FastLanguageModel.from_pretrained( | |
| "unsloth/Llama-3.2-1B-bnb-4bit", | |
| load_in_4bit=True, | |
| dtype=torch.float16, | |
| device_map="auto", | |
| ) | |
| try: | |
| model = PeftModel.from_pretrained( | |
| backbone, | |
| "samith-a/Django-orm-code-gen", | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| ) | |
| print("Loaded LoRA adapter via Unsloth.") | |
| except Exception as e: | |
| print(f"β Adapter load failed, using backbone only: {e}") | |
| model = backbone | |
| FastLanguageModel.for_inference(model) | |
| return model, tokenizer | |
| # --- Fallback: CPU-only via HF Transformers + PEFT --- | |
| print("Falling back to CPU-only Transformers + PEFT") | |
| base_name = "unsloth/Llama-3.2-1B" # non-4bit to run on CPU | |
| tokenizer = AutoTokenizer.from_pretrained(base_name, use_fast=True) | |
| base = AutoModelForCausalLM.from_pretrained( | |
| base_name, | |
| device_map={"": "cpu"}, | |
| torch_dtype=torch.float32, | |
| ) | |
| try: | |
| model = PeftModel.from_pretrained( | |
| base, | |
| "samith-a/Django-orm-code-gen", | |
| device_map={"": "cpu"}, | |
| torch_dtype=torch.float32, | |
| ) | |
| print("Loaded LoRA adapter via PEFT.") | |
| except Exception as e: | |
| print(f"β Adapter load failed, using base model: {e}") | |
| model = base | |
| model.eval() | |
| return model, tokenizer | |
| def generate(self, instruction: str, input_text: str, max_new_tokens: int = 128) -> str: | |
| alpaca_template = ( | |
| "### Instruction:\n{}\n\n" | |
| "### Input:\n{}\n\n" | |
| "### Response:\n" | |
| ) | |
| prompt = alpaca_template.format(instruction, input_text) | |
| inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device) | |
| outputs = self.model.generate( | |
| **inputs, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=True, | |
| temperature=0.7 | |
| ) | |
| raw = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return raw.split("### Response:")[-1].strip() | |
| # Initialize once | |
| manager = ModelManager.get_instance() | |
| def predict(instruction, context, max_tokens): | |
| return manager.generate(instruction, context, max_new_tokens=int(max_tokens)) | |
| demo = gr.Interface( | |
| fn=predict, | |
| inputs=[ | |
| gr.Textbox(lines=2, label="Instruction"), | |
| gr.Textbox(lines=5, label="Context / Code"), | |
| gr.Slider(16, 512, step=16, label="Max new tokens", value=128), | |
| ], | |
| outputs=gr.Textbox(label="Generated Code"), | |
| title="Django-ORM Code Generator", | |
| description="LoRA-finetuned LLaMA3.2 for Django ORM code (CPU/GPU fallback)." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) | |