Spaces:

samith-a
/

Django-code-model

Runtime error

App Files Files Community

Django-code-model / app.py

samith-a

try model unsloth/Llama-3.2-1B-bnb-4bit

ea721a6 4 months ago

raw

history blame

3.77 kB

	import gradio as gr
	import torch
	from peft import PeftModel
	from transformers import AutoModelForCausalLM, AutoTokenizer

	# Try to import Unsloth; fallback if failed
	try:
	from unsloth import FastLanguageModel
	HAS_UNSLOTH = True
	except NotImplementedError:
	HAS_UNSLOTH = False
	except ImportError:
	HAS_UNSLOTH = False

	class ModelManager:
	_instance = None

	def __init__(self):
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.model, self.tokenizer = self.load_model()

	@classmethod
	def get_instance(cls):
	if cls._instance is None:
	cls._instance = cls()
	return cls._instance

	def load_model(self):
	if HAS_UNSLOTH and self.device != "cpu":
	# GPU via Unsloth + LoRA
	backbone, tokenizer = FastLanguageModel.from_pretrained(
	"unsloth/Llama-3.2-1B-bnb-4bit",
	load_in_4bit=True,
	dtype=torch.float16,
	device_map="auto",
	)
	try:
	model = PeftModel.from_pretrained(
	backbone,
	"samith-a/Django-orm-code-gen",
	torch_dtype=torch.float16,
	device_map="auto",
	)
	print("Loaded LoRA adapter via Unsloth.")
	except Exception as e:
	print(f"❗ Adapter load failed, using backbone only: {e}")
	model = backbone
	FastLanguageModel.for_inference(model)
	return model, tokenizer

	# --- Fallback: CPU-only via HF Transformers + PEFT ---
	print("Falling back to CPU-only Transformers + PEFT")
	base_name = "unsloth/Llama-3.2-1B" # non-4bit to run on CPU
	tokenizer = AutoTokenizer.from_pretrained(base_name, use_fast=True)
	base = AutoModelForCausalLM.from_pretrained(
	base_name,
	device_map={"": "cpu"},
	torch_dtype=torch.float32,
	)
	try:
	model = PeftModel.from_pretrained(
	base,
	"samith-a/Django-orm-code-gen",
	device_map={"": "cpu"},
	torch_dtype=torch.float32,
	)
	print("Loaded LoRA adapter via PEFT.")
	except Exception as e:
	print(f"❗ Adapter load failed, using base model: {e}")
	model = base

	model.eval()
	return model, tokenizer

	def generate(self, instruction: str, input_text: str, max_new_tokens: int = 128) -> str:
	alpaca_template = (
	"### Instruction:\n{}\n\n"
	"### Input:\n{}\n\n"
	"### Response:\n"
	)
	prompt = alpaca_template.format(instruction, input_text)

	inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
	outputs = self.model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	temperature=0.7
	)
	raw = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
	return raw.split("### Response:")[-1].strip()

	# Initialize once
	manager = ModelManager.get_instance()

	def predict(instruction, context, max_tokens):
	return manager.generate(instruction, context, max_new_tokens=int(max_tokens))

	demo = gr.Interface(
	fn=predict,
	inputs=[
	gr.Textbox(lines=2, label="Instruction"),
	gr.Textbox(lines=5, label="Context / Code"),
	gr.Slider(16, 512, step=16, label="Max new tokens", value=128),
	],
	outputs=gr.Textbox(label="Generated Code"),
	title="Django-ORM Code Generator",
	description="LoRA-finetuned LLaMA3.2 for Django ORM code (CPU/GPU fallback)."
	)

	if __name__ == "__main__":
	demo.launch(share=True)