Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| from peft import PeftModel | |
| import torch | |
| app = FastAPI() | |
| # π Replace with your actual repos | |
| BASE_MODEL = "unsloth/llama-3-8b-Instruct-bnb-4bit" | |
| ADAPTER_REPO = "rayymaxx/DirectEd-AI-LoRA" # your adapter repo | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) | |
| # Load base model | |
| model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| device_map="auto", | |
| torch_dtype=torch.float16, | |
| ) | |
| # Load adapter | |
| model = PeftModel.from_pretrained(model, ADAPTER_REPO) | |
| # Pipeline | |
| pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto") | |
| class Request(BaseModel): | |
| prompt: str | |
| max_new_tokens: int = 150 | |
| temperature: float = 0.7 | |
| def generate(req: Request): | |
| output = pipe( | |
| req.prompt, | |
| max_new_tokens=req.max_new_tokens, | |
| temperature=req.temperature, | |
| do_sample=True | |
| ) | |
| return {"response": output[0]["generated_text"]} | |