|
|
from fastapi import FastAPI |
|
|
from pydantic import BaseModel |
|
|
from llama_cpp import Llama |
|
|
import os |
|
|
import requests |
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
|
|
|
MODEL_REPO="nilbot/gemma-2b-it-Q4_K.gguf" |
|
|
MODEL_FILE="gemma-2b-it-Q4_K.gguf" |
|
|
MODEL_URL = f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILE}" |
|
|
MODEL_DIR = "./models" |
|
|
MODEL_PATH = os.path.join(MODEL_DIR, MODEL_FILE) |
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
|
|
|
|
|
|
os.makedirs(MODEL_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
if not os.path.exists(MODEL_PATH): |
|
|
print("📦 Downloading GGUF model manually from Hugging Face...") |
|
|
headers = {"Authorization": f"Bearer {HF_TOKEN}"} |
|
|
response = requests.get(MODEL_URL, headers=headers, stream=True) |
|
|
if response.status_code != 200: |
|
|
raise RuntimeError(f"❌ Failed to download model. Status: {response.status_code}") |
|
|
with open(MODEL_PATH, "wb") as f: |
|
|
for chunk in response.iter_content(chunk_size=8192): |
|
|
f.write(chunk) |
|
|
print(f"✅ Model downloaded to {MODEL_PATH}") |
|
|
|
|
|
|
|
|
print("🔧 Loading GGUF model...") |
|
|
llm = Llama(model_path=MODEL_PATH, n_ctx=512, n_threads=os.cpu_count()) |
|
|
|
|
|
|
|
|
class PromptRequest(BaseModel): |
|
|
prompt: str |
|
|
max_tokens: int = 256 |
|
|
temperature: float = 0.7 |
|
|
|
|
|
@app.post("/prompt") |
|
|
def generate_prompt(req: PromptRequest): |
|
|
output = llm( |
|
|
prompt=req.prompt, |
|
|
max_tokens=req.max_tokens, |
|
|
temperature=req.temperature, |
|
|
stop=["</s>"], |
|
|
) |
|
|
return {"response": output["choices"][0]["text"].strip()} |
|
|
|