File size: 1,500 Bytes
6bf37cd deb83c9 6bf37cd d9d6b2c deb83c9 6bf37cd deb83c9 6bf37cd d9d6b2c 6bf37cd 231cb7b deb83c9 072df7d 6bf37cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
from fastapi import FastAPI
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import os
REPO_ID = "google/gemma-2b-it-GGUF"
FILENAME = "gemma-2b-it.gguf"
HF_TOKEN = os.environ.get("HF_TOKEN") # must be set in HF Spaces Secrets
MODEL_DIR = "./models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
# Step 1: Auto-download model if not exists
if not os.path.exists(MODEL_PATH):
os.makedirs(MODEL_DIR, exist_ok=True)
try:
print("📦 Downloading model from Hugging Face Hub...")
hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
token=HF_TOKEN,
local_dir=MODEL_DIR,
local_dir_use_symlinks=False
)
print("✅ Model downloaded.")
except Exception as e:
print(f"❌ Download failed: {e}")
raise
# Step 2: Load model using llama-cpp-python
print("🤖 Loading GGUF model...")
llm = Llama(
model_path=MODEL_PATH,
n_ctx=512,
n_threads=4,
n_batch=512,
verbose=False
)
# Step 3: FastAPI app
app = FastAPI()
class PromptRequest(BaseModel):
prompt: str
@app.post("/prompt")
def generate_prompt(req: PromptRequest):
prompt = req.prompt.strip()
output = llm(
prompt,
max_tokens=512,
temperature=0.6,
top_p=0.95,
stop=["<|endoftext|>", "</s>", "```"],
echo=False
)
result = output["choices"][0]["text"].strip()
return {"response": result}
|