ThongCoding's picture
asdwdasd
6bf37cd
raw
history blame
1.5 kB
from fastapi import FastAPI
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import os
REPO_ID = "google/gemma-2b-it-GGUF"
FILENAME = "gemma-2b-it.gguf"
HF_TOKEN = os.environ.get("HF_TOKEN") # must be set in HF Spaces Secrets
MODEL_DIR = "./models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
# Step 1: Auto-download model if not exists
if not os.path.exists(MODEL_PATH):
os.makedirs(MODEL_DIR, exist_ok=True)
try:
print("📦 Downloading model from Hugging Face Hub...")
hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
token=HF_TOKEN,
local_dir=MODEL_DIR,
local_dir_use_symlinks=False
)
print("✅ Model downloaded.")
except Exception as e:
print(f"❌ Download failed: {e}")
raise
# Step 2: Load model using llama-cpp-python
print("🤖 Loading GGUF model...")
llm = Llama(
model_path=MODEL_PATH,
n_ctx=512,
n_threads=4,
n_batch=512,
verbose=False
)
# Step 3: FastAPI app
app = FastAPI()
class PromptRequest(BaseModel):
prompt: str
@app.post("/prompt")
def generate_prompt(req: PromptRequest):
prompt = req.prompt.strip()
output = llm(
prompt,
max_tokens=512,
temperature=0.6,
top_p=0.95,
stop=["<|endoftext|>", "</s>", "```"],
echo=False
)
result = output["choices"][0]["text"].strip()
return {"response": result}