File size: 1,500 Bytes
6bf37cd
deb83c9
6bf37cd
d9d6b2c
 
deb83c9
6bf37cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
deb83c9
6bf37cd
 
 
 
 
 
 
 
d9d6b2c
 
6bf37cd
 
 
 
231cb7b
deb83c9
072df7d
6bf37cd
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from fastapi import FastAPI
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import os

REPO_ID = "google/gemma-2b-it-GGUF"
FILENAME = "gemma-2b-it.gguf"
HF_TOKEN = os.environ.get("HF_TOKEN")  # must be set in HF Spaces Secrets
MODEL_DIR = "./models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)

# Step 1: Auto-download model if not exists
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_DIR, exist_ok=True)
    try:
        print("📦 Downloading model from Hugging Face Hub...")
        hf_hub_download(
            repo_id=REPO_ID,
            filename=FILENAME,
            token=HF_TOKEN,
            local_dir=MODEL_DIR,
            local_dir_use_symlinks=False
        )
        print("✅ Model downloaded.")
    except Exception as e:
        print(f"❌ Download failed: {e}")
        raise

# Step 2: Load model using llama-cpp-python
print("🤖 Loading GGUF model...")
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=512,
    n_threads=4,
    n_batch=512,
    verbose=False
)

# Step 3: FastAPI app
app = FastAPI()

class PromptRequest(BaseModel):
    prompt: str

@app.post("/prompt")
def generate_prompt(req: PromptRequest):
    prompt = req.prompt.strip()

    output = llm(
        prompt,
        max_tokens=512,
        temperature=0.6,
        top_p=0.95,
        stop=["<|endoftext|>", "</s>", "```"],
        echo=False
    )

    result = output["choices"][0]["text"].strip()
    return {"response": result}