File size: 1,217 Bytes
2fc68b4
bfb014a
2fc68b4
 
bfb014a
3dc4bd8
77dfe34
2fc68b4
 
bfb014a
2fc68b4
 
bfb014a
 
 
 
 
 
 
 
 
 
 
2fc68b4
 
bfb014a
2fc68b4
3dc4bd8
 
 
96e3318
3dc4bd8
 
2fc68b4
 
 
bfb014a
 
 
 
2fc68b4
bfb014a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import os
import requests
from llama_cpp import Llama

HF_TOKEN = os.getenv("HF_TOKEN")
MODEL_REPO = "afrideva/TinyMistral-248M-SFT-v4-GGUF"
MODEL_FILENAME = "tinymistral-248m-sft-v4.q8_0.gguf"
MODEL_PATH = f"./models/{MODEL_FILENAME}"

# Manual download with fallback
if not os.path.exists(MODEL_PATH):
    print("📦 Downloading GGUF model manually from Hugging Face...")

    url = f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILENAME}"
    headers = {"Authorization": f"Bearer {HF_TOKEN}"}

    os.makedirs("./models", exist_ok=True)
    with requests.get(url, headers=headers, stream=True) as r:
        r.raise_for_status()
        with open(MODEL_PATH, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

    print(f"✅ Model downloaded to {MODEL_PATH}")

# Load with llama-cpp
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=256,
    n_threads=2,
    n_batch=32,
    n_gpu_layers=0,
    chat_format=None  # Not using llama-2 format
)

def generate_structure(prompt: str) -> str:
    output = llm.create_completion(
        prompt=prompt,
        temperature=0.7,
        max_tokens=512,
    )
    return output["choices"][0]["text"].strip()