import os import requests from llama_cpp import Llama HF_TOKEN = os.getenv("HF_TOKEN") MODEL_REPO = "afrideva/TinyMistral-248M-SFT-v4-GGUF" MODEL_FILENAME = "tinymistral-248m-sft-v4.q2_k.gguf" MODEL_PATH = f"./models/{MODEL_FILENAME}" # Manual download with fallback if not os.path.exists(MODEL_PATH): print("📦 Downloading GGUF model manually from Hugging Face...") url = f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILENAME}" headers = {"Authorization": f"Bearer {HF_TOKEN}"} os.makedirs("./models", exist_ok=True) with requests.get(url, headers=headers, stream=True) as r: r.raise_for_status() with open(MODEL_PATH, "wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) print(f"✅ Model downloaded to {MODEL_PATH}") # Load with llama-cpp llm = Llama( model_path=MODEL_PATH, n_ctx=256, n_threads=2, n_batch=32, n_gpu_layers=0, chat_format=None # Not using llama-2 format ) def generate_structure(prompt: str) -> str: output = llm.create_completion( prompt=prompt, temperature=0.7, max_tokens=512, ) return output["choices"][0]["text"].strip()