|
|
|
|
|
""" |
|
|
Test script for FastVLM-7B model loading and configuration |
|
|
""" |
|
|
|
|
|
import asyncio |
|
|
import sys |
|
|
import os |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
|
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
|
|
|
|
|
def check_dependencies(): |
|
|
"""Check if all required dependencies are installed""" |
|
|
print("Checking dependencies...") |
|
|
|
|
|
deps = { |
|
|
"torch": None, |
|
|
"transformers": None, |
|
|
"sentencepiece": None, |
|
|
"einops": None, |
|
|
"accelerate": None |
|
|
} |
|
|
|
|
|
for dep in deps: |
|
|
try: |
|
|
module = __import__(dep) |
|
|
deps[dep] = getattr(module, "__version__", "installed") |
|
|
print(f"β {dep}: {deps[dep]}") |
|
|
except ImportError: |
|
|
print(f"β {dep}: NOT INSTALLED") |
|
|
deps[dep] = None |
|
|
|
|
|
return all(v is not None for v in deps.values()) |
|
|
|
|
|
def check_hardware(): |
|
|
"""Check hardware capabilities""" |
|
|
print("\nHardware check:") |
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
print(f"β CUDA available: {torch.cuda.get_device_name(0)}") |
|
|
print(f" Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB") |
|
|
elif torch.backends.mps.is_available(): |
|
|
print("β Apple Silicon MPS available") |
|
|
|
|
|
import subprocess |
|
|
result = subprocess.run(['sysctl', 'hw.memsize'], capture_output=True, text=True) |
|
|
if result.returncode == 0: |
|
|
mem_bytes = int(result.stdout.split()[1]) |
|
|
print(f" System Memory: {mem_bytes / 1e9:.2f} GB") |
|
|
else: |
|
|
print("β CPU mode") |
|
|
import psutil |
|
|
print(f" Available Memory: {psutil.virtual_memory().available / 1e9:.2f} GB") |
|
|
|
|
|
async def test_fastvlm_loading(): |
|
|
"""Test loading FastVLM-7B model""" |
|
|
print("\n" + "="*50) |
|
|
print("Testing FastVLM-7B Model Loading") |
|
|
print("="*50) |
|
|
|
|
|
model_name = "apple/FastVLM-7B" |
|
|
|
|
|
try: |
|
|
print(f"\n1. Loading tokenizer from {model_name}...") |
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
|
model_name, |
|
|
trust_remote_code=True, |
|
|
use_fast=True |
|
|
) |
|
|
print(" β Tokenizer loaded successfully") |
|
|
print(f" Tokenizer class: {tokenizer.__class__.__name__}") |
|
|
print(f" Vocab size: {tokenizer.vocab_size}") |
|
|
|
|
|
|
|
|
IMAGE_TOKEN_INDEX = -200 |
|
|
if hasattr(tokenizer, 'IMAGE_TOKEN_INDEX'): |
|
|
print(f" IMAGE_TOKEN_INDEX: {tokenizer.IMAGE_TOKEN_INDEX}") |
|
|
else: |
|
|
print(f" Setting IMAGE_TOKEN_INDEX to {IMAGE_TOKEN_INDEX}") |
|
|
tokenizer.IMAGE_TOKEN_INDEX = IMAGE_TOKEN_INDEX |
|
|
|
|
|
print("\n2. Attempting to load model...") |
|
|
print(" Note: This requires ~14GB RAM for full precision") |
|
|
|
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
device = "cuda" |
|
|
dtype = torch.float16 |
|
|
elif torch.backends.mps.is_available(): |
|
|
device = "mps" |
|
|
dtype = torch.float16 |
|
|
else: |
|
|
device = "cpu" |
|
|
dtype = torch.float32 |
|
|
|
|
|
print(f" Device: {device}") |
|
|
print(f" Dtype: {dtype}") |
|
|
|
|
|
|
|
|
print(" Loading with low_cpu_mem_usage=True...") |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
trust_remote_code=True, |
|
|
torch_dtype=dtype, |
|
|
low_cpu_mem_usage=True |
|
|
) |
|
|
|
|
|
print(" β Model loaded successfully!") |
|
|
|
|
|
|
|
|
total_params = sum(p.numel() for p in model.parameters()) |
|
|
print(f" Parameters: {total_params / 1e9:.2f}B") |
|
|
|
|
|
|
|
|
print(f"\n3. Moving model to {device}...") |
|
|
model = model.to(device) |
|
|
model.eval() |
|
|
print(" β Model ready for inference") |
|
|
|
|
|
|
|
|
print("\n4. Testing generation...") |
|
|
test_prompt = "Hello, this is a test of" |
|
|
inputs = tokenizer(test_prompt, return_tensors="pt").to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=10, |
|
|
temperature=0.7, |
|
|
do_sample=True |
|
|
) |
|
|
|
|
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
print(f" Input: {test_prompt}") |
|
|
print(f" Output: {response}") |
|
|
|
|
|
print("\nβ FastVLM-7B is working correctly!") |
|
|
return True |
|
|
|
|
|
except ImportError as e: |
|
|
print(f"\nβ Import Error: {e}") |
|
|
if "trust_remote_code" in str(e): |
|
|
print("\nSolution: The model requires trust_remote_code=True") |
|
|
print("This is already set in the code, but the model files may need to be re-downloaded.") |
|
|
return False |
|
|
|
|
|
except RuntimeError as e: |
|
|
if "out of memory" in str(e).lower(): |
|
|
print(f"\nβ Out of Memory Error") |
|
|
print("\nSolutions:") |
|
|
print("1. Use the quantized version:") |
|
|
print(" model_name = 'apple/FastVLM-7B-int4'") |
|
|
print("2. Use a smaller variant:") |
|
|
print(" model_name = 'apple/FastVLM-1.5B'") |
|
|
print("3. Enable 8-bit quantization (requires bitsandbytes)") |
|
|
print("4. Increase system RAM or use a GPU") |
|
|
else: |
|
|
print(f"\nβ Runtime Error: {e}") |
|
|
return False |
|
|
|
|
|
except Exception as e: |
|
|
print(f"\nβ Error: {e}") |
|
|
print(f" Error type: {type(e).__name__}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return False |
|
|
|
|
|
async def test_alternative_models(): |
|
|
"""Test alternative model options if FastVLM-7B fails""" |
|
|
print("\n" + "="*50) |
|
|
print("Alternative Model Options") |
|
|
print("="*50) |
|
|
|
|
|
alternatives = [ |
|
|
("apple/FastVLM-1.5B", "Smaller FastVLM variant (1.5B params)"), |
|
|
("apple/FastVLM-7B-int4", "Quantized FastVLM for lower memory"), |
|
|
("apple/FastVLM-0.5B", "Smallest FastVLM variant (0.5B params)") |
|
|
] |
|
|
|
|
|
for model_name, description in alternatives: |
|
|
print(f"\nβ’ {model_name}") |
|
|
print(f" {description}") |
|
|
try: |
|
|
|
|
|
from transformers import AutoConfig |
|
|
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) |
|
|
print(f" β Model available") |
|
|
except Exception as e: |
|
|
print(f" β Not accessible: {str(e)[:50]}...") |
|
|
|
|
|
async def main(): |
|
|
"""Main test function""" |
|
|
print("FastVLM-7B Integration Test") |
|
|
print("="*50) |
|
|
|
|
|
|
|
|
if not check_dependencies(): |
|
|
print("\nβ Missing dependencies. Please install all requirements.") |
|
|
return |
|
|
|
|
|
|
|
|
check_hardware() |
|
|
|
|
|
|
|
|
success = await test_fastvlm_loading() |
|
|
|
|
|
if not success: |
|
|
|
|
|
await test_alternative_models() |
|
|
|
|
|
print("\n" + "="*50) |
|
|
print("Recommendations:") |
|
|
print("="*50) |
|
|
print("\n1. If memory is limited, use FastVLM-1.5B or FastVLM-0.5B") |
|
|
print("2. For Apple Silicon, ensure you have enough RAM (16GB+ recommended)") |
|
|
print("3. Consider using the quantized version (FastVLM-7B-int4)") |
|
|
print("4. Make sure transformers >= 4.40.0 is installed") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
asyncio.run(main()) |