File size: 2,217 Bytes
56d9e9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# load_quantized_model.py
import json
import torch
from safetensors.torch import load_file
from optimum.quanto import requantize, quantize, qint4
from hunyuan_image_3.hunyuan import HunyuanImage3ForCausalMM
from transformers import AutoConfig, QuantoConfig
from transformers.generation.utils import GenerationConfig


def load_quantized_hi3_m1(model_path):
    print(f"Loading model architecture from {model_path} to CPU...")
    Qmodel = HunyuanImage3ForCausalMM.from_pretrained(
        model_path,
        dtype=torch.bfloat16,
        device_map=None,
        attn_implementation="sdpa",
        moe_impl="eager",
        moe_drop_tokens=True,
        trust_remote_code=True,
        low_cpu_mem_usage=False,
    )

    print("Applying int4 quantization structure...")
    quantize(Qmodel, weights=qint4)

    print("Loading quantized weights...")
    state_dict = load_file(f"{model_path}/model.safetensors")
    Qmodel.load_state_dict(state_dict, strict=False, assign=True)

    print("Moving quantized model to GPU...")
    Qmodel = Qmodel.to("cuda")

    return Qmodel


def load_quantized_hi3_m2(model_path):

    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)

    state_dict = load_file(f"{model_path}/model.safetensors")
    with open(f"{model_path}/quantization_map.json", "r") as f: quantization_map = json.load(f)

    print("Create Meta model and Loading quantized weights to CPU...")
    with torch.device('meta'): Qmodel = HunyuanImage3ForCausalMM(config)
    Qmodel = Qmodel.to(torch.bfloat16)
    requantize(Qmodel, state_dict, quantization_map, device=torch.device('cpu'))

    generation_config = GenerationConfig.from_pretrained(model_path)
    Qmodel.generation_config = generation_config

    print("Moving quantized model to GPU...")
    Qmodel = Qmodel.to(torch.device('cuda'))

    return Qmodel


# modify your "app/pipeline.py" script as below:
# from load_quantized_model import load_quantized_hi3_m1, load_quantized_hi3_m2

# replace:
#        self.model = HunyuanImage3ForCausalMM.from_pretrained(args.model_id, **kwargs)
# with:
#        self.model = load_quantized_hi3_m1(args.model_id)
# or with:
#        self.model = load_quantized_hi3_m2(args.model_id)