# load_quantized_model.py import json import torch from safetensors.torch import load_file from optimum.quanto import requantize, quantize, qint4 from hunyuan_image_3.hunyuan import HunyuanImage3ForCausalMM from transformers import AutoConfig, QuantoConfig from transformers.generation.utils import GenerationConfig def load_quantized_hi3_m1(model_path): print(f"Loading model architecture from {model_path} to CPU...") Qmodel = HunyuanImage3ForCausalMM.from_pretrained( model_path, dtype=torch.bfloat16, device_map=None, attn_implementation="sdpa", moe_impl="eager", moe_drop_tokens=True, trust_remote_code=True, low_cpu_mem_usage=False, ) print("Applying int4 quantization structure...") quantize(Qmodel, weights=qint4) print("Loading quantized weights...") state_dict = load_file(f"{model_path}/model.safetensors") Qmodel.load_state_dict(state_dict, strict=False, assign=True) print("Moving quantized model to GPU...") Qmodel = Qmodel.to("cuda") return Qmodel def load_quantized_hi3_m2(model_path): config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) state_dict = load_file(f"{model_path}/model.safetensors") with open(f"{model_path}/quantization_map.json", "r") as f: quantization_map = json.load(f) print("Create Meta model and Loading quantized weights to CPU...") with torch.device('meta'): Qmodel = HunyuanImage3ForCausalMM(config) Qmodel = Qmodel.to(torch.bfloat16) requantize(Qmodel, state_dict, quantization_map, device=torch.device('cpu')) generation_config = GenerationConfig.from_pretrained(model_path) Qmodel.generation_config = generation_config print("Moving quantized model to GPU...") Qmodel = Qmodel.to(torch.device('cuda')) return Qmodel # modify your "app/pipeline.py" script as below: # from load_quantized_model import load_quantized_hi3_m1, load_quantized_hi3_m2 # replace: # self.model = HunyuanImage3ForCausalMM.from_pretrained(args.model_id, **kwargs) # with: # self.model = load_quantized_hi3_m1(args.model_id) # or with: # self.model = load_quantized_hi3_m2(args.model_id)