Spaces:

haolun
/

llm-profiler

Sleeping

llm-profiler / config.py

zenghaolun02

add demo

0c4803b over 1 year ago

9.13 kB

	# -- coding : utf-8 --
	# Description : gpu, model, Parallelism, data, train and inference config definition

	import math, json
	from constants import *
	from dataclasses import dataclass
	from enum import Enum
	from functools import total_ordering


	class ActivationRecomputation(Enum):
	NONE = 0
	"""No activation recomputation; requires the most amount of memory."""

	SELECTIVE = 1
	"""Selectively checkpoints and recomputes only parts of each transformer
	layer that take up a considerable amount of memory but are not
	computationally expensive to recompute, i.e. Q K V matrix multiplies,
	QK^T matrix multiply, softmax, softmax dropout, and attention over V."""

	FULL = 2
	"""Full activation recomputation stores the input to EVERY transformer
	layer, which is sharded across the tensor parallel group, thus requiring an
	extra all-gather (ignored for now) per layer and add communication
	overhead; requires the lease amount of memory; requires an extra forward
	pass."""

	@total_ordering
	class DSZeRO(Enum):
	NONE = 0
	"""No DeepSPeed ZeRO; requires the most amount of memory."""

	STAGE_1 = 1
	"""ZeRO stage 1 shards the optimizer states across the data parallel
	group."""

	STAGE_2 = 2
	"""ZeRO stage 2 shards the optimizer states and gradients across the data
	parallel group."""

	STAGE_3 = 3
	"""ZeRO stage 3 shards the optimizer states, gradients, and model weights
	across the data parallel group."""

	def __lt__(self, other):
	# 炫技写法
	if other.__class__ is self.__class__:
	return self.value < other.value # Enum 枚举类自动赋值
	return NotImplemented

	def __eq__(self, other):
	if isinstance(other, DSZeRO):
	return self.value == other.value
	return NotImplemented

	@dataclass
	class GPUEfficiencyConfig:
	flops_efficiency: float = 1.0
	hbm_memory_efficiency: float = 1.0
	intra_node_memory_efficiency: float = 1.0
	inter_node_memory_efficiency: float = 1.0

	@dataclass
	class InferenceConfig:
	"""Inference configuration dataclass."""
	batch_size_per_gpu: int = None # batch size
	seq_len: int = 522 # input sequence length
	generate_len: int = 1526 # number of tokens to generate
	context_len: int = None # context length
	use_kv_cache: bool = True # whether to use key/value cache
	bytes_per_param: int = BYTES_FP16 # model weight bytes
	layernorm_dtype_bytes: int = BYTES_FP16 # layernorm data type bytes
	kv_cache_dtype_bytes: int = BYTES_FP16 # key/value cache data type bytes
	def __post_init__(self):
	if self.context_len is None:
	self.context_len = self.seq_len + self.generate_len

	@dataclass
	class ParallelismConfig:
	"""dataclass module provides a decorator and functions for automatically adding generated special methods
	such as __init__() and __repr__() to user-defined classes
	"""
	tp_size: int = 1 # tensor parallelism size, Megatron-LM tensor parallelism implementation
	pp_size: int = 1 # pipeline parallelism size, Megatron-LM pipeline parallelism implementation
	dp_size: int = 1 # data parallelism size, DeepSpeed Zero parallelism implementation
	sp_size: int = 1 # sequence parallelism size, Megatron-LM sequence parallelism implementation

	@dataclass
	class ModelConfig:
	num_layers: int # number of transformer layers (blocks)
	n_head: int # number of attention heads
	hidden_dim: int # hidden dimension
	vocab_size: int # vocabulary size
	num_key_value_heads: int = None
	max_seq_len: int = None # max sequence length
	ffn_embed_dim: int = None # hidden dimension of FFN, default to 4 * hidden_dim
	model_type: str = None # model type as tagged on Hugging Face (e.g., gpt2, opt, llama.)
	model_name: str = None # model name as tagged on Hugging Face (e.g., gpt2-xl, opt, llama-13b.)

	def __post_init__(self):
	if self.num_key_value_heads is None: # 如果不存在，设置默认值
	self.num_key_value_heads = self.n_head

	if self.ffn_embed_dim is None:
	self.ffn_embed_dim = self.hidden_dim * 4

	@dataclass
	class GPUConfig:
	# 1, gpu 型号和显存大小
	name: str # GPU config name
	memory_GPU_in_GB: float # memory per GPU in GB

	# 2, gpu 显存带宽、节点内带宽、节点间带宽
	hbm_bandwidth_in_GB_per_sec: float # GPU HBM bandwidth in GB/s
	intra_node_bandwidth_in_GB_per_sec: float # intra node GPU bandwidth in GB/s.(PCIE/NVLINK)
	intra_node_min_message_latency: float # minimum intra node message latency in seconds

	inter_node_bandwidth_in_GB_per_sec: float = 200 # inter node bandwidth in GB/s, assuming Mellanox 200Gbps HDR Infiniband

	# 3, 不同精度的 Tensor core 的计算性能
	peak_fp32_TFLOPS: float = None # peak Tensor TFLOPS for FP32
	peak_fp16_TFLOPS: float = None # peak Tensor TFLOPS for FP16
	peak_int8_TFLOPS: float = None # peak Tensor TFLOPS for INT8
	peak_int4_TFLOPS: float = None # peak Tensor TFLOPS for INT4

	FLOPS_EFFICIENCY = 0.7
	HBM_MEMORY_EFFICIENCY = 0.9

	def __post_init__(self):
	"""object creation of DataClass starts with __init__() (constructor-calling) and
	ends with __post__init__() (post-init processing).
	"""
	if self.peak_fp32_TFLOPS is None:
	self.peak_fp32_TFLOPS = math.ceil(self.peak_fp16_TFLOPS / 2)
	if self.peak_int8_TFLOPS is None:
	self.peak_int8_TFLOPS = 2 * self.peak_fp16_TFLOPS
	if self.peak_int4_TFLOPS is None:
	self.peak_int4_TFLOPS = 4 * self.peak_fp16_TFLOPS

	if self.FLOPS_EFFICIENCY:
	self.peak_fp32_TFLOPS *= self.FLOPS_EFFICIENCY
	self.peak_fp16_TFLOPS *= self.FLOPS_EFFICIENCY
	self.peak_int8_TFLOPS *= self.FLOPS_EFFICIENCY
	self.peak_int4_TFLOPS *= self.FLOPS_EFFICIENCY
	if self.HBM_MEMORY_EFFICIENCY:
	self.hbm_bandwidth_in_GB_per_sec *= self.HBM_MEMORY_EFFICIENCY
	self.intra_node_bandwidth_in_GB_per_sec *= self.HBM_MEMORY_EFFICIENCY
	class LLMConfigs(object):
	def __init__(self, gpu_config: GPUConfig,
	model_config: ModelConfig,
	parallelism_config: ParallelismConfig = ParallelismConfig(),
	inference_config: InferenceConfig = InferenceConfig(),
	gpu_efficiency_config: GPUEfficiencyConfig = GPUEfficiencyConfig()
	) -> None:
	self.model_config = model_config
	self.gpu_config = gpu_config
	self.parallelism_config = parallelism_config
	self.inference_config = inference_config # 用户自行指定配置
	self.gpu_efficiency_config = gpu_efficiency_config # 用户自行指定配置

	def get_model_and_gpu_config_by_name(model_name="llama-13b", gpu_name="v100-pcie-32gb") -> dict:
	"""Read model and gpu configs from a json file."""
	config_files = ["configs/model_configs.json", "configs/gpu_configs.json"]
	model_config, gpu_config = {}, {}

	for config_filename in config_files:
	with open(config_filename, "r") as f:
	config_json = json.load(f)

	if "model" in config_filename:
	assert model_name in config_json, f"model name {model_name} not found in {config_filename}"
	config_dict = config_json[model_name]
	model_config = ModelConfig(**config_dict)

	elif "gpu" in config_filename:
	assert gpu_name in config_json, f"gpu name {gpu_name} not found in {config_filename}"
	config_dict = config_json[gpu_name]
	gpu_config = GPUConfig(**config_dict)
	else:
	assert False, f"unknown config type when reading: {type}"

	return model_config, gpu_config

	def get_TFLOPS_per_gpu(gpu_config: GPUConfig, data_type="fp16", flops_efficiency=1.0) -> float:
	"""Get the expected TFLOPS per GPU for the specified data type
	configuration/GPU (adjusted by flops_efficiency)

	Returns:
	float: TFLOPS per GPU and unit is T.
	"""
	if data_type == "int8":
	gemm_TFOPS = gpu_config.peak_int8_TFLOPS
	elif data_type == "fp16":
	gemm_TFOPS = gpu_config.peak_fp16_TFLOPS
	else:
	print("weight_bits and activation_bits must be 8, or 16!")

	return gemm_TFOPS * flops_efficiency

	def get_gpu_hbm_bandwidth(gpu_config: GPUConfig, hbm_memory_efficiency=1.0) -> float:
	return (
	gpu_config.hbm_bandwidth_in_GB_per_sec * hbm_memory_efficiency
	)

	def get_intra_node_bandwidth(gpu_config: GPUConfig, intra_node_memory_efficiency=1.0) -> float:
	return (
	gpu_config.intra_node_bandwidth_in_GB_per_sec * intra_node_memory_efficiency
	)

	def get_inter_node_bandwidth(gpu_config: GPUConfig, inter_node_memory_efficiency=1.0) -> float:
	return (
	gpu_config.inter_node_bandwidth_in_GB_per_sec * inter_node_memory_efficiency
	)