Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import io | |
| import logging | |
| from llm_profiler import * | |
| import sys | |
| from contextlib import redirect_stdout | |
| # 模型列表 | |
| model_names = [ | |
| "opt-1.3b", | |
| "opt-6.7b", | |
| "opt-13b", | |
| "opt-66b", | |
| "opt-175b", | |
| "gpt2", | |
| "gpt2-medium", | |
| "gpt2-large", | |
| "gpt2-xl", | |
| "bloom-560m", | |
| "bloom-7b", | |
| "bloom-175b", | |
| "llama-7b", | |
| "llama-13b", | |
| "llama-30b", | |
| "llama-65b", | |
| "llama2-13b", | |
| "llama2-70b", | |
| "internlm-20b", | |
| "baichuan2-13b", | |
| ] | |
| # GPU 列表 | |
| gpu_names = [ | |
| "t4-pcie-15gb", | |
| "v100-pcie-32gb", | |
| "v100-sxm-32gb", | |
| "br104p", | |
| "a100-pcie-40gb", | |
| "a100-sxm-40gb", | |
| "a100-pcie-80gb", | |
| "a100-sxm-80gb", | |
| "910b-64gb", | |
| "h100-sxm-80gb", | |
| "h100-pcie-80gb", | |
| "a30-pcie-24gb", | |
| "a30-sxm-24gb", | |
| "a40-pcie-48gb", | |
| ] | |
| # 创建一个日志处理器,将日志消息写入 StringIO 对象 | |
| class StringHandler(logging.Handler): | |
| def __init__(self): | |
| super().__init__() | |
| self.stream = io.StringIO() | |
| self.setFormatter(logging.Formatter("%(message)s")) | |
| def emit(self, record): | |
| self.stream.write(self.format(record) + "\n") | |
| def get_value(self): | |
| return self.stream.getvalue() | |
| # 创建一个日志记录器并添加 StringHandler | |
| logger = logging.getLogger(__name__) | |
| logger.setLevel(logging.INFO) | |
| string_handler = StringHandler() | |
| logger.addHandler(string_handler) | |
| def gradio_interface( | |
| model_name="llama2-70b", | |
| gpu_name: str = "t4-pcie-15gb", | |
| bytes_per_param: int = BYTES_FP16, | |
| batch_size_per_gpu: int = 2, | |
| seq_len: int = 300, | |
| generate_len: int = 40, | |
| ds_zero: int = 0, | |
| dp_size: int = 1, | |
| tp_size: int = 4, | |
| pp_size: int = 1, | |
| sp_size: int = 1, | |
| use_kv_cache: bool = True, | |
| layernorm_dtype_bytes: int = BYTES_FP16, | |
| kv_cache_dtype_bytes: int = BYTES_FP16, | |
| flops_efficiency: float = FLOPS_EFFICIENCY, | |
| hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY, | |
| intra_node_memory_efficiency: float = INTRA_NODE_MEMORY_EFFICIENCY, | |
| inter_node_memory_efficiency: float = INTER_NODE_MEMORY_EFFICIENCY, | |
| mode: str = "inference", | |
| print_flag: bool = True, | |
| ) -> list: | |
| # 清空 StringIO 对象 | |
| string_handler.stream.seek(0) | |
| string_handler.stream.truncate() | |
| # 重定向 sys.stdout 到 StringHandler | |
| original_stdout = sys.stdout | |
| sys.stdout = string_handler.stream | |
| # 调用你的推理函数 | |
| results = llm_profile_infer( | |
| model_name, | |
| gpu_name, | |
| bytes_per_param, | |
| batch_size_per_gpu, | |
| seq_len, | |
| generate_len, | |
| ds_zero, | |
| dp_size, | |
| tp_size, | |
| pp_size, | |
| sp_size, | |
| use_kv_cache, | |
| layernorm_dtype_bytes, | |
| kv_cache_dtype_bytes, | |
| flops_efficiency, | |
| hbm_memory_efficiency, | |
| intra_node_memory_efficiency, | |
| inter_node_memory_efficiency, | |
| mode, | |
| print_flag, | |
| ) | |
| # 恢复 sys.stdout | |
| sys.stdout = original_stdout | |
| # 获取日志消息 | |
| log_output = string_handler.get_value() | |
| # 返回推理结果和日志输出 | |
| return results, log_output | |
| # 创建 Gradio 界面 | |
| iface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=[ | |
| gr.Dropdown(choices=model_names, label="Model Name", value="llama2-70b"), | |
| gr.Dropdown(choices=gpu_names, label="GPU Name", value="a100-sxm-80gb"), | |
| gr.Number(label="Bytes per Param", value=BYTES_FP16), | |
| gr.Number(label="Batch Size per GPU", value=2), | |
| gr.Number(label="Sequence Length", value=300), | |
| gr.Number(label="Generate Length", value=40), | |
| gr.Number(label="DS Zero", value=0), | |
| gr.Number(label="DP Size", value=1), | |
| gr.Number(label="TP Size", value=4), | |
| gr.Number(label="PP Size", value=1), | |
| gr.Number(label="SP Size", value=1), | |
| gr.Checkbox(label="Use KV Cache", value=True), | |
| gr.Number(label="Layernorm dtype Bytes", value=BYTES_FP16), | |
| gr.Number(label="KV Cache dtype Bytes", value=BYTES_FP16), | |
| gr.Number(label="FLOPS Efficiency", value=FLOPS_EFFICIENCY), | |
| gr.Number(label="HBM Memory Efficiency", value=HBM_MEMORY_EFFICIENCY), | |
| gr.Number( | |
| label="Intra Node Memory Efficiency", value=INTRA_NODE_MEMORY_EFFICIENCY | |
| ), | |
| gr.Number( | |
| label="Inter Node Memory Efficiency", value=INTER_NODE_MEMORY_EFFICIENCY | |
| ), | |
| gr.Radio(choices=["inference", "other_mode"], label="Mode", value="inference"), | |
| gr.Checkbox(label="Print Flag", value=True), | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Inference Results"), # 推理结果输出,带标签 | |
| gr.Textbox(label="Detailed Analysis"), # 日志输出,带标签 | |
| ], | |
| title="LLM Profiler", | |
| description="Input parameters to profile your LLM.", | |
| ) | |
| # 启动 Gradio 界面 | |
| iface.launch(auth=("xtrt-llm", "xtrt-llm"), share=False) | |
| # iface.launch() | |