Aduc_sdr

Paused

Aduc_sdr / common /distributed /basic.py

Carlexxx

feat: Implement self-contained specialist managers

0344c73 2 months ago

2.32 kB

	# // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
	# //
	# // Licensed under the Apache License, Version 2.0 (the "License");
	# // you may not use this file except in compliance with the License.
	# // You may obtain a copy of the License at
	# //
	# // http://www.apache.org/licenses/LICENSE-2.0
	# //
	# // Unless required by applicable law or agreed to in writing, software
	# // distributed under the License is distributed on an "AS IS" BASIS,
	# // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# // See the License for the specific language governing permissions and
	# // limitations under the License.

	"""
	Distributed basic functions.
	"""

	import os
	from datetime import timedelta
	import torch
	import torch.distributed as dist
	from torch.nn.parallel import DistributedDataParallel


	def get_global_rank() -> int:
	"""
	Get the global rank, the global index of the GPU.
	"""
	return int(os.environ.get("RANK", "0"))


	def get_local_rank() -> int:
	"""
	Get the local rank, the local index of the GPU.
	"""
	return int(os.environ.get("LOCAL_RANK", "0"))


	def get_world_size() -> int:
	"""
	Get the world size, the total amount of GPUs.
	"""
	return int(os.environ.get("WORLD_SIZE", "1"))


	def get_device() -> torch.device:
	"""
	Get current rank device.
	"""
	return torch.device("cuda", get_local_rank())


	def barrier_if_distributed(args, *kwargs):
	"""
	Synchronizes all processes if under distributed context.
	"""
	if dist.is_initialized():
	return dist.barrier(args, *kwargs)


	def init_torch(cudnn_benchmark=True, timeout=timedelta(seconds=600)):
	"""
	Common PyTorch initialization configuration.
	"""
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True
	torch.backends.cudnn.benchmark = cudnn_benchmark
	torch.cuda.set_device(get_local_rank())
	dist.init_process_group(
	backend="nccl",
	rank=get_global_rank(),
	world_size=get_world_size(),
	timeout=timeout,
	)


	def convert_to_ddp(module: torch.nn.Module, **kwargs) -> DistributedDataParallel:
	return DistributedDataParallel(
	module=module,
	device_ids=[get_local_rank()],
	output_device=get_local_rank(),
	**kwargs,
	)