Spaces:
Running
Running
| # -------------------------------------------------------- | |
| # Based on BEiT, timm, DINO and DeiT code bases | |
| # https://github.com/microsoft/unilm/tree/master/beit | |
| # https://github.com/rwightman/pytorch-image-models/tree/master/timm | |
| # https://github.com/facebookresearch/deit | |
| # https://github.com/facebookresearch/dino | |
| # -------------------------------------------------------- | |
| import os | |
| import pickle | |
| import shutil | |
| import tempfile | |
| import torch | |
| import torch.distributed as dist | |
| def setup_for_distributed(is_master): | |
| """ | |
| This function disables printing when not in master process | |
| """ | |
| import builtins as __builtin__ | |
| builtin_print = __builtin__.print | |
| def print(*args, **kwargs): | |
| force = kwargs.pop('force', False) | |
| if is_master or force: | |
| builtin_print(*args, **kwargs) | |
| __builtin__.print = print | |
| def is_dist_avail_and_initialized(): | |
| if not dist.is_available(): | |
| return False | |
| if not dist.is_initialized(): | |
| return False | |
| return True | |
| def get_world_size(): | |
| if not is_dist_avail_and_initialized(): | |
| return 1 | |
| return dist.get_world_size() | |
| def get_rank(): | |
| if not is_dist_avail_and_initialized(): | |
| return 0 | |
| return dist.get_rank() | |
| def is_main_process(): | |
| return get_rank() == 0 | |
| def save_on_master(*args, **kwargs): | |
| if is_main_process(): | |
| torch.save(*args, **kwargs) | |
| def init_distributed_mode(args): | |
| if args.dist_on_itp: | |
| args.rank = int(os.environ['OMPI_COMM_WORLD_RANK']) | |
| args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) | |
| args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) | |
| args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT']) | |
| os.environ['LOCAL_RANK'] = str(args.gpu) | |
| os.environ['RANK'] = str(args.rank) | |
| os.environ['WORLD_SIZE'] = str(args.world_size) | |
| # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"] | |
| elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: | |
| args.rank = int(os.environ["RANK"]) | |
| args.world_size = int(os.environ['WORLD_SIZE']) | |
| args.gpu = int(os.environ['LOCAL_RANK']) | |
| elif 'SLURM_PROCID' in os.environ: | |
| args.rank = int(os.environ['SLURM_PROCID']) | |
| args.gpu = args.rank % torch.cuda.device_count() | |
| else: | |
| print('Not using distributed mode') | |
| args.distributed = False | |
| return | |
| args.distributed = True | |
| torch.cuda.set_device(args.gpu) | |
| args.dist_backend = 'nccl' | |
| print('| distributed init (rank {}): {}, gpu {}'.format( | |
| args.rank, args.dist_url, args.gpu), flush=True) | |
| torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, | |
| world_size=args.world_size, rank=args.rank) | |
| torch.distributed.barrier() | |
| setup_for_distributed(args.rank == 0) | |
| # # From MMCV | |
| def collect_results_cpu(result_part, size, tmpdir=None): | |
| """Collect results under cpu mode. | |
| On cpu mode, this function will save the results on different gpus to | |
| ``tmpdir`` and collect them by the rank 0 worker. | |
| Args: | |
| result_part (list): Result list containing result parts | |
| to be collected. | |
| size (int): Size of the results, commonly equal to length of | |
| the results. | |
| tmpdir (str | None): temporal directory for collected results to | |
| store. If set to None, it will create a random temporal directory | |
| for it. | |
| Returns: | |
| list: The collected results. | |
| """ | |
| rank = get_rank() | |
| world_size = get_world_size() | |
| # create a tmp dir if it is not specified | |
| if tmpdir is None: | |
| MAX_LEN = 512 | |
| # 32 is whitespace | |
| dir_tensor = torch.full((MAX_LEN, ), | |
| 32, | |
| dtype=torch.uint8, | |
| device='cuda') | |
| if rank == 0: | |
| os.makedirs('/tmp/dist_test', exist_ok=True) | |
| tmpdir = tempfile.mkdtemp(dir='/tmp/dist_test') | |
| tmpdir = torch.tensor( | |
| bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') | |
| dir_tensor[:len(tmpdir)] = tmpdir | |
| dist.broadcast(dir_tensor, 0) | |
| tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() | |
| else: | |
| os.makedirs(tmpdir, exist_ok=True) | |
| # dump the part result to the dir | |
| tmp_file = os.path.join(tmpdir, f'part_{rank}.pkl') | |
| pickle.dump(result_part, open(str(tmp_file), "wb")) | |
| dist.barrier() | |
| # collect all parts | |
| if rank != 0: | |
| return None | |
| else: | |
| # load results of all parts from tmp dir | |
| part_list = [] | |
| for i in range(world_size): | |
| part_file = os.path.join(tmpdir, f'part_{i}.pkl') | |
| part_result = pickle.load(open(str(part_file), "rb")) | |
| # When data is severely insufficient, an empty part_result | |
| # on a certain gpu could makes the overall outputs empty. | |
| if part_result: | |
| part_list.append(part_result) | |
| # sort the results | |
| ordered_results = [] | |
| for res in zip(*part_list): | |
| ordered_results.extend(list(res)) | |
| # the dataloader may pad some samples | |
| ordered_results = ordered_results[:size] | |
| # remove tmp dir | |
| shutil.rmtree(tmpdir) | |
| return ordered_results | |