import os import subprocess def prepare_data(root_path, mode='default'): """Untar data at the specified root_path. Only useful for euler cluster""" if mode == 'euler': tar_file_path = root_path tar_file_name = os.path.split(tar_file_path)[-1].split('.')[0] untar_path = '{}'.format(os.environ["TMPDIR"]) out_path = '{}/{}'.format(os.environ["TMPDIR"], tar_file_name) marker_path = '{}_done.txt'.format(out_path) if not os.path.isfile(marker_path): if tar_file_path.endswith('.tar.gz'): cmd = 'tar -I pigz -xf {} -C {}'.format(tar_file_path, untar_path) elif tar_file_path.endswith('tar'): cmd = 'tar -xf {} -C {}'.format(tar_file_path, untar_path) elif tar_file_path.endswith('tar.xz'): cmd = 'tar -xvf {} -C {}'.format(tar_file_path, untar_path) elif tar_file_path.endswith('.zip'): cmd = 'unzip {} -d {}'.format(tar_file_path, untar_path) elif os.path.isdir(tar_file_path): # directory containing multiple tar files cmd = 'bash get_dataset.sh -n 16 -d {}'.format(tar_file_path) else: raise ValueError('Untaring file selected not valid : {}'.format(root_path)) print('Copying data: {}'.format(cmd)) out = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) stdout, stderr = out.communicate() print(stdout) print(stderr) with open(marker_path, mode='a'): pass def dataset_wrapper(base_dataset, mode='default'): if mode == 'default': base_dataset.initialize() elif mode == 'euler': if hasattr(base_dataset, 'prepare_data_euler'): base_dataset.prepare_data_euler() else: base_dataset = prepare_data_euler(base_dataset) else: raise Exception('Unknown mode {}'.format(mode)) return base_dataset def prepare_data_euler(base_dataset): root_path = base_dataset.root tar_file_path = base_dataset.root tar_file_name = os.path.split(tar_file_path)[-1].split('.')[0] print(tar_file_name) untar_path = '{}'.format(os.environ["TMPDIR"]) out_path = '{}/{}'.format(os.environ["TMPDIR"], tar_file_name) marker_path = '{}_done.txt'.format(out_path) if not os.path.isfile(marker_path): cmd = 'tar xvf {} -C {}'.format(tar_file_path, untar_path) print('Copying data: {}'.format(cmd)) out = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) stdout, stderr = out.communicate() print(stdout) print(stderr) with open(marker_path, mode='a'): pass base_dataset.root = out_path # base_dataset.initialize() return base_dataset