Spaces:
Sleeping
Sleeping
File size: 2,890 Bytes
05fb4ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import os
import subprocess
def prepare_data(root_path, mode='default'):
"""Untar data at the specified root_path. Only useful for euler cluster"""
if mode == 'euler':
tar_file_path = root_path
tar_file_name = os.path.split(tar_file_path)[-1].split('.')[0]
untar_path = '{}'.format(os.environ["TMPDIR"])
out_path = '{}/{}'.format(os.environ["TMPDIR"], tar_file_name)
marker_path = '{}_done.txt'.format(out_path)
if not os.path.isfile(marker_path):
if tar_file_path.endswith('.tar.gz'):
cmd = 'tar -I pigz -xf {} -C {}'.format(tar_file_path, untar_path)
elif tar_file_path.endswith('tar'):
cmd = 'tar -xf {} -C {}'.format(tar_file_path, untar_path)
elif tar_file_path.endswith('tar.xz'):
cmd = 'tar -xvf {} -C {}'.format(tar_file_path, untar_path)
elif tar_file_path.endswith('.zip'):
cmd = 'unzip {} -d {}'.format(tar_file_path, untar_path)
elif os.path.isdir(tar_file_path):
# directory containing multiple tar files
cmd = 'bash get_dataset.sh -n 16 -d {}'.format(tar_file_path)
else:
raise ValueError('Untaring file selected not valid : {}'.format(root_path))
print('Copying data: {}'.format(cmd))
out = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
stdout, stderr = out.communicate()
print(stdout)
print(stderr)
with open(marker_path, mode='a'):
pass
def dataset_wrapper(base_dataset, mode='default'):
if mode == 'default':
base_dataset.initialize()
elif mode == 'euler':
if hasattr(base_dataset, 'prepare_data_euler'):
base_dataset.prepare_data_euler()
else:
base_dataset = prepare_data_euler(base_dataset)
else:
raise Exception('Unknown mode {}'.format(mode))
return base_dataset
def prepare_data_euler(base_dataset):
root_path = base_dataset.root
tar_file_path = base_dataset.root
tar_file_name = os.path.split(tar_file_path)[-1].split('.')[0]
print(tar_file_name)
untar_path = '{}'.format(os.environ["TMPDIR"])
out_path = '{}/{}'.format(os.environ["TMPDIR"], tar_file_name)
marker_path = '{}_done.txt'.format(out_path)
if not os.path.isfile(marker_path):
cmd = 'tar xvf {} -C {}'.format(tar_file_path, untar_path)
print('Copying data: {}'.format(cmd))
out = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
stdout, stderr = out.communicate()
print(stdout)
print(stderr)
with open(marker_path, mode='a'):
pass
base_dataset.root = out_path
# base_dataset.initialize()
return base_dataset |