File size: 2,890 Bytes
05fb4ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
import subprocess


def prepare_data(root_path, mode='default'):
    """Untar data at the specified root_path. Only useful for euler cluster"""
    if mode == 'euler':
        tar_file_path = root_path

        tar_file_name = os.path.split(tar_file_path)[-1].split('.')[0]
        untar_path = '{}'.format(os.environ["TMPDIR"])
        out_path = '{}/{}'.format(os.environ["TMPDIR"], tar_file_name)

        marker_path = '{}_done.txt'.format(out_path)
        if not os.path.isfile(marker_path):
            if tar_file_path.endswith('.tar.gz'):
                cmd = 'tar -I pigz -xf {} -C {}'.format(tar_file_path, untar_path)
            elif tar_file_path.endswith('tar'):
                cmd = 'tar -xf {} -C {}'.format(tar_file_path, untar_path)
            elif tar_file_path.endswith('tar.xz'):
                cmd = 'tar -xvf {} -C {}'.format(tar_file_path, untar_path)
            elif tar_file_path.endswith('.zip'):
                cmd = 'unzip {} -d {}'.format(tar_file_path, untar_path)
            elif os.path.isdir(tar_file_path):
                # directory containing multiple tar files
                cmd = 'bash get_dataset.sh -n 16 -d {}'.format(tar_file_path)
            else:
                raise ValueError('Untaring file selected not valid : {}'.format(root_path))
            print('Copying data: {}'.format(cmd))
            out = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
            stdout, stderr = out.communicate()
            print(stdout)
            print(stderr)
            with open(marker_path, mode='a'):
                pass


def dataset_wrapper(base_dataset, mode='default'):
    if mode == 'default':
        base_dataset.initialize()
    elif mode == 'euler':
        if hasattr(base_dataset, 'prepare_data_euler'):
            base_dataset.prepare_data_euler()
        else:
            base_dataset = prepare_data_euler(base_dataset)
    else:
        raise Exception('Unknown mode {}'.format(mode))

    return base_dataset


def prepare_data_euler(base_dataset):
    root_path = base_dataset.root
    tar_file_path = base_dataset.root

    tar_file_name = os.path.split(tar_file_path)[-1].split('.')[0]
    print(tar_file_name)
    untar_path = '{}'.format(os.environ["TMPDIR"])
    out_path = '{}/{}'.format(os.environ["TMPDIR"], tar_file_name)

    marker_path = '{}_done.txt'.format(out_path)
    if not os.path.isfile(marker_path):
        cmd = 'tar xvf {} -C {}'.format(tar_file_path, untar_path)
        print('Copying data: {}'.format(cmd))
        out = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
        stdout, stderr = out.communicate()
        print(stdout)
        print(stderr)
        with open(marker_path, mode='a'):
            pass

    base_dataset.root = out_path
    # base_dataset.initialize()

    return base_dataset