Spaces:
Runtime error
Runtime error
| # Launches tuning jobs. | |
| # Modify this file to launch workers with your prefered cloud API. | |
| # The following implementation runs each worker as a subprocess on the local | |
| # machine. | |
| MODELS_DIR="/tmp/models" | |
| # Get command line options. | |
| OPTS=$(getopt -n "$0" -o "" --long "job_name:,config:,num_tuners:,num_workers_per_tuner:,num_ps_per_tuner:,max_npe:,num_repetitions:,stop_on_success:,fixed_hparams:,hparam_space_type:" -- "$@") | |
| if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi | |
| eval set -- "$OPTS" | |
| JOB_NAME="" # Name of the process and the logs directory. | |
| CONFIG="" # Model and environment hparams. | |
| # NUM_TUNERS: Number of tuning jobs to launch. Each tuning job can train a | |
| # hparam combination. So more tuners means more hparams tried in parallel. | |
| NUM_TUNERS=1 | |
| # NUM_WORKERS_PER_TUNER: Number of workers to launch for each tuning job. If | |
| # using neural networks, each worker will be 1 replica. | |
| NUM_WORKERS_PER_TUNER=1 | |
| # NUM_PS_PER_TUNER: Number of parameter servers to launch for this tuning job. | |
| # Only set this if using neural networks. For 1 worker per tuner, no parameter | |
| # servers are needed. For more than 1 worker per tuner, at least 1 parameter | |
| # server per tuner is needed to store the global model for each tuner. | |
| NUM_PS_PER_TUNER=0 | |
| # MAX_NPE: Maximum number of programs executed. Training will quit once this | |
| # threshold is reached. If 0, the threshold is infinite. | |
| MAX_NPE=0 | |
| NUM_REPETITIONS=25 # How many times to run this experiment. | |
| STOP_ON_SUCCESS=true # Whether to halt training when a solution is found. | |
| # FIXED_HPARAMS: Hold hparams fixed in the grid search. This reduces the search | |
| # space. | |
| FIXED_HPARAMS="" | |
| # HPARAM_SPACE_TYPE: Specifies the hparam search space. See | |
| # `define_tuner_hparam_space` functions defined in pg_train.py and ga_train.py. | |
| HPARAM_SPACE_TYPE="pg" | |
| # Parse options into variables. | |
| while true; do | |
| case "$1" in | |
| --job_name ) JOB_NAME="$2"; shift; shift ;; | |
| --config ) CONFIG="$2"; shift; shift ;; | |
| --num_tuners ) NUM_TUNERS="$2"; shift; shift ;; | |
| --num_workers_per_tuner ) NUM_WORKERS_PER_TUNER="$2"; shift; shift ;; | |
| --num_ps_per_tuner ) NUM_PS_PER_TUNER="$2"; shift; shift ;; | |
| --max_npe ) MAX_NPE="$2"; shift; shift ;; | |
| --num_repetitions ) NUM_REPETITIONS="$2"; shift; shift ;; | |
| --stop_on_success ) STOP_ON_SUCCESS="$2"; shift; shift ;; | |
| --fixed_hparams ) FIXED_HPARAMS="$2"; shift; shift ;; | |
| --hparam_space_type ) HPARAM_SPACE_TYPE="$2"; shift; shift ;; | |
| -- ) shift; break ;; | |
| * ) break ;; | |
| esac | |
| done | |
| # Launch jobs. | |
| # TODO: multi-worker RL training | |
| LOGDIR="$MODELS_DIR/$JOB_NAME" | |
| mkdir -p $LOGDIR | |
| BIN_DIR="bazel-bin/single_task" | |
| for ((tuner=0;tuner<NUM_TUNERS;tuner+=1)); do | |
| for ((i=0;i<NUM_WORKERS_PER_TUNER;i++)); do | |
| # Expecting tune.par to be built. | |
| echo "$LOGDIR" | |
| $BIN_DIR/tune.par \ | |
| --alsologtostderr \ | |
| --config="$CONFIG" \ | |
| --logdir="$LOGDIR" \ | |
| --max_npe="$MAX_NPE" \ | |
| --num_repetitions="$NUM_REPETITIONS" \ | |
| --stop_on_success="$STOP_ON_SUCCESS" \ | |
| --summary_tasks=1 \ | |
| --hparam_space="$HPARAM_SPACE_TYPE" \ | |
| --fixed_hparams="$FIXED_HPARAMS" \ | |
| --tuner_id=$tuner \ | |
| --num_tuners=$NUM_TUNERS \ | |
| 2> "$LOGDIR/tuner_$tuner.task_$i.log" & # Run as subprocess | |
| echo "Launched tuner $tuner, task $i. Logs: $LOGDIR/tuner_$tuner.task_$i.log" | |
| done | |
| done | |
| # Use "pidof tune.par" to find jobs. | |
| # Kill with "pkill tune.par" | |