Spaces:
Runtime error
Runtime error
| # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import os | |
| import time | |
| from collections import deque, defaultdict | |
| import pickle | |
| import shutil | |
| import numpy as np | |
| import paddle | |
| import paddle.nn.functional as F | |
| from paddleseg.utils import TimeAverager, calculate_eta, resume, logger | |
| from .val import evaluate | |
| def visual_in_traning(log_writer, vis_dict, step): | |
| """ | |
| Visual in vdl | |
| Args: | |
| log_writer (LogWriter): The log writer of vdl. | |
| vis_dict (dict): Dict of tensor. The shape of thesor is (C, H, W) | |
| """ | |
| for key, value in vis_dict.items(): | |
| value_shape = value.shape | |
| if value_shape[0] not in [1, 3]: | |
| value = value[0] | |
| value = value.unsqueeze(0) | |
| value = paddle.transpose(value, (1, 2, 0)) | |
| min_v = paddle.min(value) | |
| max_v = paddle.max(value) | |
| if (min_v > 0) and (max_v < 1): | |
| value = value * 255 | |
| elif (min_v < 0 and min_v >= -1) and (max_v <= 1): | |
| value = (1 + value) / 2 * 255 | |
| else: | |
| value = (value - min_v) / (max_v - min_v) * 255 | |
| value = value.astype('uint8') | |
| value = value.numpy() | |
| log_writer.add_image(tag=key, img=value, step=step) | |
| def save_best(best_model_dir, metrics_data, iter): | |
| with open(os.path.join(best_model_dir, 'best_metrics.txt'), 'w') as f: | |
| for key, value in metrics_data.items(): | |
| line = key + ' ' + str(value) + '\n' | |
| f.write(line) | |
| f.write('iter' + ' ' + str(iter) + '\n') | |
| def get_best(best_file, metrics, resume_model=None): | |
| '''Get best metrics and iter from file''' | |
| best_metrics_data = {} | |
| if os.path.exists(best_file) and (resume_model is not None): | |
| values = [] | |
| with open(best_file, 'r') as f: | |
| lines = f.readlines() | |
| for line in lines: | |
| line = line.strip() | |
| key, value = line.split(' ') | |
| best_metrics_data[key] = eval(value) | |
| if key == 'iter': | |
| best_iter = eval(value) | |
| else: | |
| for key in metrics: | |
| best_metrics_data[key] = np.inf | |
| best_iter = -1 | |
| return best_metrics_data, best_iter | |
| def train(model, | |
| train_dataset, | |
| val_dataset=None, | |
| optimizer=None, | |
| save_dir='output', | |
| iters=10000, | |
| batch_size=2, | |
| resume_model=None, | |
| save_interval=1000, | |
| log_iters=10, | |
| log_image_iters=1000, | |
| num_workers=0, | |
| use_vdl=False, | |
| losses=None, | |
| keep_checkpoint_max=5, | |
| eval_begin_iters=None, | |
| metrics='sad'): | |
| """ | |
| Launch training. | |
| Args: | |
| model(nn.Layer): A matting model. | |
| train_dataset (paddle.io.Dataset): Used to read and process training datasets. | |
| val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. | |
| optimizer (paddle.optimizer.Optimizer): The optimizer. | |
| save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'. | |
| iters (int, optional): How may iters to train the model. Defualt: 10000. | |
| batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2. | |
| resume_model (str, optional): The path of resume model. | |
| save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000. | |
| log_iters (int, optional): Display logging information at every log_iters. Default: 10. | |
| log_image_iters (int, optional): Log image to vdl. Default: 1000. | |
| num_workers (int, optional): Num workers for data loader. Default: 0. | |
| use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False. | |
| losses (dict, optional): A dict of loss, refer to the loss function of the model for details. Default: None. | |
| keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5. | |
| eval_begin_iters (int): The iters begin evaluation. It will evaluate at iters/2 if it is None. Defalust: None. | |
| metrics(str|list, optional): The metrics to evaluate, it may be the combination of ("sad", "mse", "grad", "conn"). | |
| """ | |
| model.train() | |
| nranks = paddle.distributed.ParallelEnv().nranks | |
| local_rank = paddle.distributed.ParallelEnv().local_rank | |
| start_iter = 0 | |
| if resume_model is not None: | |
| start_iter = resume(model, optimizer, resume_model) | |
| if not os.path.isdir(save_dir): | |
| if os.path.exists(save_dir): | |
| os.remove(save_dir) | |
| os.makedirs(save_dir) | |
| if nranks > 1: | |
| # Initialize parallel environment if not done. | |
| if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized( | |
| ): | |
| paddle.distributed.init_parallel_env() | |
| ddp_model = paddle.DataParallel(model) | |
| else: | |
| ddp_model = paddle.DataParallel(model) | |
| batch_sampler = paddle.io.DistributedBatchSampler( | |
| train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) | |
| loader = paddle.io.DataLoader( | |
| train_dataset, | |
| batch_sampler=batch_sampler, | |
| num_workers=num_workers, | |
| return_list=True, ) | |
| if use_vdl: | |
| from visualdl import LogWriter | |
| log_writer = LogWriter(save_dir) | |
| if isinstance(metrics, str): | |
| metrics = [metrics] | |
| elif not isinstance(metrics, list): | |
| metrics = ['sad'] | |
| best_metrics_data, best_iter = get_best( | |
| os.path.join(save_dir, 'best_model', 'best_metrics.txt'), | |
| metrics, | |
| resume_model=resume_model) | |
| avg_loss = defaultdict(float) | |
| iters_per_epoch = len(batch_sampler) | |
| reader_cost_averager = TimeAverager() | |
| batch_cost_averager = TimeAverager() | |
| save_models = deque() | |
| batch_start = time.time() | |
| iter = start_iter | |
| while iter < iters: | |
| for data in loader: | |
| iter += 1 | |
| if iter > iters: | |
| break | |
| reader_cost_averager.record(time.time() - batch_start) | |
| logit_dict, loss_dict = ddp_model(data) if nranks > 1 else model( | |
| data) | |
| loss_dict['all'].backward() | |
| optimizer.step() | |
| lr = optimizer.get_lr() | |
| if isinstance(optimizer._learning_rate, | |
| paddle.optimizer.lr.LRScheduler): | |
| optimizer._learning_rate.step() | |
| model.clear_gradients() | |
| for key, value in loss_dict.items(): | |
| avg_loss[key] += value.numpy()[0] | |
| batch_cost_averager.record( | |
| time.time() - batch_start, num_samples=batch_size) | |
| if (iter) % log_iters == 0 and local_rank == 0: | |
| for key, value in avg_loss.items(): | |
| avg_loss[key] = value / log_iters | |
| remain_iters = iters - iter | |
| avg_train_batch_cost = batch_cost_averager.get_average() | |
| avg_train_reader_cost = reader_cost_averager.get_average() | |
| eta = calculate_eta(remain_iters, avg_train_batch_cost) | |
| # loss info | |
| loss_str = ' ' * 26 + '\t[LOSSES]' | |
| loss_str = loss_str | |
| for key, value in avg_loss.items(): | |
| if key != 'all': | |
| loss_str = loss_str + ' ' + key + '={:.4f}'.format( | |
| value) | |
| logger.info( | |
| "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.5f}, ips={:.4f} samples/sec | ETA {}\n{}\n" | |
| .format((iter - 1) // iters_per_epoch + 1, iter, iters, | |
| avg_loss['all'], lr, avg_train_batch_cost, | |
| avg_train_reader_cost, | |
| batch_cost_averager.get_ips_average( | |
| ), eta, loss_str)) | |
| if use_vdl: | |
| for key, value in avg_loss.items(): | |
| log_tag = 'Train/' + key | |
| log_writer.add_scalar(log_tag, value, iter) | |
| log_writer.add_scalar('Train/lr', lr, iter) | |
| log_writer.add_scalar('Train/batch_cost', | |
| avg_train_batch_cost, iter) | |
| log_writer.add_scalar('Train/reader_cost', | |
| avg_train_reader_cost, iter) | |
| if iter % log_image_iters == 0: | |
| vis_dict = {} | |
| # ground truth | |
| vis_dict['ground truth/img'] = data['img'][0] | |
| for key in data['gt_fields']: | |
| key = key[0] | |
| vis_dict['/'.join(['ground truth', key])] = data[ | |
| key][0] | |
| # predict | |
| for key, value in logit_dict.items(): | |
| vis_dict['/'.join(['predict', key])] = logit_dict[ | |
| key][0] | |
| visual_in_traning( | |
| log_writer=log_writer, vis_dict=vis_dict, step=iter) | |
| for key in avg_loss.keys(): | |
| avg_loss[key] = 0. | |
| reader_cost_averager.reset() | |
| batch_cost_averager.reset() | |
| # save model | |
| if (iter % save_interval == 0 or iter == iters) and local_rank == 0: | |
| current_save_dir = os.path.join(save_dir, | |
| "iter_{}".format(iter)) | |
| if not os.path.isdir(current_save_dir): | |
| os.makedirs(current_save_dir) | |
| paddle.save(model.state_dict(), | |
| os.path.join(current_save_dir, 'model.pdparams')) | |
| paddle.save(optimizer.state_dict(), | |
| os.path.join(current_save_dir, 'model.pdopt')) | |
| save_models.append(current_save_dir) | |
| if len(save_models) > keep_checkpoint_max > 0: | |
| model_to_remove = save_models.popleft() | |
| shutil.rmtree(model_to_remove) | |
| # eval model | |
| if eval_begin_iters is None: | |
| eval_begin_iters = iters // 2 | |
| if (iter % save_interval == 0 or iter == iters) and ( | |
| val_dataset is not None | |
| ) and local_rank == 0 and iter >= eval_begin_iters: | |
| num_workers = 1 if num_workers > 0 else 0 | |
| metrics_data = evaluate( | |
| model, | |
| val_dataset, | |
| num_workers=1, | |
| print_detail=True, | |
| save_results=False, | |
| metrics=metrics) | |
| model.train() | |
| # save best model and add evaluation results to vdl | |
| if (iter % save_interval == 0 or iter == iters) and local_rank == 0: | |
| if val_dataset is not None and iter >= eval_begin_iters: | |
| if metrics_data[metrics[0]] < best_metrics_data[metrics[0]]: | |
| best_iter = iter | |
| best_metrics_data = metrics_data.copy() | |
| best_model_dir = os.path.join(save_dir, "best_model") | |
| paddle.save( | |
| model.state_dict(), | |
| os.path.join(best_model_dir, 'model.pdparams')) | |
| save_best(best_model_dir, best_metrics_data, iter) | |
| show_list = [] | |
| for key, value in best_metrics_data.items(): | |
| show_list.append((key, value)) | |
| log_str = '[EVAL] The model with the best validation {} ({:.4f}) was saved at iter {}.'.format( | |
| show_list[0][0], show_list[0][1], best_iter) | |
| if len(show_list) > 1: | |
| log_str += " While" | |
| for i in range(1, len(show_list)): | |
| log_str = log_str + ' {}: {:.4f},'.format( | |
| show_list[i][0], show_list[i][1]) | |
| log_str = log_str[:-1] | |
| logger.info(log_str) | |
| if use_vdl: | |
| for key, value in metrics_data.items(): | |
| log_writer.add_scalar('Evaluate/' + key, value, | |
| iter) | |
| batch_start = time.time() | |
| # Sleep for half a second to let dataloader release resources. | |
| time.sleep(0.5) | |
| if use_vdl: | |
| log_writer.close() | |