Spaces:

chinmaydan
/

S2SCascadeDemo

Runtime error

App Files Files Community

S2SCascadeDemo / eval.sh

chinmaydan

Initial commit

95a3ca6 about 2 years ago

raw

history blame contribute delete

5.36 kB

	#!/usr/bin/env bash

	# repo_dir: root directory of the project
	repo_dir="$( cd "$( dirname "$0" )" && pwd )"
	echo "==== Working directory: ====" >&2
	echo "${repo_dir}" >&2
	echo "============================" >&2


	test_config=$1
	source ${repo_dir}/scripts/load_config.sh ${test_config} ${repo_dir}
	model_dir=$2
	choice=$3 # all\|best\|last

	model_dir=${repo_dir}/model
	data_dir=${repo_dir}/data
	res_path=${model_dir}/results

	mkdir -p ${model_dir} ${data_dir} ${res_path}

	testset_name=data_testset_1_name
	testset_path=data_testset_1_path
	testset_ref=data_testset_1_ref
	testset_direc=data_testset_1_direction
	i=1
	testsets=""
	while [[ ! -z ${!testset_path} && ! -z ${!testset_direc} ]]; do
	dataname=${!testset_name}
	mkdir -p ${data_dir}/${!testset_direc}/${dataname} ${data_dir}/ref/${!testset_direc}/${dataname}
	cp ${!testset_path}/* ${data_dir}/${!testset_direc}/${dataname}/
	cp ${!testset_ref}/* ${data_dir}/ref/${!testset_direc}/${dataname}/
	if [[ $testsets == "" ]]; then
	testsets=${!testset_direc}/${dataname}
	else
	testsets=${testsets}:${!testset_direc}/${dataname}
	fi
	i=$((i+1))
	testset_name=testset_${i}_name
	testset_path=testset_${i}_path
	testset_ref=testset_${i}_ref
	testset_direc=testset_${i}_direction
	done

	IFS=':' read -r -a testset_list <<< ${testsets}


	bleu () {
	src=$1
	tgt=$2
	res_file=$3
	ref_file=$4
	if [[ -f ${res_file} ]]; then
	f_dirname=`dirname ${res_file}`
	python3 ${repo_dir}/scripts/utils.py ${res_file} ${ref_file} \|\| exit 1;
	input_file="${f_dirname}/hypo.out.nobpe"
	output_file="${f_dirname}/hypo.out.nobpe.final"
	# form command
	cmd="cat ${input_file}"
	lang_token="LANG_TOK_"`echo "${tgt} " \| tr '[a-z]' '[A-Z]'`
	if [[ $tgt == "fr" ]]; then
	cmd=$cmd" \| sed -Ee 's/\"([^\"]*)\"/« \1 »/g'"
	elif [[ $tgt == "zh" ]]; then
	tokenizer="zh"
	elif [[ $tgt == "ja" ]]; then
	tokenizer="ja-mecab"
	fi
	[[ -z $tokenizer ]] && tokenizer="none"
	cmd=$cmd" \| sed -e s'\|${lang_token} \|\|g' > ${output_file}"
	eval $cmd \|\| { echo "$cmd FAILED !"; exit 1; }
	cat ${output_file} \| sacrebleu -l ${src}-${tgt} -tok $tokenizer --short "${f_dirname}/ref.out" \| awk '{print $3}'
	else
	echo "${res_file} not exist!" >&2 && exit 1;
	fi
	}

	# monitor
	# ${ckptname}/${direction}/${testname}/orig.txt
	(inotifywait -r -m -e close_write ${res_path} \|
	while read path action file; do
	if [[ "$file" =~ .*txt$ ]]; then
	tmp_str="${path%/*}"
	testname="${tmp_str##*/}"
	tmp_str="${tmp_str%/*}"
	direction="${tmp_str##*/}"
	tmp_str="${tmp_str%/*}"
	ckptname="${tmp_str##*/}"
	src_lang="${direction%2*}"
	tgt_lang="${direction##*2}"
	res_file=$path$file
	ref_file=${data_dir}/ref/${direction}/${testname}/dev.${tgt_lang}
	bleuscore=`bleu ${src_lang} ${tgt_lang} ${res_file} ${ref_file}`
	bleu_str="$(date "+%Y-%m-%d %H:%M:%S")\t${ckptname}\t${direction}/${testname}\t$bleuscore"
	echo -e ${bleu_str} # to stdout
	echo -e ${bleu_str} >> ${model_dir}/summary.log
	fi
	done) &


	if [[ ${choice} == "all" ]]; then
	filelist=`ls -la ${model_dir} \| sort -k6,7 -r \| awk '{print $NF}' \| grep .pt$ \| tr '\n' ' '`
	elif [[ ${choice} == "best" ]]; then
	filelist="${model_dir}/checkpoint_best.pt"
	elif [[ ${choice} == "last" ]]; then
	filelist="${model_dir}/checkpoint_last.pt"
	else
	echo "invalid choice!" && exit 2;
	fi

	N=${NUM_GPU}
	#export CUDA_VISIBLE_DEVICES=$(seq -s ',' 0 $(($N - 1)) )


	infer_test () {
	test_path=$1
	ckpts=$2
	gpu=$3
	final_res_file=$4
	src=$5
	tgt=$6
	gpu_cmd="CUDA_VISIBLE_DEVICES=$gpu "
	lang_token="LANG_TOK_"`echo "${tgt} " \| tr '[a-z]' '[A-Z]'`
	[[ -z ${max_source_positions} ]] && max_source_positions=1024
	[[ -z ${max_target_positions} ]] && max_target_positions=1024
	command=${gpu_cmd}"fairseq-generate ${test_path} \
	--user-dir ${repo_dir}/mcolt \
	-s ${src} \
	-t ${tgt} \
	--skip-invalid-size-inputs-valid-test \
	--path ${ckpts} \
	--max-tokens 1024 \
	--task translation_w_langtok \
	${options} \
	--lang-prefix-tok ${lang_token} \
	--max-source-positions ${max_source_positions} \
	--max-target-positions ${max_target_positions} \
	--nbest 1 \| grep -E '[S\|H\|P\|T]-[0-9]+' > ${final_res_file}
	"
	echo "$command"
	}

	export -f infer_test
	i=0
	(for ckpt in ${filelist}
	do
	for testset in "${testset_list[@]}"
	do
	ckptbase=`basename $ckpt`
	ckptname="${ckptbase%.*}"
	direction="${testset%/*}"
	testname="${testset##*/}"
	src_lang="${direction%2*}"
	tgt_lang="${direction##*2}"

	((i=i%N)); ((i++==0)) && wait
	test_path=${data_dir}/${testset}

	echo "-----> "${ckptname}" \| "${direction}/$testname" <-----" >&2
	if [[ ! -d ${res_path}/${ckptname}/${direction}/${testname} ]]; then
	mkdir -p ${res_path}/${ckptname}/${direction}/${testname}
	fi
	final_res_file="${res_path}/${ckptname}/${direction}/${testname}/orig.txt"
	command=`infer_test ${test_path} ${model_dir}/${ckptname}.pt $((i-1)) ${final_res_file} ${src_lang} ${tgt_lang}`
	echo "${command}"
	eval $command &
	done
	done)