#!/bin/bash # Copyright 2015 University of Illinois (Author: Amit Das) # Copyright 2012-2015 Brno University of Technology (Author: Karel Vesely) # Apache 2.0 # This example script trains Multi-lingual DNN with output, using FBANK features. # The network is trained on multiple languages simultaneously, creating a separate softmax layer # per language while sharing hidden layers across all languages. # The script supports arbitrary number of languages. . ./cmd.sh . ./path.sh # Example setup, the options are in 'csl' format, they must have same number of elements, lang_code_csl="rm,wsj" # One label for each language, lang_weight_csl="1.0,0.1" # Per-language weights, they scale loss-function and gradient, 1.0 for each language is good, ali_dir_csl="exp/tri3b_ali,../../wsj/s5/exp/tri4b_ali_si284" # One ali-dir per language, data_dir_csl="data/train,../../wsj/s5/data/train_si284" # One train-data-dir per language (features will be re-computed), nnet_type=dnn_small # dnn_small | dnn | bn stage=0 . utils/parse_options.sh || exit 1; set -euxo pipefail # Convert 'csl' to bash array (accept separators ',' ':'), lang_code=($(echo $lang_code_csl | tr ',:' ' ')) ali_dir=($(echo $ali_dir_csl | tr ',:' ' ')) data_dir=($(echo $data_dir_csl | tr ',:' ' ')) # Make sure we have same number of items in lists, ! [ ${#lang_code[@]} -eq ${#ali_dir[@]} -a ${#lang_code[@]} -eq ${#data_dir[@]} ] && \ echo "Non-matching number of 'csl' items: lang_code ${#lang_code[@]}, ali_dir ${ali_dir[@]}, data_dir ${#data_dir[@]}" && \ exit 1 num_langs=${#lang_code[@]} # Check if all the input directories exist, for i in $(seq 0 $[num_langs-1]); do echo "lang = ${lang_code[$i]}, alidir = ${ali_dir[$i]}, datadir = ${data_dir[$i]}" [ ! -d ${ali_dir[$i]} ] && echo "Missing ${ali_dir[$i]}" && exit 1 [ ! -d ${data_dir[$i]} ] && echo "Missing ${data_dir[$i]}" && exit 1 done # Make the features, data=data-fbank-multilingual${num_langs}-$(echo $lang_code_csl | tr ',' '-') data_tr90=$data/combined_tr90 data_cv10=$data/combined_cv10 if [ $stage -le 0 ]; then # Make local copy of data-dirs (while adding language-code), tr90="" cv10="" for i in $(seq 0 $[num_langs-1]); do code=${lang_code[$i]} dir=${data_dir[$i]} tgt_dir=$data/${code}_$(basename $dir) utils/copy_data_dir.sh --utt-suffix _$code --spk-suffix _$code $dir $tgt_dir; rm $tgt_dir/{feats,cmvn}.scp || true # remove features, # extract features, get cmvn stats, steps/make_fbank_pitch.sh --nj 30 --cmd "$train_cmd -tc 10" $tgt_dir{,/log,/data} steps/compute_cmvn_stats.sh $tgt_dir{,/log,/data} # split lists 90% train / 10% held-out, utils/subset_data_dir_tr_cv.sh $tgt_dir ${tgt_dir}_tr90 ${tgt_dir}_cv10 tr90="$tr90 ${tgt_dir}_tr90" cv10="$cv10 ${tgt_dir}_cv10" done # Merge the datasets, utils/combine_data.sh $data_tr90 $tr90 utils/combine_data.sh $data_cv10 $cv10 # Validate, utils/validate_data_dir.sh $data_tr90 utils/validate_data_dir.sh $data_cv10 fi # Extract the tied-state numbers from transition models, for i in $(seq 0 $[num_langs-1]); do ali_dim[i]=$(hmm-info ${ali_dir[i]}/final.mdl | grep pdfs | awk '{ print $NF }') done ali_dim_csl=$(echo ${ali_dim[@]} | tr ' ' ',') # Total number of DNN outputs (sum of all per-language blocks), output_dim=$(echo ${ali_dim[@]} | tr ' ' '\n' | awk '{ sum += $i; } END{ print sum; }') echo "Total number of DNN outputs: $output_dim = $(echo ${ali_dim[@]} | sed 's: : + :g')" # Objective function string (per-language weights are imported from '$lang_weight_csl'), objective_function="multitask$(echo ${ali_dim[@]} | tr ' ' '\n' | \ awk -v w=$lang_weight_csl 'BEGIN{ split(w,w_arr,/[,:]/); } { printf(",xent,%d,%s", $1, w_arr[NR]); }')" echo "Multitask objective function: $objective_function" # DNN training will be in $dir, the alignments are prepared beforehead, dir=exp/dnn4g-multilingual${num_langs}-$(echo $lang_code_csl | tr ',' '-') [ ! -e $dir ] && mkdir -p $dir echo "$lang_code_csl" >$dir/lang_code_csl echo "$ali_dir_csl" >$dir/ali_dir_csl echo "$data_dir_csl" >$dir/data_dir_csl echo "$ali_dim_csl" >$dir/ali_dim_csl echo "$objective_function" >$dir/objective_function # Prepare the merged targets, if [ $stage -le 1 ]; then [ ! -e $dir/ali-post ] && mkdir -p $dir/ali-post # re-saving the ali in posterior format, indexed by 'scp', for i in $(seq 0 $[num_langs-1]); do code=${lang_code[$i]} ali=${ali_dir[$i]} # utt suffix added by 'awk', ali-to-pdf $ali/final.mdl "ark:gunzip -c ${ali}/ali.*.gz |" ark,t:- | awk -v c=$code '{ $1=$1"_"c; print $0; }' | \ ali-to-post ark:- ark,scp:$dir/ali-post/$code.ark,$dir/ali-post/$code.scp done # pasting the ali's, adding language-specific offsets to the posteriors, featlen="ark:feat-to-len 'scp:cat $data_tr90/feats.scp $data_cv10/feats.scp |' ark,t:- |" # get number of frames for every utterance, post_scp_list=$(echo ${lang_code[@]} | tr ' ' '\n' | awk -v d=$dir '{ printf(" scp:%s/ali-post/%s.scp", d, $1); }') paste-post --allow-partial=true "$featlen" "${ali_dim_csl}" ${post_scp_list} \ ark,scp:$dir/ali-post/combined.ark,$dir/ali-post/combined.scp fi # Train the system, 1st stage of Stacked-Bottleneck-Network, if [ $stage -le 2 ]; then case $nnet_type in bn) $cuda_cmd $dir/log/train_nnet.log \ steps/nnet/train.sh --learn-rate 0.008 \ --hid-layers 2 --hid-dim 1500 --bn-dim 80 \ --cmvn-opts "--norm-means=true --norm-vars=false" \ --feat-type "traps" --splice 5 --traps-dct-basis 6 \ --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \ --proto-opts "--block-softmax-dims=${ali_dim_csl}" \ --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \ ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir ;; dnn_small) $cuda_cmd $dir/log/train_nnet.log \ steps/nnet/train.sh --learn-rate 0.008 \ --cmvn-opts "--norm-means=true --norm-vars=true" \ --delta-opts "--delta-order=2" --splice 5 \ --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \ --proto-opts "--block-softmax-dims=${ali_dim_csl}" \ --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \ ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir ;; dnn) $cuda_cmd $dir/log/train_nnet.log \ steps/nnet/train.sh --learn-rate 0.008 \ --hid-layers 6 --hid-dim 2048 \ --cmvn-opts "--norm-means=true --norm-vars=false" \ --delta-opts "--delta-order=2" --splice 5 \ --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \ --proto-opts "--block-softmax-dims=${ali_dim_csl}" \ --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \ ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir ;; *) echo "Unknown --nnet-type $nnet_type"; exit 1; ;; esac fi exit 0