run_multilingual.sh 6.85 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
#!/bin/bash

# Copyright 2015  University of Illinois (Author: Amit Das)
# Copyright 2012-2015  Brno University of Technology (Author: Karel Vesely)

# Apache 2.0

# This example script trains Multi-lingual DNN with <BlockSoftmax> output, using FBANK features.
# The network is trained on multiple languages simultaneously, creating a separate softmax layer
# per language while sharing hidden layers across all languages.
# The script supports arbitrary number of languages.

. ./cmd.sh
. ./path.sh

# Example setup, the options are in 'csl' format, they must have same number of elements,
lang_code_csl="rm,wsj" # One label for each language,
lang_weight_csl="1.0,0.1" # Per-language weights, they scale loss-function and gradient, 1.0 for each language is good,
ali_dir_csl="exp/tri3b_ali,../../wsj/s5/exp/tri4b_ali_si284" # One ali-dir per language,
data_dir_csl="data/train,../../wsj/s5/data/train_si284" # One train-data-dir per language (features will be re-computed),

nnet_type=dnn_small # dnn_small | dnn | bn

stage=0
. utils/parse_options.sh || exit 1;

set -euxo pipefail

# Convert 'csl' to bash array (accept separators ',' ':'),
lang_code=($(echo $lang_code_csl | tr ',:' ' ')) 
ali_dir=($(echo $ali_dir_csl | tr ',:' ' '))
data_dir=($(echo $data_dir_csl | tr ',:' ' '))

# Make sure we have same number of items in lists,
! [ ${#lang_code[@]} -eq ${#ali_dir[@]} -a ${#lang_code[@]} -eq ${#data_dir[@]} ] && \
  echo "Non-matching number of 'csl' items: lang_code ${#lang_code[@]}, ali_dir ${ali_dir[@]}, data_dir ${#data_dir[@]}" && \
  exit 1
num_langs=${#lang_code[@]}

# Check if all the input directories exist,
for i in $(seq 0 $[num_langs-1]); do
    echo "lang = ${lang_code[$i]}, alidir = ${ali_dir[$i]}, datadir = ${data_dir[$i]}"
    [ ! -d ${ali_dir[$i]} ] && echo  "Missing ${ali_dir[$i]}" && exit 1
    [ ! -d ${data_dir[$i]} ] && echo "Missing ${data_dir[$i]}" && exit 1
done

# Make the features,
data=data-fbank-multilingual${num_langs}-$(echo $lang_code_csl | tr ',' '-')
data_tr90=$data/combined_tr90
data_cv10=$data/combined_cv10
if [ $stage -le 0 ]; then
  # Make local copy of data-dirs (while adding language-code),
  tr90=""
  cv10=""
  for i in $(seq 0 $[num_langs-1]); do
    code=${lang_code[$i]}
    dir=${data_dir[$i]}
    tgt_dir=$data/${code}_$(basename $dir)
    utils/copy_data_dir.sh --utt-suffix _$code --spk-suffix _$code $dir $tgt_dir; rm $tgt_dir/{feats,cmvn}.scp || true # remove features,
    # extract features, get cmvn stats,
    steps/make_fbank_pitch.sh --nj 30 --cmd "$train_cmd -tc 10" $tgt_dir{,/log,/data}
    steps/compute_cmvn_stats.sh $tgt_dir{,/log,/data}
    # split lists 90% train / 10% held-out,
    utils/subset_data_dir_tr_cv.sh $tgt_dir ${tgt_dir}_tr90 ${tgt_dir}_cv10
    tr90="$tr90 ${tgt_dir}_tr90"
    cv10="$cv10 ${tgt_dir}_cv10"
  done
  # Merge the datasets,
  utils/combine_data.sh $data_tr90 $tr90
  utils/combine_data.sh $data_cv10 $cv10
  # Validate,
  utils/validate_data_dir.sh $data_tr90  
  utils/validate_data_dir.sh $data_cv10  
fi

# Extract the tied-state numbers from transition models,
for i in $(seq 0 $[num_langs-1]); do
  ali_dim[i]=$(hmm-info ${ali_dir[i]}/final.mdl | grep pdfs | awk '{ print $NF }')
done
ali_dim_csl=$(echo ${ali_dim[@]} | tr ' ' ',')

# Total number of DNN outputs (sum of all per-language blocks),
output_dim=$(echo ${ali_dim[@]} | tr ' ' '\n' | awk '{ sum += $i; } END{ print sum; }')
echo "Total number of DNN outputs: $output_dim = $(echo ${ali_dim[@]} | sed 's: : + :g')"

# Objective function string (per-language weights are imported from '$lang_weight_csl'),
objective_function="multitask$(echo ${ali_dim[@]} | tr ' ' '\n' | \
  awk -v w=$lang_weight_csl 'BEGIN{ split(w,w_arr,/[,:]/); } { printf(",xent,%d,%s", $1, w_arr[NR]); }')"
echo "Multitask objective function: $objective_function"

# DNN training will be in $dir, the alignments are prepared beforehead,
dir=exp/dnn4g-multilingual${num_langs}-$(echo $lang_code_csl | tr ',' '-') 
[ ! -e $dir ] && mkdir -p $dir
echo "$lang_code_csl" >$dir/lang_code_csl
echo "$ali_dir_csl" >$dir/ali_dir_csl
echo "$data_dir_csl" >$dir/data_dir_csl
echo "$ali_dim_csl" >$dir/ali_dim_csl
echo "$objective_function" >$dir/objective_function

# Prepare the merged targets,
if [ $stage -le 1 ]; then
  [ ! -e $dir/ali-post ] && mkdir -p $dir/ali-post
  # re-saving the ali in posterior format, indexed by 'scp',
  for i in $(seq 0 $[num_langs-1]); do
    code=${lang_code[$i]}
    ali=${ali_dir[$i]}
    # utt suffix added by 'awk',
    ali-to-pdf $ali/final.mdl "ark:gunzip -c ${ali}/ali.*.gz |" ark,t:- | awk -v c=$code '{ $1=$1"_"c; print $0; }' | \
      ali-to-post ark:- ark,scp:$dir/ali-post/$code.ark,$dir/ali-post/$code.scp
  done
  # pasting the ali's, adding language-specific offsets to the posteriors,
  featlen="ark:feat-to-len 'scp:cat $data_tr90/feats.scp $data_cv10/feats.scp |' ark,t:- |" # get number of frames for every utterance,
  post_scp_list=$(echo ${lang_code[@]} | tr ' ' '\n' | awk -v d=$dir '{ printf(" scp:%s/ali-post/%s.scp", d, $1); }')
  paste-post --allow-partial=true "$featlen" "${ali_dim_csl}" ${post_scp_list} \
    ark,scp:$dir/ali-post/combined.ark,$dir/ali-post/combined.scp
fi

# Train the <BlockSoftmax> system, 1st stage of Stacked-Bottleneck-Network,
if [ $stage -le 2 ]; then  
  case $nnet_type in
    bn)
    $cuda_cmd $dir/log/train_nnet.log \
      steps/nnet/train.sh --learn-rate 0.008 \
        --hid-layers 2 --hid-dim 1500 --bn-dim 80 \
        --cmvn-opts "--norm-means=true --norm-vars=false" \
        --feat-type "traps" --splice 5 --traps-dct-basis 6 \
        --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \
        --proto-opts "--block-softmax-dims=${ali_dim_csl}" \
        --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \
        ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir
    ;;
    dnn_small)
    $cuda_cmd $dir/log/train_nnet.log \
      steps/nnet/train.sh --learn-rate 0.008 \
        --cmvn-opts "--norm-means=true --norm-vars=true" \
        --delta-opts "--delta-order=2" --splice 5 \
        --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \
        --proto-opts "--block-softmax-dims=${ali_dim_csl}" \
        --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \
        ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir
    ;;
    dnn)
    $cuda_cmd $dir/log/train_nnet.log \
      steps/nnet/train.sh --learn-rate 0.008 \
        --hid-layers 6 --hid-dim 2048 \
        --cmvn-opts "--norm-means=true --norm-vars=false" \
        --delta-opts "--delta-order=2" --splice 5 \
        --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \
        --proto-opts "--block-softmax-dims=${ali_dim_csl}" \
        --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \
        ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir
    ;;
    *)
    echo "Unknown --nnet-type $nnet_type"; exit 1;
    ;;
  esac
fi

exit 0