Commit 0b216964 authored by vesis84's avatar vesis84

nnet1: adding multi-lingual NN training,

- supports arbitrary number of languages,
- uses 'multi-task' training with per-language weights (one softmax per language),
- (the decoding script not yet merged)
parent 05e3fbd5
#!/bin/bash
# Copyright 2015 University of Illinois (Author: Amit Das)
# Copyright 2012-2015 Brno University of Technology (Author: Karel Vesely)
# Apache 2.0
# This example script trains Multi-lingual DNN with <BlockSoftmax> output, using FBANK features.
# The network is trained on multiple languages simultaneously, creating a separate softmax layer
# per language while sharing hidden layers across all languages.
# The script supports arbitrary number of languages.
. ./cmd.sh
. ./path.sh
# Example setup, the options are in 'csl' format, they must have same number of elements,
lang_code_csl="rm,wsj" # One label for each language,
lang_weight_csl="1.0,0.1" # Per-language weights, they scale loss-function and gradient, 1.0 for each language is good,
ali_dir_csl="exp/tri3b_ali,../../wsj/s5/exp/tri4b_ali_si284" # One ali-dir per language,
data_dir_csl="data/train,../../wsj/s5/data/train_si284" # One train-data-dir per language (features will be re-computed),
nnet_type=dnn_small # dnn_small | dnn | bn
stage=0
. utils/parse_options.sh || exit 1;
set -euxo pipefail
# Convert 'csl' to bash array (accept separators ',' ':'),
lang_code=($(echo $lang_code_csl | tr ',:' ' '))
ali_dir=($(echo $ali_dir_csl | tr ',:' ' '))
data_dir=($(echo $data_dir_csl | tr ',:' ' '))
# Make sure we have same number of items in lists,
! [ ${#lang_code[@]} -eq ${#ali_dir[@]} -a ${#lang_code[@]} -eq ${#data_dir[@]} ] && \
echo "Non-matching number of 'csl' items: lang_code ${#lang_code[@]}, ali_dir ${ali_dir[@]}, data_dir ${#data_dir[@]}" && \
exit 1
num_langs=${#lang_code[@]}
# Check if all the input directories exist,
for i in $(seq 0 $[num_langs-1]); do
echo "lang = ${lang_code[$i]}, alidir = ${ali_dir[$i]}, datadir = ${data_dir[$i]}"
[ ! -d ${ali_dir[$i]} ] && echo "Missing ${ali_dir[$i]}" && exit 1
[ ! -d ${data_dir[$i]} ] && echo "Missing ${data_dir[$i]}" && exit 1
done
# Make the features,
data=data-fbank-multilingual${num_langs}-$(echo $lang_code_csl | tr ',' '-')
data_tr90=$data/combined_tr90
data_cv10=$data/combined_cv10
if [ $stage -le 0 ]; then
# Make local copy of data-dirs (while adding language-code),
tr90=""
cv10=""
for i in $(seq 0 $[num_langs-1]); do
code=${lang_code[$i]}
dir=${data_dir[$i]}
tgt_dir=$data/${code}_$(basename $dir)
utils/copy_data_dir.sh --utt-suffix _$code --spk-suffix _$code $dir $tgt_dir; rm $tgt_dir/{feats,cmvn}.scp || true # remove features,
# extract features, get cmvn stats,
steps/make_fbank_pitch.sh --nj 30 --cmd "$train_cmd -tc 10" $tgt_dir{,/log,/data}
steps/compute_cmvn_stats.sh $tgt_dir{,/log,/data}
# split lists 90% train / 10% held-out,
utils/subset_data_dir_tr_cv.sh $tgt_dir ${tgt_dir}_tr90 ${tgt_dir}_cv10
tr90="$tr90 ${tgt_dir}_tr90"
cv10="$cv10 ${tgt_dir}_cv10"
done
# Merge the datasets,
utils/combine_data.sh $data_tr90 $tr90
utils/combine_data.sh $data_cv10 $cv10
# Validate,
utils/validate_data_dir.sh $data_tr90
utils/validate_data_dir.sh $data_cv10
fi
# Extract the tied-state numbers from transition models,
for i in $(seq 0 $[num_langs-1]); do
ali_dim[i]=$(hmm-info ${ali_dir[i]}/final.mdl | grep pdfs | awk '{ print $NF }')
done
ali_dim_csl=$(echo ${ali_dim[@]} | tr ' ' ',')
# Total number of DNN outputs (sum of all per-language blocks),
output_dim=$(echo ${ali_dim[@]} | tr ' ' '\n' | awk '{ sum += $i; } END{ print sum; }')
echo "Total number of DNN outputs: $output_dim = $(echo ${ali_dim[@]} | sed 's: : + :g')"
# Objective function string (per-language weights are imported from '$lang_weight_csl'),
objective_function="multitask$(echo ${ali_dim[@]} | tr ' ' '\n' | \
awk -v w=$lang_weight_csl 'BEGIN{ split(w,w_arr,/[,:]/); } { printf(",xent,%d,%s", $1, w_arr[NR]); }')"
echo "Multitask objective function: $objective_function"
# DNN training will be in $dir, the alignments are prepared beforehead,
dir=exp/dnn4g-multilingual${num_langs}-$(echo $lang_code_csl | tr ',' '-')
[ ! -e $dir ] && mkdir -p $dir
echo "$lang_code_csl" >$dir/lang_code_csl
echo "$ali_dir_csl" >$dir/ali_dir_csl
echo "$data_dir_csl" >$dir/data_dir_csl
echo "$ali_dim_csl" >$dir/ali_dim_csl
echo "$objective_function" >$dir/objective_function
# Prepare the merged targets,
if [ $stage -le 1 ]; then
[ ! -e $dir/ali-post ] && mkdir -p $dir/ali-post
# re-saving the ali in posterior format, indexed by 'scp',
for i in $(seq 0 $[num_langs-1]); do
code=${lang_code[$i]}
ali=${ali_dir[$i]}
# utt suffix added by 'awk',
ali-to-pdf $ali/final.mdl "ark:gunzip -c ${ali}/ali.*.gz |" ark,t:- | awk -v c=$code '{ $1=$1"_"c; print $0; }' | \
ali-to-post ark:- ark,scp:$dir/ali-post/$code.ark,$dir/ali-post/$code.scp
done
# pasting the ali's, adding language-specific offsets to the posteriors,
featlen="ark:feat-to-len 'scp:cat $data_tr90/feats.scp $data_cv10/feats.scp |' ark,t:- |" # get number of frames for every utterance,
post_scp_list=$(echo ${lang_code[@]} | tr ' ' '\n' | awk -v d=$dir '{ printf(" scp:%s/ali-post/%s.scp", d, $1); }')
paste-post --allow-partial=true "$featlen" "${ali_dim_csl}" ${post_scp_list} \
ark,scp:$dir/ali-post/combined.ark,$dir/ali-post/combined.scp
fi
# Train the <BlockSoftmax> system, 1st stage of Stacked-Bottleneck-Network,
if [ $stage -le 2 ]; then
case $nnet_type in
bn)
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh --learn-rate 0.008 \
--hid-layers 2 --hid-dim 1500 --bn-dim 80 \
--cmvn-opts "--norm-means=true --norm-vars=false" \
--feat-type "traps" --splice 5 --traps-dct-basis 6 \
--labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \
--proto-opts "--block-softmax-dims=${ali_dim_csl}" \
--train-tool "nnet-train-frmshuff --objective-function=$objective_function" \
${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir
;;
dnn_small)
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh --learn-rate 0.008 \
--cmvn-opts "--norm-means=true --norm-vars=true" \
--delta-opts "--delta-order=2" --splice 5 \
--labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \
--proto-opts "--block-softmax-dims=${ali_dim_csl}" \
--train-tool "nnet-train-frmshuff --objective-function=$objective_function" \
${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir
;;
dnn)
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh --learn-rate 0.008 \
--hid-layers 6 --hid-dim 2048 \
--cmvn-opts "--norm-means=true --norm-vars=false" \
--delta-opts "--delta-order=2" --splice 5 \
--labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \
--proto-opts "--block-softmax-dims=${ali_dim_csl}" \
--train-tool "nnet-train-frmshuff --objective-function=$objective_function" \
${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir
;;
*)
echo "Unknown --nnet-type $nnet_type"; exit 1;
;;
esac
fi
exit 0
......@@ -17,7 +17,7 @@
# Generated Nnet prototype, to be initialized by 'nnet-initialize'.
import math, random, sys
import math, random, sys, re
###
### Parse options
......@@ -79,7 +79,7 @@ assert(num_leaves > 0)
assert(num_hid_layers >= 0)
assert(num_hid_neurons > 0)
if o.block_softmax_dims:
assert(sum(map(int, o.block_softmax_dims.split(':'))) == num_leaves)
assert(sum(map(int, re.split("[,:]", o.block_softmax_dims))) == num_leaves) # posible separators : ',' ':'
# Optionaly scale
def Glorot(dim1, dim2):
......
......@@ -82,7 +82,7 @@ class BlockSoftmax : public Component {
is >> std::ws; // eat-up whitespace
}
// parse dims,
if (!kaldi::SplitStringToIntegers(dims_str, ":", false, &block_dims))
if (!kaldi::SplitStringToIntegers(dims_str, ",:", false, &block_dims))
KALDI_ERR << "Invalid block-dims " << dims_str;
// sanity check
int32 sum = 0;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment