Commit a6c5dd43 authored by David Snyder's avatar David Snyder
Browse files

trunk: Replacing extract_ivectors_online.sh with (a modified version of)...

trunk: Replacing extract_ivectors_online.sh with (a modified version of) extract_ivectors_online2.sh and deleting the latter from svn. The new version provides the option of creating fake-speakers (see script). Some example scripts using the online setup were changed to remove calls to extract_ivectors_online2.sh.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4441 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent dc930128
......@@ -61,7 +61,8 @@ if [ $stage -le 3 ]; then
# pairs the utterances into twos (by default, see --utts-per-spk-max option)
# and treats each of these pairs as one speaker.
# Note that these are extracted 'online'.
steps/online/nnet2/extract_ivectors_online2.sh --cmd "$train_cmd" --nj 60 \
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
--utts-per-spk-max 2 \
data/train exp/nnet2_online/extractor $ivectordir || exit 1;
fi
......
......@@ -65,7 +65,8 @@ if [ $stage -le 3 ]; then
# pairs the utterances into twos (by default, see --utts-per-spk-max option)
# and treats each of these pairs as one speaker.
# Note that these are extracted 'online'.
steps/online/nnet2/extract_ivectors_online2.sh --cmd "$train_cmd" --nj 60 \
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
--utts-per-spk-max 2 \
data/train exp/nnet2_online/extractor $ivectordir || exit 1;
fi
......
......@@ -47,7 +47,8 @@ if [ $stage -le 2 ]; then
fi
if [ $stage -le 3 ]; then
steps/online/nnet2/extract_ivectors_online2.sh --cmd "$train_cmd" --nj 4 \
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
--utts-per-spk-max 2 \
data/train exp/nnet2_online/extractor exp/nnet2_online/ivectors || exit 1;
fi
......
......@@ -79,7 +79,7 @@ if [ $stage -le 5 ]; then
# of speakers into "fake-speakers" with about 2 utterances each, by randomly making
# some have 2 and some 3 utterances... this randomnes will be different in different
# copies of the data.
steps/online/nnet2/extract_ivectors_online2.sh --cmd "$train_cmd" --nj 30 \
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
--utts-per-spk-max 2.5 \
data/train_perturbed_mfcc exp/nnet2_online/extractor $ivectordir || exit 1;
fi
......
......@@ -109,7 +109,8 @@ fi
# In order to train the combined model, we'll need to dump iVectors.
if [ $stage -le 6 ]; then
steps/online/nnet2/extract_ivectors_online2.sh --cmd "$train_cmd" --nj 10 \
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
--utts-per-spk-max 2 \
data/train $ivector_src exp/nnet2_online_wsj/ivectors || exit 1;
fi
......
......@@ -53,7 +53,8 @@ if [ $stage -le 3 ]; then
# train the system on. This version of the iVector-extraction script
# pairs the utterances into twos (by default, see --utts-per-spk-max option)
# and treats each as one speaker.
steps/online/nnet2/extract_ivectors_online2.sh --cmd "$train_cmd" --nj 30 \
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
--utts-per-spk-max 2 \
data/train_nodup exp/nnet2_online/extractor exp/nnet2_online/ivectors_train_nodup2 || exit 1;
fi
......
......@@ -55,8 +55,9 @@ fi
if [ $stage -le 3 ]; then
# We extract iVectors on all the train_si284 data, which will be what we
# train the system on.
steps/online/nnet2/extract_ivectors_online2.sh --cmd "$train_cmd" --nj 30 \
data/train_si284 exp/nnet2_online/extractor exp/nnet2_online/ivectors2_train_si284 || exit 1;
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
--utts-per-spk-max 2
data/train_si284 exp/nnet2_online/extractor exp/nnet2_online/ivectors_train_si284 || exit 1;
fi
......@@ -80,7 +81,7 @@ if [ $stage -le 4 ]; then
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
--num-epochs 8 --num-epochs-extra 4 \
--splice-width 7 --feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors2_train_si284 \
--online-ivector-dir exp/nnet2_online/ivectors_train_si284 \
--cmvn-opts "--norm-means=false --norm-vars=false" \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
......
......@@ -45,14 +45,14 @@ if [ $stage -le 1 ]; then
# run at one time.
steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
--nj $nj --sub-split 40 --num-threads 6 --parallel-opts "-pe smp 6" \
--online-ivector-dir exp/nnet2_online/ivectors2_train_si284 \
--online-ivector-dir exp/nnet2_online/ivectors_train_si284 \
data/train_si284 data/lang $srcdir ${srcdir}_denlats
fi
if [ $stage -le 2 ]; then
if $use_gpu; then gpu_opt=yes; else gpu_opt=no; fi
steps/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" \
--online-ivector-dir exp/nnet2_online/ivectors2_train_si284 \
--online-ivector-dir exp/nnet2_online/ivectors_train_si284 \
--use-gpu $gpu_opt \
--nj $nj data/train_si284 data/lang ${srcdir} ${srcdir}_ali
fi
......@@ -60,7 +60,7 @@ fi
if [ $stage -le 3 ]; then
if $use_gpu; then
steps/nnet2/train_discriminative.sh --cmd "$decode_cmd" --learning-rate 0.00002 \
--online-ivector-dir exp/nnet2_online/ivectors2_train_si284 \
--online-ivector-dir exp/nnet2_online/ivectors_train_si284 \
--num-jobs-nnet 4 --num-threads $num_threads --parallel-opts "$gpu_opts" \
data/train_si284 data/lang \
${srcdir}_ali ${srcdir}_denlats ${srcdir}/final.mdl ${srcdir}_smbr
......@@ -94,7 +94,7 @@ if [ $stage -le 5 ]; then
if $use_gpu; then
steps/nnet2/train_discriminative.sh --cmd "$decode_cmd" --learning-rate 0.00002 \
--use-preconditioning true \
--online-ivector-dir exp/nnet2_online/ivectors2_train_si284 \
--online-ivector-dir exp/nnet2_online/ivectors_train_si284 \
--num-jobs-nnet 4 --num-threads $num_threads --parallel-opts "$gpu_opts" \
data/train_si284 data/lang \
${srcdir}_ali ${srcdir}_denlats ${srcdir}/final.mdl ${srcdir}_smbr_precon
......
......@@ -3,14 +3,23 @@
# Copyright 2013 Daniel Povey
# Apache 2.0.
# This script is based on ^/egs/sre08/v1/sid/extract_ivectors.sh. Instead of
# This script extracts iVectors for a set of utterances, given
# features and a trained iVector extractor.
# The script is based on ^/egs/sre08/v1/sid/extract_ivectors.sh. Instead of
# extracting a single iVector per utterance, it extracts one every few frames
# (controlled by the --ivector-period option, e.g. 10, which is to save compute).
# This is used in training (and not-really-online testing) of neural networks
# for online decoding.
# This script extracts iVectors for a set of utterances, given
# features and a trained iVector extractor.
# Rather than treating each utterance separately, it carries forward
# information from one utterance to the next, within the speaker. However,
# take note of the option "utts-per-spk-max", which splits speakers up into
# "fake speakers" with at most two utterances in them. This means that more
# iVectors are estimated starting from an uninformative starting point, than
# if we used the real speaker labels (which may have many utterances each);
# it's a compromise between per-utterance and per-speaker iVector estimation.
# Begin configuration section.
nj=30
......@@ -27,8 +36,15 @@ posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
# used when training the iVector extractor, but more important
# that this match the value used when you do real online decoding
# with the neural nets trained with these iVectors.
compress=true # If true, compress the features on disk (lossy compression, as used
# for feature files.)
utts_per_spk_max=-1 # Maximum utterances per "fake-speaker." With the default
# of -1 no fake-speakers are used. Note: this does not have to
# be an integer; if it's noninteger, it will be rounded in a
# randomized way to one of the two integers it's close to.
# This is useful in the "perturbed-feature" recipe to encourage
# that different perturbed versions of the same speaker get
# split into fake-speakers differently.
compress=true # If true, compress the iVectors stored on disk (it's lossy
# compression, as used for feature matrices).
# End configuration section.
......@@ -44,14 +60,16 @@ if [ $# != 3 ]; then
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --num-iters <#iters|10> # Number of iterations of E-M"
echo " --nj <n|10> # Number of jobs (also see num-processes and num-threads)"
echo " --num-threads <n|8> # Number of threads for each process"
echo " --stage <stage|0> # To control partial reruns"
echo " --num-gselect <n|5> # Number of Gaussians to select using"
echo " # diagonal model."
echo " --min-post <float;default=0.025> # Pruning threshold for posteriors"
echo " --ivector-period <int;default=10> # How often to extract an iVector (frames)"
echo " --utts-per-spk-max <int;default=-1> # Controls splitting into 'fake speakers'."
echo " # Set to 1 if compatibility with utterance-by-utterance"
echo " # decoding is the only factor, and to larger if you care "
echo " # also about adaptation over several utterances."
exit 1;
fi
......@@ -60,36 +78,82 @@ srcdir=$2
dir=$3
for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \
$srcdir/online_cmvn.conf; do
$srcdir/online_cmvn.conf $srcdir/final.mat; do
[ ! -f $f ] && echo "No such file $f" && exit 1;
done
# Set various variables.
mkdir -p $dir/log
mkdir -p $dir/log $dir/conf
sdata=$data/split$nj;
utils/split_data.sh $data $nj || exit 1;
echo $ivector_period > $dir/ivector_period || exit 1;
splice_opts=$(cat $srcdir/splice_opts)
# the program ivector-extract-online2 does a bunch of stuff in memory and is
# config-driven... this was easier in this case because the same code is
# involved in online decoding. We need to create a config file for iVector
# extration.
ieconf=$dir/conf/ivector_extractor.conf
echo -n >$ieconf
cp $srcdir/online_cmvn.conf $dir/conf/ || exit 1;
echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$ieconf
for x in $(echo $splice_opts); do echo "$x"; done > $dir/conf/splice.conf
echo "--splice-config=$dir/conf/splice.conf" >>$ieconf
echo "--lda-matrix=$srcdir/final.mat" >>$ieconf
echo "--global-cmvn-stats=$srcdir/global_cmvn.stats" >>$ieconf
echo "--diag-ubm=$srcdir/final.dubm" >>$ieconf
echo "--ivector-extractor=$srcdir/final.ie" >>$ieconf
echo "--num-gselect=$num_gselect" >>$ieconf
echo "--min-post=$min_post" >>$ieconf
echo "--posterior-scale=$posterior_scale" >>$ieconf
echo "--max-remembered-frames=1000" >>$ieconf # the default
ns=$(wc -l <$data/spk2utt)
if [ "$ns" == 1 -a "$utts_per_spk_max" != 1 -a "$utts_per_spk_max" != -1 ]; then
echo "$0: you seem to have just one speaker in your database. This is probably not a good idea."
echo " see http://kaldi.sourceforge.net/data_prep.html (search for 'bold') for why"
echo " Setting --utts-per-spk-max to 1."
utts_per_spk_max=1
fi
## Set up features. $gmm_feats is the version of the features with online CMVN, that we use
## to get the Gaussian posteriors, $feats is the version of the features with no CMN.
gmm_feats="ark,s,cs:apply-cmvn-online --config=$srcdir/online_cmvn.conf $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
feats="ark,s,cs:splice-feats $splice_opts scp:$sdata/JOB/feats.scp ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
spk2utt=""
if [ "$utts_per_spk_max" != -1 ]; then
mkdir -p $dir/spk2utt_fake
for job in $(seq $nj); do
# create fake spk2utt files with reduced number of utterances per speaker,
# so the network is well adapted to using iVectors from small amounts of
# training data.
# the if (rand() % 2 == 0)
awk -v max=$utts_per_spk_max '{ n=2; count=0;
while(n<=NF) {
int_max=int(max)+ (rand() < (max-int(max))?1:0);
nmax=n+int_max; count++; printf("%s-%06x", $1, count);
for (;n<nmax&&n<=NF; n++) printf(" %s", $n); print "";} }' \
<$sdata/$job/spk2utt >$dir/spk2utt_fake/spk2utt.$job
done
spk2utt="ark:$dir/spk2utt_fake/spk2utt.JOB"
else
spk2utt="ark:$sdata/JOB/spk2utt"
fi
for n in $(seq $nj); do
# This will do nothing unless the directorys $dir/storage exists;
# it can be used to distribute the data among multiple machines.
utils/create_data_link.pl $dir/ivector_online.$n.ark
done
if [ $stage -le 0 ]; then
echo "$0: extracting iVectors"
$cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm \
"$gmm_feats" ark:- \| scale-post ark:- $posterior_scale ark:- \| \
ivector-extract-online --ivector-period=$ivector_period $srcdir/final.ie "$feats" ark,s,cs:- ark:- \| \
copy-feats --compress=$compress ark:- \
ark,scp,t:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1;
ivector-extract-online2 --config=$ieconf "$spk2utt" scp:$sdata/JOB/feats.scp ark:- \| \
copy-feats --compress=$compress ark:- \
ark,scp:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1;
fi
if [ $stage -le 1 ]; then
......
#!/bin/bash
# Copyright 2013 Daniel Povey
# Apache 2.0.
# This script is as ./extract_ivectors_online.sh but internally it uses a
# different program, with code that corresponds more closely to the real online
# decoding setup. Rather than treating each utterance separately, as
# extract_ivectors_online.sh, it carries forward information from one utterance
# to the next, within the speaker. However, take note of the option
# "utts-per-spk-max", defaulting to 2, which splits speakers up into "fake
# speakers" with at most two utterances in them. This means that more iVectors
# are estimated starting from an uninformative starting point, than if we used
# the real speaker labels (which may have many utterances each); it's a
# compromise between per-utterance and per-speaker iVector estimation.
# This script is based on ^/egs/sre08/v1/sid/extract_ivectors.sh. Instead of
# extracting a single iVector per utterance, it extracts one every few frames
# (controlled by the --ivector-period option, e.g. 10, which is to save compute).
# This is used in training (and not-really-online testing) of neural networks
# for online decoding.
# This script extracts iVectors for a set of utterances, given
# features and a trained iVector extractor.
# Begin configuration section.
nj=30
cmd="run.pl"
stage=0
num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
ivector_period=10
posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
# inter-frame correlations. Making this small during iVector
# extraction is equivalent to scaling up the prior, and will
# will tend to produce smaller iVectors where data-counts are
# small. It's not so important that this match the value
# used when training the iVector extractor, but more important
# that this match the value used when you do real online decoding
# with the neural nets trained with these iVectors.
utts_per_spk_max=2 # maximum 2 utterances per "fake-speaker." Note: this does
# not have to be an integer; if it's noninteger, it will be
# rounded in a randomized way to one of the two integers it's
# close to. This is useful in the "perturbed-feature" recipe
# to encourage that different perturbed versions of the same
# speaker get split into fake-speakers differently.
compress=true # If true, compress the iVectors stored on disk (it's lossy
# compression, as used for feature matrices).
# End configuration section.
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0 [options] <data> <extractor-dir> <ivector-dir>"
echo " e.g.: $0 data/train exp/nnet2_online/extractor exp/nnet2_online/ivectors_train"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --nj <n|10> # Number of jobs (also see num-processes and num-threads)"
echo " --stage <stage|0> # To control partial reruns"
echo " --num-gselect <n|5> # Number of Gaussians to select using"
echo " # diagonal model."
echo " --min-post <float;default=0.025> # Pruning threshold for posteriors"
echo " --ivector-period <int;default=10> # How often to extract an iVector (frames)"
echo " --utts-per-spk-max <int;default=2> # Controls splitting into 'fake speakers'."
echo " # Set to 1 if compatibility with utterance-by-utterance"
echo " # decoding is the only factor, and to larger if you care "
echo " # also about adaptation over several utterances."
exit 1;
fi
data=$1
srcdir=$2
dir=$3
for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \
$srcdir/online_cmvn.conf $srcdir/final.mat; do
[ ! -f $f ] && echo "No such file $f" && exit 1;
done
# Set various variables.
mkdir -p $dir/log $dir/conf
sdata=$data/split$nj;
utils/split_data.sh $data $nj || exit 1;
echo $ivector_period > $dir/ivector_period || exit 1;
splice_opts=$(cat $srcdir/splice_opts)
# the program ivector-extract-online2 does a bunch of stuff in memory and is
# config-driven... this was easier in this case because the same code is
# involved in online decoding. We need to create a config file for iVector
# extration.
ieconf=$dir/conf/ivector_extractor.conf
echo -n >$ieconf
cp $srcdir/online_cmvn.conf $dir/conf/ || exit 1;
echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$ieconf
for x in $(echo $splice_opts); do echo "$x"; done > $dir/conf/splice.conf
echo "--splice-config=$dir/conf/splice.conf" >>$ieconf
echo "--lda-matrix=$srcdir/final.mat" >>$ieconf
echo "--global-cmvn-stats=$srcdir/global_cmvn.stats" >>$ieconf
echo "--diag-ubm=$srcdir/final.dubm" >>$ieconf
echo "--ivector-extractor=$srcdir/final.ie" >>$ieconf
echo "--num-gselect=$num_gselect" >>$ieconf
echo "--min-post=$min_post" >>$ieconf
echo "--posterior-scale=$posterior_scale" >>$ieconf
echo "--max-remembered-frames=1000" >>$ieconf # the default
ns=$(wc -l <$data/spk2utt)
if [ "$ns" == 1 -a "$utts_per_spk_max" != 1 ]; then
echo "$0: you seem to have just one speaker in your database. This is probably not a good idea."
echo " see http://kaldi.sourceforge.net/data_prep.html (search for 'bold') for why"
echo " Setting --utts-per-spk-max to 1."
utts_per_spk_max=1
fi
mkdir -p $dir/spk2utt_fake
for job in $(seq $nj); do
# create fake spk2utt files with reduced number of utterances per speaker,
# so the network is well adapted to using iVectors from small amounts of
# training data.
# the if (rand() % 2 == 0)
awk -v max=$utts_per_spk_max '{ n=2; count=0;
while(n<=NF) {
int_max=int(max)+ (rand() < (max-int(max))?1:0);
nmax=n+int_max; count++; printf("%s-%06x", $1, count);
for (;n<nmax&&n<=NF; n++) printf(" %s", $n); print "";} }' \
<$sdata/$job/spk2utt >$dir/spk2utt_fake/spk2utt.$job
done
for n in $(seq $nj); do
# This will do nothing unless the directorys $dir/storage exists;
# it can be used to distribute the data among multiple machines.
utils/create_data_link.pl $dir/ivector_online.$n.ark
done
if [ $stage -le 0 ]; then
echo "$0: extracting iVectors"
$cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
ivector-extract-online2 --config=$ieconf ark:$dir/spk2utt_fake/spk2utt.JOB scp:$sdata/JOB/feats.scp ark:- \| \
copy-feats --compress=$compress ark:- \
ark,scp,t:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1;
fi
if [ $stage -le 1 ]; then
echo "$0: combining iVectors across jobs"
for j in $(seq $nj); do cat $dir/ivector_online.$j.scp; done >$dir/ivector_online.scp || exit 1;
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment