Commit 7523184d authored by Ho Yin Chan's avatar Ho Yin Chan
Browse files

trunk:egs/wsj/s5/util merge subset_data_dir_tr_cv_spk.sh and...

trunk:egs/wsj/s5/util merge subset_data_dir_tr_cv_spk.sh and subset_data_dir_tr90_cv10.sh to a single script subset_data_dir_tr_cv.sh

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@2787 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 8c6119bd
......@@ -29,7 +29,7 @@ steps/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
$dir data/train $gmmdir $dir/_log $dir/_data || exit 1
# split the data : 90% train 10% cross-validation (held-out)
utils/subset_data_dir_tr90_cv10.sh $dir ${dir}_tr90 ${dir}_cv10 || exit 1
utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10 || exit 1
}
......
#!/bin/bash
# Copyright 2010-2012 Brno University of Technology (Author: Karel Vesely)
# Apache 2.0
# This script splits dataset to two parts :
# 90% training set and 10% held-out set (or cross-validation),
# which will be later on used for neural network training
#
# It is useful if the database is not presplit or where
# we cannot get alignment on dev set
if [ $# != 3 ]; then
echo "Usage: $0 <srcdir> <traindir> <crossvaldir>"
exit 1;
fi
srcdir=$1
trndir=$2
cvdir=$3
if [ ! -f $srcdir/utt2spk ]; then
echo "$0: no such file $srcdir/utt2spk"
exit 1;
fi
#total number of lines
N=$(cat $srcdir/utt2spk | wc -l)
#get line number where 90% of the data lies
N_head=$((N*9/10))
#move the boundary so it is located on speaker change
N_head=$(cat $srcdir/utt2spk | uniq -f1 -c | awk '{ if(n+$1<='$N_head') { n += $1 } else { nextfile } } END{ print n }')
#the rest of the data will be that big
N_tail=$((N-N_head))
#now call the subset_data_dir.sh and fix the directories
subset_data_dir.sh --first $srcdir $N_head $trndir
subset_data_dir.sh --last $srcdir $N_tail $cvdir
......@@ -5,29 +5,45 @@
# Apache 2.0
# This script splits dataset to two parts :
# training set from (100-P)% of speakers and
# held-out set (or cross-validation) from P% of remaining speakers,
# training set from (100-P)% of speakers/utterances and
# held-out set (or cross-validation) from P% of remaining speakers/remaining utterances,
# which will be later on used for neural network training
# The default cross validation percentage portion is 10% (i.e. P=10)
#
# It is useful if you would like to have subset chosen from random speakers order,
# especially for the cases where a dataset contains multiple different corpora,
# There are two options for choosing held-out (or cross-validation) set, either by
# --cv-spk-percent P , which will give you CV set based on random chosen P% of speakers, or
# --cv-utt-percent P , which will give you CV set based on last P% utterances in the dataset
#
# If you don't apply the above two options, by default the script will use --cv-utt-percent option,
# and the default cross validation percentage portion is equal to 10% (i.e. P=10)
#
# The --cv-spk-percent option is useful if you would like to have subset chosen from random speakers order,
# especially for the cases where dataset contains multiple different corpora,
# where type of speakers or recording channels may be quite different
# Begin configuration.
cv_spk_percent=10 # default 10% of speakers
cv_spk_percent= # % of speakers is parsed by option
cv_utt_percent=10 # default 10% of total utterances
seed=777 # use seed for speaker shuffling
# End configuration.
echo "$0 $@" # Print the command line for logging
uttbase=true; # by default, we choose last 10% utterances for CV
if [ "$1" == "--cv-spk-percent" ]; then
uttbase=false;
spkbase=true;
fi
[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0 [--cv-spk-percent P] <srcdir> <traindir> <crossvaldir>"
echo " --cv-spk-percent P Cross Validation portion of the total speakers, default is 10% (i.e. P=10)"
echo "Usage: $0 [--cv-spk-percent P|--cv-utt-percent P] <srcdir> <traindir> <crossvaldir>"
echo " --cv-spk-percent P Cross Validation portion of the total speakers, recommend value is 10% (i.e. P=10)"
echo " --cv-utt-percent P Cross Validation portion of the total utterances, default is 10% (i.e. P=10)"
echo " "
exit 1;
fi
......@@ -35,6 +51,31 @@ srcdir=$1
trndir=$2
cvdir=$3
## use simple last P% utterance for CV
if $uttbase; then
if [ ! -f $srcdir/utt2spk ]; then
echo "$0: no such file $srcdir/utt2spk"
exit 1;
fi
#total number of lines
N=$(cat $srcdir/utt2spk | wc -l)
#get line number where (100-P)% of the data lies
P_utt=$((N * cv_utt_percent / 100))
N_head=$((N -P_utt))
#move the boundary so it is located on speaker change
N_head=$(cat $srcdir/utt2spk | uniq -f1 -c | awk '{ if(n+$1<='$N_head') { n += $1 } else { nextfile } } END{ print n }')
#the rest of the data will be that big
N_tail=$((N-N_head))
#now call the subset_data_dir.sh and fix the directories
subset_data_dir.sh --first $srcdir $N_head $trndir
subset_data_dir.sh --last $srcdir $N_tail $cvdir
exit 0;
fi
## use random chosen P% speakers for CV
if [ ! -f $srcdir/spk2utt ]; then
echo "$0: no such file $srcdir/spk2utt"
exit 1;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment