Commit 8c6119bd authored by Ho Yin Chan's avatar Ho Yin Chan
Browse files

trunk:egs/wsj/utils/subset_data_dir_tr_cv_spk.sh (A data split script for...

trunk:egs/wsj/utils/subset_data_dir_tr_cv_spk.sh (A data split script for cross validation based on random chosen speakers) -> modifications for better order of options and temporary files placement (Recommended by Karel Vesely)

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@2786 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent dad3b29f
#!/bin/bash
# Copyright 2013 Hong Kong University of Science and Technology (Author: Ricky Chan Ho Yin);
# Karel Vesely, Daniel Povey;
# Brno University of Technology (Author: Karel Vesely);
# Johns Hopkins University (Author: Daniel Povey);
# Apache 2.0
# This script splits dataset to two parts :
......@@ -10,12 +11,23 @@
# The default cross validation percentage portion is 10% (i.e. P=10)
#
# It is useful if you would like to have subset chosen from random speakers order,
# especially for the cases where a dataset contains multiple different corpora, where
# type of speakers or recording channels may be quite different
# especially for the cases where a dataset contains multiple different corpora,
# where type of speakers or recording channels may be quite different
if [ $# != 3 ] && [ $# != 5 ]; then
echo "Usage: $0 <srcdir> <traindir> <crossvaldir> [--cvportion P]"
echo "--cvportion P Cross Validation portion of the total speakers, default is 10% (i.e. P=10)"
# Begin configuration.
cv_spk_percent=10 # default 10% of speakers
seed=777 # use seed for speaker shuffling
# End configuration.
echo "$0 $@" # Print the command line for logging
[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0 [--cv-spk-percent P] <srcdir> <traindir> <crossvaldir>"
echo " --cv-spk-percent P Cross Validation portion of the total speakers, default is 10% (i.e. P=10)"
exit 1;
fi
......@@ -23,36 +35,29 @@ srcdir=$1
trndir=$2
cvdir=$3
if [ $# == 5 ]; then
if [ $4 = "--cvportion" ]; then
cvportion=$5
else
echo "Usage: $0 <srcdir> <traindir> <crossvaldir> [--cvportion P]"
echo "--cvportion P Cross Validation portion of the total speakers, default is 10% (i.e. P=10)"
exit 1;
fi
else
cvportion=10
fi
if [ ! -f $srcdir/spk2utt ]; then
echo "$0: no such file $srcdir/spk2utt"
exit 1;
fi
#total number of lines
#total, cv, train number of speakers
N=$(cat $srcdir/spk2utt | wc -l)
awk '{print $1}' $srcdir/spk2utt | awk 'BEGIN{srand();}{print rand()"\t"$0}' | sort -k1 -n | cut -f2- > $srcdir/_tmpf_randspk
boundary=$((N*cvportion/100))
tailboundary=$((N-$boundary))
N_spk_cv=$((N * cv_spk_percent / 100))
N_spk_trn=$((N - N_spk_cv))
mkdir -p $cvdir $trndir
head -$boundary $srcdir/_tmpf_randspk > $cvdir/_tmpf_cvspk
tail -$tailboundary $srcdir/_tmpf_randspk > $trndir/_tmpf_trainspk
#shuffle the speaker list
awk '{print $1}' $srcdir/spk2utt | shuffle_list.pl --srand $seed > $trndir/_tmpf_randspk
#split the train/cv
head -n $N_spk_cv $trndir/_tmpf_randspk > $cvdir/_tmpf_cvspk
tail -n $N_spk_trn $trndir/_tmpf_randspk > $trndir/_tmpf_trainspk
#now call the subset_data_dir.sh
subset_data_dir.sh --spk-list $trndir/_tmpf_trainspk $srcdir $trndir
subset_data_dir.sh --spk-list $cvdir/_tmpf_cvspk $srcdir $cvdir
rm -f $srcdir/_tmpf_randspk $trndir/_tmpf_trainspk $cvdir/_tmpf_cvspk
#clean-up
rm -f $trndir/_tmpf_randspk $trndir/_tmpf_trainspk $cvdir/_tmpf_cvspk
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment