Commit b0f9cb4b authored by Shinji Watanabe's avatar Shinji Watanabe
Browse files

trunk: committing example scripts for CHIME 3 Challenge

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4909 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent e4ec3600
This is a kaldi setup for 3rd CHiME challenge.
See http://spandh.dcs.shef.ac.uk/chime_challenge/ for more detailed information.
Quick instruction:
1) Download kaldi (version 4710)
svn co -r 4710 https://svn.code.sf.net/p/kaldi/code/trunk kaldi-trunk-r4710
2) specify kaldi in path.sh
e.g.,
export KALDI_ROOT=`pwd`/../../..
3) specify data path of CHiME3 corpus in run_init.sh
e.g.,
chime3_data=/local_data/archive/speech-db/original/public/CHiME3/
4) execute run.sh
5) if you have your own enhanced speech data for training and test data, you can evaluate the performance
of GMM and DNN systems by
local/run_gmm.sh <enhancement method> <enhanced speech directory>
local/run_dnn.sh <enhancement method> <enhanced speech directory>
You don't have to execute local/run_init.sh twice.
6) You can find result at
enhan=<enhancement method>
GMM: exp/tri3b_tr05_sr_$enhan/best_wer_$enhan.result
DNN: exp/tri4a_dnn_tr05_sr_${enhan}_smbr_i1lats/best_wer_${enhan}.result
# "queue.pl" uses qsub. The options to it are
# options to qsub. If you have GridEngine installed,
# change this to a queue you have access to.
# Otherwise, use "run.pl", which will run jobs locally
# (make sure your --num-jobs options are no more than
# the number of cpus on your machine.
#a) JHU cluster options
#export train_cmd="queue.pl -l arch=*64"
#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
#export cuda_cmd="..."
#b) BUT cluster options
#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
#c) run it locally...
export train_cmd=run.pl
export decode_cmd=run.pl
export cuda_cmd=run.pl
export mkgraph_cmd=run.pl
beam=18.0 # beam for decoding. Was 13.0 in the scripts.
lattice_beam=10.0 # this has most effect on size of the lattices.
# No non-default options for now.
--window-type=hamming # disable Dans window, use the standard
--use-energy=false # only fbank outputs
--sample-frequency=16000 # Cantonese is sampled at 8kHz
--low-freq=64 # typical setup from Frantisek Grezl
--high-freq=8000
--dither=1
--num-mel-bins=40 # 8kHz so we use 15 bins
--htk-compat=true # try to make it compatible with HTK
--use-energy=false # only non-default option.
#!/bin/bash
set -e
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# This is modified from the script in standard Kaldi recipe to account
# for the way the WSJ data is structured on the Edinburgh systems.
# - Arnab Ghoshal, 29/05/12
# Modified from the script for CHiME3 baseline
# Shinji Watanabe 02/13/2015
if [ $# -ne 1 ]; then
printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
echo "The argument should be a the top-level CHiME3 directory."
echo "It is assumed that there will be a 'data' subdirectory"
echo "within the top-level corpus directory."
exit 1;
fi
echo "$0 $@" # Print the command line for logging
eval_flag=false # make it true when the evaluation data are released
audio_dir=$1/data/audio/16kHz/isolated
trans_dir=$1/data/transcriptions
echo "extract 5th channel (CH5.wav, the center bottom edge in the front of the tablet) for noisy data"
dir=`pwd`/data/local/data
lmdir=`pwd`/data/local/nist_lm
mkdir -p $dir $lmdir
local=`pwd`/local
utils=`pwd`/utils
. ./path.sh # Needed for KALDI_ROOT
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
if $eval_flag; then
list_set="tr05_bth dt05_bth et05_bth"
else
list_set="tr05_bth dt05_bth"
fi
cd $dir
find $audio_dir -name '*CH5.wav' | grep 'tr05_bth' | sort -u > tr05_bth.flist
find $audio_dir -name '*CH5.wav' | grep 'dt05_bth' | sort -u > dt05_bth.flist
if $eval_flag; then
find $audio_dir -name '*CH5.wav' | grep 'et05_bth' | sort -u > et05_bth.flist
fi
# make a dot format from json annotation files
cp $trans_dir/tr05_bth.dot_all tr05_bth.dot
cp $trans_dir/dt05_bth.dot_all dt05_bth.dot
if $eval_flag; then
cp $trans_dir/et05_bth.dot_all et05_bth.dot
fi
# make a scp file from file list
for x in $list_set; do
cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav//' > ${x}_wav.ids
paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp
done
#make a transcription from dot
cat tr05_bth.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH5"}'> tr05_bth.ids
cat tr05_bth.dot | sed -e 's/(.*)//' > tr05_bth.txt
paste -d" " tr05_bth.ids tr05_bth.txt | sort -k 1 > tr05_bth.trans1
cat dt05_bth.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH5"}'> dt05_bth.ids
cat dt05_bth.dot | sed -e 's/(.*)//' > dt05_bth.txt
paste -d" " dt05_bth.ids dt05_bth.txt | sort -k 1 > dt05_bth.trans1
if $eval_flag; then
cat et05_bth.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH5"}'> et05_bth.ids
cat et05_bth.dot | sed -e 's/(.*)//' > et05_bth.txt
paste -d" " et05_bth.ids et05_bth.txt | sort -k 1 > et05_bth.trans1
fi
# Do some basic normalization steps. At this point we don't remove OOVs--
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
for x in $list_set;do
cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
| sort > $x.txt || exit 1;
done
# Make the utt2spk and spk2utt files.
for x in $list_set; do
cat ${x}_wav.scp | awk -F'_' '{print $1}' > $x.spk
cat ${x}_wav.scp | awk '{print $1}' > $x.utt
paste -d" " $x.utt $x.spk > $x.utt2spk
cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
done
# copying data to data/...
for x in $list_set; do
mkdir -p ../../$x
cp ${x}_wav.scp ../../$x/wav.scp || exit 1;
cp ${x}.txt ../../$x/text || exit 1;
cp ${x}.spk2utt ../../$x/spk2utt || exit 1;
cp ${x}.utt2spk ../../$x/utt2spk || exit 1;
done
echo "Data preparation succeeded"
#!/bin/bash
# Copyright 2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
# Apache 2.0.
set -e
if [ $# -ne 2 ]; then
printf "\nUSAGE: %s <training experiment directory> <enhance method>\n\n" `basename $0`
printf "%s exp/tri3b_tr05_sr_noisy noisy\n\n" `basename $0`
exit 1;
fi
echo "$0 $@" # Print the command line for logging
. path.sh
eval_flag=false # make it true when the evaluation data are released
dir=$1
enhan=$2
echo "compute WER for each location"
echo ""
for a in `find $dir/decode_tgpr_5k_dt05_real_$enhan/ | grep "\/wer_" | awk -F'[/]' '{print $NF}' | sort`; do
echo -n "$a "
cat $dir/decode_tgpr_5k_dt05_{real,simu}_$enhan/$a | grep WER | awk '{err+=$4} {wrd+=$6} END{printf("%.2f\n",err/wrd*100)}'
done | sort -k 2 | head -n 1 > $dir/log/best_wer_$enhan
lmw=`cut -f 1 -d" " $dir/log/best_wer_$enhan | cut -f 2 -d"_"`
echo "-------------------"
printf "best overall WER %s" `cut -f 2 -d" " $dir/log/best_wer_$enhan`
echo -n "%"
printf " (language model weight = %s)\n" $lmw
echo "-------------------"
for task in simu real; do
rdir=$dir/decode_tgpr_5k_dt05_${task}_$enhan
for a in _BUS _CAF _PED _STR; do
grep $a $rdir/scoring/test_filt.txt \
> $rdir/scoring/test_filt_$a.txt
cat $rdir/scoring/$lmw.tra \
| utils/int2sym.pl -f 2- $rdir/../graph_tgpr_5k/words.txt \
| sed s:\<UNK\>::g \
| compute-wer --text --mode=present ark:$rdir/scoring/test_filt_$a.txt ark,p:- \
1> $rdir/${a}_wer_$lmw 2> /dev/null
done
echo -n "$task WER: `grep WER $rdir/wer_$lmw | cut -f 2 -d" "`% (Average), "
echo -n "`grep WER $rdir/_BUS_wer_$lmw | cut -f 2 -d" "`% (BUS), "
echo -n "`grep WER $rdir/_CAF_wer_$lmw | cut -f 2 -d" "`% (CAFE), "
echo -n "`grep WER $rdir/_PED_wer_$lmw | cut -f 2 -d" "`% (PEDESTRIAN), "
echo -n "`grep WER $rdir/_STR_wer_$lmw | cut -f 2 -d" "`% (STREET)"
echo ""
echo "-------------------"
done
# for spreadsheet cut&paste
for task in simu real; do
rdir=$dir/decode_tgpr_5k_dt05_${task}_$enhan
grep WER $rdir/_BUS_wer_$lmw | cut -f 2 -d" "
grep WER $rdir/_CAF_wer_$lmw | cut -f 2 -d" "
grep WER $rdir/_PED_wer_$lmw | cut -f 2 -d" "
grep WER $rdir/_STR_wer_$lmw | cut -f 2 -d" "
grep WER $rdir/wer_$lmw | cut -f 2 -d" "
done
cut -f 2 -d" " $dir/log/best_wer_$enhan
echo $lmw
#!/bin/bash
# Copyright 2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
# Apache 2.0.
set -e
if [ $# -ne 3 ]; then
printf "\nUSAGE: %s <training experiment directory> <enhance method> <graph_dir>\n\n" `basename $0`
printf "%s exp/tri3b_tr05_sr_noisy noisy exp/tri4a_dnn_tr05_sr_noisy/graph_tgpr_5k\n\n" `basename $0`
exit 1;
fi
echo "$0 $@" # Print the command line for logging
. path.sh
eval_flag=false # make it true when the evaluation data are released
dir=$1
enhan=$2
graph_dir=$3
echo "compute WER for each location"
echo ""
# collect scores
for x in `ls $dir | grep decode_tgpr_5k_dt05_real_${enhan} | awk -F'[/]' '{print $NF}' | sort`; do
for y in `find $dir/$x/ | grep "\/wer_" | awk -F'[/]' '{print $NF}' | sort`; do
echo -n "${x}_$y "
cat $dir/$x/$y | grep WER | awk '{err+=$4} {wrd+=$6} END{printf("%.2f\n",err/wrd*100)}'
done
done | sort -k 2 | head -n 1 > $dir/log/best_wer_$enhan
lmw=`cut -f 1 -d" " $dir/log/best_wer_$enhan | awk -F'[_]' '{print $NF}'`
it=`cut -f 1 -d" " $dir/log/best_wer_$enhan | sed -e 's/^.*it\(.*\)_wer_.*/\1/'`
echo "-------------------"
printf "best overall WER %s" `cut -f 2 -d" " $dir/log/best_wer_$enhan`
echo -n "%"
printf " (language model weight = %s)\n" $lmw
printf " (Number of iterations = %s)\n" $it
echo "-------------------"
for task in simu real; do
rdir=$dir/decode_tgpr_5k_dt05_${task}_${enhan}_it$it
for a in _BUS _CAF _PED _STR; do
grep $a $rdir/scoring/test_filt.txt \
> $rdir/scoring/test_filt_$a.txt
cat $rdir/scoring/$lmw.tra \
| utils/int2sym.pl -f 2- $graph_dir/words.txt \
| sed s:\<UNK\>::g \
| compute-wer --text --mode=present ark:$rdir/scoring/test_filt_$a.txt ark,p:- \
1> $rdir/${a}_wer_$lmw 2> /dev/null
done
echo -n "$task WER: `grep WER $rdir/wer_$lmw | cut -f 2 -d" "`% (Average), "
echo -n "`grep WER $rdir/_BUS_wer_$lmw | cut -f 2 -d" "`% (BUS), "
echo -n "`grep WER $rdir/_CAF_wer_$lmw | cut -f 2 -d" "`% (CAFE), "
echo -n "`grep WER $rdir/_PED_wer_$lmw | cut -f 2 -d" "`% (PEDESTRIAN), "
echo -n "`grep WER $rdir/_STR_wer_$lmw | cut -f 2 -d" "`% (STREET)"
echo ""
echo "-------------------"
done
# for spreadsheet cut&paste
for task in simu real; do
rdir=$dir/decode_tgpr_5k_dt05_${task}_${enhan}_it$it
grep WER $rdir/_BUS_wer_$lmw | cut -f 2 -d" "
grep WER $rdir/_CAF_wer_$lmw | cut -f 2 -d" "
grep WER $rdir/_PED_wer_$lmw | cut -f 2 -d" "
grep WER $rdir/_STR_wer_$lmw | cut -f 2 -d" "
grep WER $rdir/wer_$lmw | cut -f 2 -d" "
done
cut -f 2 -d" " $dir/log/best_wer_$enhan
echo $lmw
#!/bin/bash
# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# This script takes data prepared in a corpus-dependent way
# in data/local/, and converts it into the "canonical" form,
# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
# data/train_si84, etc.
# Modified from the script for CHiME3 baseline
# Shinji Watanabe 02/13/2015
. ./path.sh || exit 1;
echo "Preparing train and test data"
srcdir=data/local/data
lmdir=data/local/nist_lm
tmpdir=data/local/lm_tmp
lexicon=data/local/lang_tmp/lexiconp.txt
mkdir -p $tmpdir
for x in et05_orig_clean dt05_orig_clean tr05_orig_clean; do
mkdir -p data/$x
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
cp $srcdir/$x.txt data/$x/text || exit 1;
cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
done
# Next, for each type of language model, create the corresponding FST
# and the corresponding lang_test_* directory.
echo Preparing language models for test
for lm_suffix in tgpr_5k; do
test=data/lang_test_${lm_suffix}
mkdir -p $test
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
phones/; do
cp -r data/lang/$f $test
done
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
mkdir -p $tmpdir/g
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
done
echo "Succeeded in formatting data."
rm -r $tmpdir
#!/bin/bash
set -e
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# This is modified from the script in standard Kaldi recipe to account
# for the way the WSJ data is structured on the Edinburgh systems.
# - Arnab Ghoshal, 29/05/12
# Modified from the script for CHiME3 baseline
# Shinji Watanabe 02/13/2015
if [ $# -ne 1 ]; then
printf "\nUSAGE: %s <original WSJ0 corpus-directory>\n\n" `basename $0`
echo "The argument should be a the top-level WSJ corpus directory."
echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
echo "within the top-level corpus directory."
exit 1;
fi
wsj0=$1
dir=`pwd`/data/local/data
lmdir=`pwd`/data/local/nist_lm
mkdir -p $dir $lmdir
local=`pwd`/local
utils=`pwd`/utils
. ./path.sh # Needed for KALDI_ROOT
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
cd $dir
# This version for SI-84
cat $wsj0/wsj0/doc/indices/train/tr_s_wv1.ndx \
| $local/cstr_ndx2flist.pl $wsj0 | sort -u > tr05_orig_clean.flist
# Now for the test sets.
# $wsj0/wsj1/doc/indices/readme.doc
# describes all the different test sets.
# Note: each test-set seems to come in multiple versions depending
# on different vocabulary sizes, verbalized vs. non-verbalized
# pronunciations, etc. We use the largest vocab and non-verbalized
# pronunciations.
# The most normal one seems to be the "baseline 60k test set", which
# is h1_p0.
# Nov'92 (330 utts, 5k vocab)
cat $wsj0/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
$local/cstr_ndx2flist.pl $wsj0 | sort > et05_orig_clean.flist
# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.
# Sometimes this gets copied from the CD's with upcasing, don't know
# why (could be older versions of the disks).
find $wsj0/wsj0/si_dt_05 -print | grep -i ".wv1" | sort > dt05_orig_clean.flist
# Finding the transcript files:
find -L $wsj0 -iname '*.dot' > dot_files.flist
# Convert the transcripts into our format (no normalization yet)
# adding suffix to utt_id
# 0 for clean condition
for x in tr05_orig_clean et05_orig_clean dt05_orig_clean; do
$local/flist2scp.pl $x.flist | sort > ${x}_sph_tmp.scp
cat ${x}_sph_tmp.scp | awk '{print $1}' \
| $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1
cat ${x}_sph_tmp.scp | awk '{printf("%s %s\n", $1, $2);}' > ${x}_sph.scp
cat ${x}_tmp.trans1 | awk '{printf("%s ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1
done
# Do some basic normalization steps. At this point we don't remove OOVs--
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
for x in tr05_orig_clean et05_orig_clean dt05_orig_clean; do
cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
| sort > $x.txt || exit 1;
done
# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
for x in tr05_orig_clean et05_orig_clean dt05_orig_clean; do
awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \
> ${x}_wav.scp
done
# Make the utt2spk and spk2utt files.
for x in tr05_orig_clean et05_orig_clean dt05_orig_clean; do
cat ${x}_sph.scp | awk '{print $1}' \
| perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
done
#in case we want to limit lm's on most frequent words, copy lm training word frequency list
cp $wsj0/wsj0/doc/lng_modl/vocab/wfl_64.lst $lmdir
chmod u+w $lmdir/*.lst # had weird permissions on source.
# The 5K vocab language model without verbalized pronunciations.
# This is used for 3rd CHiME challenge
# trigram would be: !only closed vocabulary here!
cp $wsj0/wsj0/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
chmod u+rw $lmdir/lm_tg_5k.arpa.gz
gunzip $lmdir/lm_tg_5k.arpa.gz
tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
rm $lmdir/lm_tg_5k.arpa
prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
rm -f wsj0-train-spkrinfo.txt
wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \
|| ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt );
fi
if [ ! -f wsj0-train-spkrinfo.txt ]; then
echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
echo "This is possibly omitted from the training disks; couldn't find it."
echo "Everything else may have worked; we just may be missing gender info"
echo "which is only needed for VTLN-related diagnostics anyway."
exit 1
fi
# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
# LDC put it on the web. Perhaps it was accidentally omitted from the
# disks.
cat $wsj0/wsj0/doc/spkrinfo.txt \
./wsj0-train-spkrinfo.txt | \
perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
echo "Data preparation succeeded"
#!/usr/bin/perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This is modified from the script in standard Kaldi recipe to account
# for the way the WSJ data is structured on the Edinburgh systems.
# - Arnab Ghoshal, 12/1/12
# This program takes as its standard input an .ndx file from the WSJ corpus that looks
# like this:
#;; File: tr_s_wv1.ndx, updated 04/26/94
#;;
#;; Index for WSJ0 SI-short Sennheiser training data
#;; Data is read WSJ sentences, Sennheiser mic.
#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts
#;; per speaker TI) = 7236 utts
#;;
#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
# and as command-line argument it takes the names of the WSJ disk locations, e.g.:
# /group/corpora/public/wsjcam0/data on DICE machines.
# It outputs a list of absolute pathnames.
$wsj_dir = $ARGV[0];
while(<STDIN>){
if(m/^;/){ next; } # Comment. Ignore it.
else {
m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_";
$filename = $2; # as a subdirectory of the distributed disk.
if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; }