Commit 3f30bd33 authored by Dan Povey's avatar Dan Povey
Browse files

Added TIDIGITS recipe.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@1050 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent bb2c64cc
......@@ -21,6 +21,11 @@ Note: the easiest examples to work with are rm/s3 and wsj/s3.
get the same data using combinations of other catalog numbers, but this
is the one we used).
tidigits: The TI Digits database, available from the LDC (catalog number LDC93S10).
This is one of the oldest speech databases; it consists of a bunch of speakers
saying digit strings. It's not considered a "real" task any more, but can be useful
for demos, tutorials, and the like.
yesno: This is a simple recipe with some data consisting of a single person
saying the words "yes" and "no", that can be downloaded from the Kaldi website.
It's a very easy task, but useful for checking that the scripts run, or if
......
The TIDIGITS database consists of men, women, boys and girls reading
digit strings of varying lengths; these are sampled at 20 kHz.
It's available from the LDC as catalog number LDC93S10.
The subdirectory s5 consists of "s5-style" (i.e. new, at the current
time of writing) scripts for training and testing. Note: unlike the
other s5 scripts we don't include word-boundary information, since it
wouldn't add anything useful.
# "queue.pl" uses qsub. The options to it are
# options to qsub. If you have GridEngine installed,
# change this to a queue you have access to.
# Otherwise, use "run.pl", which will run jobs locally
# (make sure your --num-jobs options are no more than
# the number of cpus on your machine.
#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
export train_cmd=run.pl
#export decode_cmd=run.pl
--use-energy=false
--sample-frequency=20000
<Topology>
<TopologyEntry>
<ForPhones>
NONSILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State>
<State> 3 </State>
</TopologyEntry>
<TopologyEntry>
<ForPhones>
SILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 3 <PdfClass> 3 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 4 <PdfClass> 4 <Transition> 4 0.25 <Transition> 5 0.75 </State>
<State> 5 </State>
</TopologyEntry>
</Topology>
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
[ -f ./path.sh ] && . ./path.sh
cmd=run.pl
[ $1 == "--cmd" ] && cmd=$2 && shift 2;
[ $# -ne 3 ] && \
echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit 1;
data=$1
lang_or_graph=$2
dir=$3
symtab=$lang_or_graph/words.txt
for f in $symtab $dir/lat.1.gz $data/text; do
[ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
done
mkdir -p $dir/scoring/log
$cmd LMWT=9:19 $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
"ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
# Note: the double level of quoting for the sed command
$cmd LMWT=9:19 $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
compute-wer --text --mode=present \
ark:$data/text ark,p:- ">&" $dir/wer_LMWT || exit 1;
exit 0;
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
. ./path.sh # Needed for KALDI_ROOT
if [ $# -ne 1 ]; then
echo "Argument should be the TIDIGITS directory, see ../run.sh for example."
exit 1;
fi
tidigits=$1
tmpdir=`pwd`/data/local/data
mkdir -p $tmpdir
# Note: the .wav files are not in .wav format but "sphere" format (this was
# produced in the days before Windows).
find $tidigits/tidigits/train -name '*.wav' > $tmpdir/train.flist
n=`cat $tmpdir/train.flist | wc -l`
[ $n -eq 8623 ] || echo Unexpected number of training files $n versus 8623
find $tidigits/tidigits/test -name '*.wav' > $tmpdir/test.flist
n=`cat $tmpdir/test.flist | wc -l`
[ $n -eq 8700 ] || echo Unexpected number of test files $n versus 8700
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
for x in train test; do
# get scp file that has utterance-ids and maps to the sphere file.
cat $tmpdir/$x.flist | perl -ane 'm|/(..)/([1-9zo]+[ab])\.wav| || die "bad line $_"; print "$1_$2 $_"; ' \
| sort > $tmpdir/${x}_sph.scp
# turn it into one that has a valid .wav format in the modern sense (i.e. RIFF format, not sphere).
# This file goes into its final location
mkdir -p data/$x
awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < $tmpdir/${x}_sph.scp > data/$x/wav.scp
# Now get the "text" file that says what the transcription is.
cat data/$x/wav.scp |
perl -ane 'm/^(.._([1-9zo]+)[ab]) / || die; $text = join(" ", split("", $2)); print "$1 $text\n";' \
<data/$x/wav.scp >data/$x/text
# now get the "utt2spk" file that says, for each utterance, the speaker name.
perl -ane 'm/^((..)_\S+) / || die; print "$1 $2\n"; ' \
<data/$x/wav.scp >data/$x/utt2spk
# create the file that maps from speaker to utterance-list.
utils/utt2spk_to_spk2utt.pl <data/$x/utt2spk >data/$x/spk2utt
done
echo "Data preparation succeeded"
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# This script prepares the lang/ directory.
#
. ./path.sh
# Decided to do this using something like a real lexicon, although we
# could also have used whole-word models.
tmpdir=data/local/dict
lang=data/lang
mkdir -p $tmpdir
cat >$tmpdir/lexicon.txt <<EOF
z z iy r ow
o ow
1 w ah n
2 t uw
3 th r iy
4 f ao r
5 f ay v
6 s ih k s
7 s eh v ah n
8 ey t
9 n ay n
EOF
# and note, we'll have a silence phone, but it won't appear
# in this form of lexicon as there's no silence word; it's an option
# in the lexicon FST that gets added by the script.
mkdir -p $lang/phones
# symbol-table for words:
cat $tmpdir/lexicon.txt | awk '{print $1}' | awk 'BEGIN {print "<eps> 0"; n=1;} { printf("%s %s\n", $1, n++); }' \
>$lang/words.txt
# list of phones.
cat $tmpdir/lexicon.txt | awk '{for(n=2;n<=NF;n++) seen[$n]=1; } END{print "sil"; for (w in seen) { print w; }}' \
>$tmpdir/phone.list
# symbol-table for phones:
cat $tmpdir/phone.list | awk 'BEGIN {print "<eps> 0"; n=1;} { printf("%s %s\n", $1, n++); }' \
>$lang/phones.txt
p=$lang/phones
echo sil > $p/silence.txt
echo sil > $p/context_indep.txt
echo sil > $p/optional_silence.txt
grep -v -w sil $tmpdir/phone.list > $p/nonsilence.txt
touch $p/disambig.txt # disambiguation-symbols list, will be empty.
touch $p/extra_questions.txt # list of "extra questions"-- empty; we don't
# have things like tone or word-positions or stress markings.
cat $tmpdir/phone.list > $p/sets.txt # list of "phone sets"-- each phone is in its
# own set. Normally, each line would have a bunch of word-position-dependenent or
# stress-dependent realizations of the same phone.
for t in silence nonsilence context_indep optional_silence disambig; do
utils/sym2int.pl $lang/phones.txt <$p/$t.txt >$p/$t.int
cat $p/$t.int | awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $p/$t.csl
done
for t in extra_questions sets; do
utils/sym2int.pl $lang/phones.txt <$p/$t.txt >$p/$t.int
done
cat $tmpdir/phone.list | awk '{printf("shared split %s\n", $1);}' >$p/roots.txt
utils/sym2int.pl -f 3- $lang/phones.txt $p/roots.txt >$p/roots.int
echo z > $lang/oov.txt # we map OOV's to this.. there are no OOVs in this setup,
# but the scripts expect this file to exist.
utils/sym2int.pl $lang/words.txt <$lang/oov.txt >$lang/oov.int
# Note: "word_boundary.{txt,int}" will not exist in this setup. This will mean it's
# not very easy to get word alignments, but it simplifies some things.
# Make the FST form of the lexicon (this includes optional silence).
utils/make_lexicon_fst.pl $tmpdir/lexicon.txt 0.5 sil | \
fstcompile --isymbols=$lang/phones.txt --osymbols=$lang/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > $lang/L.fst
# Note: in this setup there are no "disambiguation symbols" because the lexicon
# contains no homophones; and there is no '#0' symbol in the LM because it's
# not a backoff LM, so L_disambig.fst is the same as L.fst.
cp $lang/L.fst $lang/L_disambig.fst
silphonelist=`cat $lang/phones/silence.csl | sed 's/:/ /g'`
nonsilphonelist=`cat $lang/phones/nonsilence.csl | sed 's/:/ /g'`
cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \
sed "s:SILENCEPHONES:$silphonelist:" > $lang/topo
# Now we prepare a simple grammar G.fst that's a kind of loop of
# digits (no silence in this, since that's handled in L.fst)
# there are 12 options: 1-9, zero, oh, and end-of-sentence.
penalty=`perl -e '$prob = 1.0/12; print -log($prob); '` # negated log-prob,
# which becomes the cost on the FST.
( for x in `echo z o 1 2 3 4 5 6 7 8 9`; do
echo 0 0 $x $x $penalty # format is: from-state to-state input-symbol output-symbol cost
done
echo 0 $penalty # format is: state final-cost
) | fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt \
--keep_isymbols=false --keep_osymbols=false >$lang/G.fst
exit 0;
if [ $# -ne 0 ]; then
echo "Argument should be the TIDIGITS directory, see ../run.sh for example."
exit 1;
fi
tidigits=$1
tmpdir=`pwd`/data/local/data
mkdir -p $tmpdir
# Note: the .wav files are not in .wav format but "sphere" format (this was
# produced in the days before Windows).
find $tidigits/tidigits/train -name '*.wav' > $tmpdir/train.flist
n=`cat $tmpdir/train.flist | wc -l`
[ $n -eq 8623 ] || echo Unexpected number of training files $n versus 8623
find $tidigits/tidigits/test -name '*.wav' > $tmpdir/test.flist
n=`cat $tmpdir/test.flist | wc -l`
[ $n -eq 8700 ] || echo Unexpected number of test files $n versus 8700
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
for x in train test; do
# get scp file that has utterance-ids and maps to the sphere file.
cat $tmpdir/$x.flist | perl -ane 'm|/(..)/([1-9zo]+[ab])\.wav| || die "bad line $_"; print "$1_$2 $_"; ' \
| sort > $tmpdir/${x}_sph.scp
# turn it into one that has a valid .wav format in the modern sense (i.e. RIFF format, not sphere).
# This file goes into its final location
mkdir -p data/$x
awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < $tmpdir/${x}_sph.scp > data/$x/wav.scp
# Now get the "text" file that says what the transcription is.
cat data/$x/wav.scp |
perl -ane 'm/^(.._([1-9zo]+)[ab]) / || die; $text = join(" ", split("", $2)); print "$1 $text\n";' \
<data/$x/wav.scp >data/$x/text
# now get the "utt2spk" file that says, for each utterance, the speaker name.
perl -ane 'm/^((..)_\S+) / || die; print "$1 $2\n"; ' \
<data/$x/wav.scp >data/$x/utt2spk
# create the file that maps from speaker to utterance-list.
utils/utt2spk_to_spk2utt.pl <data/$x/utt2spk >data/$x/spk2utt
done
echo "Data preparation succeeded"
#!/bin/bash
# Note: this TIDIGITS setup has not been tuned at all and has some obvious
# deficiencies; this has been created as a starting point for a tutorial.
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
# This is a shell script, but it's recommended that you run the commands one by
# one by copying and pasting into the shell.
tidigits=/export/corpora5/LDC/LDC93S10
#tidigits=/mnt/matylda2/data/TIDIGITS
# The following command prepares the data/{train,dev,test} directories.
local/tidigits_data_prep.sh $tidigits || exit 1;
local/tidigits_prepare_lang.sh || exit 1;
utils/validate_lang.pl data/lang/ # Note; this actually does report errors,
# and exits with status 1, but we've checked them and seen that they
# don't matter (this setup doesn't have any disambiguation symbols,
# and the script doesn't like that).
# Now make MFCC features.
# mfccdir should be some place with a largish disk where you
# want to store MFCC features.
mfccdir=mfcc
for x in test train; do
steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 \
data/$x exp/make_mfcc/$x $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
done
utils/subset_data_dir.sh data/train 1000 data/train_1k
# try --boost-silence 1.25 to some of the scripts below (also 1.5, if that helps...
# effect may not be clear till we test triphone system. See
# wsj setup for examples (../../wsj/s5/run.sh)
steps/train_mono.sh --nj 4 --cmd "$train_cmd" \
data/train_1k data/lang exp/mono0a
utils/mkgraph.sh --mono data/lang exp/mono0a exp/mono0a/graph && \
steps/decode.sh --nj 10 --cmd "$decode_cmd" \
exp/mono0a/graph data/test exp/mono0a/decode
steps/align_si.sh --nj 4 --cmd "$train_cmd" \
data/train data/lang exp/mono0a exp/mono0a_ali
steps/train_deltas.sh --cmd "$train_cmd" \
300 3000 data/train data/lang exp/mono0a_ali exp/tri1
utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph
steps/decode.sh --nj 10 --cmd "$decode_cmd" \
exp/tri1/graph data/test exp/tri1/decode
# Example of looking at the output.
# utils/int2sym.pl -f 2- data/lang/words.txt exp/tri1/decode/scoring/19.tra | sed "s/ $//" | sort | diff - data/test/text
# Getting results [see RESULTS file]
# for x in exp/*/decode*; do [ -d $x ] && grep SER $x/wer_* | utils/best_wer.sh; done
#exp/mono0a/decode/wer_17:%SER 3.67 [ 319 / 8700 ]
#exp/tri1/decode/wer_19:%SER 2.64 [ 230 / 8700 ]
../../wsj/s5/steps
\ No newline at end of file
../../wsj/s5/utils
\ No newline at end of file
......@@ -18,7 +18,7 @@
# To be run from one directory above this script.
perl -e 'while(<>){
if (m/WER (\S+)/ && (!defined $bestwer || $bestwer > $1)){ $bestwer = $1; $bestline=$_; } # kaldi "compute-wer" tool.
if (m/[WS]ER (\S+)/ && (!defined $bestwer || $bestwer > $1)){ $bestwer = $1; $bestline=$_; } # kaldi "compute-wer" tool.
elsif (m: (Mean|Sum/Avg|)\s+\|\s+\S+\s+\S+\s+\|\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\|:
&& (!defined $bestwer || $bestwer > $2)){ $bestwer = $2; $bestline=$_; } } # sclite.
if (defined $bestline){ print $bestline; } '
......
......@@ -18,7 +18,7 @@
# makes lexicon FST (no pron-probs involved).
if(@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
die "Usage: make_lexicon_fst.pl lexicon.txt [silprob silphone [sil_disambig_sym]] lexiconfst.txt"
die "Usage: make_lexicon_fst.pl lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt"
}
$lexfn = shift @ARGV;
......
......@@ -48,7 +48,7 @@ loopscale=0.1
required="$lang/L.fst $lang/G.fst $lang/phones.txt $lang/words.txt $lang/phones/silence.csl $lang/phones/disambig.int $model $tree"
for f in $required; do
[ ! -s $f ] && echo "mkgraph.sh: expected $f to exist" && exit 1;
[ ! -f $f ] && echo "mkgraph.sh: expected $f to exist" && exit 1;
done
mkdir -p $lang/tmp
......@@ -103,7 +103,8 @@ fi
cp $lang/words.txt $dir/ || exit 1;
mkdir -p $dir/phones
cp $lang/phones/word_boundary.int $dir/phones/ # might be needed for ctm scoring.
cp $lang/phones/word_boundary.int $dir/phones/ 2>/dev/null # might be needed for ctm scoring,
# but ignore the error if it's not there.
cp $lang/phones/silence.csl $dir/phones/ || exit 1;
# to make const fst:
......
......@@ -9,7 +9,6 @@ if(@ARGV != 1) {
}
$lang = shift @ARGV;
$exit = 0;
# Checking phones.txt -------------------------------
print "Checking $lang/phones.txt ...\n";
......@@ -59,7 +58,7 @@ foreach(keys %wsymtab) {
if(exists $wsymtab{"#0"}) {
print "--> $lang/words.txt has \"#0\"\n";
print "--> $lang/words.txt is OK\n";
} else {print "--> ERROR: $lang/words.txt doesn't have \"#0\"\n"; exit 1;}
} else {print "--> ERROR: $lang/words.txt doesn't have \"#0\"\n"; $exit = 1;}
print "\n";
# Checking phones/* -------------------------------
......
......@@ -103,7 +103,7 @@ fi
cd irstlm
# Applying patch to get -write option of interpolate-lm
# May not work with anything else than revision 398
patch -N -p0 < ../interpolatedwrite-5.60.02.patch || exit 1;
patch -N -p0 < ../interpolatedwrite-5.60.02.patch # || exit 1;
# Just using the default aclocal, automake.
# You may have to mess with the version by editing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment