Commit af15c31b authored by Dan Povey's avatar Dan Povey
Browse files

Getting closer to finishing Swbd setup. Not yet finished.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@615 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 37da9435
#!/bin/bash
#
# To be run from one directory above this script.
# The input is two directory names (possibly the same) ontaining the
# 2000 Hub5 english evaluation test set and transcripts, which are
# respectively:
# LDC2002S09 LDC2002T43
# e.g. see
#http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2002S09
#http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002T43
#
# Example usage:
# local/eval2000_data_prep.sh /mnt/matylda2/data/HUB5_2000/ /mnt/matylda2/data/HUB5_2000/
# If you just copied the CDs directly, both directories might end with "hub5e_00".
# [note: I'm not sure about this though, I didn't see the original CD's].
# The first directory ($sdir) contains the speech data, and the directory
# $sdir/english/
# should exist.
# The second directory ($tdir) contains the transcripts, and the directory
# $tdir/2000_hub5_eng_eval_tr
# should exist; in particular we need the file
# $tdir/2000_hub5_eng_eval_tr/reference/hub5e00.english.000405.stm
# [just change this script if you don't have this type of structure in
# the way you unpacked it].
if [ $# -ne 2 ]; then
echo "Usage: local/eval2000_data_prep.sh <speech-dir> <transcription-dir>"
echo e.g. local/eval2000_data_prep.sh /mnt/matylda2/data/HUB5_2000/ /mnt/matylda2/data/HUB5_2000/
echo See comments in the script for more details
exit 1
fi
sdir=$1
tdir=$2
[ ! -d $sdir/english ] && echo Expecting directory $sdir/english to be present \
&& exit 1;
[ ! -d $tdir/2000_hub5_eng_eval_tr ] && echo Expecting directory $tdir/2000_hub5_eng_eval_tr to be present \
&& exit 1;
dir=data/local/eval2000
mkdir -p $dir
for x in $sdir/english/*.sph; do echo $x; done > $dir/sph.flist
awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $dir/sph.flist > $dir/sph_sides.scp
sph2pipe=`cd ../../..; echo $PWD/tools/sph2pipe_v2.5/sph2pipe`
[ ! -f $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
cat $dir/sph_sides.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
sort > $dir/wav_sides.scp
#cat /mnt/matylda2/data/HUB5_2000/2000_hub5_eng_eval_tr/reference/english/*.txt | \
# awk '/<contraction/{next;} /</{print;}'| head
# Get segments file...
#segments file format is: utt-id side-id start-time end-time, e.g.:
#sw02001-A_000098-001156 sw02001-A 0.98 11.56
pem=$sdir/english/hub5e_00.pem
[ ! -f $pem ] && echo "No such file $pem" && exit 1;
# pem file has lines like:
#en_4156 A unknown_speaker 301.85 302.48
grep -v ';;' $pem | awk '{spk=$1"-"$2; utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); print utt,spk,$4,$5;}' \
| sort > $dir/segments
# sgm file has lines like:
#en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F> HE IS A POLICE OFFICER
grep -v ';;' $tdir/2000_hub5_eng_eval_tr/reference/hub5e00.english.000405.stm | \
awk '{spk=$1"-"$2; utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); printf utt;
for(n=7;n<=NF;n++) printf " " $n; print ""; }' | sort > $dir/text.all
# We'll use the stm file for sclite scoring. There seem to be various errors
# in the stm file that upset hubscr.pl, and we fix them here.
cat $tdir/2000_hub5_eng_eval_tr/reference/hub5e00.english.000405.stm | \
sed 's:((:(:' | sed s:<B_ASIDE>::g | sed s:<E_ASIDE>::g |
> $dir/stm
cp $tdir/2000_hub5_eng_eval_tr/reference/en20000405_hub5.glm $dir/glm
# next line uses command substitution
# Just checking that the segments are the same in pem vs. stm.
! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \
echo "Segments from pem file and stm file do not match." && exit 1;
grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
# create an utt2spk file that assumes each conversation side is
# a separate speaker.
cat $dir/segments | awk '{print $1,$2;}' > $dir/utt2spk
scripts/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
dest=data/eval2000
mkdir -p $dest
for x in wav_sides.scp segments text utt2spk spk2utt stm glm; do
cp $dir/$x $dest/$x
done
echo Data preparation and formatting completed for Eval 2000
echo "(but not MFCC extraction)"
#!/bin/awk -f
{
segment=$1;
split(segment,S,"[_-]");
side=S[2];
audioname=S[1];
startf=S[3];
endf=S[4];
print segment " " audioname "-" side " " startf/100 " " endf/100
}
\ No newline at end of file
......@@ -14,7 +14,7 @@
#check existing directories
if [ $# != 1 ]; then
echo "Usage: ./run.sh /path/to/SWBD"
echo "Usage: swbd_p1_data_prep.sh /path/to/SWBD"
exit 1;
fi
......@@ -89,14 +89,9 @@ cat transcripts1.txt | $DIR/local/oov2unk.pl lexicon1.txt " " \
# Now modify both the lexicon and trancsripts to
# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
$DIR/local/dct2phones.awk lexicon1.txt | sort | \
perl -ane 's:\r::; print;' | \
awk 'BEGIN{print "<eps> 0"; print "SIL 1"; print "SPN 2"; print "NSN 3"; print "LAU 4"; N=5; }
......@@ -116,7 +111,8 @@ cat lexicon1.txt \
> lexicon2.txt
# Add to the lexicon the silences, noises etc.
(echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU' ) | \
(echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU';
echo '<UNK> SPN' ) | \
cat - lexicon2.txt > lexicon3.txt
......@@ -184,26 +180,25 @@ cat lexicon.txt | awk '{print $1}' | sort | uniq | \
awk 'BEGIN{print "<eps> 0";} {printf("%s %d\n", $1, NR);} END{printf("#0 %d\n", NR+1);} ' \
> words.txt
## (1b) Continue trans preparation
## Convert real OOVs to <SPOKEN_NOISE>
# (1c) Make segment files from transcript
#segments file format is: utt-id side-id start-time end-time, e.g.:
#sw02001-A_000098-001156 sw02001-A 0.98 11.56
# I) list of all segments
$DIR/local/make_segments.awk train.txt > segments
awk '{ segment=$1; split(segment,S,"[_-]"); side=S[2]; audioname=S[1];startf=S[3];endf=S[4];
print segment " " audioname "-" side " " startf/100 " " endf/100}' <train.txt > segments
awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' train_sph.flist > train_sph.scp
awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' train_sph.flist > train_sph.scp
sph2pipe=`cd ../../../../..; echo $PWD/tools/sph2pipe_v2.5/sph2pipe`
if [ ! -f $sph2pipe ]; then
echo "Could not find the sph2pipe program at $sph2pipe";
exit 1;
fi
cat train_sph.scp | awk '{printf("%s-A '$sph2pipe' -f wav -p -c 1 %s |\n", $1, $2); printf("%s-B '$sph2pipe' -f wav -p -c 2 %s |\n", $1, $2);}' | \
sort > train_wav.scp #side A - channel 1, side B - channel 2
[ ! -f $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
cat train_sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
sort > train_wav.scp #side A - channel 1, side B - channel 2
cat segments | awk '{spk=substr($1,4,6); print $1 " " spk}' > train.utt2spk
cat train.utt2spk | sort -k 2 | $DIR/scripts/utt2spk_to_spk2utt.pl > train.spk2utt
echo Switchboard phase 1 data preparation succeeded.
#!/bin/bash
#!/bin/bash
#
if [ -f path.sh ]; then . path.sh; fi
#data_list="train test"
data_list="train"
for x in lang lang_test $data_list; do
silprob=0.5
for x in lang lang_test train; do
mkdir -p data/$x
done
arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
# Copy stuff into its final location:
for x in $data_list; do
for x in train; do
cp data/local/$x.spk2utt data/$x/spk2utt || exit 1;
cp data/local/$x.utt2spk data/$x/utt2spk || exit 1;
# Don't call it wav.scp because that's reserved for the wav file
......@@ -26,7 +25,6 @@ for x in $data_list; do
done
cp data/local/words.txt data/lang/words.txt
cp data/local/phones.txt data/lang/phones.txt
......@@ -69,65 +67,96 @@ cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \
ndisambig=`scripts/add_lex_disambig.pl data/local/lexicon.txt data/local/lexicon_disambig.txt`
ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST.
scripts/add_disambig.pl data/lang/phones.txt $ndisambig > data/lang_test/phones_disambig.txt
scripts/add_disambig.pl --include-zero data/lang/phones.txt $ndisambig > data/lang_test/phones_disambig.txt
silprob=0.5 # same prob as word
scripts/make_lexicon_fst.pl data/local/lexicon.txt $silprob SIL | \
fstcompile --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > data/lang/L.fst
echo "This script is not finished!"
exit 1;
for x in topo L.fst words.txt phones.txt silphones.csl nonsilphones.csl; do
cp data/lang/$x data/lang_test
done
# test lexicon - not ready yet !!!
#scripts/make_lexicon_fst.pl data/local/lexicon_disambig.txt $silprob SIL '#'$ndisambig | \
# fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang/words.txt \
# --keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel \
# > data/lang_test/L_disambig.fst
# G is not ready yet !!!
fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang/words.txt --keep_isymbols=false \
--keep_osymbols=false data/local/G.txt > data/lang_test/G.fst
# Create L_align.fst, which is as L.fst but with alignment symbols (#1 and #2 at the
# beginning and end of words, on the input side)... needed to discover the
# word boundaries in alignments, when we need to create ctm-format output.
# Checking that G is stochastic [note, it wouldn't be for an Arpa]
fstisstochastic data/lang_test/G.fst || echo Error: G is not stochastic
cat data/local/lexicon.txt | \
awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }' | \
scripts/make_lexicon_fst.pl - $silprob SIL | \
fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > data/lang_test/L_align.fst
# Checking that G.fst is determinizable.
fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
# Checking that L_disambig.fst is determinizable.
fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
# Make lexicon with disambiguation symbols. We need to
# add self-loops to "pass through" the #0 symbol from the
# backoff language model.
phone_disambig_symbol=`grep \#0 data/lang_test/phones_disambig.txt | awk '{print $2}'`
word_disambig_symbol=`grep \#0 data/lang_test/words.txt | awk '{print $2}'`
# Checking that disambiguated lexicon times G is determinizable
fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
fstdeterminize >/dev/null || echo Error
scripts/make_lexicon_fst.pl data/local/lexicon_disambig.txt $silprob SIL '#'$ndisambig | \
fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstaddselfloops "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \
fstarcsort --sort_type=olabel \
> data/lang_test/L_disambig.fst
# Checking that LG is stochastic:
fsttablecompose data/lang/L.fst data/lang_test/G.fst | \
fstisstochastic || echo Error: LG is not stochastic.
# Copy into data/lang/ also, where it will be needed for discriminative training.
cp data/lang_test/L_disambig.fst data/lang/
# Checking that L_disambig.G is stochastic:
fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
fstisstochastic || echo Error: LG is not stochastic.
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
scripts/remove_oovs.pl /dev/null | \
scripts/eps2disambig.pl | scripts/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
echo "First few lines of lexicon FST:"
fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head
echo Performing further checks
silphonelist=`cat data/lang/silphones.csl | sed 's/:/ /g'`
nonsilphonelist=`cat data/lang/nonsilphones.csl | sed 's/:/ /g'`
cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \
sed "s:SILENCEPHONES:$silphonelist:" > data/lang/topo
# Checking that G.fst is determinizable.
fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
# Checking that L_disambig.fst is determinizable.
fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
# Checking that disambiguated lexicon times G is determinizable
# Note: we do this with fstdeterminizestar not fstdeterminize, as
# fstdeterminize was taking forever (presumbaly relates to a bug
# in this version of OpenFst that makes determinization slow for
# some case).
fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
fstdeterminizestar >/dev/null || echo Error
# Checking that LG is stochastic:
fsttablecompose data/lang/L.fst data/lang_test/G.fst | \
fstisstochastic || echo Error: LG is not stochastic.
for x in phones.txt words.txt silphones.csl nonsilphones.csl topo; do
cp data/lang/$x data/lang_test/$x || exit 1;
done
echo swbd_p1_format_data succeeded.
......@@ -27,12 +27,22 @@ local/swbd_p1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2
local/swbd_p1_format_data.sh
# Data preparation and formatting for eval2000 (note: the "text" file
# is not very much preprocessed; for actual WER reporting we'll use
# sclite.
local/eval2000_data_prep.sh /mnt/matylda2/data/HUB5_2000/ /mnt/matylda2/data/HUB5_2000/
# mfccdir should be some place with a largish disk where you
# want to store MFCC features.
#mfccdir=/mnt/matylda6/ijanda/kaldi_swbd_mfcc
mfccdir=/mnt/matylda6/jhu09/qpovey/kaldi_swbd_mfcc
cmd="queue.pl -q all.q@@blade" # remove the option if no queue.
local/make_mfcc_segs.sh --num-jobs 10 --cmd "$cmd" data/train exp/make_mfcc/train $mfccdir
# after this, the next command will remove the small number of utterances
# that couldn't be extracted for some reason (e.g. too short; no such file).
scripts/fix_data_dir.sh data/train
local/make_mfcc_segs.sh --num-jobs 4 data/eval2000 exp/make_mfcc/eval2000 $mfccdir
# Now-- there are 264k utterances, and we want to start the monophone training
# on relatively short utterances (easier to align), but not only the very shortest
......@@ -93,6 +103,10 @@ steps/align_lda_mllt.sh --num-jobs 30 --cmd "$train_cmd" \
steps/train_lda_mllt_sat.sh --num-jobs 30 --cmd "$train_cmd" \
4000 20000 data/train_100k_nodup data/lang exp/tri3a_ali exp/tri4a
scripts/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
scripts/decode.sh --num-jobs 10 --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri4a/graph \
data/eval2000 exp/tri4a/decode_eval2000
steps/align_lda_mllt_sat.sh --num-jobs 30 --cmd "$train_cmd" \
data/train_nodup data/lang exp/tri4a exp/tri4a_ali_all_nodup
......@@ -101,3 +115,22 @@ steps/train_lda_mllt_sat.sh --num-jobs 30 --cmd "$train_cmd" \
4000 150000 data/train_nodup data/lang exp/tri4a_ali_all_nodup exp/tri5a
scripts/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
scripts/decode.sh --num-jobs 10 --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri5a/graph \
data/eval2000 exp/tri5a/decode_eval2000
# Align the 5a system; we'll train an SGMM system on top of
# LDA+MLLT+SAT, and use 5a system for 1st pass.
steps/align_lda_mllt_sat.sh --num-jobs 30 --cmd "$train_cmd" \
data/train_nodup data/lang exp/tri5a exp/tri5a_ali_all_nodup
steps/train_ubm_lda_etc.sh --num-jobs 30 --cmd "$train_cmd" \
700 data/train_nodup data/lang exp/tri5a_ali_all_nodup exp/ubm6a
steps/train_sgmm_lda_etc.sh --num-jobs 30 --cmd "$train_cmd" \
4500 40000 41 40 data/train_nodup data/lang exp/tri5a_ali_all_nodup exp/ubm6a/final.ubm exp/sgmm6a
scripts/mkgraph.sh data/lang_test_tgpr exp/sgmm6a exp/sgmm6a/graph_tgpr
# have to match num-jobs with 5a decode.
scripts/decode.sh --num-jobs 10 --cmd "$decode_cmd" steps/decode_sgmm_lda_etc.sh \
exp/sgmm6a/graph_tgpr data/eval2000 exp/sgmm6a/decode_eval2000 exp/tri5a/decode_eval2000
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment