Commit b208abd4 authored by Dan Povey's avatar Dan Povey
Browse files

Minor changes to WSJ scripts, incidental to Switchboard work (minor bug fixes,...

Minor changes to WSJ scripts, incidental to Switchboard work (minor bug fixes, etc.; scripts in shared directory with Swbd that are not called by WSJ)

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@611 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 14958153
......@@ -14,7 +14,7 @@ srcdir=data/local/dict_larger
mkdir -p $dir
export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
( # First make sure the kaldi_lm toolkit is installed.
cd ../../../tools
cd ../../../tools || exit 1;
if [ -d kaldi_lm ]; then
echo Not installing the kaldi_lm toolkit since it is already there.
else
......
......@@ -18,6 +18,8 @@
# To be run from one directory above this script.
perl -e 'while(<>){
if (m/WER (\S+)/ && !defined $bestwer || $bestwer > $1){ $bestwer = $1; $bestline=$_; }}
if (m/WER (\S+)/ && (!defined $bestwer || $bestwer > $2)){ $bestwer = $2; $bestline=$_; } # kaldi "compute-wer" tool.
elsif (m/ Mean\s+\|\s+\S+\s+\S+\s+\|\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\|/
&& (!defined $bestwer || $bestwer > $1)){ $bestwer = $1; $bestline=$_; } } # sclite.
if (defined $bestline){ print $bestline; } '
......@@ -19,6 +19,7 @@ orig_args="$*"
# will set nj to #spkrs (if using queue) or 4 (if not), if
# not set by the user.
nj=
lang=
cmd=scripts/run.pl
for x in 1 2; do
if [ $1 == "--num-jobs" ]; then
......@@ -32,11 +33,20 @@ for x in 1 2; do
shift
[ -z "$cmd" ] && echo "Empty argument to --cmd option" && exit 1;
fi
if [ $1 == "-l" ]; then
shift
lang=$1
shift
[ ! -d "$lang/phones_disambig.txt" -o ! -d "$lang/L_align.fst" ] && \
echo "Invalid argument to -l option; expected $lang/phones_disambig.txt and $lang/L_align.fst to exist." \
&& exit 1;
fi
done
if [ $# -lt 4 ]; then
echo "Usage: scripts/decode.sh [--cmd scripts/queue.sh opts..] [--num-jobs n] <decode_script> <graph-dir> <data-dir> <decode-dir> [extra-args...]"
echo "Usage: scripts/decode.sh [-l lang-dir] [--cmd scripts/queue.sh opts..] [--num-jobs n] <decode_script> <graph-dir> <data-dir> <decode-dir> [extra-args...]"
echo "note: -l option only required if you want to score with sclite (since we need L_align.fst)"
exit 1;
fi
......@@ -90,7 +100,13 @@ wait
[ -f $dir/.error ] && echo "Error in decoding script: command line was decode.sh $orig_args" && exit 1;
if ls $dir/lat.*.gz >&/dev/null; then
scripts/score_lats.sh $dir $graphdir/words.txt $data || exit 1;
if [ -n "$lang" ]; then # sclite scoring: $lang directory supplied only for this reason.
[ ! -f $data/stm ] && \
echo "Expected $data/stm to exist (-l option only for sclite scoring)" && exit 1;
scripts/score_lats_ctm.sh $dir $data $lang || exit 1;
else
scripts/score_lats.sh $dir $graphdir/words.txt $data || exit 1;
fi
elif ls $dir/*.txt >&/dev/null; then
scripts/score_text.sh $dir $data || exit 1;
else
......
#!/bin/bash
# This script makes sure that only the segments present in
# all of "feats.scp", "wav.scp" [if present], segments[if prsent]
# text, and utt2spk are present in any of them.
# It puts the original contents of data-dir into
# data-dir/.backup
if [ $# != 1 ]; then
echo "Usage: fix_data_dir.sh data-dir"
exit 1
fi
data=$1
mkdir -p $data/.backup
cat $data/utt2spk | awk '{print $1}' > $data/utts
for x in feats.scp wav.scp text segments; do
if [ -f $data/$x ]; then
scripts/filter_scp.pl $data/$x $data/utts > $data/utts.tmp
mv $data/utts.tmp $data/utts
fi
done
[ ! -s $data/utts ] && echo "fix_data_dir.sh: no utterances remained: not doing anything." && \
rm $data/utts && exit 1;
nutts=`cat $data/utts | wc -l`
nfeats=`cat $data/feats.scp | wc -l`
ntext=`cat $data/text | wc -l`
if [ "$nutts" -ne "$nfeats" -o "$nutts" -ne "$ntext" ]; then
echo "fix_data_dir.sh: kept $nutts utterances, vs. $nfeats features and $ntext transcriptions."
else
echo "fix_data_dir.sh: kept all $nutts utterances."
fi
for x in utt2spk feats.scp wav.scp text segments; do
if [ -f $data/$x ]; then
mv $data/$x $data/.backup/$x
scripts/filter_scp.pl $data/utts $data/.backup/$x > $data/$x
fi
done
scripts/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt
rm $data/utts
echo "fix_data_dir.sh: old files are kept in $data/.backup"
#!/bin/bash
if [ $# != 2 ]; then
echo "Usage: make_ctms.sh src-dir decode-dir"
exit 1;
fi
model=$1/final.mdl
dir=$2
if [ ! -f $model ]; then
echo "No such file $model";
exit 1;
fi
wbegin=`grep "#1" data/phones_disambig.txt | awk '{print $2}'`
wend=`grep "#2" data/phones_disambig.txt | awk '{print $2}'`
mkdir -p $dir/ctm
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
ali-to-phones $model ark:$dir/test_${test}.ali ark:- | \
phones-to-prons data/L_align.fst $wbegin $wend ark:- ark:$dir/test_${test}.tra ark,t:- | \
prons-to-wordali ark:- \
"ark:ali-to-phones --write-lengths $model ark:$dir/test_${test}.ali ark:-|" ark,t:- | \
scripts/wali_to_ctm.sh - data/words.txt > $dir/ctm/test_${test}.ctm || exit 1;
done
#!/bin/bash
if [ -f ./path.sh ]; then . ./path.sh; fi
if [ $# -ne 3 ]; then
echo "Usage: scripts/score_lats_ctm.sh <decode-dir> <lang-dir> <data-dir>"
echo "e.g.: scripts/score_lats_ctm.sh exp/tri5a/decode_eval2000 data/lang_test/ data/eval2000/"
exit 1;
fi
dir=$1
lang=$2
data=$3
model=$dir/../final.mdl # assume model one level up from decoding dir.
hubscr=../../../tools/sctk-2.4.0/bin/hubscr.pl
export PATH=$PATH:`dirname $hubscr`
for f in "$hubscr" $data/stm $data/glm $lang/words.txt $lang/phones_disambig.txt \
$lang/L_align.fst $model $data/segments; do
[ ! -f $f ] && echo "score_lats_ctm.sh: expecting file $f to exist" && exit 1;
done
wbegin=`grep "#1" $lang/phones_disambig.txt | head -1 | awk '{print $2}'`
wend=`grep "#2" $lang/phones_disambig.txt | head -1 | awk '{print $2}'`
[ ! -n "$wbegin" ] && echo "Error with word-begin symbol (bad phones_disambig.txt?)" && exit 1
[ ! -n "$wend" ] && echo "Error with word-end symbol (bad phones_disambig.txt?)" && exit 1
rm $dir/.error 2>/dev/null
for group in "9 10 11" "12 13 14" "15 16"; do # do it in batches of up to 3.
for inv_acwt in $group; do
(
mkdir -p $dir/score_${inv_acwt}
acwt=`perl -e "print (1.0/$inv_acwt);"`
# Since we'll need the word aligment, get the state-level alignment
# as well as the word-level one, for each acwt.
lattice-best-path --acoustic-scale=$acwt --word-symbol-table=$symtab \
"ark:gunzip -c $dir/lat.*.gz|" "ark,t:|gzip -c >$dir/score_${inv_acwt}/tra.gz" \
"ark,t:|gzip -c >$dir/score_${inv_acwt}/ali.gz" 2>$dir/score_${inv_acwt}/rescore.log || exit 1;
name=`basename $data` # e.g. "eval2000"
# Create ctm this pipe first creates a ctm that's relative to the utterance-ids,
# and then makes it relative to the conversation sides).
( ali-to-phones $model "ark:gunzip -c $dir/score_${inv_acwt}/ali.gz|" ark:- | \
phones-to-prons $lang/L_align.fst $wbegin $wend ark:- "ark:gunzip -c $dir/score_${inv_acwt}/tra.gz|" ark,t:- | \
prons-to-wordali ark:- \
"ark:ali-to-phones --write-lengths $model 'ark:gunzip -c $dir/score_${inv_acwt}/ali.gz|' ark,t:- |" ark,t:- | \
scripts/wali_to_ctm.sh - $lang/words.txt $data/segments | grep -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \
grep -v -E '<UNK>' ) > $dir/score_${inv_acwt}/$name.ctm 2>$dir/score_${inv_acwt}/log || exit 1;
$hubscr -V -l english -h hub5 -g $data/glm -r $data/stm $dir/score_${inv_acwt}/${name}.ctm \
>&$dir/score_${inv_acwt}/sclite.log || exit 1
) || (echo status is $? && touch $dir/.error) &
done
wait
[ -f $dir/.error ] && echo "Error in scoring script, look into logs in $dir/score_*/ for more details" \
&& exit 1
done
exit 0
#!/bin/bash
if [ $# != 2 ]; then
echo "Usage: wali_to_ctm.sh word-alignments words-symbol-table > ctm" 1>&2
# Produces ctm files suitable for scoring with NIST's sclite tool.
# Note: with 2 arguments this produces a ctm that's "relative to the utterance-id",
# i.e. it treats the utterance-id as if it's a file.
# If you provide the segments-file,
# which specifies how the utterance-ids relate to the original waveform files,
# it produces output that is relative to the original waveform files.
if [ $# -ne 2 -a $# -ne 3 ]; then
echo "Usage: wali_to_ctm.sh word-alignments words-symbol-table [segments-file] > ctm" 1>&2
exit 1;
fi
......@@ -17,10 +23,31 @@ cat $wali | \
($word,$dur) = split(" ", $a);
$dur *= 0.01;
if ($word != 0) {
print "$utt 1 $word $time $dur $word\n";
print "$utt 1 $time $dur $word\n";
}
$time =$time + $dur;
} ' | scripts/int2sym.pl --field 6 $symtab
} ' | scripts/int2sym.pl --field 5 $symtab | \
( if [ $# -eq 2 ]; then
cat
else # Convert this ctm to being relative to orig.
# waveform files.
segments=$3
[ ! -f $segments ] && echo No such file $segments && exit 1;
perl -e '$seg=shift @ARGV; open(S, "<$seg")||die "No such file $seg";
while(<S>){ ($utt,$spk,$begin,$end)=split;
($filename,$side) = split("-",$spk); $begin{$utt}=$begin;
$end{$utt}=$end;$filename{$utt}=$filename; $side{$utt}=$side;
}
while(<STDIN>) {
($utt,$one,$time,$dur,$word)=split;
$filename=$filename{$utt}; $side=$side{$utt};
$begin=$begin{$utt};
defined $begin && defined $filename && defined $side|| die "Bad utt $utt: not in segments file";
$begintime = $time + $begin{$utt};
print "$filename $side $begintime $dur $word\n";
} ' $segments
fi )
......@@ -33,10 +33,8 @@ if [ "$1" == "-j" ]; then
numjobs=$1;
jobid=$2;
shift; shift;
if [ $jobid -ge $numjobs ]; then
echo "Invalid job number, $jobid >= $numjobs";
exit 1;
fi
! scripts/get_splits.pl $numjobs | grep -w $jobid >/dev/null && \
echo Invalid job-number $jobid "(num-jobs = $numjobs)" && exit 1;
fi
if [ $# != 3 ]; then
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment