Commit 88634565 authored by Dan Povey's avatar Dan Povey
Browse files

trunk: some script improvements and fixes RE cleanup/find_bad_utts.sh and cleanup/debug_lexicon.sh

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4731 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent d8c12c0b
......@@ -6,6 +6,7 @@
# Begin configuration section.
stage=1
remove_stress=false
nj=10 # number of jobs for various decoding-type things that we run.
cmd=run.pl
# End configuration section
......@@ -22,6 +23,9 @@ if [ $# != 5 ]; then
echo " --nj <nj> # number of parallel jobs"
echo " --cmd <cmd> # command to run jobs, e.g. run.pl,queue.pl"
echo " --stage <stage> # use to control partial reruns."
echo " --remove-stress <true|false> # if true, remove stress before printing analysis"
echo " # note: if you change this, you only have to rerun"
echo " # from stage 10."
exit 1;
fi
......@@ -51,7 +55,7 @@ fi
if [ $stage -le 4 ]; then
steps/decode_si.sh --cmd "$cmd" --nj $nj --transform-dir ${src}_ali_$(basename $data) \
--beam 10.0 --lattice-beam 2.0 --max-active 2500 \
--acwt 0.25 --beam 25.0 --lattice-beam 5.0 --max-active 2500 \
$src/graph_phone_bg $data $src/decode_$(basename $data)_phone_bg
fi
......@@ -60,12 +64,14 @@ if [ $stage -le 5 ]; then
fi
if [ $stage -le 6 ]; then
steps/get_ctm.sh $data data/$(basename $lang)_phone_bg $src/decode_$(basename $data)_phone_bg
steps/get_ctm.sh --min-lmwt 3 --max-lmwt 8 \
$data data/$(basename $lang)_phone_bg $src/decode_$(basename $data)_phone_bg
fi
if [ $stage -le 7 ]; then
mkdir -p $dir
cp $src/decode_$(basename $data)_phone_bg/score_10/$(basename $data).ctm $dir/phone.ctm
# lmwt=4 corresponds to the scale we decoded at.
cp $src/decode_$(basename $data)_phone_bg/score_4/$(basename $data).ctm $dir/phone.ctm
cp ${src}_ali_$(basename $data)/ctm $dir/word.ctm
fi
......@@ -117,8 +123,23 @@ if [ $stage -le 9 ]; then
fi
if [ $stage -le 10 ]; then
cat $dir/prons.txt | perl -e '
print ";; <count-of-this-pron> <rank-of-this-pron> CORRECT|INCORRECT <word> <pron>\n";
if $remove_stress; then
perl -e 'while(<>) { @A=split(" ", $_); for ($n=1;$n<@A;$n++) { $A[$n] =~ s/[0-9]$//; } print join(" ", @A) . "\n"; } ' \
<$srcdict >$dir/lexicon.txt
else
cp $srcdict $dir/lexicon.txt
fi
awk '{count[$2] += $1;} END {for (w in count){print w, count[w];}}' \
<$dir/prons.txt >$dir/counts.txt
cat $dir/prons.txt | \
if $remove_stress; then
perl -e 'while(<>) { @A=split(" ", $_); for ($n=1;$n<@A;$n++) { $A[$n] =~ s/[0-9]$//; } print join(" ", @A) . "\n"; } '
else
cat
fi | perl -e '
print ";; <count-of-this-pron> <rank-of-this-pron> <frequency-of-this-pron> CORRECT|INCORRECT <word> <pron>\n";
open(D, "<$ARGV[0]") || die "opening dict file $ARGV[0]";
# create a hash of all reference pronuncations, and for each word, record
# a list of the prons, separated by " | ".
......@@ -128,17 +149,38 @@ if [ $stage -le 10 ]; then
if (!defined $prons{$w}) { $prons{$w} = join(" ", @A); }
else { $prons{$w} = $prons{$w} . " | " . join(" ", @A); }
}
open(C, "<$ARGV[1]") || die "opening counts file $ARGV[1];";
while (<C>) { @A = split(" ", $_); $word_count{$A[0]} = $A[1]; }
while (<STDIN>) { @A = split(" ", $_);
$count = shift @A; $word = $A[0];
$wc = ++$wcount{$word}; # 1 if top observed pron of word, 2 if second...
$count = shift @A; $word = $A[0]; $freq = sprintf("%0.2f", $count / $word_count{$word});
$rank = ++$wcount{$word}; # 1 if top observed pron of word, 2 if second...
$str = (defined $is_pron{join(" ", @A)} ? "CORRECT" : "INCORRECT");
shift @A;
print "$count $wc $str $word \"" . join(" ", @A) . "\", ref = \"$prons{$word}\"\n";
} ' $srcdict >$dir/pron_info.txt
print "$count $rank $freq $str $word \"" . join(" ", @A) . "\", ref = \"$prons{$word}\"\n";
} ' $dir/lexicon.txt $dir/counts.txt >$dir/pron_info.txt
grep -v '^;;' $dir/pron_info.txt | \
awk '{ word=$5; count=$1; if (tot[word] == 0) { first_line[word] = $0; }
corr[word] += ($4 == "CORRECT" ? count : 0); tot[word] += count; }
END {for (w in tot) { printf("%s\t%s\t%s\t\t%s\n", tot[w], w, (corr[w]/tot[w]), first_line[w]); }} ' \
| sort -k1 -nr | cat <( echo ';; <total-count-of-word> <word> <correct-proportion> <first-corresponding-line-in-pron_info.txt>') - \
> $dir/word_info.txt
fi
if [ $stage -le 11 ]; then
echo "$0: some of the more interesting stuff in $dir/pron_info.txt follows."
grep -w INCORRECT $dir/pron_info.txt | grep -w 1 | head -n 40
echo "# grep -w INCORRECT $dir/pron_info.txt | grep -w 1 | head -n 20"
grep -w INCORRECT $dir/pron_info.txt | grep -w 1 | head -n 20
echo "$0: here are some other interesting things.."
echo "# grep -w INCORRECT $dir/pron_info.txt | grep -w 1 | awk '\$3 > 0.4 && \$1 > 10' | head -n 20"
grep -w INCORRECT $dir/pron_info.txt | grep -w 1 | awk '$3 > 0.4 && $1 > 10' | head -n 20
echo "$0: here are some high-frequency words whose reference pronunciations rarely show up."
echo "# awk '\$3 < 0.1' $dir/word_info.txt | head -n 20"
awk '$3 < 0.1 || $1 == ";;"' $dir/word_info.txt | head -n 20
fi
......@@ -30,8 +30,8 @@ echo "$0 $@" # Print the command line for logging
. parse_options.sh || exit 1;
if [ $# != 4 ]; then
echo "usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
echo "e.g.: $0 data/train data/lang exp/tri1 exp/tri1_ali"
echo "usage: $0 <data-dir> <lang-dir> <src-dir> <dir>"
echo "e.g.: $0 data/train data/lang exp/tri1 exp/tri1_debug"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --nj <nj> # number of parallel jobs"
......
......@@ -8,6 +8,8 @@
# begin configuration section.
cmd=run.pl
stage=0
min_lmwt=5
max_lmwt=20
use_segments=true # if we have a segments file, use it to convert
# the segments to be relative to the original files.
#end configuration section.
......@@ -57,7 +59,7 @@ if [ $stage -le 0 ]; then
fi
if [ -f $lang/phones/word_boundary.int ]; then
$cmd LMWT=5:20 $dir/scoring/log/get_ctm.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
mkdir -p $dir/score_LMWT/ '&&' \
lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
......@@ -70,7 +72,7 @@ if [ $stage -le 0 ]; then
exit 1;
fi
$cmd LMWT=5:20 $dir/scoring/log/get_ctm.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
mkdir -p $dir/score_LMWT/ '&&' \
lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment