Commit 57596e1d authored by Jan Trmal's avatar Jan Trmal
Browse files

(trunk/wsj/s5) Small improvements to the "core" scripts as used during Babel


git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4249 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent d4584aec
#!/bin/bash
# Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey)
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey)
# Vimal Manohar
# Apache 2.0
##Changes
# Vimal Manohar (Jan 2014):
# Added options to boost silence probabilities in the model before
# decoding. This can help in favoring the silence phones when
# some silence regions are wrongly decoded as speech phones like glottal stops
# Begin configuration section.
transform_dir=
iter=
model= # You can specify the model to use (e.g. if you want to use the .alimdl)
boost_silence=1.0 # Boost silence pdfs in the model by this factor before decoding
silence_phones_list= # List of silence phones that would be boosted before decoding
stage=0
nj=4
cmd=run.pl
......@@ -27,6 +36,8 @@ echo "$0 $@" # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
[ -z $silence_phones_list ] && boost_silence=1.0
if [ $# != 3 ]; then
echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
......@@ -106,10 +117,13 @@ if [ $stage -le 0 ]; then
words="ark:/dev/null"
fi
[ ! -z "$silence_phones_list" ] && \
model="gmm-boost-silence --boost=$boost_silence $silence_phones_list $model - |"
$cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
gmm-decode-faster$thread_string --max-active=$max_active --beam=$beam \
--acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
$model $graphdir/HCLG.fst "$feats" "$words" "$ali" || exit 1;
"$model" $graphdir/HCLG.fst "$feats" "$words" "$ali" || exit 1;
fi
exit 0;
......@@ -20,7 +20,7 @@ gselect=15 # Number of Gaussian-selection indices for SGMMs. [Note:
first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in
# the 1st pass of decoding (lattice generation).
max_active=7000
max_mem=50000000
#WARNING: This option is renamed lattice_beam (it was renamed to follow the naming
# in the other scripts
lattice_beam=6.0 # Beam we use in lattice generation.
......@@ -131,7 +131,7 @@ if [ $stage -le 2 ]; then
$cmd $parallel_opts JOB=1:$nj $dir/log/decode_pass1.JOB.log \
sgmm2-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
--acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \
--word-symbol-table=$graphdir/words.txt "$gselect_opt_1stpass" $alignment_model \
--word-symbol-table=$graphdir/words.txt --max-mem=$max_mem "$gselect_opt_1stpass" $alignment_model \
$graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1;
fi
......
......@@ -2,23 +2,7 @@
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
@ARGV != 1 && print STDERR "Usage: summarize_warnings.pl <log-dir>\n" && exit 1;
$dir = $ARGV[0];
! -d $dir && print STDERR "summarize_warnings.pl: no such directory $dir\n" && exit 1;
$dir =~ s:/$::; # Remove trailing slash.
# Group the files into categories where all have the same base-name.
foreach $f (glob ("$dir/*.log")) {
$f_category = $f;
# do next expression twice; s///g doesn't work as they overlap.
$f_category =~ s:\.\d+\.:.*.:;
$f_category =~ s:\.\d+\.:.*.:;
$fmap{$f_category} .= " $f";
}
#scalar(@ARGV) >= 1 && print STDERR "Usage: summarize_warnings.pl <log-dir>\n" && exit 1;
sub split_hundreds { # split list of filenames into groups of 100.
my $names = shift @_;
......@@ -35,7 +19,53 @@ sub split_hundreds { # split list of filenames into groups of 100.
return @ans;
}
foreach $c (keys %fmap) {
sub parse_accounting_entry {
$entry= shift @_;
@elems = split " ", $entry;
$time=undef;
$threads=undef;
foreach $elem (@elems) {
if ( $elem=~ m/time=(\d+)/ ) {
$elem =~ s/time=(\d+)/$1/;
$time = $elem;
} elsif ( $elem=~ m/threads=(\d+)/ ) {
$elem =~ s/threads=(\d+)/$1/g;
$threads = $elem;
} else {
die "Unknown entry \"$elem\" when parsing \"$entry\" \n";
}
}
if (defined($time) and defined($threads) ) {
return ($time, $threads);
} else {
die "The accounting entry \"$entry\" did not contain all necessary attributes";
}
}
foreach $dir (@ARGV) {
#$dir = $ARGV[0];
print $dir
! -d $dir && print STDERR "summarize_warnings.pl: no such directory $dir\n" ;
$dir =~ s:/$::; # Remove trailing slash.
# Group the files into categories where all have the same base-name.
foreach $f (glob ("$dir/*.log")) {
$f_category = $f;
# do next expression twice; s///g doesn't work as they overlap.
$f_category =~ s:\.\d+\.(?!\d+):.*.:;
#$f_category =~ s:\.\d+\.:.*.:;
$fmap{$f_category} .= " $f";
}
}
foreach $c (sort (keys %fmap) ) {
$n = 0;
foreach $fgroup (split_hundreds($fmap{$c})) {
$n += `grep -w WARNING $fgroup | wc -l`;
......@@ -44,7 +74,7 @@ foreach $c (keys %fmap) {
print "$n warnings in $c\n"
}
}
foreach $c (keys %fmap) {
foreach $c (sort (keys %fmap)) {
$n = 0;
foreach $fgroup (split_hundreds($fmap{$c})) {
$n += `grep -w ERROR $fgroup | wc -l`;
......@@ -53,3 +83,39 @@ foreach $c (keys %fmap) {
print "$n errors in $c\n"
}
}
$supertotal_cpu_time=0.0;
$supertotal_clock_time=0.0;
$supertotal_threads=0.0;
foreach $c (sort (keys %fmap)) {
$n = 0;
$total_cpu_time=0.0;
$total_clock_time=0.0;
$total_threads=0.0;
foreach $fgroup (split_hundreds($fmap{$c})) {
$lines=`grep -a "# Accounting: " $fgroup |sed 's/.* Accounting: *//g'`;
#print $lines ."\n";
@entries = split "\n", $lines;
foreach $line (@entries) {
$time, $threads = parse_accounting_entry($line);
$total_cpu_time += $time * $threads;
$total_threads += $threads;
if ( $time > $total_clock_time ) {
$total_clock_time += $time;
}
}
}
print "total_cpu_time=$total_cpu_time clock_time=$total_clock_time total_threads=$total_threads group=$c\n";
$supertotal_cpu_time += $total_cpu_time;
$supertotal_clock_time += $total_clock_time;
$supertotal_threads += $total_threads;
}
print "total_cpu_time=$supertotal_cpu_time clock_time=$supertotal_clock_time total_threads=$supertotal_threads group=all\n";
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment