Commit cedab3e9 authored by Dan Povey's avatar Dan Povey
Browse files

Fixes and extensions to scripts; fixed transition-probs not being trained in SGMMs.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@628 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent c6267c08
......@@ -134,11 +134,11 @@ while [ $iter -lt $numiters ]; do
spkvecs_opt="--spk-vecs=ark:$dir/cur.vecs"
fi
if [ $iter -eq 0 ]; then
flags=vwcS
flags=vwcSt
elif [ $[$iter%2] -eq 1 -a $iter -gt 4 ]; then # even iters after 4 (i.e. starting from 6)...
flags=vNwcS
flags=vNwcSt
else
flags=vMwcS
flags=vMwcSt
fi
sgmm-acc-stats $spkvecs_opt $utt2spk_opt --update-flags=$flags "$gselect_opt" --rand-prune=$randprune --binary=false $dir/$iter.mdl "$feats" "ark:ali-to-post ark:$dir/cur.ali ark:-|" $dir/$iter.acc 2> $dir/acc.$iter.log || exit 1;
sgmm-est --update-flags=$flags --split-substates=$numsubstates --write-occs=$dir/$[$iter+1].occs $dir/$iter.mdl $dir/$iter.acc $dir/$[$iter+1].mdl 2> $dir/update.$iter.log || exit 1;
......
# Note: WER is second number from the right.
for x in exp/*/decode_*; do [ -d $x ] && grep Mean $x/score_*/*.sys | scripts/best_wer.sh; done
exp/tri4a/decode_eval2000/score_13/eval2000.ctm.filt.sys: | Mean | 54.4 524.3 | 52.1 33.1 14.8 4.5 52.4 75.3 |
exp/tri5a/decode_eval2000/score_14/eval2000.ctm.filt.sys:| Mean | 54.4 524.3 | 68.6 21.5 9.9 3.3 34.6 67.9 |
# [waiting for fixed SGMM results]
......@@ -108,7 +108,7 @@ scripts/make_lexicon_fst.pl data/local/lexicon_disambig.txt $silprob SIL '#'$ndi
# Copy into data/lang/ also, where it will be needed for discriminative training.
cp data/lang_test/L_disambig.fst data/lang/
cp data/lang_test/phones_disambig.txt data/lang/
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
......
......@@ -75,6 +75,7 @@ local/remove_dup_utts.sh 300 data/train data/train_nodup
decode_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
train_cmd="queue.pl -q all.q@@blade -l ram_free=700M,mem_free=700M"
long_cmd="queue.pl -q long.q@@blade -l ram_free=700M,mem_free=700M"
steps/train_mono.sh --num-jobs 10 --cmd "$train_cmd" \
data/train_10k_nodup data/lang exp/mono0a
......@@ -127,7 +128,7 @@ steps/align_lda_mllt_sat.sh --num-jobs 30 --cmd "$train_cmd" \
steps/train_ubm_lda_etc.sh --num-jobs 30 --cmd "$train_cmd" \
700 data/train_nodup data/lang exp/tri5a_ali_all_nodup exp/ubm6a
steps/train_sgmm_lda_etc.sh --num-jobs 30 --cmd "$train_cmd" \
4500 40000 41 40 data/train_nodup data/lang exp/tri5a_ali_all_nodup exp/ubm6a/final.ubm exp/sgmm6a
4500 40000 50 40 data/train_nodup data/lang exp/tri5a_ali_all_nodup exp/ubm6a/final.ubm exp/sgmm6a
scripts/mkgraph.sh data/lang_test exp/sgmm6a exp/sgmm6a/graph
# have to match num-jobs with 5a decode.
scripts/decode.sh -l data/lang_test --num-jobs 30 --cmd "$decode_cmd" steps/decode_sgmm_lda_etc.sh \
......@@ -143,4 +144,23 @@ scripts/decode.sh -l data/lang_test --num-jobs 30 --cmd "$decode_cmd" steps/deco
scripts/decode.sh -l data/lang_test --num-jobs 30 --cmd "$decode_cmd" steps/decode_sgmm_lda_etc_fromlats.sh \
data/lang_test data/eval2000 exp/sgmm6a/decode_eval2000_fromlats exp/tri5a/decode_eval2000
for x in exp/*/decode_*; do [ -d $x ] && grep Mean $x/score_*/*.sys | scripts/best_wer.sh; done
# MMI starting from the system in tri5a.
steps/align_lda_mllt_sat.sh --num-jobs 40 --cmd "$train_cmd" \
data/train data/lang exp/tri5a exp/tri5a_ali
steps/make_denlats_lda_etc.sh --num-jobs 40 --cmd "$long_cmd" \
data/train data/lang exp/tri5a_ali exp/tri5a_denlats
steps/train_lda_etc_mmi.sh --num-jobs 40 --cmd "$train_cmd" \
data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a exp/tri5a_mmi
scripts/decode.sh -l data/lang_test --num-jobs 30 --cmd "$decode_cmd" steps/decode_lda_etc.sh \
exp/tri5a/graph data/test_eval2000 exp/tri5a_mmi/decode_eval2000 exp/tri5a/decode_eval2000
steps/train_lda_etc_mmi.sh --boost 0.1 --num-jobs 40 --cmd "$train_cmd" \
data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a exp/tri5a_mmi_b0.1
scripts/decode.sh -l data/lang_test --num-jobs 30 --cmd "$decode_cmd" steps/decode_lda_etc.sh exp/tri5a/graph \
data/test_eval2000 exp/tri5a_mmi_b0.1/decode_eval2000 exp/tri5a/decode_eval2000
# getting results (see RESULTS file)
for x in exp/*/decode_*; do [ -d $x ] && grep Mean $x/score_*/*.sys | scripts/best_wer.sh; done
#!/usr/bin/perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Adds some specified number of disambig symbols to a symbol table.
# Adds these as #1, #2, etc.
# If the --include-zero option is specified, includes an extra one
# #0.
$include_zero = 0;
if($ARGV[0] eq "--include-zero") {
$include_zero = 1;
shift @ARGV;
}
if(@ARGV != 2) {
die "Usage: add_disambig.pl [--include-zero] symtab.txt num_extra > symtab_out.txt ";
}
$input = $ARGV[0];
$nsyms = $ARGV[1];
open(F, "<$input") || die "Opening file $input";
while(<F>) {
@A = split(" ", $_);
@A == 2 || die "Bad line $_";
$lastsym = $A[1];
print;
}
if(!defined($lastsym)){
die "Empty symbol file?";
}
if($include_zero) {
$lastsym++;
print "#0 $lastsym\n";
}
for($n = 1; $n <= $nsyms; $n++) {
$y = $n + $lastsym;
print "#$n $y\n";
}
#!/usr/bin/perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Adds disambiguation symbols to a lexicon.
# Outputs still in the normal lexicon format.
# Disambig syms are numbered #1, #2, #3, etc. (#0
# reserved for symbol in grammar).
# Outputs the number of disambig syms to the standard output.
if(@ARGV != 2) {
die "Usage: add_lex_disambig.pl lexicon.txt lexicon_disambig.txt "
}
$lexfn = shift @ARGV;
$lexoutfn = shift @ARGV;
open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
# (1) Read in the lexicon.
@L = ( );
while(<L>) {
@A = split(" ", $_);
push @L, join(" ", @A);
}
# (2) Work out the count of each phone-sequence in the
# lexicon.
foreach $l (@L) {
@A = split(" ", $l);
shift @A; # Remove word.
$count{join(" ",@A)}++;
}
# (3) For each left sub-sequence of each phone-sequence, note down
# that exists (for identifying prefixes of longer strings).
foreach $l (@L) {
@A = split(" ", $l);
shift @A; # Remove word.
while(@A > 0) {
pop @A; # Remove last phone
$issubseq{join(" ",@A)} = 1;
}
}
# (4) For each entry in the lexicon:
# if the phone sequence is unique and is not a
# prefix of another word, no diambig symbol.
# Else output #1, or #2, #3, ... if the same phone-seq
# has already been assigned a disambig symbol.
open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
$max_disambig = 0;
foreach $l (@L) {
@A = split(" ", $l);
$word = shift @A;
$phnseq = join(" ",@A);
if(!defined $issubseq{$phnseq}
&& $count{$phnseq}==1) {
; # Do nothing.
} else {
if($phnseq eq "") { # need disambig symbols for the empty string
# that are not use anywhere else.
$max_disambig++;
$reserved{$max_disambig} = 1;
$phnseq = "#$max_disambig";
} else {
$curnumber = $disambig_of{$phnseq};
if(!defined{$curnumber}) { $curnumber = 0; }
$curnumber++; # now 1 or 2, ...
while(defined $reserved{$curnumber} ) { $curnumber++; } # skip over reserved symbols
if($curnumber > $max_disambig) {
$max_disambig = $curnumber;
}
$disambig_of{$phnseq} = $curnumber;
$phnseq = $phnseq . " #" . $curnumber;
}
}
print O "$word\t$phnseq\n";
}
print $max_disambig . "\n";
#!/bin/bash
#
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# To be run from one directory above this script.
perl -e 'while(<>){
if (m/WER (\S+)/ && (!defined $bestwer || $bestwer > $1)){ $bestwer = $1; $bestline=$_; } # kaldi "compute-wer" tool.
elsif (m/ Mean\s+\|\s+\S+\s+\S+\s+\|\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\|/
&& (!defined $bestwer || $bestwer > $1)){ $bestwer = $1; $bestline=$_; } } # sclite.
if (defined $bestline){ print $bestline; } '
#!/bin/bash
# Creates FSTs needed for large lm decoding
# And run experiments by invoking steps/decode_tri2a_biglm_faster.sh
# To be run from ..
. path.sh
data=`pwd`/data
graphs=`pwd`/exp/tri2a
for lm_suffix in bg tg_pruned; do
# These are for building HCLG with #0 on output
fstproject $data/G_${lm_suffix}.fst \
| fstarcsort --sort_type=olabel \
> $graphs/Gr_${lm_suffix}.fst
echo "Created Gr_${lm_suffix}.fst"
# These are G^-1 with #0 on input, <eps> on output
cat $data/G_${lm_suffix}.fst \
| fstmap --map_type=invert \
| fstarcsort --sort_type=ilabel \
> $graphs/Gm_${lm_suffix}.fst
echo "Created Gm_${lm_suffix}.fst"
done
for lm_suffix in tg_pruned tg; do
# These are G' with <eps> on both input and output
fstproject --project_output=true $data/G_${lm_suffix}.fst \
| fstarcsort --sort_type=ilabel \
> $graphs/Gp_${lm_suffix}.fst
echo "Created Gp_${lm_suffix}.fst"
done
# Functions to run one experiment defined by $lm1, $lm2
createHCLG() {
scripts/mkgraph.sh $graphs/Gr_${lm1}.fst exp/tri2a/tree exp/tri2a/final.mdl exp/graph_tri2a_${lm1}
echo "Created exp/graph_tri2a_${lm1}/HCLG.fst"
}
runexp() {
exp=exp/decode_tri2a_${lm1}_composed_${lm2}_eval92
mkdir -p $exp
scripts/decode.sh $exp exp/graph_tri2a_${lm1}/HCLG.fst steps/decode_tri2a_composed.sh data/eval_nov92.scp $graphs/Gm_${lm1}.fst $graphs/Gp_${lm2}.fst
echo "$lm1 $lm2 $exp"
grep WER $exp/wer
}
# Bigram + pruned trigram
lm1=bg
lm2=tg_pruned
createHCLG
runexp
# Bigram + unpruned trigram
lm1=bg
lm2=tg
createHCLG
runexp
# RESULTS
# exp: decode_tri2a_bg_composed_tg_pruned_eval92 %WER 12.92 4.8 RT
# exp: decode_tri2a_bg_composed_tg_eval92 %WER 12.11 4.0 RT
#!/usr/bin/perl -w
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This program takes two arguments, which may be files or "-" for the
# standard input. Both files must have lines with one or more fields,
# interpreted as a map from the first field (a string) to a list of strings.
# if the first file has as one of its lines
# A x y
# and the second has the lines
# x P
# y Q R
# then the output of this program will be
# A P Q R
#
# Note that if x or y did not appear as the first field of file b, we would
# print a warning and omit the whole line rather than map it to the empty
# string.
if(@ARGV < 1 || @ARGV > 2 ) {
die "Usage: compose_maps.pl map1 [map2] ";
}
$map1 = shift @ARGV;
open(I, "<$map1") || die "Opening first map $map1";
while(<>) { # <> represents map2.
@A = split(" ", $_);
if(@A == 0) { die "compose_maps.pl: invalid line in second map: $_\n"; }
$key = shift @A;
if(defined $map2{$key} ) {
print STDERR "compose_map.pl: key $key appears twice in second map.\n";
if ($map2{$key} ne join(" ", @A)) {
print STDERR " [and it has inconsistent values]\n";
}
}
$map2{$key} = join(" ", @A);
}
while(<I>) {
@A = split(" ", $_);
if(@A == 0) { die "compose_map.pl: invalid line in second map: $_\n"; }
$key = shift @A;
$str = "$key ";
$ok = 1;
foreach $a (@A) {
if(!defined $map2{$a}) {
print STDERR "compose_map.pl: key $a not defined in second map [skipping the line for $key]\n";
$ok = 0;
} else {
$str = $str . "$map2{$a} ";
}
}
if($ok) {
print "$str\n";
}
}
#!/bin/bash
export LANG=C
export LC_ALL=C
if [ "$3" == "" ]; then
echo "usage: $0 rnnlm.model vocab kaldi.model"
exit
fi
v=$2
km=$3
rm -f $v
cat $1 | awk '
BEGIN{
hix=0;
}
/vocabulary size:/{
V1_size=$3;next;
}
/hidden layer size:/{
h_size=$4;next;
}
/output layer size:/{
W2_size=$4;next;
}
/Vocabulary:/{
for (i=0;i<V1_size;i++) {getline;print $1"\t"$3>>"'$v'";voc[$1]=$3;cl[$1]=$4}; next;
}
/Hidden layer activation:/{
for (i=0;i<h_size;i++) { getline; h[hix++]=$1; } next;
}
/Weights 0->1:/{
for (i=0;i<h_size;i++) for (j=0;j<V1_size+h_size;j++) {
getline;if (j<V1_size) V1[i,j]=$1;else U1[i,j-V1_size]=$1;
}; next;
}
/Weights 1->2:/{
for (i=0;i<W2_size;i++) for (j=0;j<h_size;j++) {
getline;if (j<V1_size) W2[i,j]=$1;
}; next;
}
END{
printf "<rnnlm_v2.0> <v1> [";
# print V1
for (j=0;j<V1_size;j++) {
for (i=0;i<h_size;i++) {
printf " "V1[i,j]
}
print ""
}
# print U1
print " ]";printf " <u1> [";
for (j=0;j<h_size;j++) {
for (i=0;i<h_size;i++) {
printf " "U1[i,j]
}
print ""
}
# print b1
print " ]";printf " <b1> [";
for (i=0;i<h_size;i++)printf " 0.0000";
print " ]";printf "<w2> [";
# print w2
for (j=0;j<h_size;j++) {
for (i=0;i<V1_size;i++) {
printf " "W2[i,j]
}
print ""
}
# print b2
print " ]";printf " <b2> [";
for (i=0;i<V1_size;i++)printf " 0.0000";
print " ]";printf "<cl> [";
# print cl
for (j=0;j<h_size;j++) {
for (i=V1_size;i<W2_size;i++) {
printf " "W2[i,j];
}
print ""
}
# print cl_b
print " ]";printf " <cl_b> [";
for (i=V1_size;i<W2_size;i++) printf " 0.0000";
print " ] <classes> [ ";
for (i=0;i<V1_size;i++) printf " "cl[i];
print " ]";printf " <words> ";
for (i=0;i<V1_size;i++) print voc[i];
}' > $km
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
orig_args="$*"
. path.sh
# will set nj to #spkrs (if using queue) or 4 (if not), if
# not set by the user.
nj=
lang=
cmd=scripts/run.pl
for x in 1 2; do
if [ $1 == "--num-jobs" ]; then
shift
nj=$1
shift
fi
if [ $1 == "--cmd" ]; then
shift
cmd=$1
shift
[ -z "$cmd" ] && echo "Empty argument to --cmd option" && exit 1;
fi
if [ $1 == "-l" ]; then
shift
lang=$1
shift
[ ! -f "$lang/phones_disambig.txt" -o ! -f "$lang/L_align.fst" ] && \
echo "Invalid argument to -l option; expected $lang/phones_disambig.txt and $lang/L_align.fst to exist." \
&& exit 1;
fi
done
if [ $# -lt 4 ]; then
echo "Usage: scripts/decode.sh [-l lang-dir] [--cmd scripts/queue.sh opts..] [--num-jobs n] <decode_script> <graph-dir> <data-dir> <decode-dir> [extra-args...]"
echo "note: -l option only required if you want to score with sclite (since we need L_align.fst)"
exit 1;
fi
script=$1
graphdir=$2
data=$3
dir=$4
# Make "dir" an absolute pathname.
dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`
mkdir -p $dir || exit 1
shift;shift;shift;shift;
# Remaining args will be supplied to decoding script.
extra_args=$*
for file in $script $scp $data/utt2spk; do
if [ ! -f $file ]; then
echo "decode.sh: no such file $file"
exit 1
fi
done
if [ ! -f $graphdir/HCLG.fst -a ! -f $graphdir/G.fst ]; then
# Note: most scripts expect HCLG.fst in graphdir, but the
# "*_fromlats.sh" script(s) require(s) a "lang" dir in that
# position
echo No such file: $graphdir/HCLG.fst or $graphdir/G.fst
exit 1;
fi
if [ "$nj" == "" ]; then # Figure out num-jobs; user did not specify.
cmd1=`echo $cmd | awk '{print $1;}'`
if [ `basename $cmd1` == run.pl ]; then
nj=4
else # running on queue...
nj=`scripts/utt2spk_to_spk2utt.pl $data/utt2spk | wc -l`
fi
fi
echo "Decoding with num-jobs = $nj"
if [[ $nj -gt 1 || ! -d $data/split$nj || $data/split$nj -ot $data/feats.scp ]]; then
scripts/split_data.sh $data $nj
fi
rm $dir/.error 2>/dev/null
for n in `scripts/get_splits.pl $nj`; do
$cmd $dir/part$n.log \
$script -j $nj $n $graphdir $data $dir $extra_args || touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo "Error in decoding script: command line was decode.sh $orig_args" && exit 1;
if ls $dir/lat.*.gz >&/dev/null; then
if [ -n "$lang" ]; then # sclite scoring: $lang directory supplied only for this reason.
[ ! -f $data/stm ] && \
echo "Expected $data/stm to exist (-l option only for sclite scoring)" && exit 1;
scripts/score_lats_ctm.sh $dir $lang $data || exit 1;
else
scripts/score_lats.sh $dir $graphdir/words.txt $data || exit 1;
fi
elif ls $dir/*.txt >&/dev/null; then
scripts/score_text.sh $dir $data || exit 1;
else
echo "No output found in $dir, not scoring.";