Commit 9475afc2 authored by Dan Povey's avatar Dan Povey
Browse files

Committing latest improvements to s3 scripts (and also fix to rm/s3 script)

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@520 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent c3417b09
......@@ -61,6 +61,21 @@ scripts/make_lexicon_fst.pl data/local/lexicon.txt $silprob sil | \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > data/lang/L.fst
# Create L_align.fst, which is as L.fst but with alignment symbols (#1 and #2 at the
# beginning and end of words, on the input side)... useful if we
# ever need to e.g. create ctm's-- these are used to work out the
# word boundaries.
cat data/local/lexicon.txt | \
awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }' | \
scripts/make_lexicon_fst.pl - 0.5 sil | \
fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > data/lang_test/L_align.fst
# L_disambig.fst has the disambiguation symbols (c.f. Mohri's papers)
scripts/make_lexicon_fst.pl data/local/lexicon_disambig.txt $silprob sil '#'$ndisambig | \
fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang/words.txt \
--keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel \
......
......@@ -40,31 +40,38 @@ open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
sub is_sil {
# Return true (1) if provided with a phone-sequence
# that means silence.
# @_ is the parameters of the function
# This function returns true if @_ equals ( $silphone )
# or something of the form ( "#0", $silphone, "#1" )
# where the "#0" and "#1" are disambiguation symbols.
return ( @_ == 1 && $_[0] eq $silphone ||
(@_ == 3 && $_[1] eq $silphone &&
$_[0] =~ m/^\#\d+$/ &&
$_[0] =~ m/^\#\d+$/));
}
if( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
$loopstate = 0;
$nexststate = 1; # next unallocated state.
while(<L>) {
@A = split(" ", $_);
$w = shift @A;
if(@A == 0) { # For empty words (<s> and </s>) insert no optional
# silence (not needed as adjacent words supply it)....
# actually we only hit this case for the lexicon without disambig
# symbols but doesn't ever matter as training transcripts don't have <s> or </s>.
print "$loopstate\t$loopstate\t<eps>\t$w\n";
} else {
$s = $loopstate;
$word_or_eps = $w;
while (@A > 0) {
$p = shift @A;
if(@A > 0) {
$ns = $nextstate++;
} else {
$ns = $loopstate;
}
print "$s\t$ns\t$p\t$word_or_eps\n";
$word_or_eps = "<eps>";
$s = $ns;
}
$s = $loopstate;
$word_or_eps = $w;
while (@A > 0) {
$p = shift @A;
if(@A > 0) {
$ns = $nextstate++;
} else {
$ns = $loopstate;
}
print "$s\t$ns\t$p\t$word_or_eps\n";
$word_or_eps = "<eps>";
$s = $ns;
}
}
print "$loopstate\t0\n"; # final-cost.
......@@ -87,34 +94,27 @@ if( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state
while(<L>) {
@A = split(" ", $_);
$w = shift @A;
if(@A == 0) { # For empty words (<s> and </s>) insert no optional
# silence (not needed as adjacent words supply it)....
# actually we only hit this case for the lexicon without disambig
# symbols but doesn't ever matter as training transcripts don't have <s> or </s>.
print "$loopstate\t$loopstate\t<eps>\t$w\n";
} else {
$is_silence_word = (@A == 1 && $A[0] eq $silphone); # boolean.
$s = $loopstate;
$word_or_eps = $w;
while (@A > 0) {
$p = shift @A;
if(@A > 0) {
$ns = $nextstate++;
print "$s\t$ns\t$p\t$word_or_eps\n";
$word_or_eps = "<eps>";
$s = $ns;
$s = $loopstate;
$word_or_eps = $w;
while (@A > 0) {
$p = shift @A;
if(@A > 0) {
$ns = $nextstate++;
print "$s\t$ns\t$p\t$word_or_eps\n";
$word_or_eps = "<eps>";
$s = $ns;
} else {
if(!is_sil(@A)){
# This is non-deterministic but relatively compact,
# and avoids epsilons.
print "$s\t$loopstate\t$p\t$word_or_eps\t$nosilcost\n";
print "$s\t$silstate\t$p\t$word_or_eps\t$silcost\n";
} else {
if(! $is_silence_word) {
# This is non-deterministic but relatively compact,
# and avoids epsilons.
print "$s\t$loopstate\t$p\t$word_or_eps\t$nosilcost\n";
print "$s\t$silstate\t$p\t$word_or_eps\t$silcost\n";
} else {
# no point putting opt-sil after silence word.
print "$s\t$loopstate\t$p\t$word_or_eps\n";
}
$word_or_eps = "<eps>";
# no point putting opt-sil after silence word.
print "$s\t$loopstate\t$p\t$word_or_eps\n";
}
$word_or_eps = "<eps>";
}
}
}
......
......@@ -74,7 +74,7 @@ grep '#' $lang/phones_disambig.txt | awk '{print $2}' > $lang/tmp/disambig_phone
clg=$lang/tmp/CLG_${N}_${P}.fst
if [[ ! -f $clg || $clg -ot $lang/LG.fst ]]; then
if [[ ! -f $clg || $clg -ot $lang/tmp/LG.fst ]]; then
fstcomposecontext --context-size=$N --central-position=$P \
--read-disambig-syms=$lang/tmp/disambig_phones.list \
--write-disambig-syms=$lang/tmp/disambig_ilabels_${N}_${P}.list \
......
......@@ -17,9 +17,12 @@
$ignore_oov = 0;
$ignore_first_field = 0;
for($x = 0; $x < 2; $x++) {
if($ARGV[0] eq "--ignore-oov") { $ignore_oov = 1; shift @ARGV; }
for($x = 0; $x < 3; $x++) {
# Note: it will just print OOVS unmodified if you specify --ignore-oov.
# Else will complain and put nothing out.
if($ARGV[0] eq "--ignore-oov") { $ignore_oov = 1; shift @ARGV; }
if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; }
if($ARGV[0] eq "--map-oov") { shift @ARGV; $map_oov = shift @ARGV; }
}
$symtab = shift @ARGV;
......@@ -33,6 +36,9 @@ while(<F>) {
$sym2int{$A[0]} = $A[1] + 0;
}
$num_warning = 0;
$max_warning = 20;
$error = 0;
while(<>) {
@A = split(" ", $_);
if(@A == 0) {
......@@ -42,18 +48,35 @@ while(<>) {
$key = shift @A;
print $key . " ";
}
@B = ();
foreach $a (@A) {
$i = $sym2int{$a};
if(!defined ($i)) {
if($ignore_oov) {
print $a . " " ;
if (defined $map_oov) {
if (!defined $sym2int{$map_oov}) {
die "sym2int.pl: invalid map-oov option $map_oov (undefined symbol)";
}
if ($num_warning++ < $max_warning) {
print STDERR "sym2int.pl: replacing $a with $map_oov\n";
if ($num_warning == $max_warning) {
print STDERR "sym2int.pl: not warning for OOVs any more times\n";
}
}
$i = $sym2int{$map_oov};
} elsif($ignore_oov) {
$i = $a; # just print them out unmodified..
} else {
die "sym2int.pl: undefined symbol $a\n";
}
}
print $i . " ";
push @B, $i;
}
print join(" ", @B);
print "\n";
}
if($error) { exit(1); }
else { exit(0); }
......@@ -127,7 +127,7 @@ while [ $x -lt $numiters ]; do
"$feats" ark:- $dir/$x.macc ) 2> $dir/macc.$x.log || exit 1;
est-mllt $dir/$x.mat.new $dir/$x.macc 2> $dir/mupdate.$x.log || exit 1;
gmm-transform-means --binary=false $dir/$x.mat.new $dir/$x.mdl $dir/$[$x+1].mdl 2> $dir/transform_means.$x.log || exit 1;
gmm-transform-means --binary=false $dir/$x.mat.new $dir/$x.mdl $dir/$x.mdl 2> $dir/transform_means.$x.log || exit 1;
compose-transforms --print-args=false $dir/$x.mat.new $cur_lda $dir/$x.mat || exit 1;
cur_lda=$dir/$x.mat
......
......@@ -79,17 +79,17 @@ cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \
# have to add .wv1
cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_20.ndx | \
$local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \
sort > eval_nov92.flist
sort > test_eval92.flist
# Nov'93: (213 utts)
# Have to replace a wrong disk-id.
cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \
sed s/13_32_1/13_33_1/ | \
$local/ndx2flist.pl $* | sort > eval_nov93.flist
$local/ndx2flist.pl $* | sort > test_eval93.flist
# Dev-set for Nov'93 (503 utts)
cat links/13-34.1/wsj1/doc/indices/h1_p0.ndx | \
$local/ndx2flist.pl $* | sort > dev_nov93.flist
$local/ndx2flist.pl $* | sort > test_dev93.flist
# Dev-set for Nov'93 (503 utts)
# links/13-34.1/wsj1/doc/indices/h1_p0.ndx
......@@ -98,7 +98,7 @@ cat links/13-34.1/wsj1/doc/indices/h1_p0.ndx | \
for x in $*; do find -L $x -iname '*.dot'; done > dot_files.flist
# Convert the transcripts into our format (no normalization yet)
for x in train_si84 train_si284 eval_nov92 eval_nov93 dev_nov93; do
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93; do
$local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl dot_files.flist > $x.trans1
done
......@@ -107,13 +107,13 @@ done
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
for x in train_si84 train_si284 eval_nov92 eval_nov93 dev_nov93; do
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93; do
cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1;
done
# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
for x in train_si84 train_si284 eval_nov92 eval_nov93 dev_nov93; do
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93; do
awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
done
......@@ -133,7 +133,7 @@ prune-lm --threshold=1e-7 lm_tg.arpa.gz lm_tgpr.arpa || exit 1;
gzip -f lm_tgpr.arpa || exit 1;
# Make the utt2spk and spk2utt files.
for x in train_si84 train_si284 eval_nov92 eval_nov93 dev_nov93; do
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93; do
cat ${x}_sph.scp | awk '{print $1}' | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
cat $x.utt2spk | $scripts/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
done
......
......@@ -28,7 +28,7 @@
echo "Preparing train and test data"
for x in train_si284 eval_nov92 eval_nov93 dev_nov93; do
for x in train_si284 test_eval92 test_eval93 test_dev93; do
mkdir -p data/$x
cp data/local/${x}_wav.scp data/$x/wav.scp
cp data/local/$x.txt data/$x/text
......@@ -140,6 +140,18 @@ scripts/make_lexicon_fst.pl data/local/lexicon_disambig.txt 0.5 SIL '#'$ndisambi
fstaddselfloops "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \
fstarcsort --sort_type=olabel > data/lang_test/L_disambig.fst
# Create L_align.fst, which is as L.fst but with alignment symbols (#1 and #2 at the
# beginning and end of words, on the input side)... useful if we
# ever need to e.g. create ctm's-- these are used to work out the
# word boundaries.
cat data/local/lexicon.txt | \
awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }' | \
scripts/make_lexicon_fst.pl - 0.5 SIL | \
fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > data/lang_test/L_align.fst
# Next, for each type of language model, create the corresponding FST
# and the corresponding lang_test directory.
......@@ -159,14 +171,14 @@ for lm_suffix in bg tgpr tg; do
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG.
# determinization failures of CLG [ends up being epsilon cycles].
gunzip -c data/local/lm_${lm_suffix}.arpa.gz | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
scripts/remove_oovs.pl data/local/oovs_${lm_suffix}.txt | \
scripts/eps2disambig.pl | fstcompile --isymbols=$test/words.txt \
scripts/eps2disambig.pl | scripts/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false \
> $test/G.fst
fstisstochastic $test/G.fst
......
......@@ -55,7 +55,7 @@ grep -v ';;;' data/local/cmudict/cmudict.0.7a | \
# Add to cmudict the silences, noises etc.
(echo '!SIL SIL'; echo '<s> '; echo '</s> '; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
cat - data/local/lexicon_nosil.txt > data/local/lexicon.txt || exit 1;
echo "Dictionary preparation succeeded"
......
......@@ -63,9 +63,31 @@ steps/train_mono.sh data/train_si84_2k data/lang exp/mono
scripts/mkgraph.sh --mono data/lang_test_tgpr exp/mono exp/mono/graph_tgpr
scripts/decode.sh steps/decode_deltas.sh exp/mono/graph_tgpr data/dev_nov93 exp/mono/decode_tgpr_dev93
scripts/decode.sh steps/decode_deltas.sh exp/mono/graph_tgpr data/eval_nov92 exp/mono/decode_tgpr_eval92
steps/align_deltas.sh data/train_si84_half data/lang exp/mono exp/mono_ali
steps/train_deltas.sh 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1
scripts/mkgraph.sh data/lang_test_tgpr exp/tri1 exp/tri1/graph_tgpr
scripts/decode.sh steps/decode_deltas.sh exp/tri1/graph_tgpr data/eval_nov92 exp/tri1/decode_tgpr_eval92
scripts/decode.sh steps/decode_deltas.sh exp/tri1/graph_tgpr data/dev_nov93 exp/tri1/decode_tgpr_dev93
# Align tri1 system with si84 data.
steps/align_deltas.sh data/train_si84 data/lang exp/tri1 exp/tri1_ali_si84
# Train tri2a, which is deltas + delta-deltas, on si84 data.
steps/train_deltas.sh 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2a
scripts/mkgraph.sh data/lang_test_tgpr exp/tri2a exp/tri2a/graph_tgpr
scripts/decode.sh steps/decode_deltas.sh exp/tri2a/graph_tgpr data/eval_nov92 exp/tri2a/decode_tgpr_eval92
scripts/decode.sh steps/decode_deltas.sh exp/tri2a/graph_tgpr data/dev_nov93 exp/tri2a/decode_tgpr_dev93
# Train tri2b, which is LDA+MLLT, on si84 data.
steps/train_lda_mllt.sh 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b
scripts/mkgraph.sh data/lang_test_tgpr exp/tri2a exp/tri2a/graph_tgpr
# exp/decode_mono_tgpr_eval92 exp/graph_mono_tg_pruned/HCLG.fst steps/decode_mono.sh data/eval_nov92.scp
# add --no-queue --num-jobs 4 after "scripts/decode.sh" below, if you don't have
......@@ -517,4 +539,7 @@ done
# 20.9 sec @ beam = 7
# 13.8 sec @ beam = 6
# 14.4 sec @ beam = 5
# 14.4 sec @ beam = 4
\ No newline at end of file
# 14.4 sec @ beam = 4
#How I moved stuff to log/ dir:
#for x in train_lda_mllt.sh train_deltas.sh; do cat $x | perl -ane 's:dir/(\S+)\.log:dir/log/$1.log:; print; ' | sed 's:mkdir -p $dir:mkdir -p $dir/log:' > tmpf; cp tmpf $x; done
\ No newline at end of file
......@@ -68,6 +68,9 @@ if [ "$num_jobs" == "" ]; then # Figure out num-jobs.
fi
echo "Decoding with num-jobs = $num_jobs"
if [[ $num_jobs -gt 1 || ! -d $data/split$num_jobs || $data/split$num_jobs -ot $data/feats.scp ]]; then
scripts/split_data.sh $data $num_jobs
fi
n=0
while [ $n -lt $num_jobs ]; do
......
......@@ -14,10 +14,14 @@
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script replaces epsilon with #0 on the input side only, of the G.fst
# acceptor.
# This script replaces <s> and </s> with <eps> (on both input and output sides),
# for the G.fst acceptor.
while(<>){
s:^(\d+\s+\d+\s+)\<eps\>(\s+):$1#0$2:;
print;
@A = split(" ", $_);
if ( @A >= 4 ) {
if ($A[2] eq "<s>" || $A[2] eq "</s>") { $A[2] = "<eps>"; }
if ($A[3] eq "<s>" || $A[3] eq "</s>") { $A[3] = "<eps>"; }
}
print join("\t", @A) . "\n";
}
......@@ -58,7 +58,7 @@ while(<>) {
}
foreach $w (keys %in_arpa) {
if(!defined $seen{$w}) {
if(!defined $seen{$w} && $w ne "<s>" && $w ne "</s>") {
print "$w\n";
}
}
......@@ -40,31 +40,38 @@ open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
sub is_sil {
# Return true (1) if provided with a phone-sequence
# that means silence.
# @_ is the parameters of the function
# This function returns true if @_ equals ( $silphone )
# or something of the form ( "#0", $silphone, "#1" )
# where the "#0" and "#1" are disambiguation symbols.
return ( @_ == 1 && $_[0] eq $silphone ||
(@_ == 3 && $_[1] eq $silphone &&
$_[0] =~ m/^\#\d+$/ &&
$_[0] =~ m/^\#\d+$/));
}
if( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
$loopstate = 0;
$nexststate = 1; # next unallocated state.
while(<L>) {
@A = split(" ", $_);
$w = shift @A;
if(@A == 0) { # For empty words (<s> and </s>) insert no optional
# silence (not needed as adjacent words supply it)....
# actually we only hit this case for the lexicon without disambig
# symbols but doesn't ever matter as training transcripts don't have <s> or </s>.
print "$loopstate\t$loopstate\t<eps>\t$w\n";
} else {
$s = $loopstate;
$word_or_eps = $w;
while (@A > 0) {
$p = shift @A;
if(@A > 0) {
$ns = $nextstate++;
} else {
$ns = $loopstate;
}
print "$s\t$ns\t$p\t$word_or_eps\n";
$word_or_eps = "<eps>";
$s = $ns;
}
$s = $loopstate;
$word_or_eps = $w;
while (@A > 0) {
$p = shift @A;
if(@A > 0) {
$ns = $nextstate++;
} else {
$ns = $loopstate;
}
print "$s\t$ns\t$p\t$word_or_eps\n";
$word_or_eps = "<eps>";
$s = $ns;
}
}
print "$loopstate\t0\n"; # final-cost.
......@@ -87,34 +94,27 @@ if( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state
while(<L>) {
@A = split(" ", $_);
$w = shift @A;
if(@A == 0) { # For empty words (<s> and </s>) insert no optional
# silence (not needed as adjacent words supply it)....
# actually we only hit this case for the lexicon without disambig
# symbols but doesn't ever matter as training transcripts don't have <s> or </s>.
print "$loopstate\t$loopstate\t<eps>\t$w\n";
} else {
$is_silence_word = (@A == 1 && $A[0] eq $silphone); # boolean.
$s = $loopstate;
$word_or_eps = $w;
while (@A > 0) {
$p = shift @A;
if(@A > 0) {
$ns = $nextstate++;
print "$s\t$ns\t$p\t$word_or_eps\n";
$word_or_eps = "<eps>";
$s = $ns;
$s = $loopstate;
$word_or_eps = $w;
while (@A > 0) {
$p = shift @A;
if(@A > 0) {
$ns = $nextstate++;
print "$s\t$ns\t$p\t$word_or_eps\n";
$word_or_eps = "<eps>";
$s = $ns;
} else {
if(!is_sil(@A)){
# This is non-deterministic but relatively compact,
# and avoids epsilons.
print "$s\t$loopstate\t$p\t$word_or_eps\t$nosilcost\n";
print "$s\t$silstate\t$p\t$word_or_eps\t$silcost\n";
} else {
if(! $is_silence_word) {
# This is non-deterministic but relatively compact,
# and avoids epsilons.
print "$s\t$loopstate\t$p\t$word_or_eps\t$nosilcost\n";
print "$s\t$silstate\t$p\t$word_or_eps\t$silcost\n";
} else {
# no point putting opt-sil after silence word.
print "$s\t$loopstate\t$p\t$word_or_eps\n";
}
$word_or_eps = "<eps>";
# no point putting opt-sil after silence word.
print "$s\t$loopstate\t$p\t$word_or_eps\n";
}
$word_or_eps = "<eps>";
}
}
}
......
......@@ -74,7 +74,7 @@ grep '#' $lang/phones_disambig.txt | awk '{print $2}' > $lang/tmp/disambig_phone
clg=$lang/tmp/CLG_${N}_${P}.fst
if [[ ! -f $clg || $clg -ot $lang/LG.fst ]]; then
if [[ ! -f $clg || $clg -ot $lang/tmp/LG.fst ]]; then
fstcomposecontext --context-size=$N --central-position=$P \
--read-disambig-syms=$lang/tmp/disambig_phones.list \
--write-disambig-syms=$lang/tmp/disambig_ilabels_${N}_${P}.list \
......@@ -82,7 +82,8 @@ if [[ ! -f $clg || $clg -ot $lang/LG.fst ]]; then
fstisstochastic $clg || echo "warning: CLG not stochastic."
fi
if [[ ! -f $dir/Ha.fst || $dir/Ha.fst -ot $model ]]; then
if [[ ! -f $dir/Ha.fst || $dir/Ha.fst -ot $model \
|| $dir/Ha.fst -ot $lang/tmp/ilabels_${N}_${P} ]]; then
make-h-transducer --disambig-syms-out=$dir/disambig_tid.list \
--transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \
> $dir/Ha.fst || exit 1;
......
#!/usr/bin/perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script replaces epsilon with #0 on the input side only, of the G.fst
# acceptor.
while(<>){
s:^(\d+\s+\d+\s+)\<eps\>(\s+):$1#0$2:;
print;
}
......@@ -50,8 +50,7 @@ for inv_acwt in 9 10 11 12 13 14 15 16; do
cat $trans | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/test_trans.filt
cat $dir/${inv_acwt}.tra | \
scripts/int2sym.pl --ignore-first-field $symtab | \
sed 's:<s>::' | sed 's:</s>::' | sed 's:<UNK>::g' | \
scripts/int2sym.pl --ignore-first-field $symtab | sed 's:<UNK>::g' | \
compute-wer --text --mode=present ark:$dir/test_trans.filt ark,p:- >& $dir/wer_$inv_acwt
done
......
......@@ -57,7 +57,10 @@ while(<>) {
die "sym2int.pl: invalid map-oov option $map_oov (undefined symbol)";
}
if ($num_warning++ < $max_warning) {
print STDERR "sym2int.pl: replacing $a with $map_oov [warning $max_warning times]\n";
print STDERR "sym2int.pl: replacing $a with $map_oov\n";
if ($num_warning == $max_warning) {
print STDERR "sym2int.pl: not warning for OOVs any more times\n";
}
}
$i = $sym2int{$map_oov};
} elsif($ignore_oov) {
......
......@@ -48,6 +48,8 @@ lang=$2
srcdir=$3
dir=$4
oov_sym="<SPOKEN_NOISE>" # Map OOVs to this in training.
grep SPOKEN_NOISE $lang/words.txt >/dev/null || echo "Warning: SPOKEN_NOISE not in dictionary" </