Commit 53fe4c4a authored by Guoguo Chen's avatar Guoguo Chen
Browse files

trunk: various changes that relate to G.fst ilabel sorting.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4619 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent a7424195
......@@ -44,7 +44,7 @@ gunzip -c $lmfile | \
fstcompile --isymbols=$langdir/words.txt \
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrhocompose "$rho" - $destdir/rho.fst | \
fstrmepsilon > $destdir/G.fst || exit 1
fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
fstisstochastic $destdir/G.fst || true
......
......@@ -211,7 +211,7 @@ gunzip -c $gzipped_ARPA_LM | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $lang/G.fst || exit 1;
fstrmepsilon | fstarcsort --sort_type=ilabel > $lang/G.fst || exit 1;
fstisstochastic $lang/G.fst
##################################################################
......
......@@ -31,7 +31,7 @@ for lm_suffix in bg; do
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $test/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
......
......@@ -31,7 +31,7 @@ for lm_suffix in bg; do
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $test/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
......
......@@ -25,7 +25,7 @@ gunzip -c "$arpa_lm" | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
......
......@@ -46,7 +46,7 @@ for lm_suffix in tgpr; do
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $test/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
......
......@@ -12,7 +12,7 @@ tmpdir=data/local/tmp
. ./path.sh || exit 1; # for KALDI_ROOT
fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang/words.txt --keep_isymbols=false \
--keep_osymbols=false $tmpdir/G.txt > data/lang/G.fst || exit 1;
--keep_osymbols=false $tmpdir/G.txt | fstarcsort --sort_type=ilabel > data/lang/G.fst || exit 1;
# Checking that G is stochastic [note, it wouldn't be for an Arpa]
fstisstochastic data/lang/G.fst || echo Error: G is not stochastic
......
......@@ -21,7 +21,7 @@ cat data/train/text | \
$final_cost = -log($n_sent / $tot_count);
print "0 $final_cost\n"; ' | \
fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang/words.txt --keep_isymbols=false \
--keep_osymbols=false > data/lang_ug/G.fst || exit 1;
--keep_osymbols=false | fstarcsort --sort_type=ilabel > data/lang_ug/G.fst || exit 1;
# Checking that G is stochastic [note, it wouldn't be for an Arpa]
fstisstochastic data/lang_ug/G.fst || echo Error: G is not stochastic
......
......@@ -103,7 +103,8 @@ penalty=`perl -e '$prob = 1.0/12; print -log($prob); '` # negated log-prob,
done
echo 0 $penalty # format is: state final-cost
) | fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt \
--keep_isymbols=false --keep_osymbols=false >$lang/G.fst
--keep_isymbols=false --keep_osymbols=false |\
fstarcsort --sort_type=ilabel > $lang/G.fst
exit 0;
......
......@@ -85,7 +85,7 @@ else
fi
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "align_si.sh: feature type is $feat_type"
echo "$0: feature type is $feat_type"
case $feat_type in
delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
......
......@@ -79,7 +79,7 @@ else
fi
if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "align_si.sh: feature type is $feat_type"
echo "$0: feature type is $feat_type"
case $feat_type in
delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
......
......@@ -88,7 +88,7 @@ else
fi
if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "align_si.sh: feature type is $feat_type"
echo "$0: feature type is $feat_type"
case $feat_type in
delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
......
......@@ -95,7 +95,8 @@ ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \
$cost = -log($c / $hist); # cost on FST arc.
print "$src $cost\n"; # final-prob.
}
} ' | fstcompile --acceptor=true > $lang_out/G.fst
} ' | fstcompile --acceptor=true | \
fstarcsort --sort_type=ilabel > $lang_out/G.fst
# symbols for phones and words are the same.
# Neither has disambig symbols.
......
......@@ -62,7 +62,8 @@ arpa2fst $outdir/reverse.arpa | fstprint | \
echo "Push weights to make it stochastic (log semi-ring)"
# delta must be very small otherwise weight pushing won't succeed
#fstpush --push_weights=true --push_labels=true --delta=1E-7 $outdir/G_log.fst >$outdir/G_log_pushed.fst
fstpushspecial --delta=1E-5 $outdir/G_org.fst >$outdir/G.fst
fstpushspecial --delta=1E-5 $outdir/G_org.fst |\
fstarcsort --sort_type=ilabel >$outdir/G.fst
fstisstochastic $outdir/G.fst
# The output is like:
......
......@@ -566,26 +566,61 @@ if (-s "$lang/phones/word_boundary.int") {
# Check oov -------------------------------
check_txt_int("$lang/oov", \%wsymtab, 0); print "\n";
# Check if L.fst is olabel sorted.
if (-e "$lang/L.fst") {
$cmd = "fstinfo $lang/L.fst | grep -E 'output label sorted.*y' > /dev/null";
$res = system(". ./path.sh; $cmd");
if ($res == 0) {
print "--> $lang/L.fst is olabel sorted\n";
} else {
print "--> ERROR: $lang/L.fst is not olabel sorted\n";
$exit = 1;
}
}
# Check if L_disambig.fst is olabel sorted.
if (-e "$lang/L_disambig.fst") {
$cmd = "fstinfo $lang/L_disambig.fst | grep -E 'output label sorted.*y' > /dev/null";
$res = system(". ./path.sh; $cmd");
if ($res == 0) {
print "--> $lang/L_disambig.fst is olabel sorted\n";
} else {
print "--> ERROR: $lang/L_disambig.fst is not olabel sorted\n";
$exit = 1;
}
}
# Check if G.fst is ilabel sorted.
if (-e "$lang/G.fst") {
$cmd = "fstinfo $lang/G.fst | grep -E 'input label sorted.*y' > /dev/null";
$res = system(". ./path.sh; $cmd");
if ($res == 0) {
print "--> $lang/G.fst is ilabel sorted\n";
} else {
print "--> ERROR: $lang/G.fst is not ilabel sorted\n";
$exit = 1;
}
}
# Check determinizability of G.fst
if (-e "$lang/G.fst") {
$cmd = "fstdeterminize $lang/G.fst /dev/null";
$ret = system(". ./path.sh; $cmd");
if ($ret == 0) {
print "--> command $cmd succeeded\n";
$res = system(". ./path.sh; $cmd");
if ($res == 0) {
print "--> $lang/G.fst is determinizable\n";
} else {
print "--> ERROR: command $cmd failed\n";
print "--> ERROR: fail to determinize $lang/G.fst\n";
$exit = 1;
}
}
if (-e "$lang/G.fst" && -e "$lang/L_disambig.fst") {
$cmd = "fstcompose $lang/L_disambig.fst $lang/G.fst | fstdeterminize > /dev/null";
$ret = system(". ./path.sh; $cmd");
if ($ret == 0) {
print "--> command $cmd succeeded\n";
$res = system(". ./path.sh; $cmd");
if ($res == 0) {
print "--> L_disambig . G is determinizable\n";
} else {
print "--> ERROR: command $cmd failed\n";
print "--> ERROR: fail to determinize L_disambig . G\n";
$exit = 1;
}
}
......
......@@ -122,6 +122,14 @@ int main(int argc, char *argv[]) {
VectorFst<StdArc> *fst1 = ReadFstKaldi(fst1_in_str);
VectorFst<StdArc> *fst2 = ReadFstKaldi(fst2_in_str);
// Checks if <fst1> is olabel sorted and <fst2> is ilabel sorted.
if (fst1->Properties(fst::kOLabelSorted, true) == 0) {
KALDI_WARN << "The first FST is not olabel sorted.";
}
if (fst2->Properties(fst::kILabelSorted, true) == 0) {
KALDI_WARN << "The second FST is not ilabel sorted.";
}
VectorFst<StdArc> composed_fst;
......@@ -140,6 +148,11 @@ int main(int argc, char *argv[]) {
SequentialTableReader<VectorFstHolder> fst2_reader(fst2_in_str);
TableWriter<VectorFstHolder> fst_writer(fst_out_str);
int32 n_done = 0;
// Checks if <fst1> is olabel sorted.
if (fst1->Properties(fst::kOLabelSorted, true) == 0) {
KALDI_WARN << "The first FST is not olabel sorted.";
}
for (; !fst2_reader.Done(); fst2_reader.Next(), n_done++) {
VectorFst<StdArc> fst2(fst2_reader.Value());
VectorFst<StdArc> fst_out;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment