Commit 0ddf4bba authored by Dan Povey's avatar Dan Povey
Browse files

trunk: updating online-nnet2-decoding setup to allow for downweighting of...

trunk: updating online-nnet2-decoding setup to allow for downweighting of silence in the stats for iVector estimation.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4972 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 9045a18f
......@@ -168,7 +168,7 @@ if [ $stage -le 13 ]; then
done
fi
#exit 0;
exit 0;
###### Comment out the "exit 0" above to run the multi-threaded decoding. #####
if [ $stage -le 14 ]; then
......@@ -189,4 +189,14 @@ if [ $stage -le 15 ]; then
${dir}_online/decode_pp_${test}_tgsmall_utt_threaded_ep || exit 1;
fi
if [ $stage -le 16 ]; then
# Demonstrate the multi-threaded decoding with silence excluded
# from iVector estimation.
test=dev_clean
steps/online/nnet2/decode.sh --threaded true --silence-weight 0.0 \
--config conf/decode.config --cmd "$decode_cmd" --nj 30 \
--per-utt true exp/tri6b/graph_pp_tgsmall data/$test \
${dir}_online/decode_pp_${test}_tgsmall_utt_threaded_sil0.0 || exit 1;
fi
exit 0;
......@@ -66,7 +66,7 @@ for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
done
sdata=$data/split$nj;
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
......
......@@ -20,6 +20,10 @@ do_endpointing=false
do_speex_compressing=false
scoring_opts=
skip_scoring=false
silence_weight=1.0 # set this to a value less than 1 (e.g. 0) to enable silence weighting.
max_state_duration=40 # This only has an effect if you are doing silence
# weighting. This default is probably reasonable. transition-ids repeated
# more than this many times in an alignment are treated as silence.
iter=final
# End configuration section.
......@@ -94,6 +98,12 @@ if $do_endpointing; then
wav_rspecifier="$wav_rspecifier extend-wav-with-silence ark:- ark:- |"
fi
if [ "$silence_weight" != "1.0" ]; then
silphones=$(cat $graphdir/phones/silence.csl) || exit 1
silence_weighting_opts="--ivector-silence-weighting.max-state-duration=$max_state_duration --ivector-silence-weighting.silence_phones=$silphones --ivector-silence-weighting.silence-weight=$silence_weight"
else
silence_weighting_opts=
fi
if $threaded; then
......@@ -110,7 +120,7 @@ fi
if [ $stage -le 0 ]; then
$cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
$decoder $opts --do-endpointing=$do_endpointing \
$decoder $opts $silence_weighting_opts --do-endpointing=$do_endpointing \
--config=$srcdir/conf/online_nnet2_decoding.conf \
--max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
--acoustic-scale=$acwt --word-symbol-table=$graphdir/words.txt \
......
......@@ -22,8 +22,10 @@ if [ "$1" == "--per-utt" ]; then
fi
if [ $# != 2 ]; then
echo "Usage: split_data.sh <data-dir> <num-to-split>"
echo "Usage: split_data.sh [--per-utt] <data-dir> <num-to-split>"
echo "This script will not split the data-dir if it detects that the output is newer than the input."
echo "By default it splits per speaker (so each speaker is in only one split dir),"
echo "but with the --per-utt option it will ignore the speaker information while splitting."
exit 1
fi
......@@ -45,13 +47,11 @@ nu=`cat $data/utt2spk | wc -l`
nf=`cat $data/feats.scp 2>/dev/null | wc -l`
nt=`cat $data/text 2>/dev/null | wc -l` # take it as zero if no such file
if [ -f $data/feats.scp ] && [ $nu -ne $nf ]; then
echo "** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); this script "
echo "** may produce incorrectly split data."
echo "** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); you can "
echo "** use utils/fix_data_dir.sh $data to fix this."
fi
if [ -f $data/text ] && [ $nu -ne $nt ]; then
echo "** split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); this script "
echo "** may produce incorrectly split data."
echo "** split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); you can "
echo "** use utils/fix_data_dir.sh to fix this."
fi
......@@ -74,11 +74,7 @@ fi
for n in `seq $numsplit`; do
mkdir -p $data/split$numsplit/$n
feats="$feats $data/split$numsplit/$n/feats.scp"
vads="$vads $data/split$numsplit/$n/vad.scp"
texts="$texts $data/split$numsplit/$n/text"
utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk"
utt2langs="$utt2langs $data/split$numsplit/$n/utt2lang"
done
if $split_per_spk; then
......@@ -87,37 +83,51 @@ else
utt2spk_opt=
fi
utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1
# If lockfile is not installed, just don't lock it. It's not a big deal.
which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
[ -f $data/feats.scp ] && utils/split_scp.pl $utt2spk_opt $data/feats.scp $feats
utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1
[ -f $data/text ] && utils/split_scp.pl $utt2spk_opt $data/text $texts
for n in `seq $numsplit`; do
dsn=$data/split$numsplit/$n
utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1;
done
[ -f $data/vad.scp ] && utils/split_scp.pl $utt2spk_opt $data/vad.scp $vads
maybe_wav_scp=
if [ ! -f $data/segments ]; then
maybe_wav_scp=wav.scp # If there is no segments file, then wav file is
# indexed per utt.
fi
[ -f $data/utt2lang ] && utils/split_scp.pl $utt2spk_opt $data/utt2lang $utt2langs
# split some things that are indexed by utterance.
for f in feats.scp text vad.scp utt2lang $maybe_wav_scp; do
if [ -f $data/$f ]; then
utils/filter_scps.pl JOB=1:$numsplit \
$data/split$numsplit/JOB/utt2spk $data/$f $data/split$numsplit/JOB/$f || exit 1;
fi
done
# If lockfile is not installed, just don't lock it. It's not a big deal.
which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
# split some things that are indexed by speaker
for f in spk2gender spk2warp cmvn.scp; do
if [ -f $data/$f ]; then
utils/filter_scps.pl JOB=1:$numsplit \
$data/split$numsplit/JOB/spk2utt $data/$f $data/split$numsplit/JOB/$f || exit 1;
fi
done
for n in `seq $numsplit`; do
dsn=$data/split$numsplit/$n
utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1;
for f in spk2gender spk2warp cmvn.scp; do
[ -f $data/$f ] && \
utils/filter_scp.pl $dsn/spk2utt $data/$f > $dsn/$f
done
if [ -f $data/segments ]; then
utils/filter_scp.pl $dsn/utt2spk $data/segments > $dsn/segments
awk '{print $2;}' $dsn/segments |sort|uniq > $data/tmp.reco # recording-ids.
[ -f $data/reco2file_and_channel ] &&
utils/filter_scp.pl $data/tmp.reco $data/reco2file_and_channel > $dsn/reco2file_and_channel
[ -f $data/wav.scp ] && utils/filter_scp.pl $data/tmp.reco $data/wav.scp > $dsn/wav.scp
awk '{print $2;}' $dsn/segments | sort | uniq > $data/tmp.reco # recording-ids.
if [ -f $data/reco2file_and_channel ]; then
utils/filter_scp.pl $data/tmp.reco $data/reco2file_and_channel > $dsn/reco2file_and_channel
fi
if [ -f $data/wav.scp ]; then
utils/filter_scp.pl $data/tmp.reco $data/wav.scp >$dsn/wav.scp
fi
rm $data/tmp.reco
else # else wav indexed by utterance -> filter on this.
[ -f $data/wav.scp ] &&
utils/filter_scp.pl $dsn/utt2spk $data/wav.scp > $dsn/wav.scp
fi
fi # else it would have been handled above, see maybe_wav.
done
rm -f $data/.split_lock
......
......@@ -669,13 +669,13 @@ HTML_HEADER = doc/header.html
# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
# The allowed range is 0 to 359.
HTML_COLORSTYLE_HUE = 26
HTML_COLORSTYLE_HUE = 31
# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
# the colors in the HTML output. For a value of 0 the output will use
# grayscales only. A value of 255 will produce the most vivid colors.
HTML_COLORSTYLE_SAT = 80
HTML_COLORSTYLE_SAT = 115
# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
# the luminance component of the colors in the HTML output. Values below
......@@ -684,7 +684,7 @@ HTML_COLORSTYLE_SAT = 80
# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
# and 100 does not change the gamma.
HTML_COLORSTYLE_GAMMA = 90
HTML_COLORSTYLE_GAMMA = 80
......
......@@ -50,9 +50,15 @@ class LatticeFasterOnlineDecoder {
typedef Arc::Label Label;
typedef Arc::StateId StateId;
typedef Arc::Weight Weight;
struct BestPathIterator {
void *tok;
int32 frame;
// note, "frame" is the frame-index of the frame you'll get the
// transition-id for next time, if you call TraceBackBestPath on this
// iterator (assuming it's not an epsilon transition). Note that this
// is one less than you might reasonably expect, e.g. it's -1 for
// the nonemitting transitions before the first frame.
BestPathIterator(void *t, int32 f): tok(t), frame(f) { }
bool Done() { return tok == NULL; }
};
......
......@@ -56,4 +56,6 @@ fi
# moved the header.html to doc/ and edited it to include the following snippet,
# and added it to the repo.
#<link rel="icon" type="image/png" href="http://kaldi.sf.net/favicon.ico">
# Also did similar with stylesheet.
......@@ -49,9 +49,9 @@ namespace kaldi {
be run in order to build the systems used for alignment.
Regarding which of the two setups you should use:
- Karel's setup (nnet1) supports training on a single GPU card, which allows
- Karel's setup (\ref dnn1 "nnet1") supports training on a single GPU card, which allows
the implementation to be simpler and relatively easy to modify.
- Dan's setup (nnet2) is more flexible in how
- Dan's setup (\ref dnn2 "nnet2") is more flexible in how
you can train: it supports using multiple GPUs, or multiple CPU's each with
multiple threads. Multiple GPU's is the recommended setup.
They don't have to all be on the same machine. Both setups give commensurate results.
......
......@@ -23,7 +23,7 @@ $mathjax
<tbody>
<tr style="height: 56px;">
<!--BEGIN PROJECT_LOGO-->
<td id="projectlogo"><img alt="Logo" src="$relpath$$projectlogo"/ style="padding: 4px 5px 1px 5px"></td>
<td id="projectlogo"><img alt="Logo" src="$relpath$$projectlogo"/ style="padding: 3px 5px 1px 5px"></td>
<!--END PROJECT_LOGO-->
<!--BEGIN PROJECT_NAME-->
<td style="padding-left: 0.5em;">
......
......@@ -534,7 +534,9 @@ void OnlineIvectorEstimationStats::AccStats(
for (size_t idx = 0; idx < gauss_post.size(); idx++) {
int32 g = gauss_post[idx].first;
double weight = gauss_post[idx].second;
KALDI_ASSERT(weight >= 0.0);
// allow negative weights; it's needed in the online iVector extraction
// with speech-silence detection based on decoder traceback (we subtract
// stuff we previously added if the traceback changes).
if (weight == 0.0)
continue;
linear_term_.AddMatVec(weight, extractor.Sigma_inv_M_[g], kTrans,
......@@ -543,8 +545,9 @@ void OnlineIvectorEstimationStats::AccStats(
quadratic_term_vec.AddVec(weight, U_g);
tot_weight += weight;
}
if (max_count_ != 0.0) {
// see comments in header RE max_count for explanation.
if (max_count_ > 0.0) {
// see comments in header RE max_count for explanation. It relates to
// prior scaling when the count exceeds max_count_
double old_num_frames = num_frames_,
new_num_frames = num_frames_ + tot_weight;
double old_prior_scale = std::max(old_num_frames, max_count_) / max_count_,
......
......@@ -538,7 +538,13 @@ int32 LinearCgd(const LinearCgdOptions &opts,
p.AddVec(-1.0, r);
r_cur_norm_sq = r_next_norm_sq;
}
if (r_cur_norm_sq > r_initial_norm_sq) {
// note: the first element of the && is only there to save compute.
// the residual r is A x - b, and r_cur_norm_sq and r_initial_norm_sq are
// of the form r * r, so it's clear that b * b has the right dimension to
// compare with the residual.
if (r_cur_norm_sq > r_initial_norm_sq &&
r_cur_norm_sq > r_initial_norm_sq + 1.0e-10 * VecVec(b, b)) {
KALDI_WARN << "Doing linear CGD in dimension " << A.NumRows() << ", after " << k
<< " iterations the squared residual has got worse, "
<< r_cur_norm_sq << " > " << r_initial_norm_sq
......
......@@ -895,6 +895,9 @@ class AffineComponentPreconditioned: public AffineComponent {
};
/// Keywords: natural gradient descent, NG-SGD, naturalgradient. For
/// the top-level of the natural gradient code look here, and also in
/// nnet-precondition-online.h.
/// AffineComponentPreconditionedOnline is, like AffineComponentPreconditioned,
/// a version of AffineComponent that has a non-(multiple of unit) learning-rate
/// matrix. See nnet-precondition-online.h for a description of the technique.
......
......@@ -32,6 +32,8 @@ namespace nnet2 {
/**
Keywords for search: natural gradient, naturalgradient, NG-SGD
It will help to first try to understand ./nnet-precondition.h before reading
this comment and trying to understand what's going on here. The motivation
for this method was that the code in nnet-precondition.h was too slow when
......
......@@ -33,6 +33,7 @@ void OnlineIvectorExtractionInfo::Init(
min_post = config.min_post;
posterior_scale = config.posterior_scale;
max_count = config.max_count;
num_cg_iters = config.num_cg_iters;
use_most_recent_ivector = config.use_most_recent_ivector;
greedy_ivector_extractor = config.greedy_ivector_extractor;
if (greedy_ivector_extractor && !use_most_recent_ivector) {
......@@ -151,31 +152,95 @@ int32 OnlineIvectorFeature::NumFramesReady() const {
return lda_->NumFramesReady();
}
void OnlineIvectorFeature::UpdateStatsUntilFrame(int32 frame) {
KALDI_ASSERT(frame >= 0 && frame < this->NumFramesReady());
int32 feat_dim = lda_normalized_->Dim(),
ivector_period = info_.ivector_period;
void OnlineIvectorFeature::UpdateFrameWeights(
const std::vector<std::pair<int32, BaseFloat> > &delta_weights) {
// add the elements to delta_weights_, which is a priority queue. The top
// element of the priority queue is the lowest numbered frame (we ensured this
// by making the comparison object std::greater instead of std::less). Adding
// elements from top (lower-numbered frames) to bottom (higher-numbered
// frames) should be most efficient, assuming it's a heap internally. So we
// go forward not backward in delta_weights while adding.
int32 num_frames_ready = NumFramesReady();
for (size_t i = 0; i < delta_weights.size(); i++) {
delta_weights_.push(delta_weights[i]);
int32 frame = delta_weights[i].first;
KALDI_ASSERT(frame >= 0 && frame < num_frames_ready);
if (frame > most_recent_frame_with_weight_)
most_recent_frame_with_weight_ = frame;
}
delta_weights_provided_ = true;
}
int32 num_cg_iters = 15; // I don't believe this is very important, so it's
// not configurable from the command line for now.
void OnlineIvectorFeature::UpdateStatsForFrame(int32 t,
BaseFloat weight) {
int32 feat_dim = lda_normalized_->Dim();
Vector<BaseFloat> feat(feat_dim), // features given to iVector extractor
log_likes(info_.diag_ubm.NumGauss());
lda_normalized_->GetFrame(t, &feat);
info_.diag_ubm.LogLikelihoods(feat, &log_likes);
// "posterior" stores the pruned posteriors for Gaussians in the UBM.
std::vector<std::pair<int32, BaseFloat> > posterior;
tot_ubm_loglike_ += weight *
VectorToPosteriorEntry(log_likes, info_.num_gselect,
info_.min_post, &posterior);
for (size_t i = 0; i < posterior.size(); i++)
posterior[i].second *= info_.posterior_scale * weight;
lda_->GetFrame(t, &feat); // get feature without CMN.
ivector_stats_.AccStats(info_.extractor, feat, posterior);
}
void OnlineIvectorFeature::UpdateStatsUntilFrame(int32 frame) {
KALDI_ASSERT(frame >= 0 && frame < this->NumFramesReady() &&
!delta_weights_provided_);
updated_with_no_delta_weights_ = true;
int32 ivector_period = info_.ivector_period;
int32 num_cg_iters = info_.num_cg_iters;
for (; num_frames_stats_ <= frame; num_frames_stats_++) {
int32 t = num_frames_stats_; // Frame whose stats we want to get.
lda_normalized_->GetFrame(t, &feat);
info_.diag_ubm.LogLikelihoods(feat, &log_likes);
// "posterior" stores the pruned posteriors for Gaussians in the UBM.
std::vector<std::pair<int32, BaseFloat> > posterior;
tot_ubm_loglike_ += VectorToPosteriorEntry(log_likes, info_.num_gselect,
info_.min_post, &posterior);
for (size_t i = 0; i < posterior.size(); i++)
posterior[i].second *= info_.posterior_scale;
lda_->GetFrame(t, &feat); // get feature without CMN.
ivector_stats_.AccStats(info_.extractor, feat, posterior);
int32 t = num_frames_stats_;
UpdateStatsForFrame(t, 1.0);
if ((!info_.use_most_recent_ivector && t % ivector_period == 0) ||
(info_.use_most_recent_ivector && t == frame)) {
ivector_stats_.GetIvector(num_cg_iters, &current_ivector_);
if (!info_.use_most_recent_ivector) { // need to cache iVectors.
int32 ivec_index = t / ivector_period;
KALDI_ASSERT(ivec_index == static_cast<int32>(ivectors_history_.size()));
ivectors_history_.push_back(new Vector<BaseFloat>(current_ivector_));
}
}
}
}
void OnlineIvectorFeature::UpdateStatsUntilFrameWeighted(int32 frame) {
KALDI_ASSERT(frame >= 0 && frame < this->NumFramesReady() &&
delta_weights_provided_ &&
! updated_with_no_delta_weights_ &&
frame <= most_recent_frame_with_weight_);
bool debug_weights = true;
int32 ivector_period = info_.ivector_period;
int32 num_cg_iters = info_.num_cg_iters;
for (; num_frames_stats_ <= frame; num_frames_stats_++) {
int32 t = num_frames_stats_;
// Instead of just updating frame t, we update all frames that need updating
// with index <= 1, in case old frames were reclassified as silence/nonsilence.
while (!delta_weights_.empty() &&
delta_weights_.top().first <= t) {
std::pair<int32, BaseFloat> p = delta_weights_.top();
delta_weights_.pop();
int32 frame = p.first;
BaseFloat weight = p.second;
UpdateStatsForFrame(frame, weight);
if (debug_weights) {
if (current_frame_weight_debug_.size() <= frame)
current_frame_weight_debug_.resize(frame + 1, 0.0);
current_frame_weight_debug_[frame] += weight;
KALDI_ASSERT(current_frame_weight_debug_[frame] >= -0.01 &&
current_frame_weight_debug_[frame] <= 1.01);
}
}
if ((!info_.use_most_recent_ivector && t % ivector_period == 0) ||
(info_.use_most_recent_ivector && t == frame)) {
ivector_stats_.GetIvector(num_cg_iters, &current_ivector_);
......@@ -188,10 +253,16 @@ void OnlineIvectorFeature::UpdateStatsUntilFrame(int32 frame) {
}
}
void OnlineIvectorFeature::GetFrame(int32 frame,
VectorBase<BaseFloat> *feat) {
UpdateStatsUntilFrame(info_.greedy_ivector_extractor ?
lda_->NumFramesReady() - 1 : frame);
int32 frame_to_update_until = (info_.greedy_ivector_extractor ?
lda_->NumFramesReady() - 1 : frame);
if (!delta_weights_provided_) // No silence weighting.
UpdateStatsUntilFrame(frame_to_update_until);
else
UpdateStatsUntilFrameWeighted(frame_to_update_until);
KALDI_ASSERT(feat->Dim() == this->Dim());
if (info_.use_most_recent_ivector) {
......@@ -218,8 +289,8 @@ void OnlineIvectorFeature::PrintDiagnostics() const {
KALDI_VLOG(3) << "Processed no data.";
} else {
KALDI_VLOG(3) << "UBM log-likelihood was "
<< (tot_ubm_loglike_ / num_frames_stats_)
<< " per frame, over " << num_frames_stats_
<< (tot_ubm_loglike_ / NumFrames())
<< " per frame, over " << NumFrames()
<< " frames.";
Vector<BaseFloat> temp_ivector(current_ivector_);
......@@ -266,7 +337,9 @@ OnlineIvectorFeature::OnlineIvectorFeature(
ivector_stats_(info_.extractor.IvectorDim(),
info_.extractor.PriorOffset(),
info_.max_count),
num_frames_stats_(0), tot_ubm_loglike_(0.0) {
num_frames_stats_(0), delta_weights_provided_(false),
updated_with_no_delta_weights_(false),
most_recent_frame_with_weight_(-1), tot_ubm_loglike_(0.0) {
info.Check();
KALDI_ASSERT(base_feature != NULL);
splice_ = new OnlineSpliceFrames(info_.splice_opts, base_);
......@@ -296,8 +369,8 @@ void OnlineIvectorFeature::SetAdaptationState(
}
BaseFloat OnlineIvectorFeature::UbmLogLikePerFrame() const {
if (num_frames_stats_ == 0) return 0;
else return tot_ubm_loglike_ / num_frames_stats_;
if (NumFrames() == 0) return 0;
else return tot_ubm_loglike_ / NumFrames();
}
BaseFloat OnlineIvectorFeature::ObjfImprPerFrame() const {
......@@ -305,4 +378,206 @@ BaseFloat OnlineIvectorFeature::ObjfImprPerFrame() const {
}
OnlineSilenceWeighting::OnlineSilenceWeighting(
const TransitionModel &trans_model,
const OnlineSilenceWeightingConfig &config):
trans_model_(trans_model), config_(config),
num_frames_output_and_correct_(0) {
vector<int32> silence_phones;
SplitStringToIntegers(config.silence_phones_str, ":,", false,
&silence_phones);
for (size_t i = 0; i < silence_phones.size(); i++)
silence_phones_.insert(silence_phones[i]);
}
void OnlineSilenceWeighting::ComputeCurrentTraceback(
const LatticeFasterOnlineDecoder &decoder) {
int32 num_frames_decoded = decoder.NumFramesDecoded(),
num_frames_prev = frame_info_.size();
// note, num_frames_prev is not the number of frames previously decoded,
// it's the generally-larger number of frames that we were requested to
// provide weights for.
if (num_frames_prev < num_frames_decoded)
frame_info_.resize(num_frames_decoded);
if (num_frames_prev > num_frames_decoded &&
frame_info_[num_frames_decoded].transition_id != -1)
KALDI_ERR << "Number of frames decoded decreased"; // Likely bug
if (num_frames_decoded == 0)
return;
int32 frame = num_frames_decoded - 1;
bool use_final_probs = false;
LatticeFasterOnlineDecoder::BestPathIterator iter =
decoder.BestPathEnd(use_final_probs, NULL);
while (frame >= 0) {
LatticeArc arc;
arc.ilabel = 0;
while (arc.ilabel == 0) // the while loop skips over input-epsilons
iter = decoder.TraceBackBestPath(iter, &arc);
// note, the iter.frame values are slightly unintuitively defined,
// they are one less than you might expect.
KALDI_ASSERT(iter.frame == frame - 1);
if (frame_info_[frame].token == iter.tok) {
// we know that the traceback from this point back will be identical, so
// no point tracing back further. Note: we are comparing memory addresses
// of tokens of the decoder; this guarantees it's the same exact token
// because tokens, once allocated on a frame, are only deleted, never
// reallocated for that frame.
break;
}
if (num_frames_output_and_correct_ > frame)
num_frames_output_and_correct_ = frame;
frame_info_[frame].token = iter.tok;
frame_info_[frame].transition_id = arc.ilabel;
frame--;
// leave frame_info_.current_weight at zero for now (as set in the
// constructor), reflecting that we haven't already output a weight for that
// frame.
}
}
int32 OnlineSilenceWeighting::GetBeginFrame() {
int32 max_duration = config_.max_state_duration;
if (max_duration <= 0 || num_frames_output_and_correct_ == 0)
return num_frames_output_and_correct_;
// t_last_untouched is the index of the last frame that is not newly touched
// by ComputeCurrentTraceback. We are interested in whether it is part of a
// run of length greater than max_duration, since this would force it
// to be treated as silence (note: typically a non-silence phone that's very
// long is really silence, for example this can happen with the word "mm").
int32 t_last_untouched = num_frames_output_and_correct_ - 1,
t_end = frame_info_.size();
int32 transition_id = frame_info_[t_last_untouched].transition_id;
// no point searching longer than max_duration; when the length of the run is
// at least that much, a longer length makes no difference.
int32 lower_search_bound = std::max(0, t_last_untouched - max_duration),
upper_search_bound = std::min(t_last_untouched + max_duration, t_end - 1),
t_lower, t_upper;
// t_lower will be the first index in the run of equal transition-ids.
for (t_lower = t_last_untouched;
t_lower > lower_search_bound &&
frame_info_[t_lower - 1].transition_id == transition_id; t_lower++);
// t_lower will be the last index in the run of equal transition-ids.
for (t_upper = t_last_untouched;
t_upper < upper_search_bound &&
frame_info_[t_upper + 1].transition_id == transition_id; t_upper++);
int32 run_length = t_upper - t_lower + 1;
if (run_length <= max_duration) {
// we wouldn't treat this run as being silence, as it's within
// the duration limit. So we return the default value
// num_frames_output_and_correct_ as our lower bound for processing.
return num_frames_output_and_correct_;
}
int32 old_run_length = t_last_untouched - t_lower + 1;
if (old_run_length > max_duration) {
// The run-length before we got this new data was already longer than the
// max-duration, so would already have been treated as silence. therefore
// we don't have to encompass it all- we just include a long enough length
// in the region we are going to process, that the run-length in that region
// is longer than max_duration.
int32 ans = t_upper - max_duration;
KALDI_ASSERT(ans >= t_lower);
return ans;
} else {
return t_lower;