Commit 70be4ae0 authored by David Snyder's avatar David Snyder
Browse files

trunk: Adding lattice-expand-ngram binary and supporting classes and...

trunk: Adding lattice-expand-ngram binary and supporting classes and functions. The primary purpose of the binary is transform the lattice so that for any state it has a unique transition history up to order n.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4270 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent a6fbf1c3
// fstext/deterministic-fst-inl.h
// Copyright 2011-2012 Gilles Boulianne Johns Hopkins University (author: Daniel Povey)
// 2014 Telepoint Global Hosting Service, LLC. (Author: David Snyder)
// See ../../COPYING for clarification regarding multiple authors
//
......@@ -83,6 +84,53 @@ bool BackoffDeterministicOnDemandFst<Arc>::GetArc(
}
}
template<class Arc>
UnweightedNgramFst<Arc>::UnweightedNgramFst(int n): n_(n) {
// Starting state is an empty vector
std::vector<Label> start_state;
state_vec_.push_back(start_state);
start_state_ = 0;
state_map_[start_state] = 0;
}
template<class Arc>
bool UnweightedNgramFst<Arc>::GetArc(
StateId s, Label ilabel, Arc *oarc) {
// The state ids increment with each state we encounter.
// if the assert fails, then we are trying to access
// unseen states that are not immediately traversable.
KALDI_ASSERT(static_cast<size_t>(s) < state_vec_.size());
std::vector<Label> seq = state_vec_[s];
// Update state info.
seq.push_back(ilabel);
if (seq.size() > n_-1) {
// Remove oldest word in the history.
seq.erase(seq.begin());
}
std::pair<const std::vector<Label>, StateId> new_state(
seq,
static_cast<Label>(state_vec_.size()));
// Now get state id for destination state.
typedef typename MapType::iterator IterType;
std::pair<IterType, bool> result = state_map_.insert(new_state);
if (result.second == true) {
state_vec_.push_back(seq);
}
oarc->weight = Weight::One(); // Because the FST is unweightd.
oarc->ilabel = ilabel;
oarc->olabel = ilabel;
oarc->nextstate = result.first->second; // The next state id.
// All arcs can be matched.
return true;
}
template<class Arc>
typename Arc::Weight UnweightedNgramFst<Arc>::Final(StateId state) {
KALDI_ASSERT(state < static_cast<StateId>(state_vec_.size()));
return Weight::One();
}
template<class Arc>
ComposeDeterministicOnDemandFst<Arc>::ComposeDeterministicOnDemandFst(
DeterministicOnDemandFst<Arc> *fst1,
......
// fstext/deterministic-fst.h
// Copyright 2011-2012 Gilles Boulianne Johns Hopkins University (author: Daniel Povey)
// 2014 Telepoint Global Hosting Service, LLC. (Author: David Snyder)
// See ../../COPYING for clarification regarding multiple authors
//
......@@ -126,6 +127,40 @@ class BackoffDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
const Fst<Arc> &fst_;
};
/**
The class UnweightedNgramFst is a DeterministicOnDemandFst whose states
encode an n-gram history. Conceptually, for n-gram order n and k labels,
the FST has k^(n-1) states. However, the FST is created on demand and doesn't
need the label vocabulary; GetArc matches on any input label. This class is
primarily used by ComposeDeterministicOnDemand to expand the n-gram
history of lattices.
*/
template<class Arc>
class UnweightedNgramFst: public DeterministicOnDemandFst<Arc> {
public:
typedef typename Arc::Weight Weight;
typedef typename Arc::StateId StateId;
typedef typename Arc::Label Label;
UnweightedNgramFst(int n);
StateId Start() { return start_state_; };
Weight Final(StateId s);
bool GetArc(StateId s, Label ilabel, Arc *oarc);
private:
typedef unordered_map<std::vector<Label>,
StateId, kaldi::VectorHasher<Label> > MapType;
// The order of the n-gram.
int n_;
MapType state_map_;
StateId start_state_;
// Map from history-state to pair.
std::vector<std::vector<Label> > state_vec_;
};
template<class Arc>
class ComposeDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
public:
......
// fstext/fstext-utils-inl.h
// Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
// 2014 Telepoint Global Hosting Service, LLC. (Author: David Snyder)
// See ../../COPYING for clarification regarding multiple authors
//
......@@ -1099,6 +1100,94 @@ void PhiCompose(const Fst<Arc> &fst1,
Connect(ofst);
}
template<class Arc>
void ComposeDeterministicOnDemand(const Fst<Arc> &fst1,
DeterministicOnDemandFst<Arc> *fst2,
MutableFst<Arc> *fst_composed) {
typedef typename Arc::Weight Weight;
typedef typename Arc::StateId StateId;
typedef std::pair<StateId, StateId> StatePair;
typedef unordered_map<StatePair, StateId,
kaldi::PairHasher<StateId> > MapType;
typedef typename MapType::iterator IterType;
fst_composed->DeleteStates();
MapType state_map;
std::queue<StatePair> state_queue;
// Set start state in fst_composed.
StateId s1 = fst1.Start(),
s2 = fst2->Start(),
start_state = fst_composed->AddState();
StatePair start_pair(s1, s2);
state_queue.push(start_pair);
fst_composed->SetStart(start_state);
// A mapping between pairs of states in fst1 and fst2 and the corresponding
// state in fst_composed.
std::pair<const StatePair, StateId> start_map(start_pair, start_state);
std::pair<IterType, bool> result = state_map.insert(start_map);
KALDI_ASSERT(result.second == true);
while (!state_queue.empty()) {
StatePair q = state_queue.front();
StateId q1 = q.first,
q2 = q.second;
state_queue.pop();
// If the product of the final weights of the two fsts is non-zero then
// we can create a final state in fst_composed.
Weight final_weight = Times(fst1.Final(q1), fst2->Final(q2));
if (final_weight != Weight::Zero()) {
KALDI_ASSERT(state_map.find(q) != state_map.end());
fst_composed->SetFinal(state_map[q], final_weight);
}
// for each pair of edges from fst1 and fst2 at q1 and q2.
for (ArcIterator<Fst<Arc> > aiter(fst1, q1); !aiter.Done(); aiter.Next()) {
const Arc &arc1 = aiter.Value();
Arc arc2;
StatePair next_pair;
StateId next_state1 = arc1.nextstate,
next_state2,
next_state;
// If there is an epsilon on the arc of fst1 we transition to the next
// state but keep fst2 at the current state.
if (arc1.olabel == 0) {
next_state2 = q2;
} else {
bool match = fst2->GetArc(q2, arc1.olabel, &arc2);
// This should always find a match.
KALDI_ASSERT(match == true);
next_state2 = arc2.nextstate;
}
next_pair = StatePair(next_state1, next_state2);
IterType sitr = state_map.find(next_pair);
// If sitr == state_map.end() then the state isn't in fst_composed yet.
if (sitr == state_map.end()) {
next_state = fst_composed->AddState();
std::pair<const StatePair, StateId> new_state(
next_pair, next_state);
std::pair<IterType, bool> result = state_map.insert(new_state);
// Since we already checked if state_map contained new_state,
// it should always be added if we reach here.
KALDI_ASSERT(result.second == true);
state_queue.push(next_pair);
// If sitr != state_map.end() then the next state is already in
// the state_map.
} else {
next_state = sitr->second;
}
if (arc1.olabel == 0) {
fst_composed->AddArc(state_map[q], Arc(0, 0, arc1.weight,
next_state));
} else {
fst_composed->AddArc(state_map[q], Arc(arc1.ilabel, arc2.olabel,
Times(arc1.weight, arc2.weight), next_state));
}
}
}
}
template<class Arc>
void PropagateFinalInternal(
typename Arc::Label phi_label,
......
......@@ -2,6 +2,7 @@
// Copyright 2009-2011 Microsoft Corporation
// Copyright 2012-2013 Johns Hopkins University (Authors: Guoguo Chen, Daniel Povey)
// 2014 Telepoint Global Hosting Service, LLC. (Author: David Snyder)
// See ../../COPYING for clarification regarding multiple authors
//
......@@ -27,6 +28,7 @@
#include <fst/fstlib.h>
#include <fst/fst-decl.h>
#include "fstext/determinize-star.h"
#include "fstext/deterministic-fst.h"
#include "fstext/remove-eps-local.h"
#include "../base/kaldi-common.h" // for error reporting macros.
#include "../util/text-utils.h" // for SplitStringToVector
......@@ -576,6 +578,14 @@ void PhiCompose(const Fst<Arc> &fst1,
typename Arc::Label phi_label,
MutableFst<Arc> *fst);
// Compose a left hand FST or lattice with a right hand
// DeterministicOnDemandFst and store the result in fst_composed.
// This is mainly used for expanding lattice n-gram histories, where
// fst1 is a lattice and fst2 is an UnweightedNgramFst.
template<class Arc>
void ComposeDeterministicOnDemand(const Fst<Arc> &fst1,
DeterministicOnDemandFst<Arc> *fst2,
MutableFst<Arc> *fst_composed);
// PropagateFinal propagates final-probs through
// "phi" transitions (note that here, phi_label may
......
......@@ -18,7 +18,8 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \
lattice-to-smbr-post lattice-determinize-pruned-parallel \
lattice-add-penalty lattice-align-words-lexicon lattice-push \
lattice-minimize lattice-limit-depth lattice-depth-per-frame \
lattice-determinize-phone-pruned lattice-determinize-phone-pruned-parallel
lattice-determinize-phone-pruned lattice-determinize-phone-pruned-parallel \
lattice-expand-ngram
OBJFILES =
......
// latbin/lattice-expand-ngram.cc
// Copyright 2014 Telepoint Global Hosting Service, LLC. (Author: David Snyder)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "fstext/fstext-lib.h"
#include "lat/kaldi-lattice.h"
int main(int argc, char *argv[]) {
try {
using namespace kaldi;
typedef kaldi::int32 int32;
typedef kaldi::int64 int64;
using fst::SymbolTable;
using fst::VectorFst;
using kaldi::CompactLatticeArc;
const char *usage =
"Expand lattices so that each arc has a unique n-label history, for\n"
"a specified n (defaults to 3).\n"
"Usage: lattice-expand-ngram [options] lattice-rspecifier "
"lattice-wspecifier\n"
"e.g.: lattice-expand-ngram --n=3 ark:lat ark:expanded_lat\n";
ParseOptions po(usage);
int32 n = 3;
std::string word_syms_filename;
po.Register("n", &n, "n-gram context to expand to.");
po.Read(argc, argv);
if (po.NumArgs() != 2) {
po.PrintUsage();
exit(1);
}
KALDI_ASSERT(n > 0);
std::string lats_rspecifier = po.GetArg(1),
lats_wspecifier = po.GetOptArg(2);
fst::UnweightedNgramFst<CompactLatticeArc> expand_fst(n);
SequentialCompactLatticeReader lat_reader(lats_rspecifier);
CompactLatticeWriter lat_writer(lats_wspecifier);
int32 n_done = 0, n_fail = 0;
for (; !lat_reader.Done(); lat_reader.Next()) {
std::string key = lat_reader.Key();
KALDI_LOG << "Processing lattice for key " << key;
CompactLattice lat = lat_reader.Value();
CompactLattice expanded_lat;
ComposeDeterministicOnDemand(lat, &expand_fst, &expanded_lat);
if (expanded_lat.Start() == fst::kNoStateId) {
KALDI_WARN << "Empty lattice for utterance " << key << std::endl;
n_fail++;
} else {
if (lat.NumStates() == expanded_lat.NumStates()) {
KALDI_LOG << "Lattice for key " << key
<< " did not need to be expanded for order " << n << ".";
} else {
KALDI_LOG << "Lattice expanded from " << lat.NumStates() << " to "
<< expanded_lat.NumStates() << " states for order " << n << ".";
}
lat_writer.Write(key, expanded_lat);
n_done++;
}
lat_reader.FreeCurrent();
}
KALDI_LOG << "Processed " << n_done << " lattices with " << n_fail
<< " failures.";
return 0;
} catch(const std::exception &e) {
std::cerr << e.what();
return -1;
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment