Commit 739ed027 authored by Dan Povey's avatar Dan Povey
Browse files

Added copy-tree program; documentation extensions; minor cosmetic fixes

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@608 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent b2ef575d
......@@ -53,16 +53,6 @@ CREATE_SUBDIRS = NO
OUTPUT_LANGUAGE = English
# This tag can be used to specify the encoding used in the generated output.
# The encoding is not always determined by the language that is chosen,
# but also whether or not the output is meant for Windows or non-Windows users.
# In case there is a difference, setting the USE_WINDOWS_ENCODING tag to YES
# forces the Windows encoding (this is the default for the Windows binary),
# whereas setting the tag to NO uses a Unix-style encoding (the default for
# all platforms other than Windows).
USE_WINDOWS_ENCODING = NO
# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
# include brief member descriptions after the members that are listed in
# the file and class documentation (similar to JavaDoc).
......@@ -148,13 +138,6 @@ JAVADOC_AUTOBRIEF = YES
MULTILINE_CPP_IS_BRIEF = NO
# If the DETAILS_AT_TOP tag is set to YES then Doxygen
# will output the detailed description near the top, like JavaDoc.
# If set to NO, the detailed description appears after the member
# documentation.
DETAILS_AT_TOP = NO
# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
# member inherits the documentation from any documented member that it
# re-implements.
......@@ -1193,22 +1176,6 @@ DOT_PATH = /usr/bin/
DOTFILE_DIRS =
# The MAX_DOT_GRAPH_WIDTH tag can be used to set the maximum allowed width
# (in pixels) of the graphs generated by dot. If a graph becomes larger than
# this value, doxygen will try to truncate the graph, so that it fits within
# the specified constraint. Beware that most browsers cannot cope with very
# large images.
MAX_DOT_GRAPH_WIDTH = 1024
# The MAX_DOT_GRAPH_HEIGHT tag can be used to set the maximum allows height
# (in pixels) of the graphs generated by dot. If a graph becomes larger than
# this value, doxygen will try to truncate the graph, so that it fits within
# the specified constraint. Beware that most browsers cannot cope with very
# large images.
MAX_DOT_GRAPH_HEIGHT = 1024
# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
# graphs generated by dot. A depth value of 3 means that only nodes reachable
# from the root by following a path via at most 3 edges will be shown. Nodes
......
......@@ -11,7 +11,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
ali-to-phones ali-to-post weight-silence-post acc-lda est-lda \
ali-to-pdf est-mllt build-tree build-tree-two-level decode-faster \
decode-faster-mapped scale-vecs copy-transition-model rand-prune-post \
phones-to-prons prons-to-wordali copy-gselect
phones-to-prons prons-to-wordali copy-gselect copy-tree
OBJFILES =
......
......@@ -22,7 +22,6 @@
#include "tree/context-dep.h"
#include "tree/build-tree.h"
#include "tree/build-tree-utils.h"
#include "tree/context-dep.h"
#include "tree/clusterable-classes.h"
#include "util/text-utils.h"
......@@ -206,7 +205,7 @@ int main(int argc, char *argv[]) {
}
}
std::cerr << "Wrote tree\n";
KALDI_LOG << "Wrote tree\n";
DeleteBuildTreeStats(&stats);
} catch(const std::exception& e) {
......
// bin/copy-tree.cc
// Copyright 2009-2011 Microsoft Corporation
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "hmm/hmm-topology.h"
#include "tree/context-dep.h"
#include "tree/clusterable-classes.h"
#include "util/text-utils.h"
int main(int argc, char *argv[]) {
using namespace kaldi;
try {
using namespace kaldi;
typedef kaldi::int32 int32;
const char *usage =
"Copy decision tree (possibly changing binary/text format)\n"
"Usage: copy-tree [--binary=false] <tree-in> <tree-out>\n";
bool binary = true;
ParseOptions po(usage);
po.Register("binary", &binary, "Write output in binary mode");
po.Read(argc, argv);
if (po.NumArgs() != 2) {
po.PrintUsage();
exit(1);
}
std::string tree_in_filename = po.GetArg(1),
tree_out_filename = po.GetArg(2);
ContextDependency ctx_dep;
{
bool binary_in;
Input ki(tree_in_filename, &binary_in);
ctx_dep.Read(ki.Stream(), binary_in);
}
{
Output ko(tree_out_filename, binary);
ctx_dep.Write(ko.Stream(), binary);
}
KALDI_LOG << "Copied tree";
} catch(const std::exception& e) {
std::cerr << e.what();
return -1;
}
}
......@@ -74,7 +74,7 @@ struct LatticeFasterDecoderConfig {
/** A bit more optimized version of the lattice decoder.
See \ref lattice_generation \ref decoders_faster and \ref decoders_simple
See \ref lattices_generation \ref decoders_faster and \ref decoders_simple
for more information.
*/
class LatticeFasterDecoder {
......
......@@ -219,6 +219,113 @@ EventMap object; see \ref tree_internals. We wanted to hide
the actual implementation of the tree as much as possible to make it
easy to refactor the code later if needed.
\section tree_example An example of a decision tree
The decision-tree file format was not created with human readability as the first priority,
but due to popular demand we will try to explain how to interpret this file.
Look at the example below this, which is a triphone tree from the Wall Street Journal recipe.
It starts with ContextDependency which is the name of the object; then N (the context-width),
which is 3; then P (the "central position" of the context window), which is 1, i.e. the
center of the phone context positions since we are in zero-based numbering. The rest of the file
contains a single EventMap object. EventMap is a polymorphic type which may
contain pointers to other EventMap objects. See \ref treei_event_map for more details;
it is a representation of a decision tree or set of decision trees, that maps from
a set of key-value pairs (e.g. left-phone=5, central-phone=10, right-phone=11, pdf-class=2)
to a pdf-id (e.g. 158). Briefly, it comes in three types: a SplitEventMap (like a split in a decision tree),
a ConstantEventMap (like a leaf of a decision tree, containing just a number representing
a pdf-id), and a TableEventMap (which is like a table lookup containing other EventMaps). The
SplitEventMap and TableEventMap both have a "key" that they query, which in this case
would be 0, 1 or 2 corresponding to left, central or right context, or -1 corresponding
to the identity of the "pdf-class". Normally the value of the pdf-class
is the same as the index of the HMM state, i.e. 0, 1 or 2. Try not to get confused by
this: the key is -1, but the value is 0, 1 or 2, and this has no connnection to the 0, 1 or 2
which are the keys of the phones in the context-window.
The SplitEventMap has a set of values that will trigger the "yes" branch of the tree.
Below is a kind of quasi-BNF notation that explains the tree-file format.
\verbatim
EventMap := ConstantEventMap | SplitEventMap | TableEventMap | "NULL"
ConstantEventMap := "CE" <numeric pdf-id>
SplitEventMap := "SE" <key-to-split-on> "[" yes-value-list "]" "{" EventMap EventMap "}"
TableEventMap := "TE" <key-to-split-on> <table-size> "(" EventMapList ")"
\endverbatim
In the example below, the top-level EventMap of the tree is a SplitEventMap (SE) that
splits on key 1, which is the central phone. In square brackets are a contiguous range
of phone-ids. As it happens, these don't represent a question, but are just a way of
splitting on phones so we can get to the "real" decision trees which are per phone.
The issue is that this tree was built with "shared roots", so there are various phone-ids,
corresponding to different word-position-and-stress-marked versions of the same phone,
that share the root. We can't use a TableEventMap (TE) at the top level of the tree,
or we'd have to repeat each decision tree several times (since the EventMap is a pure
tree, not a general graph, it has no mechanism for pointers to be "shared").
The next few instances of the "SE" label are also part of this "quasi-tree" which
is initially splitting on the central phone (as we go down this file we are going
deeper into the tree; notice that the braces "{" are opening but not yet closing).
Then we have the string
"TE -1 5 ( CE 0 CE 1 CE 2 CE 3 CE 4 )", which represents splitting with a TableEventMap
on the pdf-class "-1" (effectively, the HMM-position), and returning values 0 through 4.
The values represent the five pdf-ids
for the silence and noise phones SIL, NSN and SPN; in our setup, the pdfs are shared between these
three non-speech phones (only the transition matrix is specific to each non-speech phone).
Note: we have a 5-state rather than 3-state HMM for
these phones, hence 5 different pdf-ids. Next is "SE -1 [ 0 ]"; and this can be considered
the first "real" question in the tree. We can see from the SE questions above it that it
applies when the central phone takes values 5 through 19, which are
various versions of the phone AA; and question is asking whether the pdf-class (key -1)
has value 0 (i.e. the leftmost HMM-state). Assuming the answer is "yes", the next question
is "SE 2 [ 220 221 222 223 ]", which is asking whether the phone to the right is one of various
forms of the phone "M" (a rather unintuitive question to ask, since we're
in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is
a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if
no, 696 ("CE 696").
\verbatim
s3# copy-tree --binary=false exp/tri1/tree - 2>/dev/null | head -100
ContextDependency 3 1 ToPdf SE 1 [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 \
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59\
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 9\
3 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 1\
20 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 14\
5 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170\
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 \
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 ]
{ SE 1 [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34\
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 6\
8 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 10\
1 102 103 104 105 106 107 108 109 110 111 ]
{ SE 1 [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34\
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 ]
{ SE 1 [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ]
{ SE 1 [ 1 2 3 ]
{ TE -1 5 ( CE 0 CE 1 CE 2 CE 3 CE 4 )
SE -1 [ 0 ]
{ SE 2 [ 220 221 222 223 ]
{ SE 0 [ 104 105 106 107 112 113 114 115 172 173 174 175 208 209 210 211 212 213 214 215 264 265 266 \
267 280 281 282 283 284 285 286 287 ]
{ CE 5 CE 696 }
SE 2 [ 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 132 \
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 248 249 250 251 252 253 254 255 256 257 2\
58 259 260 261 262 263 268 269 270 271 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 30\
3 ]
\endverbatim
Below is a simpler example: the monophone tree from the Resource Management
recipe. The top-level EventMap is a TableEventMap ("TE 0 49 ...").
The key "0" is the phone-position of zero which represents the central (and only) phone
since the context width (N) is 1. The number of entries in the table is 49
(in this case, the number of phones plus one). The
first EventMap in the table (index zero) is NULL, because there is no phone with
index zero. The next one is a TableEventMap with three elements, corresponding
to the three HMM-states (technically, pdf-classes) of the first phone: "TE -1 3 ( CE 0 CE 1 CE 2 )".
\verbatim
s3# copy-tree --binary=false exp/mono/tree - 2>/dev/null| head -5
ContextDependency 1 0 ToPdf TE 0 49 ( NULL TE -1 3 ( CE 0 CE 1 CE 2 )
TE -1 3 ( CE 3 CE 4 CE 5 )
TE -1 3 ( CE 6 CE 7 CE 8 )
TE -1 3 ( CE 9 CE 10 CE 11 )
TE -1 3 ( CE 12 CE 13 CE 14 )
\endverbatim
\section tree_ilabel The ilabel_info object
The final graph (HCLG in the standard notation, see \ref graph) has symbols
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment