Commit 70a3aa4e authored by Dan Povey's avatar Dan Povey
Browse files

Merging ^/sandbox/tanel back to trunk: added interface to GStreamer for online decoding

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@2659 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 91401e76
......@@ -19,7 +19,6 @@
#define KALDI_DECODER_BIGLM_FASTER_DECODER_H_
#include "util/stl-utils.h"
#include "util/parse-options.h"
#include "util/hash-list.h"
#include "fst/fstlib.h"
#include "itf/decodable-itf.h"
......
......@@ -19,7 +19,7 @@
#define KALDI_DECODER_FASTER_DECODER_H_
#include "util/stl-utils.h"
#include "util/parse-options.h"
#include "itf/options-itf.h"
#include "util/hash-list.h"
#include "fst/fstlib.h"
#include "itf/decodable-itf.h"
......@@ -46,7 +46,7 @@ struct FasterDecoderOptions {
// alignment, use small default.
beam_delta(0.5),
hash_ratio(2.0) { }
void Register(ParseOptions *po, bool full) { /// if "full", use obscure
void Register(OptionsItf *po, bool full) { /// if "full", use obscure
/// options too.
/// Depends on program.
po->Register("beam", &beam, "Decoder beam");
......
......@@ -55,7 +55,7 @@ struct LatticeFasterDecoderConfig {
max_arcs(-1),
beam_delta(0.5),
hash_ratio(2.0) { }
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
po->Register("beam", &beam, "Decoding beam.");
po->Register("max-active", &max_active, "Decoder max active states.");
po->Register("min-active", &min_active, "Decoder minimum #active states.");
......
......@@ -55,7 +55,7 @@ struct LatticeSimpleDecoderConfig {
max_loop(500000),
max_arcs(-1),
beam_ratio(0.9) { }
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
po->Register("beam", &beam, "Decoding beam.");
po->Register("lattice-beam", &lattice_beam, "Lattice generation beam");
po->Register("prune-interval", &prune_interval, "Interval (in frames) at which to prune tokens");
......
......@@ -57,7 +57,7 @@ struct LatticeTrackingDecoderConfig {
hash_ratio(2.0),
extra_beam(4.0),
max_beam(40.0) { }
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
po->Register("beam", &beam, "Decoding beam.");
po->Register("max-active", &max_active, "Decoder max active states.");
po->Register("lattice-beam", &lattice_beam, "Lattice generation beam");
......
......@@ -25,7 +25,7 @@
#include <tr1/unordered_map>
#endif
#include "util/stl-utils.h"
#include "util/parse-options.h"
#include "itf/options-itf.h"
#include "util/hash-list.h"
#include "fst/fstlib.h"
#include "itf/decodable-itf.h"
......@@ -44,7 +44,7 @@ struct NBestDecoderOptions {
max_active(std::numeric_limits<int32>::max()),
n_best(1),
beam_delta(0.5), hash_ratio(2.0) { }
void Register(ParseOptions *po, bool full) { /// if "full", use obscure
void Register(OptionsItf *po, bool full) { /// if "full", use obscure
/// options too.
/// Depends on program.
po->Register("beam", &beam, "Decoder beam");
......
......@@ -40,7 +40,7 @@ struct TrainingGraphCompilerOptions {
rm_eps(false),
reorder(b) { }
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
po->Register("transition-scale", &transition_scale, "Scale of transition "
"probabilities (excluding self-loops)");
po->Register("self-loop-scale", &self_loop_scale, "Scale of self-loop vs. "
......
......@@ -37,6 +37,9 @@ script found there. The programs are as follows:
There is also a Java equivalent of the online-audio-client which contains slightly more features and has a GUI.
In addition, there is a GStreamer 1.0 compatible plugin that acts as a filter, taking raw audio as input and producing
recognized word as output. The plugin is based on \ref OnlineFasterDecoder, as other online recognition programs.
\section audio_server Online Audio Server
The main difference between the online-server-gmm-decode-faster and online-audio-server-decode-faster programs is the input: the former accepts feature vectors, while the latter accepts RAW audio.
......@@ -116,6 +119,126 @@ java -jar online-audio-client.jar
Or simply double-click the JAR file in the graphical interface.
\section gst_plugin GStreamer plugin
Kaldi toolkit comes with a plugin for the <a href="http://gstreamer.freedesktop.org/">GStreamer</a> media streaming framework (version 1.0 or compatible).
The plugin acts as a filter that accepts raw audio as input and produces recognized words as output.
The main benefit of the plugin is the fact that it makes Kaldi's online speech recognition functionality available to all
programming languages that support GStreamer 1.0 (that includes Python, Ruby, Java, Vala and many more). It also simplifies the integration
of the Kaldi online decoder in applications since communicating with the decoder follows GStreamer standards.
\subsection gst_plugin_installation Installation
The source of the GStreamer plugin is located in the `src/gst-plugin` directory. To compile the plugin, rest of the Kaldi
toolkit has to be compiled with the '-fPIC' compilation option. To do this, just add `-fPIC` to the `CXXFLAGS` in
the `src/kaldi.mk` file. Then recompile Kaldi as usual. Also compile the online extensions (`make ext`).
Make sure the package that provides GStreamer 1.0 development headers is installed on your system (on Debian, the needed package is called
`libgstreamer1.0-dev`).
Finally, run `make depend` and `make` in the `src/gst-plugin` directory. This should result in a file `src/gst-plugin/libgstkaldi.so`
which contains the GStreamer plugin.
To make GStreamer able to find the Kaldi plugin, you have to add the `src/gst-plugin` directory to its plugin search path. To do this,
add the directory to the GST_PLUGIN_PATH environment variable:
\verbatim
export GST_PLUGIN_PATH=$KALDI_ROOT/src/gst-plugin
\endverbatim
Of course, replace `$KALDI_ROOT` with the actual location of the Kaldi root folder on your file system.
Now, running `gst-inspect-1.0 onlinegmmdecodefaster` should provide info about the plugin:
\verbatim
# gst-inspect-1.0 onlinegmmdecodefaster
Factory Details:
Rank: none (0)
Long-name: OnlineGmmDecodeFaster
Klass: Speech/Audio
Description: Convert speech to text
Author: Tanel Alumae <tanel.alumae@phon.ioc.ee>
[..]
Element Properties:
name : The name of the object
flags: readable, writable
String. Default: "onlinegmmdecodefaster0"
parent : The parent of the object
flags: readable, writable
Object of type "GstObject"
silent : Determines whether incoming audio is sent to the decoder or not
flags: readable, writable
Boolean. Default: false
model : Filename of the acoustic model
flags: readable, writable
String. Default: "final.mdl"
fst : Filename of the HCLG FST
flags: readable, writable
String. Default: "HCLG.fst"
[..]
min-cmn-window : Minumum CMN window used at start of decoding (adds latency only at start)
flags: readable, writable
Integer. Range: -2147483648 - 2147483647 Default: 100
Element Signals:
"hyp-word" : void user_function (GstElement* object,
gchararray arg0,
gpointer user_data);
\endverbatim
\subsection usage_cli Usage through the command-line
The most simple way to use the GStreamer plugin is via the command line. You have to specify the model files used for decoding
when lauching the plugin. To do this, set the `model`, `fst`, `word-syms`, `silence-phones` and optionally the `lda-mat`
plugin properties (similarly to Kaldi's command-line online decoders). The decoder accepts only 16KHz 16-bit mono audio. Any audio stream can be automatically converted to the
required format using GStreamer's `audioresample` and `audioconvert` plugins.
For example, to decode the file `test1.wav` using a model files in `tri2b_mmi`, and have the recognized stream of words printed to stdout, execute:
\verbatim
gst-launch-1.0 -q filesrc location=test1.wav \
! decodebin ! audioconvert ! audioresample \
! onlinegmmdecodefaster model=tri2b_mmi/model fst=tri2b_mmi/HCLG.fst \
word-syms=tri2b_mmi/words.txt silence-phones="1:2:3:4:5" lda-mat=tri2b_mmi/matrix \
! filesink location=/dev/stdout buffer-mode=2
\endverbatim
Note that the audio stream is segmented on the fly, with "<#s>" denoting silence.
You can easily try live decoding of microphone input by replacing `filesrc location=test1.wav` with `pulsesrc` (given that
your OS uses the PulseAudio framework).
An example stript that uses the plugin via the command-line to process a buch of audio files is located in `egs/voxforge/gst_demo/run-simulated.sh`.
\subsection usage_gst Usage through GStreamer bindings
An example of a Python GUI program that uses the plugin via the GStreamer bindings is located in `egs/voxforge/gst_demo/run-live.py`.
The program constructs in the `init_gst(self)` method a similar pipeline of GStreamer elements as in the command-line example.
The model files and some decoding parameters are communicated to the `onlinegmmdecodefaster` element through the standard `set_property()`
method. More interesting is this part of the code:
\verbatim
self.asr.connect('hyp-word', self._on_word)
\endverbatim
This expression orders our decoding plugin to call the GUI's `_on_word` method whenever it produces a new recognized word.
The `_on_word()` method looks like this:
\verbatim
def _on_word(self, asr, word):
Gdk.threads_enter()
if word == "<#s>":
self.textbuf.insert_at_cursor("\n")
else:
self.textbuf.insert_at_cursor(word)
self.textbuf.insert_at_cursor(" ")
Gdk.threads_leave()
\endverbatim
What it does (apart from some GUI-related chemistry), is that it inserts the recognized word into the text buffer that is connected
to the GUI's main text box. If a segmentation symbol is recognized, it inserts a line break instead.
Recognition start and stop are controlled by setting the `silent` property of the decoder plugin to `False` or `True`. Setting the
property to `False` orders the plugin not to process any incoming audio (although the audio that is already being processed might
produce some new recognized words).
*/
......
......@@ -51,7 +51,7 @@ struct FbankOptions {
htk_compat(false),
use_log_fbank(true) {}
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
frame_opts.Register(po);
mel_opts.Register(po);
po->Register("use-energy", &use_energy,
......
......@@ -47,7 +47,7 @@ struct MelBanksOptions {
: num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(400),
vtln_high(-400), debug_mel(false) {}
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
po->Register("num-mel-bins", &num_bins,
"Number of triangular mel-frequency bins");
po->Register("low-freq", &low_freq,
......@@ -88,7 +88,7 @@ struct FrameExtractionOptions {
window_type("povey"),
round_to_power_of_two(true) { }
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
po->Register("sample-frequency", &samp_freq,
"Waveform data sample frequency (must match the waveform file, "
"if specified there)");
......@@ -185,7 +185,7 @@ struct DeltaFeaturesOptions {
DeltaFeaturesOptions(int32 order = 2, int32 window = 2):
order(order), window(window) { }
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
po->Register("delta-order", &order, "Order of delta computation");
po->Register("delta-window", &window,
"Parameter controlling window for delta computation (actual window"
......
......@@ -55,7 +55,7 @@ struct MfccOptions {
cepstral_lifter(22.0),
htk_compat(false) {}
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
frame_opts.Register(po);
mel_opts.Register(po);
po->Register("num-ceps", &num_ceps,
......
......@@ -22,7 +22,7 @@
#include <string>
#include "feat/feature-functions.h"
#include "util/parse-options.h"
#include "itf/options-itf.h"
#include "matrix/kaldi-matrix-inl.h"
namespace kaldi {
......@@ -64,7 +64,7 @@ struct PlpOptions {
cepstral_scale(1.0),
htk_compat(false) {}
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
frame_opts.Register(po);
mel_opts.Register(po);
po->Register("lpc-order", &lpc_order,
......
......@@ -42,7 +42,7 @@ struct SpectrogramOptions {
energy_floor(0.0), // not in log scale: a small value e.g. 1.0e-10
raw_energy(true) {}
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
frame_opts.Register(po);
po->Register("energy-floor", &energy_floor,
"Floor on energy (absolute, not relative) in Spectrogram computation");
......
......@@ -33,7 +33,7 @@ struct PitchInterpolatorOptions {
interpolator_factor(1.0e-05),
max_voicing_prob(0.9),
max_pitch_change_per_frame(10.0) { }
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
po->Register("pitch-interval", &pitch_interval, "Frequency interval in Hz, used "
"for the pitch interpolation and smoothing algorithm.");
po->Register("interpolator-factor", &interpolator_factor, "Factor affecting the "
......
......@@ -24,7 +24,7 @@
#include <set>
#include <vector>
#include "fstext/lattice-weight.h"
#include "util/parse-options.h"
#include "itf/options-itf.h"
namespace fst {
......@@ -116,7 +116,7 @@ struct DeterminizeLatticePrunedOptions {
max_loop(-1),
max_states(-1),
max_arcs(-1) { }
void Register (kaldi::ParseOptions *po) {
void Register (kaldi::OptionsItf *po) {
po->Register("delta", &delta, "Tolerance used in determinization");
po->Register("max-mem", &max_mem, "Maximum approximate memory usage in "
"determinization (real usage might be many times this)");
......
......@@ -24,7 +24,7 @@
#include "base/kaldi-common.h"
#include "gmm/diag-gmm.h"
#include "util/parse-options.h"
#include "itf/options-itf.h"
namespace kaldi {
/// @defgroup DiagGmm DiagGmm
......@@ -171,7 +171,7 @@ struct UbmClusteringOptions {
: ubm_num_gauss(ncomp), reduce_state_factor(red),
intermediate_num_gauss(interm_gauss), cluster_varfloor(vfloor),
max_am_gauss(max_am_gauss) {}
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
std::string module = "UbmClusteringOptions: ";
po->Register("max-am-gauss", &max_am_gauss, module+
"We first reduce acoustic model to this max #Gauss before clustering.");
......
......@@ -24,7 +24,7 @@
#include "gmm/mle-diag-gmm.h"
#include "gmm/mle-am-diag-gmm.h"
#include "gmm/model-common.h"
#include "util/parse-options.h"
#include "itf/options-itf.h"
namespace kaldi {
......@@ -34,7 +34,7 @@ struct EbwOptions {
BaseFloat tau; // This is only useful for smoothing "to the model":
// if you want to smooth to ML stats, you need to use gmm-ismooth-stats
EbwOptions(): E(2.0), tau(0.0) { }
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
std::string module = "EbwOptions: ";
po->Register("E", &E, module+"Constant E for Extended Baum-Welch (EBW) update");
po->Register("tau", &tau, module+"Tau value for smoothing to the model "
......@@ -50,7 +50,7 @@ struct EbwWeightOptions {
EbwWeightOptions(): min_num_count_weight_update(10.0),
min_gaussian_weight(1.0e-05),
tau(0.0) { }
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
std::string module = "EbwWeightOptions: ";
po->Register("min-num-count-weight-update", &min_num_count_weight_update,
module+"Minimum numerator count required at "
......
......@@ -27,7 +27,6 @@
#include "base/kaldi-common.h"
#include "gmm/model-common.h"
#include "matrix/matrix-lib.h"
#include "util/parse-options.h"
namespace kaldi {
......
......@@ -24,7 +24,6 @@
#include "gmm/mle-diag-gmm.h"
#include "gmm/mle-am-diag-gmm.h"
#include "gmm/model-common.h"
#include "util/parse-options.h"
namespace kaldi {
......
......@@ -25,7 +25,7 @@
#include "gmm/diag-gmm.h"
#include "gmm/diag-gmm-normal.h"
#include "gmm/model-common.h"
#include "util/parse-options.h"
#include "itf/options-itf.h"
namespace kaldi {
......@@ -54,7 +54,7 @@ struct MleDiagGmmOptions {
min_variance = 0.001;
remove_low_count_gaussians = true;
}
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
std::string module = "MleDiagGmmOptions: ";
po->Register("min-gaussian-weight", &min_gaussian_weight,
module+"Min Gaussian weight before we remove it.");
......@@ -88,7 +88,7 @@ struct MapDiagGmmOptions {
variance_tau(50.0),
weight_tau(10.0) { }
void Register(ParseOptions *po) {
void Register(OptionsItf *po) {
po->Register("mean-tau", &mean_tau,
"Tau value for updating means.");
po->Register("variance-tau", &mean_tau,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment