nnet-component.h 78.5 KB
Newer Older
1
// nnet2/nnet-component.h
2

3
// Copyright 2011-2013  Karel Vesely
4
//                      Johns Hopkins University (author: Daniel Povey)
5
//	          2013  Xiaohui Zhang	
6

7 8
// See ../../COPYING for clarification regarding multiple authors
//
9 10 11 12 13 14 15 16 17 18 19 20 21
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

22 23
#ifndef KALDI_NNET2_NNET_COMPONENT_H_
#define KALDI_NNET2_NNET_COMPONENT_H_
24 25

#include "base/kaldi-common.h"
26
#include "itf/options-itf.h"
27
#include "matrix/matrix-lib.h"
28
#include "cudamatrix/cu-matrix-lib.h"
29
#include "thread/kaldi-mutex.h"
30
#include "nnet2/nnet-precondition-online.h"
31 32 33 34

#include <iostream>

namespace kaldi {
35
namespace nnet2 {
36 37 38 39 40 41 42 43 44 45 46 47 48 49


/**
 * Abstract class, basic element of the network,
 * it is a box with defined inputs, outputs,
 * and tranformation functions interface.
 *
 * It is able to propagate and backpropagate
 * exact implementation is to be implemented in descendants.
 *
 */ 

class Component {
 public:
50
  Component(): index_(-1) { }
51 52 53 54
  
  virtual std::string Type() const = 0; // each type should return a string such as
  // "SigmoidComponent".

55 56 57 58 59 60
  /// Returns the index in the sequence of layers in the neural net; intended only
  /// to be used in debugging information.
  virtual int32 Index() const { return index_; }

  virtual void SetIndex(int32 index) { index_ = index; }

61 62 63 64 65 66 67 68 69 70
  /// Initialize, typically from a line of a config file.  The "args" will
  /// contain any parameters that need to be passed to the Component, e.g.
  /// dimensions.
  virtual void InitFromString(std::string args) = 0; 
  
  /// Get size of input vectors
  virtual int32 InputDim() const = 0;
  
  /// Get size of output vectors 
  virtual int32 OutputDim() const = 0;
71
  
72 73
  /// Number of left-context frames the component sees for each output frame;
  /// nonzero only for splicing layers.
74
  virtual int32 LeftContext() const { return 0; }
75 76 77

  /// Number of right-context frames the component sees for each output frame;
  /// nonzero only for splicing layers.
78
  virtual int32 RightContext() const { return 0; }
79 80 81 82 83 84 85 86

  /// Perform forward pass propagation Input->Output.  Each row is
  /// one frame or training example.  Interpreted as "num_chunks"
  /// equally sized chunks of frames; this only matters for layers
  /// that do things like context splicing.  Typically this variable
  /// will either be 1 (when we're processing a single contiguous
  /// chunk of data) or will be the same as in.NumFrames(), but
  /// other values are possible if some layers do splicing.
87
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
88
                         int32 num_chunks,
89
                         CuMatrix<BaseFloat> *out) const = 0; 
90 91 92 93 94 95 96 97 98 99 100
  
  /// Perform backward pass propagation of the derivative, and
  /// also either update the model (if to_update == this) or
  /// update another model or compute the model derivative (otherwise).
  /// Note: in_value and out_value are the values of the input and output
  /// of the component, and these may be dummy variables if respectively
  /// BackpropNeedsInput() or BackpropNeedsOutput() return false for
  /// that component (not all components need these).
  ///
  /// num_chunks lets us treat the input matrix as n contiguous-in-time
  /// chunks of equal size; it only matters if splicing is involved.
101 102 103
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &out_value,                        
                        const CuMatrixBase<BaseFloat> &out_deriv,
104 105
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
106
                        CuMatrix<BaseFloat> *in_deriv) const = 0;
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
  
  virtual bool BackpropNeedsInput() const { return true; } // if this returns false,
  // the "in_value" to Backprop may be a dummy variable.
  virtual bool BackpropNeedsOutput() const { return true; } // if this returns false,
  // the "out_value" to Backprop may be a dummy variable.
  
  /// Read component from stream
  static Component* ReadNew(std::istream &is, bool binary);

  /// Copy component (deep copy).
  virtual Component* Copy() const = 0;

  /// Initialize the Component from one line that will contain
  /// first the type, e.g. SigmoidComponent, and then
  /// a number of tokens (typically integers or floats) that will
  /// be used to initialize the component.
  static Component *NewFromString(const std::string &initializer_line);

  /// Return a new Component of the given type e.g. "SoftmaxComponent",
126
  /// or NULL if no such type exists. 
127 128 129 130 131 132 133 134 135 136 137 138 139
  static Component *NewComponentOfType(const std::string &type);
  
  virtual void Read(std::istream &is, bool binary) = 0; // This Read function
  // requires that the Component has the correct type.
  
  /// Write component to stream
  virtual void Write(std::ostream &os, bool binary) const = 0;

  virtual std::string Info() const;

  virtual ~Component() { }

 private:
140
  int32 index_;
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
  KALDI_DISALLOW_COPY_AND_ASSIGN(Component);
};


/**
 * Class UpdatableComponent is a Component which has
 * trainable parameters and contains some global 
 * parameters for stochastic gradient descent
 * (learning rate, L2 regularization constant).
 * This is a base-class for Components with parameters.
 */
class UpdatableComponent: public Component {
 public:
  UpdatableComponent(const UpdatableComponent &other):
      learning_rate_(other.learning_rate_){ }
  
  void Init(BaseFloat learning_rate) {
    learning_rate_ = learning_rate;
  }
  UpdatableComponent(BaseFloat learning_rate) {
    Init(learning_rate);
  }

  /// Set parameters to zero, and if treat_as_gradient is true, we'll be
  /// treating this as a gradient so set the learning rate to 1 and make any
  /// other changes necessary (there's a variable we have to set for the
  /// MixtureProbComponent).
  virtual void SetZero(bool treat_as_gradient) = 0;
169
  
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
  UpdatableComponent(): learning_rate_(0.001) { }
  
  virtual ~UpdatableComponent() { }

  /// Here, "other" is a component of the same specific type.  This
  /// function computes the dot product in parameters, and is computed while
  /// automatically adjusting learning rates; typically, one of the two will
  /// actually contain the gradient.
  virtual BaseFloat DotProduct(const UpdatableComponent &other) const = 0;
  
  /// We introduce a new virtual function that only applies to
  /// class UpdatableComponent.  This is used in testing.
  virtual void PerturbParams(BaseFloat stddev) = 0;
  
  /// This new virtual function scales the parameters
  /// by this amount.  
  virtual void Scale(BaseFloat scale) = 0;

  /// This new virtual function adds the parameters of another
  /// updatable component, times some constant, to the current
  /// parameters.
  virtual void Add(BaseFloat alpha, const UpdatableComponent &other) = 0;
  
  /// Sets the learning rate of gradient descent
  void SetLearningRate(BaseFloat lrate) {  learning_rate_ = lrate; }
  /// Gets the learning rate of gradient descent
  BaseFloat LearningRate() const { return learning_rate_; }

198 199
  virtual std::string Info() const;
  
200 201 202 203 204 205 206
  // The next few functions are not implemented everywhere; they are
  // intended for use by L-BFGS code, and we won't implement them
  // for all child classes.
  
  /// The following new virtual function returns the total dimension of
  /// the parameters in this class.  E.g. used for L-BFGS update
  virtual int32 GetParameterDim() const { KALDI_ASSERT(0); return 0; }
207 208 209 210

  /// Turns the parameters into vector form.  We put the vector form on the CPU,
  /// because in the kinds of situations where we do this, we'll tend to use
  /// too much memory for the GPU.
211 212 213 214 215 216 217 218 219 220 221 222
  virtual void Vectorize(VectorBase<BaseFloat> *params) const { KALDI_ASSERT(0); }
  /// Converts the parameters from vector form.
  virtual void UnVectorize(const VectorBase<BaseFloat> &params) {
    KALDI_ASSERT(0);
  }
  
 protected: 
  BaseFloat learning_rate_; ///< learning rate (0.0..0.01)
 private:
  const UpdatableComponent &operator = (const UpdatableComponent &other); // Disallow.
};

223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
/// Augments a scalar variable with powers of itself, e.g. x => {x, x^2}.
class PowerExpandComponent: public Component {
 public:
  void Init(int32 dim, int32 max_power = 2, BaseFloat higher_power_scale = 1.0);
  
  explicit PowerExpandComponent(int32 dim, int32 max_power = 2,
                                BaseFloat higher_power_scale = 1.0) {
    Init(dim, max_power, higher_power_scale);
  }
  PowerExpandComponent(): input_dim_(0), max_power_(2),
                          higher_power_scale_(1.0) { }
  virtual std::string Type() const { return "PowerExpandComponent"; }
  virtual void InitFromString(std::string args); 
  virtual int32 InputDim() const { return input_dim_; }
  virtual int32 OutputDim() const { return max_power_ * input_dim_; }
238
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
239
                         int32 num_chunks,
240 241 242 243
                         CuMatrix<BaseFloat> *out) const;
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &, // out_value
                        const CuMatrixBase<BaseFloat> &out_deriv,
244 245
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
246
                        CuMatrix<BaseFloat> *in_deriv) const;
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
  virtual bool BackpropNeedsInput() const { return true; }
  virtual bool BackpropNeedsOutput() const { return false; }
  virtual Component* Copy() const { return new PowerExpandComponent(input_dim_,
                                                                    max_power_,
                                                                    higher_power_scale_); }
  
  virtual void Read(std::istream &is, bool binary); // This Read function
  // requires that the Component has the correct type.
  
  /// Write component to stream
  virtual void Write(std::ostream &os, bool binary) const;

  virtual std::string Info() const;
 private:
  int32 input_dim_;
  int32 max_power_;
  BaseFloat higher_power_scale_; // Scale put on all powers
  // except the first one.
};


268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
/// This kind of Component is a base-class for things like
/// sigmoid and softmax.
class NonlinearComponent: public Component {
 public:
  void Init(int32 dim) { dim_ = dim; count_ = 0.0; }
  explicit NonlinearComponent(int32 dim) { Init(dim); }
  NonlinearComponent(): dim_(0) { } // e.g. prior to Read().
  explicit NonlinearComponent(const NonlinearComponent &other);
  
  virtual int32 InputDim() const { return dim_; }
  virtual int32 OutputDim() const { return dim_; }
  
  /// We implement InitFromString at this level.
  virtual void InitFromString(std::string args);
  
  /// We implement Read at this level as it just needs the Type().
  virtual void Read(std::istream &is, bool binary);
  
  /// Write component to stream.
  virtual void Write(std::ostream &os, bool binary) const;
288 289 290 291
  
  void Scale(BaseFloat scale); // relates to scaling stats, not parameters.
  void Add(BaseFloat alpha, const NonlinearComponent &other); // relates to
                                                              // adding stats
292

293 294
  // The following functions are unique to NonlinearComponent.
  // They mostly relate to diagnostics.
295 296
  const CuVector<double> &ValueSum() const { return value_sum_; }
  const CuVector<double> &DerivSum() const { return deriv_sum_; }
297
  double Count() const { return count_; }
298 299 300 301

  // The following function is used when "widening" neural networks.
  void SetDim(int32 dim);
  
302
 protected:
303
  friend class NormalizationComponent;
304 305 306
  friend class SigmoidComponent;
  friend class TanhComponent;
  friend class SoftmaxComponent;
307 308
  friend class RectifiedLinearComponent;
  friend class SoftHingeComponent;
309 310 311 312
  
  // This function updates the stats "value_sum_", "deriv_sum_", and
  // count_. (If deriv == NULL, it won't update "deriv_sum_").
  // It will be called from the Backprop function of child classes.
313 314
  void UpdateStats(const CuMatrixBase<BaseFloat> &out_value,
                   const CuMatrixBase<BaseFloat> *deriv = NULL);
315 316 317
  
  const NonlinearComponent &operator = (const NonlinearComponent &other); // Disallow.
  int32 dim_;
318 319
  CuVector<double> value_sum_; // stats at the output.
  CuVector<double> deriv_sum_; // stats of the derivative of the nonlinearity (only
320 321 322 323
  // applicable to element-by-element nonlinearities, not Softmax.
  double count_;
};

324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
class MaxoutComponent: public Component {
 public:
  void Init(int32 input_dim, int32 output_dim);
  explicit MaxoutComponent(int32 input_dim, int32 output_dim) {
    Init(input_dim, output_dim);
  }
  MaxoutComponent(): input_dim_(0), output_dim_(0) { }
  virtual std::string Type() const { return "MaxoutComponent"; }
  virtual void InitFromString(std::string args); 
  virtual int32 InputDim() const { return input_dim_; }
  virtual int32 OutputDim() const { return output_dim_; }
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
                         int32 num_chunks,
                         CuMatrix<BaseFloat> *out) const;
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &, // out_value
                        const CuMatrixBase<BaseFloat> &out_deriv,
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
                        CuMatrix<BaseFloat> *in_deriv) const;
  virtual bool BackpropNeedsInput() const { return true; }
  virtual bool BackpropNeedsOutput() const { return true; }
  virtual Component* Copy() const { return new MaxoutComponent(input_dim_,
                                                              output_dim_); }
  
  virtual void Read(std::istream &is, bool binary); // This Read function
  // requires that the Component has the correct type.
  
  /// Write component to stream
  virtual void Write(std::ostream &os, bool binary) const;

  virtual std::string Info() const;
 protected:
  int32 input_dim_;
  int32 output_dim_;
};

361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
class PnormComponent: public Component {
 public:
  void Init(int32 input_dim, int32 output_dim, BaseFloat p);
  explicit PnormComponent(int32 input_dim, int32 output_dim, BaseFloat p) {
    Init(input_dim, output_dim, p);
  }
  PnormComponent(): input_dim_(0), output_dim_(0), p_(0) { }
  virtual std::string Type() const { return "PnormComponent"; }
  virtual void InitFromString(std::string args); 
  virtual int32 InputDim() const { return input_dim_; }
  virtual int32 OutputDim() const { return output_dim_; }
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
                         int32 num_chunks,
                         CuMatrix<BaseFloat> *out) const;
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &, // out_value
                        const CuMatrixBase<BaseFloat> &out_deriv,
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
                        CuMatrix<BaseFloat> *in_deriv) const;
  virtual bool BackpropNeedsInput() const { return true; }
  virtual bool BackpropNeedsOutput() const { return true; }
  virtual Component* Copy() const { return new PnormComponent(input_dim_,
                                                              output_dim_, p_); }
  
  virtual void Read(std::istream &is, bool binary); // This Read function
  // requires that the Component has the correct type.
  
  /// Write component to stream
  virtual void Write(std::ostream &os, bool binary) const;

  virtual std::string Info() const;
 protected:
  int32 input_dim_;
  int32 output_dim_;
  BaseFloat p_;
};

class NormalizeComponent: public NonlinearComponent {
 public:
  explicit NormalizeComponent(int32 dim): NonlinearComponent(dim) { }
  explicit NormalizeComponent(const NormalizeComponent &other): NonlinearComponent(other) { }
  NormalizeComponent() { }
  virtual std::string Type() const { return "NormalizeComponent"; }
  virtual Component* Copy() const { return new NormalizeComponent(*this); }
406 407
  virtual bool BackpropNeedsInput() const { return true; }
  virtual bool BackpropNeedsOutput() const { return true; }
408 409 410 411 412 413 414 415 416 417 418
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
                         int32 num_chunks,
                         CuMatrix<BaseFloat> *out) const; 
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &out_value,
                        const CuMatrixBase<BaseFloat> &out_deriv,
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
                        CuMatrix<BaseFloat> *in_deriv) const;
 private:
  NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
419 420 421 422
  static const BaseFloat kNormFloor;
  // about 0.7e-20.  We need a value that's exactly representable in
  // float and whose inverse square root is also exactly representable
  // in float (hence, an even power of two).
423 424
};

425

426 427 428 429 430 431
class SigmoidComponent: public NonlinearComponent {
 public:
  explicit SigmoidComponent(int32 dim): NonlinearComponent(dim) { }
  explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }    
  SigmoidComponent() { }
  virtual std::string Type() const { return "SigmoidComponent"; }
432 433
  virtual bool BackpropNeedsInput() const { return false; }
  virtual bool BackpropNeedsOutput() const { return true; }
434
  virtual Component* Copy() const { return new SigmoidComponent(*this); }
435
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
436
                         int32 num_chunks,
437 438 439 440
                         CuMatrix<BaseFloat> *out) const; 
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &out_value,
                        const CuMatrixBase<BaseFloat> &out_deriv,
441 442
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
443
                        CuMatrix<BaseFloat> *in_deriv) const;
444 445 446 447 448 449 450 451 452 453 454
 private:
  SigmoidComponent &operator = (const SigmoidComponent &other); // Disallow.
};

class TanhComponent: public NonlinearComponent {
 public:
  explicit TanhComponent(int32 dim): NonlinearComponent(dim) { }
  explicit TanhComponent(const TanhComponent &other): NonlinearComponent(other) { }
  TanhComponent() { }
  virtual std::string Type() const { return "TanhComponent"; }
  virtual Component* Copy() const { return new TanhComponent(*this); }
455 456
  virtual bool BackpropNeedsInput() const { return false; }
  virtual bool BackpropNeedsOutput() const { return true; }
457
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
458
                         int32 num_chunks,
459 460 461 462
                         CuMatrix<BaseFloat> *out) const; 
  virtual void Backprop(const CuMatrixBase<BaseFloat> &, // in_value
                        const CuMatrixBase<BaseFloat> &out_value,
                        const CuMatrixBase<BaseFloat> &out_deriv,
463 464
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
465
                        CuMatrix<BaseFloat> *in_deriv) const;
466 467 468 469
 private:
  TanhComponent &operator = (const TanhComponent &other); // Disallow.
};

470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
/// Take the absoute values of an input vector to a power.
/// The derivative for zero input will be treated as zero.
class PowerComponent: public NonlinearComponent {
 public:
  void Init(int32 dim, BaseFloat power = 2);
  explicit PowerComponent(int32 dim, BaseFloat power = 2) {
    Init(dim, power);
  }
  PowerComponent(): dim_(0), power_(2) { }
  virtual std::string Type() const { return "PowerComponent"; }
  virtual void InitFromString(std::string args); 
  virtual int32 InputDim() const { return dim_; }
  virtual int32 OutputDim() const { return dim_; }
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
                         int32 num_chunks,
                         CuMatrix<BaseFloat> *out) const;
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &, // out_value
                        const CuMatrixBase<BaseFloat> &out_deriv,
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
                        CuMatrix<BaseFloat> *in_deriv) const;
  virtual bool BackpropNeedsInput() const { return true; }
  virtual bool BackpropNeedsOutput() const { return true; }
  virtual Component* Copy() const { return new PowerComponent(dim_, power_); }
  virtual void Read(std::istream &is, bool binary); // This Read function
  // requires that the Component has the correct type.
  
  /// Write component to stream
  virtual void Write(std::ostream &os, bool binary) const;

  virtual std::string Info() const;

 private:
  int32 dim_;
  BaseFloat power_;
};

508 509 510 511 512 513 514
class RectifiedLinearComponent: public NonlinearComponent {
 public:
  explicit RectifiedLinearComponent(int32 dim): NonlinearComponent(dim) { }
  explicit RectifiedLinearComponent(const RectifiedLinearComponent &other): NonlinearComponent(other) { }
  RectifiedLinearComponent() { }
  virtual std::string Type() const { return "RectifiedLinearComponent"; }
  virtual Component* Copy() const { return new RectifiedLinearComponent(*this); }
515 516
  virtual bool BackpropNeedsInput() const { return false; }
  virtual bool BackpropNeedsOutput() const { return true; }
517
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
518
                         int32 num_chunks,
519 520 521 522
                         CuMatrix<BaseFloat> *out) const; 
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &out_value,
                        const CuMatrixBase<BaseFloat> &out_deriv,
523 524
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
525
                        CuMatrix<BaseFloat> *in_deriv) const;
526 527 528 529 530 531 532 533 534 535 536
 private:
  RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow.
};

class SoftHingeComponent: public NonlinearComponent {
 public:
  explicit SoftHingeComponent(int32 dim): NonlinearComponent(dim) { }
  explicit SoftHingeComponent(const SoftHingeComponent &other): NonlinearComponent(other) { }
  SoftHingeComponent() { }
  virtual std::string Type() const { return "SoftHingeComponent"; }
  virtual Component* Copy() const { return new SoftHingeComponent(*this); }
537 538
  virtual bool BackpropNeedsInput() const { return true; }
  virtual bool BackpropNeedsOutput() const { return true; }
539
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
540
                         int32 num_chunks,
541 542 543 544
                         CuMatrix<BaseFloat> *out) const; 
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &out_value,
                        const CuMatrixBase<BaseFloat> &out_deriv,
545 546
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
547
                        CuMatrix<BaseFloat> *in_deriv) const;
548 549 550 551
 private:
  SoftHingeComponent &operator = (const SoftHingeComponent &other); // Disallow.
};

552 553 554 555 556 557 558 559 560 561 562 563

// This class scales the input by a specified constant.  This is, of course,
// useless, but we use it when we want to change how fast the next layer learns.
// (e.g. a smaller scale will make the next layer learn slower.)
class ScaleComponent: public Component {
 public:
  explicit ScaleComponent(int32 dim, BaseFloat scale): dim_(dim), scale_(scale) { }
  explicit ScaleComponent(const ScaleComponent &other):
      dim_(other.dim_), scale_(other.scale_) { }
  ScaleComponent(): dim_(0), scale_(0.0) { }
  virtual std::string Type() const { return "ScaleComponent"; }
  virtual Component* Copy() const { return new ScaleComponent(*this); }
564 565
  virtual bool BackpropNeedsInput() const { return false; }
  virtual bool BackpropNeedsOutput() const { return false; }
566
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
567
                         int32 num_chunks,
568 569 570 571
                         CuMatrix<BaseFloat> *out) const; 
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &out_value,
                        const CuMatrixBase<BaseFloat> &out_deriv,
572 573
                        int32 num_chunks,
                        Component *to_update,
574
                        CuMatrix<BaseFloat> *in_deriv) const;
575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594

  virtual int32 InputDim() const { return dim_; }
  virtual int32 OutputDim() const { return dim_; }
  virtual void Read(std::istream &is, bool binary);
  
  virtual void Write(std::ostream &os, bool binary) const;

  void Init(int32 dim, BaseFloat scale);
  
  virtual void InitFromString(std::string args); 

  virtual std::string Info() const;
  
 private:
  int32 dim_;
  BaseFloat scale_;
  ScaleComponent &operator = (const ScaleComponent &other); // Disallow.
};


595

596
class SumGroupComponent; // Forward declaration.
597 598 599 600 601 602 603 604 605 606 607
class AffineComponent; // Forward declaration.

class SoftmaxComponent: public NonlinearComponent {
 public:
  explicit SoftmaxComponent(int32 dim): NonlinearComponent(dim) { }
  explicit SoftmaxComponent(const SoftmaxComponent &other): NonlinearComponent(other) { }  
  SoftmaxComponent() { }
  virtual std::string Type() const { return "SoftmaxComponent"; }  // Make it lower case
  // because each type of Component needs a different first letter.
  virtual bool BackpropNeedsInput() const { return false; }
  virtual bool BackpropNeedsOutput() const { return true; }
608
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
609
                         int32 num_chunks,
610 611 612 613
                         CuMatrix<BaseFloat> *out) const; 
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &out_value,
                        const CuMatrixBase<BaseFloat> &out_deriv,
614 615
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
616
                        CuMatrix<BaseFloat> *in_deriv) const;
617
  
618
  void MixUp(int32 num_mixtures,
619 620 621 622
             BaseFloat power,
             BaseFloat min_count,
             BaseFloat perturb_stddev,
             AffineComponent *ac,
623 624
             SumGroupComponent *sc);
  
625 626 627 628 629
  virtual Component* Copy() const { return new SoftmaxComponent(*this); }
 private:
  SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
};

630

631
class FixedAffineComponent;
632 633 634 635 636 637 638 639 640

// Affine means a linear function plus an offset.
// Note: although this class can be instantiated, it also
// function as a base-class for more specialized versions of
// AffineComponent.
class AffineComponent: public UpdatableComponent {
  friend class SoftmaxComponent; // Friend declaration relates to mixing up.
 public:
  explicit AffineComponent(const AffineComponent &other);
641 642 643 644 645
  // The next constructor is used in converting from nnet1.
  AffineComponent(const CuMatrix<BaseFloat> &linear_params,
                  const CuVector<BaseFloat> &bias_params,
                  BaseFloat learning_rate);
  
646 647 648 649
  virtual int32 InputDim() const { return linear_params_.NumCols(); }
  virtual int32 OutputDim() const { return linear_params_.NumRows(); }
  void Init(BaseFloat learning_rate,
            int32 input_dim, int32 output_dim,
650 651 652 653 654 655 656 657 658 659 660 661
            BaseFloat param_stddev, BaseFloat bias_stddev);
  void Init(BaseFloat learning_rate,
            std::string matrix_filename);

  // The following functions are used for collapsing multiple layers
  // together.  They return a pointer to a new Component equivalent to
  // the sequence of two components.  We haven't implemented this for
  // FixedLinearComponent yet.
  Component *CollapseWithNext(const AffineComponent &next) const ;
  Component *CollapseWithNext(const FixedAffineComponent &next) const;
  Component *CollapseWithPrevious(const FixedAffineComponent &prev) const;

662 663 664 665 666 667 668
  virtual std::string Info() const;
  virtual void InitFromString(std::string args);
  
  AffineComponent(): is_gradient_(false) { } // use Init to really initialize.
  virtual std::string Type() const { return "AffineComponent"; }
  virtual bool BackpropNeedsInput() const { return true; }
  virtual bool BackpropNeedsOutput() const { return false; }
669
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
670
                         int32 num_chunks,
671
                         CuMatrix<BaseFloat> *out) const;
672 673
  virtual void Scale(BaseFloat scale);
  virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
674 675 676
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &out_value, // dummy
                        const CuMatrixBase<BaseFloat> &out_deriv,
677 678
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
679
                        CuMatrix<BaseFloat> *in_deriv) const;
680 681 682 683 684 685 686 687 688
  virtual void SetZero(bool treat_as_gradient);
  virtual void Read(std::istream &is, bool binary);
  virtual void Write(std::ostream &os, bool binary) const;
  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
  virtual Component* Copy() const;
  virtual void PerturbParams(BaseFloat stddev);
  // This new function is used when mixing up:
  virtual void SetParams(const VectorBase<BaseFloat> &bias,
                         const MatrixBase<BaseFloat> &linear);
689 690
  const CuVector<BaseFloat> &BiasParams() { return bias_params_; }
  const CuMatrix<BaseFloat> &LinearParams() { return linear_params_; }
691 692 693 694

  virtual int32 GetParameterDim() const;
  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
695 696 697 698 699 700 701 702 703 704 705 706 707

  /// This function is for getting a low-rank approximations of this
  /// AffineComponent by two AffineComponents.
  virtual void LimitRank(int32 dimension,
                         AffineComponent **a, AffineComponent **b) const;

  /// This function is implemented in widen-nnet.cc
  void Widen(int32 new_dimension,
             BaseFloat param_stddev,
             BaseFloat bias_stddev,
             std::vector<NonlinearComponent*> c2, // will usually have just one
                                                  // element.
             AffineComponent *c3);
708 709 710 711
 protected:
  friend class AffineComponentA;
  // This function Update() is for extensibility; child classes may override this.
  virtual void Update(
712 713
      const CuMatrixBase<BaseFloat> &in_value,
      const CuMatrixBase<BaseFloat> &out_deriv) {
714 715 716 717 718
    UpdateSimple(in_value, out_deriv);
  }
  // UpdateSimple is used when *this is a gradient.  Child classes may
  // or may not override this.
  virtual void UpdateSimple(
719 720
      const CuMatrixBase<BaseFloat> &in_value,
      const CuMatrixBase<BaseFloat> &out_deriv);  
721 722

  const AffineComponent &operator = (const AffineComponent &other); // Disallow.
723 724
  CuMatrix<BaseFloat> linear_params_;
  CuVector<BaseFloat> bias_params_;
725 726 727 728

  bool is_gradient_; // If true, treat this as just a gradient.
};

729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758

/// PiecewiseLinearComponent is a kind of trainable version of the
/// RectifiedLinearComponent, in which each dimension of the nonlinearity has a
/// number of parameters that can be trained.  it's of the form 
/// alpha + beta x + gamma_1 |x - c_1| + gamma_2 |x - c_2| + ... + gamma_N |x - c_N|
/// where c_1 ... c_N on are constants (by default, equally
/// spaced between -1 and 1), and the alpha, beta and gamma quantities are trainable.
/// (Each dimension has separate alpha, beta and gamma quantities).
/// We require that N be odd so that the "middle" gamma quantity corresponds
/// to zero; this is for convenience of initialization so that it corresponds
/// to ReLus.
class PiecewiseLinearComponent: public UpdatableComponent {
 public:
  explicit PiecewiseLinearComponent(const PiecewiseLinearComponent &other);
  virtual int32 InputDim() const { return params_.NumRows(); }
  virtual int32 OutputDim() const { return params_.NumRows(); }

  void Init(int32 dim, int32 N,
            BaseFloat learning_rate,
            BaseFloat max_change);

  virtual std::string Info() const;
  
  virtual void InitFromString(std::string args);
  
  PiecewiseLinearComponent(): is_gradient_(false), max_change_(0.0) { } // use Init to really initialize.
  
  virtual std::string Type() const { return "PiecewiseLinearComponent"; }
  virtual bool BackpropNeedsInput() const { return true; }
  virtual bool BackpropNeedsOutput() const { return false; }
759
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
760
                         int32 num_chunks,
761
                         CuMatrix<BaseFloat> *out) const;
762 763
  virtual void Scale(BaseFloat scale);
  virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
764 765 766
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &out_value, // dummy
                        const CuMatrixBase<BaseFloat> &out_deriv,
767 768
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
769
                        CuMatrix<BaseFloat> *in_deriv) const;
770 771 772 773 774 775 776
  virtual void SetZero(bool treat_as_gradient);
  virtual void Read(std::istream &is, bool binary);
  virtual void Write(std::ostream &os, bool binary) const;
  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
  virtual Component* Copy() const;
  virtual void PerturbParams(BaseFloat stddev);

777
  const CuMatrix<BaseFloat> &Params() { return params_; }
778 779 780 781 782 783 784 785
  
  virtual int32 GetParameterDim() const;

  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
  virtual void UnVectorize(const VectorBase<BaseFloat> &params);

 protected:
  const PiecewiseLinearComponent &operator = (const PiecewiseLinearComponent &other); // Disallow.
786
  CuMatrix<BaseFloat> params_;
787 788 789
  
  bool is_gradient_; // If true, treat this as just a gradient.
  BaseFloat max_change_; // If nonzero, maximum change allowed per individual
790
                         // parameter per minibatch.  
791 792 793
};


794 795 796 797 798 799 800 801 802 803 804 805 806
// This is an idea Dan is trying out, a little bit like
// preconditioning the update with the Fisher matrix, but the
// Fisher matrix has a special structure.
// [note: it is currently used in the standard receipe].
class AffineComponentPreconditioned: public AffineComponent {
 public:
  virtual std::string Type() const { return "AffineComponentPreconditioned"; }

  virtual void Read(std::istream &is, bool binary);
  virtual void Write(std::ostream &os, bool binary) const;
  void Init(BaseFloat learning_rate,
            int32 input_dim, int32 output_dim,
            BaseFloat param_stddev, BaseFloat bias_stddev,
807 808 809 810
            BaseFloat alpha, BaseFloat max_change);
  void Init(BaseFloat learning_rate, BaseFloat alpha,
            BaseFloat max_change, std::string matrix_filename);
  
811 812 813
  virtual void InitFromString(std::string args);
  virtual std::string Info() const;
  virtual Component* Copy() const;
814
  AffineComponentPreconditioned(): alpha_(1.0), max_change_(0.0) { }
815
  void SetMaxChange(BaseFloat max_change) { max_change_ = max_change; }
816 817
 protected:
  friend class AffineComponentPreconditionedOnline;
818 819
  KALDI_DISALLOW_COPY_AND_ASSIGN(AffineComponentPreconditioned);
  BaseFloat alpha_;
820 821 822 823 824
  BaseFloat max_change_; // If > 0, this is the maximum amount of parameter change (in L2 norm)
                         // that we allow per minibatch.  This was introduced in order to
                         // control instability.  Instead of the exact L2 parameter change,
                         // for efficiency purposes we limit a bound on the exact change.
                         // The limit is applied via a constant <= 1.0 for each minibatch,
825
                         // A suitable value might be, for example, 10 or so; larger if there are
826 827 828 829
                         // more parameters.

  /// The following function is only called if max_change_ > 0.  It returns the
  /// greatest value alpha <= 1.0 such that (alpha times the sum over the
830
  /// row-index of the two matrices of the product the l2 norms of the two rows
831 832
  /// times learning_rate_)
  /// is <= max_change.
833 834
  BaseFloat GetScalingFactor(const CuMatrix<BaseFloat> &in_value_precon,
                             const CuMatrix<BaseFloat> &out_deriv_precon);
835

836
  virtual void Update(
837 838
      const CuMatrixBase<BaseFloat> &in_value,
      const CuMatrixBase<BaseFloat> &out_deriv);
839 840 841
};


842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
/// AffineComponentPreconditionedOnline is, like AffineComponentPreconditioned,
/// a version of AffineComponent that has a non-(multiple of unit) learning-rate
/// matrix.  See nnet-precondition-online.h for a description of the technique.
/// This method maintains an orthogonal matrix N with a small number of rows,
/// actually two (for input and output dims) which gets modified each time;
/// we maintain a mutex for access to this (we just use it to copy it when
/// we need it and write to it when we change it).  For multi-threaded use,
/// the parallelization method is to lock a mutex whenever we want to
/// read N or change it, but just quickly make a copy and release the mutex;
/// this is to ensure operations on N are atomic.
class AffineComponentPreconditionedOnline: public AffineComponent {
 public:
  virtual std::string Type() const {
    return "AffineComponentPreconditionedOnline";
  }

  virtual void Read(std::istream &is, bool binary);
  virtual void Write(std::ostream &os, bool binary) const;
  void Init(BaseFloat learning_rate,
            int32 input_dim, int32 output_dim,
            BaseFloat param_stddev, BaseFloat bias_stddev,
863
            int32 rank_in, int32 rank_out, int32 update_period,
864
            BaseFloat num_samples_history, BaseFloat alpha,
865
            BaseFloat max_change_per_sample);
866
  void Init(BaseFloat learning_rate, int32 rank_in,
867 868
            int32 rank_out, int32 update_period,
            BaseFloat num_samples_history,
869 870 871 872 873 874 875
            BaseFloat alpha, BaseFloat max_change_per_sample,
            std::string matrix_filename);

  // This constructor is used when converting neural networks partway
  // through training, from AffineComponentPreconditioned to
  // AffineComponentPreconditionedOnline.
  AffineComponentPreconditionedOnline(const AffineComponentPreconditioned &orig,
876
                                      int32 rank_in, int32 rank_out,
877
                                      int32 update_period,
878
                                      BaseFloat eta, BaseFloat alpha);
879 880 881 882
  
  virtual void InitFromString(std::string args);
  virtual std::string Info() const;
  virtual Component* Copy() const;
883
  AffineComponentPreconditionedOnline(): max_change_per_sample_(0.0) { }
884 885 886 887

 private:
  KALDI_DISALLOW_COPY_AND_ASSIGN(AffineComponentPreconditionedOnline);

888

889 890 891 892
  // Configs for preconditioner.  The input side tends to be better conditioned ->
  // smaller rank needed, so make them separately configurable.
  int32 rank_in_;
  int32 rank_out_;
893
  int32 update_period_;
894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926
  BaseFloat num_samples_history_;
  BaseFloat alpha_;
  
  OnlinePreconditioner preconditioner_in_;

  OnlinePreconditioner preconditioner_out_;

  BaseFloat max_change_per_sample_;
  // If > 0, max_change_per_sample_ this is the maximum amount of parameter
  // change (in L2 norm) that we allow per sample, averaged over the minibatch.
  // This was introduced in order to control instability.
  // Instead of the exact L2 parameter change, for
  // efficiency purposes we limit a bound on the exact
  // change.  The limit is applied via a constant <= 1.0
  // for each minibatch, A suitable value might be, for
  // example, 10 or so; larger if there are more
  // parameters.

  /// The following function is only called if max_change_per_sample_ > 0, it returns a
  /// scaling factor alpha <= 1.0 (1.0 in the normal case) that enforces the
  /// "max-change" constraint.  "in_products" is the inner product with itself
  /// of each row of the matrix of preconditioned input features; "out_products"
  /// is the same for the output derivatives.  gamma_prod is a product of two
  /// scalars that are output by the preconditioning code (for the input and
  /// output), which we will need to multiply into the learning rate.
  /// out_products is a pointer because we modify it in-place.
  BaseFloat GetScalingFactor(const CuVectorBase<BaseFloat> &in_products,
                             BaseFloat gamma_prod,
                             CuVectorBase<BaseFloat> *out_products);

  // Sets the configs rank, alpha and eta in the preconditioner objects,
  // from the class variables.
  void SetPreconditionerConfigs();
927 928 929 930 931 932 933

  virtual void Update(
      const CuMatrixBase<BaseFloat> &in_value,
      const CuMatrixBase<BaseFloat> &out_deriv);
};


934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972
/// AffineComponentModified as as AffineComponent but we are careful about
/// the lengths of rows of the parameter matrix, when we do the update.
/// That means, for a given row, we first do an update along the direction of
/// the existing vector; we then take the update orthogonal to that direction,
/// but keep the length of the vector fixed.
class AffineComponentModified: public AffineComponent {
 public:
  virtual std::string Type() const { return "AffineComponentModified"; }

  virtual void Read(std::istream &is, bool binary);
  virtual void Write(std::ostream &os, bool binary) const;
  void Init(BaseFloat learning_rate,
            int32 input_dim, int32 output_dim,
            BaseFloat param_stddev, BaseFloat bias_stddev,
            BaseFloat cutoff_length, BaseFloat max_change);
  void Init(BaseFloat learning_rate, BaseFloat cutoff_length,
            BaseFloat max_change, std::string matrix_filename);
  
  virtual void InitFromString(std::string args);
  virtual std::string Info() const;
  virtual Component* Copy() const;
  AffineComponentModified(): cutoff_length_(10.0), max_change_(0.1) { }
  
 private:
  KALDI_DISALLOW_COPY_AND_ASSIGN(AffineComponentModified);

  BaseFloat cutoff_length_; /// If the length of the vector corresponding to
  /// this row of the parameter matrix is less than this, we just do a regular
  /// gradient descent update.  This would typically be less than
  /// sqrt(InputDim())-- a value smaller than the expected length of the
  /// parameter vector.
  
  BaseFloat max_change_; /// [if above the cutoff], this is the maximum
                         /// change allowed in the vector per minibatch,
                         /// as a proportion of the previous value.  We separately
                         /// apply this constraint to both the length and direction.  Should
                         /// be less than one, e.g. 0.1 or 0.01.

  virtual void Update(
973 974
      const CuMatrixBase<BaseFloat> &in_value,
      const CuMatrixBase<BaseFloat> &out_deriv);
975 976 977
};


978 979 980 981 982 983 984 985 986 987 988 989
class RandomComponent: public Component {
 public:
  // This function is required in testing code and in other places we need
  // consistency in the random number generation (e.g. when optimizing
  // validation-set performance), but check where else we call srand().  You'll
  // need to call srand as well as making this call.  
  void ResetGenerator() { random_generator_.SeedGpu(0); }
 protected:
  CuRand<BaseFloat> random_generator_;
};


990

991 992 993 994 995 996 997
struct PreconditionConfig { // relates to AffineComponentA
  BaseFloat alpha;
  bool do_precondition;
  bool renormalize;
  
  PreconditionConfig(): alpha(0.1), do_precondition(true),
                        renormalize(true) { }
998
  void Register(OptionsItf *po) {
999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079
    po->Register("alpha", &alpha, "Smoothing constant used in "
                 "preconditioning of updates.");
    po->Register("do-precondition", &do_precondition, "Controls whether "
                 "or not preconditioning is applied in the L-BFGS update.");
    po->Register("renormalize", &renormalize, "If true, in the preconditioning "
                 "we renormalize with a scalar so the projected scatter has the "
                 "same trace as before preconditioning.");
  }
};


/**
   AffineComponentA is a special type of AffineComponent, that
   stores matrices for preconditioning similar to those used
   in the update function of AffineComponentPreconditioned.  This is
   intended for use as a preconditioner in L-BFGS updates.
   In this case we optionally store the preconditioning
   information with the gradient information, in a separate
   copy of the component.
*/
class AffineComponentA: public AffineComponent {
 public:
  AffineComponentA() { }
  
  virtual std::string Type() const { return "AffineComponentA"; }
  
  virtual void Read(std::istream &is, bool binary);
  virtual void Write(std::ostream &os, bool binary) const;

  // There is no Init function for now; we only have the
  // ability to initialize from another AffineComponent (or child
  // class).  This is because we imagine that the L-BFGS training
  // will be initialized from a system trained with SGD, for which
  // something like AffineComponentPreconditioned will be more
  // appropriate; we'll then convert the model.
  AffineComponentA(const AffineComponent &component);

  // We're not supporting initializing as this type.
  virtual void InitFromString(std::string args) { KALDI_ASSERT(0); }
  virtual Component* Copy() const;

  virtual void Scale(BaseFloat scale);
  virtual void Add(BaseFloat alpha, const UpdatableComponent &other);

  
  // Some functions that are specific to this class:
  void InitializeScatter(); // Lets the class
  // know that it should accumulate the scatter matrix; sets
  // up input_scatter_ and output_scatter_.


  // This function uses the input_scatter_ and output_scatter_ variables of the
  // current class to transform the linear_params_ and bias_params_ variables of
  // "component".  If forward == true then we transform to the preconditioned
  // space; otherwise we transform back from the preconditioned to the canonical
  // space.  This is done differently depending if component->is_gradient_ ==
  // true, because gradients and parameters transform differently.  The alpha
  // value relates to smoothing with the unit matrix; it's not defined in quite
  // the same way as for AffineComponentPreconditioned.  See the code for
  // details.
  void Transform(const PreconditionConfig &config,
                 bool forward,
                 AffineComponent *component);

  // This function uses the input_scatter_ and output_scatter_ variables
  // current class to transform the linear_params_ and bias_params_ variables of
  // "component".  It is equivalent to multiplying by the inverse Fisher,
  // or approximate inverse Hessian.  It's the operation that you need
  // in optimization methods like L-BFGS, to transform from "gradient space"
  // into "model space".
  // Note: it's not const in this object, because we may cache stuff with the model.
  // See also the function "PreconditionNnet" in nnet-lbfgs.h, which
  // does this at the whole-neural-net level (by calling this function).
  void Precondition(const PreconditionConfig &config,
                    AffineComponent *component);
  
 private:

  // The following variables are not used for the actual neural net, but
  // only when is_gradient_ == true (when it's being used to store gradients),

1080
  CuSpMatrix<double> input_scatter_; // scatter of (input vectors extended with 1.)
1081 1082
  // This is only set up if this->is_gradient = true, and InitializeScatter()
  // has been called.
1083
  CuSpMatrix<double> output_scatter_;
1084 1085 1086

  // The following four quantities may be cached by the function "Transform",
  // to avoid duplicating work.
1087 1088 1089 1090
  CuTpMatrix<double> in_C_;
  CuTpMatrix<double> in_C_inv_;
  CuTpMatrix<double> out_C_;
  CuTpMatrix<double> out_C_inv_;
1091 1092 1093

  // The following two quantities may be cached by the function "Precondition",
  // to avoid duplicating work.
1094 1095
  CuSpMatrix<double> inv_fisher_in_;
  CuSpMatrix<double> inv_fisher_out_;
1096 1097 1098 1099
  
  // This function computes the matrix (and corresponding transpose-ness) that
  // we'd left-multiply a vector by when transforming the parameter/gradient
  // space.
1100
  static void ComputeTransforms(const CuSpMatrix<double> &scatter,
1101 1102
                                const PreconditionConfig &config,
                                double tot_count,
1103 1104
                                CuTpMatrix<double> *C,
                                CuTpMatrix<double> *C_inv);
1105 1106 1107

  // This function is called by "Precondition"; it pre-computes
  // certain quantities we'll need.
1108
  static void ComputePreconditioner(const CuSpMatrix<double> &scatter,
1109 1110
                                    const PreconditionConfig &config,
                                    double tot_count,
1111
                                    CuSpMatrix<double> *inv_fisher);
1112 1113 1114 1115 1116 1117

  void ClearPrecomputedQuantities();
  
  // The following update function is called when *this is
  // a gradient.  We only override this one.
  virtual void UpdateSimple(
1118 1119
      const CuMatrixBase<BaseFloat> &in_value,
      const CuMatrixBase<BaseFloat> &out_deriv);
1120 1121 1122
};


1123
/// Splices a context window of frames together [over time]
1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135
class SpliceComponent: public Component {
 public:
  SpliceComponent() { }  // called only prior to Read() or Init().
  void Init(int32 input_dim,
            int32 left_context,
            int32 right_context,
            int32 const_component_dim=0);
  virtual std::string Type() const { return "SpliceComponent"; }
  virtual std::string Info() const;
  virtual void InitFromString(std::string args);
  virtual int32 InputDim() const { return input_dim_; }
  virtual int32 OutputDim() const;
1136 1137
  virtual int32 LeftContext() const { return left_context_; }
  virtual int32 RightContext() const { return right_context_; }
1138
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
1139
                         int32 num_chunks,
1140 1141 1142 1143
                         CuMatrix<BaseFloat> *out) const;
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &out_value,
                        const CuMatrixBase<BaseFloat> &out_deriv,
1144 1145
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
1146
                        CuMatrix<BaseFloat> *in_deriv) const;
1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160
  virtual bool BackpropNeedsInput() const { return false; }
  virtual bool BackpropNeedsOutput() const { return false; }
  virtual Component* Copy() const;
  virtual void Read(std::istream &is, bool binary);
  virtual void Write(std::ostream &os, bool binary) const;
 private:
  KALDI_DISALLOW_COPY_AND_ASSIGN(SpliceComponent);
  int32 input_dim_;
  int32 left_context_;
  int32 right_context_;
  int32 const_component_dim_;
};


1161

1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174
/// This is as SpliceComponent but outputs the max of
/// any of the inputs (taking the max across time).
class SpliceMaxComponent: public Component {
 public:
  SpliceMaxComponent() { }  // called only prior to Read() or Init().
  void Init(int32 dim,
            int32 left_context,
            int32 right_context);
  virtual std::string Type() const { return "SpliceMaxComponent"; }
  virtual std::string Info() const;
  virtual void InitFromString(std::string args);
  virtual int32 InputDim() const { return dim_; }
  virtual int32 OutputDim() const { return dim_; }
1175 1176
  virtual int32 LeftContext() const { return left_context_; }
  virtual int32 RightContext() const { return right_context_; }
1177
  virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
1178
                         int32 num_chunks,
1179 1180 1181 1182
                         CuMatrix<BaseFloat> *out) const;
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &out_value,
                        const CuMatrixBase<BaseFloat> &out_deriv,
1183 1184
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
1185
                        CuMatrix<BaseFloat> *in_deriv) const;
1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198
  virtual bool BackpropNeedsInput() const { return true; }
  virtual bool BackpropNeedsOutput() const { return false; }
  virtual Component* Copy() const;
  virtual void Read(std::istream &is, bool binary);
  virtual void Write(std::ostream &os, bool binary) const;
 private:
  KALDI_DISALLOW_COPY_AND_ASSIGN(SpliceMaxComponent);
  int32 dim_;
  int32 left_context_;
  int32 right_context_;
};


1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213
// Affine means a linear function plus an offset.  PreconInput means we
// precondition using the inverse of the variance of each dimension of the input
// data.  Note that this doesn't take into account any scaling of the samples,
// but this doesn't really matter.  This has some relation to AdaGrad, except
// it's being done not per input dimension, rather than per parameter, and also
// we multiply by a separately supplied and updated learning rate which will
// typically vary with time.  Note: avg_samples is the number of samples over
// which we average the variance of the input data.
class AffinePreconInputComponent: public AffineComponent {
 public:
  void Init(BaseFloat learning_rate,
                    int32 input_dim, int32 output_dim,
                    BaseFloat param_stddev,
                    BaseFloat bias_stddev,
                    BaseFloat avg_samples);
1214 1215 1216
  virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
                        const CuMatrixBase<BaseFloat> &out_value, // dummy
                        const CuMatrixBase<BaseFloat> &out_deriv,
1217 1218
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
1219
                        CuMatrix<BaseFloat> *in_deriv)