pitch-functions.h 19.8 KB
Newer Older
1
// feat/pitch-functions.h
2 3

// Copyright     2013  Pegah Ghahremani
4
//               2014  IMSL, PKU-HKUST (author: Wei Shi)
Yanqing Sun's avatar
Yanqing Sun committed
5 6
//               2014  Yanqing Sun, Junjie Wang,
//                     Daniel Povey, Korbinian Riedhammer
7
//                     Xin Lei
8

9 10
// See ../../COPYING for clarification regarding multiple authors
//
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_FEAT_PITCH_FUNCTIONS_H_
#define KALDI_FEAT_PITCH_FUNCTIONS_H_

#include <cassert>
#include <cstdlib>
#include <string>
#include <vector>

#include "base/kaldi-error.h"
#include "feat/mel-computations.h"
34
#include "itf/online-feature-itf.h"
35 36
#include "matrix/matrix-lib.h"
#include "util/common-utils.h"
37 38 39 40 41 42

namespace kaldi {
/// @addtogroup  feat FeatureExtraction
/// @{

struct PitchExtractionOptions {
43
  // FrameExtractionOptions frame_opts;
44
  BaseFloat samp_freq;          // sample frequency in hertz
Yanqing Sun's avatar
Yanqing Sun committed
45 46
  BaseFloat frame_shift_ms;     // in milliseconds.
  BaseFloat frame_length_ms;    // in milliseconds.
Yanqing Sun's avatar
Yanqing Sun committed
47
  BaseFloat preemph_coeff;      // Preemphasis coefficient. [use is deprecated.]
Yanqing Sun's avatar
Yanqing Sun committed
48 49 50
  BaseFloat min_f0;             // min f0 to search (Hz)
  BaseFloat max_f0;             // max f0 to search (Hz)
  BaseFloat soft_min_f0;        // Minimum f0, applied in soft way, must not
51
                                // exceed min-f0
Yanqing Sun's avatar
Yanqing Sun committed
52 53 54
  BaseFloat penalty_factor;     // cost factor for FO change
  BaseFloat lowpass_cutoff;     // cutoff frequency for Low pass filter
  BaseFloat resample_freq;      // Integer that determines filter width when
55
                                // upsampling NCCF
Yanqing Sun's avatar
Yanqing Sun committed
56 57
  BaseFloat delta_pitch;        // the pitch tolerance in pruning lags
  BaseFloat nccf_ballast;       // Increasing this factor reduces NCCF for
58 59
                                // quiet frames, helping ensure pitch
                                // continuity in unvoiced region
60
  int32 lowpass_filter_width;   // Integer that determines filter width of
61
                                // lowpass filter
62
  int32 upsample_filter_width;  // Integer that determines filter width when
63
                                // upsampling NCCF
64

65 66
  // Below are newer config variables, not present in the original paper,
  // that relate to the online pitch extraction algorithm.
67 68 69 70 71

  // The maximum number of frames of latency that we allow the pitch-processing
  // to introduce, for online operation. If you set this to a large value,
  // there would be no inaccuracy from the Viterbi traceback (but it might make
  // you wait to see the pitch). This is not very relevant for the online
72
  // operation: normalization-right-context is more relevant, you
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
  // can just leave this value at zero.
  int32 max_frames_latency;

  // Only relevant for the function ComputeKaldiPitch which is called by
  // compute-kaldi-pitch-feats. If nonzero, we provide the input as chunks of
  // this size. This affects the energy normalization which has a small effect
  // on the resulting features, especially at the beginning of a file. For best
  // compatibility with online operation (e.g. if you plan to train models for
  // the online-deocding setup), you might want to set this to a small value,
  // like one frame.
  int32 frames_per_chunk;

  // Only relevant for the function ComputeKaldiPitch which is called by
  // compute-kaldi-pitch-feats, and only relevant if frames_per_chunk is
  // nonzero. If true, it will query the features as soon as they are
  // available, which simulates the first-pass features you would get in online
  // decoding. If false, the features you will get will be the same as those
  // available at the end of the utterance, after InputFinished() has been
  // called: e.g. during lattice rescoring.
  bool simulate_first_pass_online;

  // Only relevant for online operation or when emulating online operation
  // (e.g. when setting frames_per_chunk). This is the frame-index on which we
  // recompute the NCCF (e.g. frame-index 500 = after 5 seconds); if the
  // segment ends before this we do it when the segment ends. We do this by
  // re-computing the signal average energy, which affects the NCCF via the
  // "ballast term", scaling the resampled NCCF by a factor derived from the
  // average change in the "ballast term", and re-doing the backtrace
  // computation. Making this infinity would be the most exact, but would
  // introduce unwanted latency at the end of long utterances, for little
  // benefit.
  int32 recompute_frame;

  // This is a "hidden config" used only for testing the online pitch
  // extraction. If true, we compute the signal root-mean-squared for the
  // ballast term, only up to the current frame, rather than the end of the
  // current chunk of signal. This makes the output insensitive to the
  // chunking, which is useful for testing purposes.
  bool nccf_ballast_online;
112
  bool snip_edges;
113
  PitchExtractionOptions():
114 115
      samp_freq(16000),
      frame_shift_ms(10.0),
116
      frame_length_ms(25.0),
117
      preemph_coeff(0.0),
118 119 120 121
      min_f0(50),
      max_f0(400),
      soft_min_f0(10.0),
      penalty_factor(0.1),
122
      lowpass_cutoff(1000),
123
      resample_freq(4000),
124
      delta_pitch(0.005),
125
      nccf_ballast(7000),
126
      lowpass_filter_width(1),
127
      upsample_filter_width(5),
128
      max_frames_latency(0),
129
      frames_per_chunk(0),
130 131
      simulate_first_pass_online(false),
      recompute_frame(500),
132 133
      nccf_ballast_online(false),
      snip_edges(true) { }
134

Yanqing Sun's avatar
Yanqing Sun committed
135
  void Register(OptionsItf *po) {
136
    po->Register("sample-frequency", &samp_freq,
Yanqing Sun's avatar
Yanqing Sun committed
137 138 139 140
                 "Waveform data sample frequency (must match the waveform "
                 "file, if specified there)");
    po->Register("frame-length", &frame_length_ms, "Frame length in "
                 "milliseconds");
141 142
    po->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds");
    po->Register("preemphasis-coefficient", &preemph_coeff,
143
                 "Coefficient for use in signal preemphasis (deprecated)");
144 145 146 147 148 149 150 151 152
    po->Register("min-f0", &min_f0,
                 "min. F0 to search for (Hz)");
    po->Register("max-f0", &max_f0,
                 "max. F0 to search for (Hz)");
    po->Register("soft-min-f0", &soft_min_f0,
                 "Minimum f0, applied in soft way, must not exceed min-f0");
    po->Register("penalty-factor", &penalty_factor,
                 "cost factor for FO change.");
    po->Register("lowpass-cutoff", &lowpass_cutoff,
153
                 "cutoff frequency for LowPass filter (Hz) ");
154 155 156
    po->Register("resample-frequency", &resample_freq,
                 "Frequency that we down-sample the signal to.  Must be "
                 "more than twice lowpass-cutoff");
157
    po->Register("delta-pitch", &delta_pitch,
158 159
                 "Smallest relative change in pitch that our algorithm "
                 "measures");
160 161
    po->Register("nccf-ballast", &nccf_ballast,
                 "Increasing this factor reduces NCCF for quiet frames");
162 163 164
    po->Register("nccf-ballast-online", &nccf_ballast_online,
                 "This is useful mainly for debug; it affects how the NCCF "
                 "ballast is computed.");
165
    po->Register("lowpass-filter-width", &lowpass_filter_width,
Pegah Ghahremani's avatar
 
Pegah Ghahremani committed
166 167
                 "Integer that determines filter width of "
                 "lowpass filter, more gives sharper filter");
168 169
    po->Register("upsample-filter-width", &upsample_filter_width,
                 "Integer that determines filter width when upsampling NCCF");
170 171
    po->Register("frames-per-chunk", &frames_per_chunk, "Only relevant for "
                 "offline pitch extraction (e.g. compute-kaldi-pitch-feats), "
172
                 "you can set it to a small nonzero value, such as 10, for "
173 174
                 "better feature compatibility with online decoding (affects "
                 "energy normalization in the algorithm)");
175
    po->Register("simulate-first-pass-online", &simulate_first_pass_online,
176
                 "If true, compute-kaldi-pitch-feats will output features "
177 178 179 180 181 182 183 184 185 186
                 "that correspond to what an online decoder would see in the "
                 "first pass of decoding-- not the final version of the "
                 "features, which is the default.  Relevant if "
                 "--frames-per-chunk > 0");
    po->Register("recompute-frame", &recompute_frame, "Only relevant for "
                 "online pitch extraction, or for compatibility with online "
                 "pitch extraction.  A non-critical parameter; the frame at "
                 "which we recompute some of the forward pointers, after "
                 "revising our estimate of the signal energy.  Relevant if"
                 "--frames-per-chunk > 0");
Yanqing Sun's avatar
Yanqing Sun committed
187 188
    po->Register("max-frames-latency", &max_frames_latency, "Maximum number "
                 "of frames of latency that we allow pitch tracking to "
189
                 "introduce into the feature processing (affects output only "
190 191
                 "if --frames-per-chunk > 0 and "
                 "--simulate-first-pass-online=true");
192 193 194
    po->Register("snip-edges", &snip_edges, "If this is set to false, the "
                 "incomplete frames near the ending edge won't be snipped, so "
                 "that the number of frames is the file size divided by the "
195
                 "frame-shift. This makes different types of features give the "
196 197
                 "same number of frames.");

198
  }
199 200
  /// Returns the window-size in samples, after resampling.  This is the
  /// "basic window size", not the full window size after extending by max-lag.
201
  int32 NccfWindowSize() const {
202
    return static_cast<int32>(resample_freq * 0.001 * frame_length_ms);
203
  }
204
  /// Returns the window-shift in samples, after resampling.
205
  int32 NccfWindowShift() const {
206
    return static_cast<int32>(resample_freq * 0.001 * frame_shift_ms);
207 208
  }
};
209

210
struct ProcessPitchOptions {
211 212 213 214 215 216 217
  BaseFloat pitch_scale;  // the final normalized-log-pitch feature is scaled
                          // with this value
  BaseFloat pov_scale;    // the final POV feature is scaled with this value
  BaseFloat pov_offset;   // An offset that can be added to the final POV
                          // feature (useful for online-decoding, where we don't
                          // do CMN to the pitch-derived features.

218
  BaseFloat delta_pitch_scale;
Yanqing Sun's avatar
Yanqing Sun committed
219
  BaseFloat delta_pitch_noise_stddev;  // stddev of noise we add to delta-pitch
220 221
  int32 normalization_left_context;    // left-context used for sliding-window
                                       // normalization
222 223
  int32 normalization_right_context;   // this should be reduced in online
                                       // decoding to reduce latency
224

225
  int32 delta_window;
226
  int32 delay;
227 228 229
  
  bool add_pov_feature;  
  bool add_normalized_log_pitch;  
230
  bool add_delta_pitch;
231
  bool add_raw_log_pitch;
232
  
233
  ProcessPitchOptions() :
234 235 236 237 238 239 240 241
      pitch_scale(2.0),
      pov_scale(2.0),
      pov_offset(0.0),
      delta_pitch_scale(10.0),
      delta_pitch_noise_stddev(0.005),
      normalization_left_context(75),
      normalization_right_context(75),
      delta_window(2),
242
      delay(0),
243 244 245 246 247
      add_pov_feature(true),
      add_normalized_log_pitch(true),
      add_delta_pitch(true),
      add_raw_log_pitch(false) { }

248 249 250

  void Register(ParseOptions *po) {
    po->Register("pitch-scale", &pitch_scale,
251
                 "Scaling factor for the final normalized log-pitch value");
252
    po->Register("pov-scale", &pov_scale,
Yanqing Sun's avatar
Yanqing Sun committed
253 254
                 "Scaling factor for final POV (probability of voicing) "
                 "feature");
255 256
    po->Register("pov-offset", &pov_offset,
                 "This can be used to add an offset to the POV feature. "
257 258
                 "Intended for use in online decoding as a substitute for "
                 " CMN.");
259
    po->Register("delta-pitch-scale", &delta_pitch_scale,
260
                 "Term to scale the final delta log-pitch feature");
261
    po->Register("delta-pitch-noise-stddev", &delta_pitch_noise_stddev,
Yanqing Sun's avatar
Yanqing Sun committed
262 263 264 265 266
                 "Standard deviation for noise we add to the delta log-pitch "
                 "(before scaling); should be about the same as delta-pitch "
                 "option to pitch creation.  The purpose is to get rid of "
                 "peaks in the delta-pitch caused by discretization of pitch "
                 "values.");
267 268 269
    po->Register("normalization-left-context", &normalization_left_context,
                 "Left-context (in frames) for moving window normalization");
    po->Register("normalization-right-context", &normalization_right_context,
270
                 "Right-context (in frames) for moving window normalization");
271
    po->Register("delta-window", &delta_window,
Yanqing Sun's avatar
Yanqing Sun committed
272 273
                 "Number of frames on each side of central frame, to use for "
                 "delta window.");
274 275
    po->Register("delay", &delay,
                 "Number of frames by which the pitch information is delayed.");
276
    po->Register("add-pov-feature", &add_pov_feature,
277
                 "If true, the warped NCCF is added to output features");
278
    po->Register("add-normalized-log-pitch", &add_normalized_log_pitch,
279 280
                 "If true, the log-pitch with POV-weighted mean subtraction "
                 "over 1.5 second window is added to output features");
281
    po->Register("add-delta-pitch", &add_delta_pitch,
282 283
                 "If true, time derivative of log-pitch is added to output "
                 "features");
284
    po->Register("add-raw-log-pitch", &add_raw_log_pitch,
285
                 "If true, log(pitch) is added to output features");
286 287
  }
};
288 289


290 291 292 293 294 295 296 297 298 299
// We don't want to expose the pitch-extraction internals here as it's
// quite complex, so we use a private implementation.
class OnlinePitchFeatureImpl;


// Note: to start on a new waveform, just construct a new version
// of this object.
class OnlinePitchFeature: public OnlineBaseFeature {
 public:
  explicit OnlinePitchFeature(const PitchExtractionOptions &opts);
Yanqing Sun's avatar
Yanqing Sun committed
300

301
  virtual int32 Dim() const { return 2; /* (NCCF, pitch) */ }
302 303 304 305 306 307

  virtual int32 NumFramesReady() const;

  virtual bool IsLastFrame(int32 frame) const;

  /// Outputs the two-dimensional feature consisting of (pitch, NCCF).  You
308
  /// should probably post-process this using class OnlineProcessPitch.
309 310 311 312 313 314 315 316
  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);

  virtual void AcceptWaveform(BaseFloat sampling_rate,
                              const VectorBase<BaseFloat> &waveform);

  virtual void InputFinished();

  virtual ~OnlinePitchFeature();
317

318 319
 private:
  OnlinePitchFeatureImpl *impl_;
Yanqing Sun's avatar
Yanqing Sun committed
320 321 322 323
};


/// This online-feature class implements post processing of pitch features.
324
/// Inputs are original 2 dims (nccf, pitch).  It can produce various
325 326
/// kinds of outputs, using the default options it will be (pov-feature,
/// normalized-log-pitch, delta-log-pitch).
327
class OnlineProcessPitch: public OnlineFeatureInterface {
Yanqing Sun's avatar
Yanqing Sun committed
328 329 330 331
 public:
  virtual int32 Dim() const { return dim_; }

  virtual bool IsLastFrame(int32 frame) const {
332 333 334 335 336 337 338
    if (frame <= -1)
      return src_->IsLastFrame(-1);
    else if (frame < opts_.delay) 
      return src_->IsLastFrame(-1) == true ? false : src_->IsLastFrame(0);
    else
      return src_->IsLastFrame(frame - opts_.delay); 
  }
Yanqing Sun's avatar
Yanqing Sun committed
339

340
  virtual int32 NumFramesReady() const;
Yanqing Sun's avatar
Yanqing Sun committed
341 342 343

  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);

344
  virtual ~OnlineProcessPitch() {  }
Yanqing Sun's avatar
Yanqing Sun committed
345

346 347 348
  // Does not take ownership of "src".
  OnlineProcessPitch(const ProcessPitchOptions &opts,
                     OnlineFeatureInterface *src);
349

Yanqing Sun's avatar
Yanqing Sun committed
350
 private:
351
  static const int32 kRawFeatureDim = 2;  // input: (nccf, pitch)
Yanqing Sun's avatar
Yanqing Sun committed
352

353
  ProcessPitchOptions opts_;
Yanqing Sun's avatar
Yanqing Sun committed
354
  OnlineFeatureInterface *src_;
355 356
  int32 dim_;  // Output feature dimension, set in initializer.

357
  struct NormalizationStats {
358 359 360 361 362 363 364
    int32 cur_num_frames;      // value of src_->NumFramesReady() when
                               // "mean_pitch" was set.
    bool input_finished;       // true if input data was finished when
                               // "mean_pitch" was computed.
    double sum_pov;            // sum of pov over relevant range
    double sum_log_pitch_pov;  // sum of log(pitch) * pov over relevant range

365 366 367 368 369
    NormalizationStats(): cur_num_frames(-1), input_finished(false),
                          sum_pov(0.0), sum_log_pitch_pov(0.0) { }
  };

  std::vector<BaseFloat> delta_feature_noise_;
370

371 372 373 374 375
  std::vector<NormalizationStats> normalization_stats_;

  /// Computes and returns the POV feature for this frame.
  /// Called from GetFrame().  
  inline BaseFloat GetPovFeature(int32 frame) const;  
Yanqing Sun's avatar
Yanqing Sun committed
376

377 378 379
  /// Computes and returns the delta-log-pitch feature for this frame.
  /// Called from GetFrame().
  inline BaseFloat GetDeltaPitchFeature(int32 frame);
380

381 382 383
  /// Computes and returns the raw log-pitch feature for this frame.
  /// Called from GetFrame().  
  inline BaseFloat GetRawLogPitchFeature(int32 frame) const;
Yanqing Sun's avatar
Yanqing Sun committed
384

385 386 387
  /// Computes and returns the mean-subtracted log-pitch feature for this frame.
  /// Called from GetFrame().
  inline BaseFloat GetNormalizedLogPitchFeature(int32 frame);
Yanqing Sun's avatar
Yanqing Sun committed
388

389
  /// Computes the normalization window sizes.
390 391 392 393
  inline void GetNormalizationWindow(int32 frame,
                                     int32 src_frames_ready,
                                     int32 *window_begin,
                                     int32 *window_end) const;
394

395 396
  /// Makes sure the entry in normalization_stats_ for this frame is up to date;
  /// called from GetNormalizedLogPitchFeature.
397
  inline void UpdateNormalizationStats(int32 frame);
398 399
};

400

401 402 403 404 405
/// This function extracts (pitch, NCCF) per frame, using the pitch extraction
/// method described in "A Pitch Extraction Algorithm Tuned for Automatic Speech
/// Recognition", Pegah Ghahremani, Bagher BabaAli, Daniel Povey, Korbinian
/// Riedhammer, Jan Trmal and Sanjeev Khudanpur, ICASSP 2014.  The output will
/// have as many rows as there are frames, and two columns corresponding to
406
/// (NCCF, pitch)
407 408 409 410
void ComputeKaldiPitch(const PitchExtractionOptions &opts,
                       const VectorBase<BaseFloat> &wave,
                       Matrix<BaseFloat> *output);

411
/// This function processes the raw (NCCF, pitch) quantities computed by
412 413 414 415 416 417 418 419 420
/// ComputeKaldiPitch, and processes them into features.  By default it will
/// output three-dimensional features, (POV-feature, mean-subtracted-log-pitch,
/// delta-of-raw-pitch), but this is configurable in the options.  The number of
/// rows of "output" will be the number of frames (rows) in "input", and the
/// number of columns will be the number of different types of features
/// requested (by default, 3; 4 is the max).  The four config variables
/// --add-pov-feature, --add-normalized-log-pitch, --add-delta-pitch,
/// --add-raw-log-pitch determine which features we create; by default we create
/// the first three.
421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
void ProcessPitch(const ProcessPitchOptions &opts,
                  const MatrixBase<BaseFloat> &input,
                  Matrix<BaseFloat> *output);

/// This function combines ComputeKaldiPitch and ProcessPitch.  The reason
/// why we need a separate function to do this is in order to be able to
/// accurately simulate the online pitch-processing, for testing and for
/// training models matched to the "first-pass" features.  It is sensitive to
/// the variables in pitch_opts that relate to online processing,
/// i.e. max_frames_latency, frames_per_chunk, simulate_first_pass_online,
/// recompute_frame.
void ComputeAndProcessKaldiPitch(const PitchExtractionOptions &pitch_opts,
                                 const ProcessPitchOptions &process_opts,
                                 const VectorBase<BaseFloat> &wave,
                                 Matrix<BaseFloat> *output);
436

437

438
/// @} End of "addtogroup feat"
Pegah Ghahremani's avatar
 
Pegah Ghahremani committed
439
}  // namespace kaldi
440
#endif  // KALDI_FEAT_PITCH_FUNCTIONS_H_