nnet-shuffle-egs.cc 3.95 KB
Newer Older
1
// nnet2bin/nnet-shuffle-egs.cc
2 3

// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
4
// Copyright 2014  Vimal Manohar
5

6 7
// See ../../COPYING for clarification regarding multiple authors
//
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "hmm/transition-model.h"
24
#include "nnet2/nnet-example-functions.h"
25 26 27 28

int main(int argc, char *argv[]) {
  try {
    using namespace kaldi;
29
    using namespace kaldi::nnet2;
30 31 32 33 34
    typedef kaldi::int32 int32;
    typedef kaldi::int64 int64;

    const char *usage =
        "Copy examples (typically single frames) for neural network training,\n"
35
        "from the input to output, but randomly shuffle the order. This program will keep\n"
36 37 38 39 40
        "all of the examples in memory at once, so don't give it too many.\n"
        "\n"
        "Usage:  nnet-shuffle-egs [options] <egs-rspecifier> <egs-wspecifier>\n"
        "\n"
        "nnet-shuffle-egs --srand=1 ark:train.egs ark:shuffled.egs\n";
41

42
    int32 srand_seed = 0;
43
    int32 buffer_size = 0;
44 45
    ParseOptions po(usage);
    po.Register("srand", &srand_seed, "Seed for random number generator ");
46
    po.Register("buffer-size", &buffer_size, "If >0, size of a buffer we use "
47 48
                "to do limited-memory partial randomization.  Otherwise, do "
                "full randomization.");
49

50 51 52
    po.Read(argc, argv);

    srand(srand_seed);
53

54 55 56 57 58 59 60 61
    if (po.NumArgs() != 2) {
      po.PrintUsage();
      exit(1);
    }

    std::string examples_rspecifier = po.GetArg(1),
        examples_wspecifier = po.GetArg(2);

62
    int64 num_done = 0;
63

64
    std::vector<std::pair<std::string, NnetExample*> > egs;
65 66
    SequentialNnetExampleReader example_reader(examples_rspecifier);
    NnetExampleWriter example_writer(examples_wspecifier);
67
    if (buffer_size == 0) {  // Do full randomization
68 69
      // Putting in an extra level of indirection here to avoid excessive
      // computation and memory demands when we have to resize the vector.
70

71
      for (; !example_reader.Done(); example_reader.Next())
72
        egs.push_back(std::make_pair(example_reader.Key(),
73
                                    new NnetExample(example_reader.Value())));
74

75 76 77
      std::random_shuffle(egs.begin(), egs.end());
    } else {
      KALDI_ASSERT(buffer_size > 0);
78 79
      egs.resize(buffer_size, 
          std::pair<std::string, NnetExample*>("", static_cast<NnetExample *>(NULL)));
80 81
      for (; !example_reader.Done(); example_reader.Next()) {
        int32 index = RandInt(0, buffer_size - 1);
82
        if (egs[index].second == NULL) {
83
          egs[index] = std::make_pair(example_reader.Key(),
84
                                    new NnetExample(example_reader.Value()));
85
        } else {
86
          example_writer.Write(egs[index].first, *(egs[index].second));
87
          egs[index].first = example_reader.Key();
88
          *(egs[index].second) = example_reader.Value();
89 90
          num_done++;
        }
91
      }
92
    }
93
    for (size_t i = 0; i < egs.size(); i++) {
94 95 96
      if (egs[i].second != NULL) {
        example_writer.Write(egs[i].first, *(egs[i].second));
        delete egs[i].second;
97
      }
98
      num_done++;
99
    }
100 101 102 103

    KALDI_LOG << "Shuffled order of " << num_done
              << " neural-network training examples "
              << (buffer_size ? "using a buffer (partial randomization)" : "");
104

105
    return (num_done == 0 ? 1 : 0);
106 107 108 109 110 111 112
  } catch(const std::exception &e) {
    std::cerr << e.what() << '\n';
    return -1;
  }
}