paste_feats.sh 2.47 KB
Newer Older
1 2 3 4 5
#!/bin/bash

# Copyright 2014  Brno University of Technology (Author: Karel Vesely)
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
6
# This script appends the features in two or more data directories.
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24

# To be run from .. (one directory up from here)
# see ../run.sh for example

# Begin configuration section.
cmd=run.pl
nj=4
length_tolerance=10 # length tolerance in frames (trim to shortest)
compress=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 5 ]; then
   echo "usage: $0 [options] <src-data-dir1> <src-data-dir2> [<src-data-dirN>] <dest-data-dir> <log-dir> <path-to-storage-dir>";
25
   echo "e.g.: $0 data/train_mfcc data/train_bottleneck data/train_combined exp/append_mfcc_plp mfcc"
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
   echo "options: "
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data_src_arr=(${@:1:$(($#-3))}) #array of source data-dirs
data=${@: -3: 1}
logdir=${@: -2: 1}
ark_dir=${@: -1: 1} #last arg.

data_src_first=${data_src_arr[0]} # get 1st src dir

# make $ark_dir an absolute pathname.
ark_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $ark_dir ${PWD}`

for data_src in ${data_src_arr[@]}; do
  utils/split_data.sh $data_src $nj || exit 1;
done

mkdir -p $ark_dir $logdir

mkdir -p $data 
cp $data_src_first/* $data/ 2>/dev/null # so we get the other files, such as utt2spk.
rm $data/cmvn.scp 2>/dev/null 
rm $data/feats.scp 2>/dev/null 

# use "name" as part of name of the archive.
name=`basename $data`

# get list of source scp's for pasting
data_src_args=
for data_src in ${data_src_arr[@]}; do
  data_src_args="$data_src_args scp:$data_src/split$nj/JOB/feats.scp"
done

$cmd JOB=1:$nj $logdir/append.JOB.log \
   paste-feats --length-tolerance=$length_tolerance $data_src_args ark:- \| \
   copy-feats --compress=$compress ark:- \
    ark,scp:$ark_dir/pasted_$name.JOB.ark,$ark_dir/pasted_$name.JOB.scp || exit 1;
              
# concatenate the .scp files together.
for ((n=1; n<=nj; n++)); do
  cat $ark_dir/pasted_$name.$n.scp >> $data/feats.scp || exit 1;
done > $data/feats.scp || exit 1;


nf=`cat $data/feats.scp | wc -l` 
nu=`cat $data/utt2spk | wc -l` 
if [ $nf -ne $nu ]; then
75
  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
76 77 78 79
  echo "consider using utils/fix_data_dir.sh $data"
fi

echo "Succeeded pasting features for $name into $data"