Commit d9d13e11 authored by Dan Povey's avatar Dan Povey
Browse files

sandbox/online: copying various files from trunk that had been skipped when...

sandbox/online: copying various files from trunk that had been skipped when merging trunk to online previously.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/online@3568 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent da133ee9
#!/usr/bin/perl -w
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# This program is a bit like ./sym2int.pl in that it applies a map
# to things in a file, but it's a bit more general in that it doesn't
# assume the things being mapped to are single tokens, they could
# be sequences of tokens. See the usage message.
# this version preserves tabs.
if (@ARGV > 0 && $ARGV[0] eq "-f") {
shift @ARGV;
$field_spec = shift @ARGV;
if ($field_spec =~ m/^\d+$/) {
$field_begin = $field_spec - 1; $field_end = $field_spec - 1;
}
if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
if ($1 ne "") {
$field_begin = $1 - 1; # Change to zero-based indexing.
}
if ($2 ne "") {
$field_end = $2 - 1; # Change to zero-based indexing.
}
}
if (!defined $field_begin && !defined $field_end) {
die "Bad argument to -f option: $field_spec";
}
}
# Mapping is obligatory
$permissive = 0;
if (@ARGV > 0 && $ARGV[0] eq '--permissive') {
shift @ARGV;
# Mapping is optional (missing key is printed to output)
$permissive = 1;
}
if(@ARGV != 1) {
print STDERR "Usage: apply_map_tab_preserving.pl [options] map <input >output\n" .
"options: [-f <field-range> ]\n" .
"Applies the map 'map' to all input text, where each line of the map\n" .
"is interpreted as a map from the first field to the list of the other fields\n" .
"Note: <field-range> can look like 4-5, or 4-, or 5-, or 1, it means the field\n" .
"range in the input to apply the map to.\n" .
"e.g.: echo A B | apply_map.pl a.txt\n" .
"where a.txt is:\n" .
"A a1 a2\n" .
"B b\n" .
"will produce:\n" .
"a1 a2 b\n";
exit(1);
}
($map) = @ARGV;
open(M, "<$map") || die "Error opening map file $map: $!";
while (<M>) {
@A = split(" ", $_);
@A >= 1 || die "apply_map.pl: empty line.";
$i = shift @A;
$o = join(" ", @A);
$map{$i} = $o;
}
while(<STDIN>) {
@A = split("\t", $_);
$field_offset = 0;
for ($n = 0; $n < @A; $n++) {
@B = split(" ", $A[$n]);
for ($x = 0; $x < @B; $x++) {
$y = $x + $field_offset;
if ( (!defined $field_begin || $y >= $field_begin)
&& (!defined $field_end || $y <= $field_end)) {
$b = $B[$x];
if (!defined $map{$b}) {
if (!$permissive) {
die "apply_map.pl: undefined key $a\n";
} else {
print STDERR "apply_map.pl: warning! missing key $a\n";
}
} else {
$B[$x] = $map{$b};
}
}
}
$field_offset += @B;
$A[$n] = join(" ", @B);
}
print join("\t", @A) . "\n";
}
This diff is collapsed.
#!/bin/bash
# Copyright 2014 Johns Hopkins University (Author: Yenda Trmal)
# Apache 2.0
# Begin configuration section.
nj=4
cmd=run.pl
acwt=0.1
# End configuration section.
echo "$0 $@" # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
if [ $# != 5 ]; then
echo "Usage $0 [options] <data-dir> <model-dir> <ali-dir> <decode-dir> <out-dir>"
echo " e.g.: local/prepare_confusions.sh --nj 32 exp/sgmm5/graph exp/sgmm5 exp/sgmm5_ali exp/sgmm5_denlats exp/conf_matrix"
echo ""
echo "main options (for others, see top of script file)"
echo " --nj <nj> # number of parallel jobs"
echo " --cmd <cmd> # Command to run in parallel with"
echo " --acwt <value|default=0.1> # Acoustic model weight. Value will be used for 1-best path decoding of the lattices"
echo ""
echo "Please note that the output confusion matrix will be phoneme-based"
echo "and all the phone contexts (singleton, intra, begin, end) or phoneme"
echo "tags (such as tone or stress) will be collapsed into a single monophone"
echo ""
echo "The output format is line oriented."
echo "Each line can have one of these four formats (A, B being different phones, <eps> special symbol"
echo " A A count #Number of hits, i.e. correctly determined phones"
echo " A B count #Number of substitutions of A with B "
echo " A <eps> count #Number of deletions"
echo " <eps> A count #Number of insertions"
exit 1;
fi
set -u
set -e
data=$1; shift
modeldir=$1; shift
alidir=$1; shift
latdir=$1; shift
wdir=$1; shift
model=$modeldir/final.mdl
[ ! -f $model ] && echo "File $model does not exist!" && exit 1
phones=$data/phones.txt
[ ! -f $phones ] && echo "File $phones does not exist!" && exit 1
! ali_nj=`cat $alidir/num_jobs` && echo "Could not open the file $alidir/num_jobs" && exit 1
! lat_nj=`cat $latdir/num_jobs` && echo "Could not open the file $latdir/num_jobs" && exit 1
if [ $ali_nj -ne $lat_nj ] ; then
echo "Alignments num_jobs and lattices num_jobs mismatch!"
exit 1
fi
[ ! $nj -le $ali_nj ] && echo "Number of jobs is too high (max is $ali_nj)." && nj=$ali_nj
mkdir -p $wdir/log
cat $data/phones.txt | sed "s/^\([^ _\t][^ _\t]*\)_[^ \t][^ \t]* /\1 /g" > $wdir/phones.txt
echo "Converting alignments to phone sequences..."
$cmd JOB=1:$nj $wdir/log/ali_to_phones.JOB.log \
compute-wer --text --mode=all\
ark:\<\( \
ali-to-phones $model ark:"gunzip -c $alidir/ali.JOB.gz|" ark,t:- \|\
int2sym.pl -f 2- $wdir/phones.txt - \) \
ark:\<\( \
lattice-to-phone-lattice $model ark:"gunzip -c $latdir/lat.JOB.gz|" ark:- \| \
lattice-best-path --acoustic-scale=$acwt ark:- ark,t:- ark:/dev/null \| \
int2sym.pl -f 2- $wdir/phones.txt - \) \
$wdir/confusions.JOB.txt
confusion_files=""
for i in `seq 1 $nj` ; do
confusion_files="$confusion_files $wdir/confusions.$i.txt"
done
echo "Converting statistics..."
cat $confusion_files | sort | uniq -c | grep -v -E '<oov>|<sss>|<vns>|SIL' | \
perl -ane '
if ($F[1] eq "correct") {
die "Unknown format " . join(" ", @F) . "\n" if ($#F != 2);
print "$F[2] $F[2] $F[0]\n";
} elsif ($F[1] eq "deletion" ) {
die "Unknown format " . join(" ", @F) . "\n" if ($#F != 2);
print "$F[2] <eps> $F[0]\n";
} elsif ($F[1] eq "insertion") {
die "Unknown format " . join(" ", @F) . "\n" if ($#F != 2);
print "<eps> $F[2] $F[0]\n";
} elsif ($F[1] eq "substitution") {
die "Unknown format " . join(" ", @F) . "\n" if ($#F != 3);
print "$F[2] $F[3] $F[0]\n";
} else {
die "Unknown line " . join(" ", @F). "\n";
}' > $wdir/confusions.txt
exit 0
#-echo "Converting alignments to phone sequences..."
#-$cmd JOB=1:$nj $wdir/log/ali_to_phones.JOB.log \
#- ali-to-phones $model ark:"gunzip -c $alidir/ali.JOB.gz|" ark,t:- \|\
#- int2sym.pl -f 2- $wdir/phones.txt - \> $wdir/ali.JOB.txt
#-
#-echo "Converting lattices to phone sequences..."
#-$cmd JOB=1:$nj $wdir/log/lat_to_phones.JOB.log \
#- lattice-to-phone-lattice $model ark:"gunzip -c $latdir/lat.JOB.gz|" ark:- \| \
#- lattice-best-path --acoustic-scale=$acwt ark:- ark,t:- ark:/dev/null \| \
#- int2sym.pl -f 2- $wdir/phones.txt - \> $wdir/lat.JOB.txt
#!/bin/bash
# Copyright 2014 Guoguo Chen
# Apache 2.0.
# Begin configuration section.
nj=8
cmd=run.pl
beam=-1 # Beam for proxy FST, -1 means no prune
phone_beam=-1 # Beam for KxL2xE FST, -1 means no prune
nbest=-1 # Use top n best proxy keywords in proxy FST, -1 means all
# proxies
phone_nbest=50 # Use top n best phone sequences in KxL2xE, -1 means all
# phone sequences
phone_cutoff=5 # We don't generate proxy keywords for OOV keywords that
# have less phones than the specified cutoff as they may
# introduce a lot false alarms
confusion_matrix= # If supplied, using corresponding E transducer
count_cutoff=1 # Minimal count to be considered in the confusion matrix;
# will ignore phone pairs that have count less than this.
pron_probs=false # If true, then lexicon looks like:
# Word Prob Phone1 Phone2...
case_insensitive=true
icu_transform="Any-Lower"
proxy_set= # List of keywords to generate proxies for, one KWID per
# line. If empty, then by default generate proxies for all
# OOV keywords.
# End configuration section.
[ -f ./path.sh ] && . ./path.sh; # source the path.
echo $0 "$@"
. parse_options.sh || exit 1;
if [ $# -ne 5 ]; then
echo "Usage: local/kws_data_prep_proxy.sh <lang-dir> <data-dir> \\"
echo " <L1-lexicon> <L2-lexicon> <kws-data-dir>"
echo " e.g.: local/kws_data_prep_proxy.sh data/lang/ data/dev10h/ \\"
echo " data/local/tmp.lang/lexiconp.txt oov_lexicon.txt data/dev10h/kws/"
echo "allowed options:"
echo " --case-sensitive <true|false> # Being case-sensitive or not"
echo " --icu-transform <string> # Transliteration for upper/lower case"
echo " # mapping"
echo " --proxy-set <IV/OOV> # Keyword set for generating proxies"
exit 1
fi
set -e
set -o pipefail
langdir=$1
datadir=$2
l1_lexicon=$3
l2_lexicon=$4
kwsdatadir=$5
# Checks some files.
for f in $langdir/words.txt $kwsdatadir/kwlist.xml $l1_lexicon $l2_lexicon; do
[ ! -f $f ] && echo "$0: no such file $f" && exit 1
done
keywords=$kwsdatadir/kwlist.xml
mkdir -p $kwsdatadir/tmp/
cat $keywords | perl -e '
#binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
use XML::Simple;
use Data::Dumper;
my $data = XMLin(\*STDIN);
#print Dumper($data->{kw});
foreach $kwentry (@{$data->{kw}}) {
#print Dumper($kwentry);
print "$kwentry->{kwid}\t$kwentry->{kwtext}\n";
}' > $kwsdatadir/raw_keywords_all.txt
# Takes care of upper/lower case.
cp $langdir/words.txt $kwsdatadir/words.txt
cat $l1_lexicon | sed 's/\s/ /g' > $kwsdatadir/tmp/L1.tmp.lex
if $case_insensitive; then
echo "$0: Running case insensitive processing"
echo "$0: Using ICU with transofrm \"$icu_transform\""
# Processing words.txt
cat $kwsdatadir/words.txt |\
uconv -f utf8 -t utf8 -x "${icu_transform}" > $kwsdatadir/words.norm.txt
# Processing lexicon
cat $l2_lexicon | sed 's/\s/ /g' | cut -d ' ' -f 1 |\
uconv -f utf8 -t utf8 -x "${icu_transform}" |\
paste -d ' ' - <(cat $l2_lexicon | sed 's/\s/ /g' | cut -d ' ' -f 2-) \
> $kwsdatadir/tmp/L2.tmp.lex
paste <(cut -f 1 $kwsdatadir/raw_keywords_all.txt) \
<(cut -f 2 $kwsdatadir/raw_keywords_all.txt |\
uconv -f utf8 -t utf8 -x "${icu_transform}") \
> $kwsdatadir/keywords_all.txt
cat $kwsdatadir/keywords_all.txt |\
local/kwords2indices.pl --map-oov 0 $kwsdatadir/words.norm.txt \
> $kwsdatadir/keywords_all.int
else
cat $l2_lexicon | sed 's/\s/ /g' > $kwsdatadir/tmp/L2.tmp.lex
cp $kwsdatadir/raw_keywords_all.txt $kwsdatadir/keywords_all.txt
cat $kwsdatadir/keywords_all.txt | \
sym2int.pl --map-oov 0 -f 2- $kwsdatadir/words.txt \
> $kwsdatadir/keywords_all.int
fi
# Writes some scoring related files.
cat $kwsdatadir/keywords_all.int |\
egrep -v " 0 | 0$" | cut -f 1 -d ' ' |\
local/subset_kwslist.pl $keywords > $kwsdatadir/kwlist_invocab.xml
cat $kwsdatadir/keywords_all.int |\
egrep " 0 | 0$" | cut -f 1 -d ' ' |\
local/subset_kwslist.pl $keywords > $kwsdatadir/kwlist_outvocab.xml
# Selects a set to generate proxies for. By default, generate proxies for OOV
# keywords.
if [ -z $proxy_set ]; then
cat $kwsdatadir/keywords_all.int |\
egrep " 0 | 0$" | awk '{print $1;}' | sort -u \
> $kwsdatadir/keywords_proxy.list
else
cp $proxy_set $kwsdatadir/keywords_proxy.list
fi
cat $kwsdatadir/keywords_all.txt |\
grep -f $kwsdatadir/keywords_proxy.list > $kwsdatadir/keywords_proxy.txt
cat $kwsdatadir/keywords_proxy.txt |\
cut -f 2- | awk '{for(x=1;x<=NF;x++) {print $x;}}' |\
sort -u > $kwsdatadir/keywords_proxy_words.list
# Maps original phone set to a "reduced" phone set. We limit L2 to only cover
# the words that are actually used in keywords_proxy.txt for efficiency purpose.
# Besides, if L1 and L2 contains the same words, we use the pronunciation from
# L1 since it is the lexicon used for the LVCSR training.
cat $kwsdatadir/tmp/L1.tmp.lex | cut -d ' ' -f 1 |\
paste -d ' ' - <(cat $kwsdatadir/tmp/L1.tmp.lex | cut -d ' ' -f 2-|\
sed 's/_[B|E|I|S]//g' | sed 's/_[%|"]//g') |\
awk '{if(NF>=2) {print $0}}' > $kwsdatadir/tmp/L1.lex
cat $kwsdatadir/tmp/L2.tmp.lex | cut -d ' ' -f 1 |\
paste -d ' ' - <(cat $kwsdatadir/tmp/L2.tmp.lex | cut -d ' ' -f 2-|\
sed 's/_[B|E|I|S]//g' | sed 's/_[%|"]//g') |\
awk '{if(NF>=2) {print $0}}' | perl -e '
($lex1, $words) = @ARGV;
open(L, "<$lex1") || die "Fail to open $lex1.\n";
open(W, "<$words") || die "Fail to open $words.\n";
while (<L>) {
chomp;
@col = split;
@col >= 2 || die "Too few columsn in \"$_\".\n";
$w = $col[0];
$w_p = $_;
if (defined($lex1{$w})) {
push(@{$lex1{$w}}, $w_p);
} else {
$lex1{$w} = [$w_p];
}
}
close(L);
while (<STDIN>) {
chomp;
@col = split;
@col >= 2 || die "Too few columsn in \"$_\".\n";
$w = $col[0];
$w_p = $_;
if (defined($lex1{$w})) {
next;
}
if (defined($lex2{$w})) {
push(@{$lex2{$w}}, $w_p);
} else {
$lex2{$w} = [$w_p];
}
}
%lex = (%lex1, %lex2);
while (<W>) {
chomp;
if (defined($lex{$_})) {
foreach $x (@{$lex{$_}}) {
print "$x\n";
}
}
}
close(W);
' $kwsdatadir/tmp/L1.lex $kwsdatadir/keywords_proxy_words.list \
> $kwsdatadir/tmp/L2.lex
rm -f $kwsdatadir/tmp/L1.tmp.lex $kwsdatadir/tmp/L2.tmp.lex
# Creates words.txt that covers all the words in L1.lex and L2.lex. We append
# new words to the original word symbol table.
max_id=`cat $kwsdatadir/words.txt | awk '{print $2}' | sort -n | tail -1`;
cat $kwsdatadir/keywords_proxy.txt |\
awk '{for(i=2; i <= NF; i++) {print $i;}}' |\
cat - <(cat $kwsdatadir/tmp/L2.lex | awk '{print $1;}') |\
cat - <(cat $kwsdatadir/tmp/L1.lex | awk '{print $1;}') |\
sort -u | grep -F -v -x -f <(cat $kwsdatadir/words.txt | awk '{print $1;}') |\
awk 'BEGIN{x='$max_id'+1}{print $0"\t"x; x++;}' |\
cat $kwsdatadir/words.txt - > $kwsdatadir/tmp/words.txt
# Creates keyword list that we need to generate proxies for.
cat $kwsdatadir/keywords_proxy.txt | perl -e '
open(W, "<'$kwsdatadir/tmp/L2.lex'") ||
die "Fail to open L2 lexicon: '$kwsdatadir/tmp/L2.lex'\n";
my %lexicon;
while (<W>) {
chomp;
my @col = split();
@col >= 2 || die "'$0': Bad line in lexicon: $_\n";
if ('$pron_probs' eq "false") {
$lexicon{$col[0]} = scalar(@col)-1;
} else {
$lexicon{$col[0]} = scalar(@col)-2;
}
}
while (<>) {
chomp;
my $line = $_;
my @col = split();
@col >= 2 || die "Bad line in keywords file: $_\n";
my $len = 0;
for (my $i = 1; $i < scalar(@col); $i ++) {
if (defined($lexicon{$col[$i]})) {
$len += $lexicon{$col[$i]};
} else {
print STEDRR "'$0': No pronunciation found for word: $col[$i]\n";
}
}
if ($len >= '$phone_cutoff') {
print "$line\n";
} else {
print STDERR "'$0': Keyword $col[0] is too short, not generating proxy\n";
}
}' > $kwsdatadir/tmp/keywords.txt
# Creates proxy keywords.
local/generate_proxy_keywords.sh \
--cmd "$cmd" --nj "$nj" --beam "$beam" --nbest "$nbest" \
--phone-beam $phone_beam --phone-nbest $phone_nbest \
--confusion-matrix "$confusion_matrix" --count-cutoff "$count_cutoff" \
--pron-probs "$pron_probs" $kwsdatadir/tmp/
cp $kwsdatadir/tmp/keywords.fsts $kwsdatadir
# Creates utterance id for each utterance.
cat $datadir/segments | \
awk '{print $1}' | \
sort | uniq | perl -e '
$idx=1;
while(<>) {
chomp;
print "$_ $idx\n";
$idx++;
}' > $kwsdatadir/utter_id
# Map utterance to the names that will appear in the rttm file. You have
# to modify the commands below accoring to your rttm file
cat $datadir/segments | awk '{print $1" "$2}' |\
sort | uniq > $kwsdatadir/utter_map;
echo "$0: Kws data preparation succeeded"
#!/bin/bash
# This is not necessarily the top-level run.sh as it is in other directories. see README.txt first.
tri5_only=true
[ ! -f ./lang.conf ] && echo 'Language configuration does not exist! Use the configurations in conf/lang/* as a startup' && exit 1
[ ! -f ./conf/common_vars.sh ] && echo 'the file conf/common_vars.sh does not exist!' && exit 1
. conf/common_vars.sh || exit 1;
. ./lang.conf || exit 1;
[ -f local.conf ] && . ./local.conf
. ./utils/parse_options.sh
set -e #Exit on non-zero return code from any command
set -o pipefail #Exit if any of the commands in the pipeline will
#return non-zero return code
#set -u #Fail on an undefined variable
lexicon=data/local/lexicon.txt
if $extend_lexicon; then
lexicon=data/local/lexiconp.txt
fi
#Preparing dev2h and train directories
if [ ! -d data/raw_train_data ]; then
echo ---------------------------------------------------------------------
echo "Subsetting the TRAIN set"
echo ---------------------------------------------------------------------
local/make_corpus_subset.sh "$train_data_dir" "$train_data_list" ./data/raw_train_data
train_data_dir=`readlink -f ./data/raw_train_data`
fi
nj_max=`cat $train_data_list | wc -l`
if [[ "$nj_max" -lt "$train_nj" ]] ; then
echo "The maximum reasonable number of jobs is $nj_max (you have $train_nj)! (The training and decoding process has file-granularity)"
exit 1;
train_nj=$nj_max
fi
train_data_dir=`readlink -f ./data/raw_train_data`
if [ ! -d data/raw_dev2h_data ]; then
echo ---------------------------------------------------------------------
echo "Subsetting the DEV2H set"
echo ---------------------------------------------------------------------
local/make_corpus_subset.sh "$dev2h_data_dir" "$dev2h_data_list" ./data/raw_dev2h_data || exit 1
fi
if [ ! -d data/raw_dev10h_data ]; then
echo ---------------------------------------------------------------------
echo "Subsetting the DEV10H set"
echo ---------------------------------------------------------------------
local/make_corpus_subset.sh "$dev10h_data_dir" "$dev10h_data_list" ./data/raw_dev10h_data || exit 1
fi
nj_max=`cat $dev2h_data_list | wc -l`
if [[ "$nj_max" -lt "$decode_nj" ]] ; then
echo "The maximum reasonable number of jobs is $nj_max -- you have $decode_nj! (The training and decoding process has file-granularity)"
exit 1
decode_nj=$nj_max
fi
# Move data/dev2h preparation forward so we can get data/dev2h/text for
# diagnostic purpose when extending the lexicon.
if [[ ! -f data/dev2h/wav.scp || data/dev2h/wav.scp -ot ./data/raw_dev2h_data/audio ]]; then
echo ---------------------------------------------------------------------
echo "Preparing dev2h data lists in data/dev2h on" `date`
echo ---------------------------------------------------------------------
mkdir -p data/dev2h
local/prepare_acoustic_training_data.pl \
--fragmentMarkers \-\*\~ \
`pwd`/data/raw_dev2h_data data/dev2h > data/dev2h/skipped_utts.log || exit 1
fi
if [[ ! -f data/dev2h/glm || data/dev2h/glm -ot "$glmFile" ]]; then
echo ---------------------------------------------------------------------
echo "Preparing dev2h stm files in data/dev2h on" `date`
echo ---------------------------------------------------------------------
if [ -z $dev2h_stm_file ]; then
echo "WARNING: You should define the variable stm_file pointing to the IndusDB stm"
echo "WARNING: Doing that, it will give you scoring close to the NIST scoring. "
local/prepare_stm.pl --fragmentMarkers \-\*\~ data/dev2h || exit 1
else
local/augment_original_stm.pl $dev2h_stm_file data/dev2h || exit 1
fi
[ ! -z $glmFile ] && cp $glmFile data/dev2h/glm
fi
mkdir -p data/local
if [[ ! -f $lexicon || $lexicon -ot "$lexicon_file" ]]; then
echo ---------------------------------------------------------------------
echo "Preparing lexicon in data/local on" `date`
echo ---------------------------------------------------------------------
local/make_lexicon_subset.sh $train_data_dir/transcription $lexicon_file data/local/filtered_lexicon.txt
local/prepare_lexicon.pl --phonemap "$phoneme_mapping" \
$lexiconFlags data/local/filtered_lexicon.txt data/local
if $extend_lexicon; then
# Extend the original lexicon.
# Will creates the files data/local/extend/{lexiconp.txt,oov2prob}.
mv data/local/lexicon.txt data/local/lexicon_orig.txt
local/extend_lexicon.sh --cmd "$train_cmd" \
--num-sent-gen $num_sent_gen --num-prons $num_prons \
data/local/lexicon_orig.txt data/local/extend data/dev2h/text
cp data/local/extend/lexiconp.txt data/local/
fi
fi
mkdir -p data/lang
if [[ ! -f data/lang/L.fst || data/lang/L.fst -ot $lexicon ]]; then
echo ---------------------------------------------------------------------
echo "Creating L.fst etc in data/lang on" `date`
echo ---------------------------------------------------------------------
utils/prepare_lang.sh \
--share-silence-phones true \
data/local $oovSymbol data/local/tmp.lang data/lang
fi
if [[ ! -f data/train/wav.scp || data/train/wav.scp -ot "$train_data_dir" ]]; then
echo ---------------------------------------------------------------------
echo "Preparing acoustic training lists in data/train on" `date`
echo ---------------------------------------------------------------------
mkdir -p data/train
local/prepare_acoustic_training_data.pl \
--vocab $lexicon --fragmentMarkers \-\*\~ \
$train_data_dir data/train > data/train/skipped_utts.log
fi