Commit db81aa6f authored by Dan Povey's avatar Dan Povey
Browse files

Add yesno example scripts; various updates to the WSJ and Switchboard s5/...

Add yesno example scripts; various updates to the WSJ and Switchboard s5/ example scripts (making more of them work.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@925 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 7a95fa85
......@@ -21,6 +21,12 @@ Note: the easiest examples to work with are rm/s3 and wsj/s3.
get the same data using combinations of other catalog numbers, but this
is the one we used).
yesno: This is a simple recipe with some data consisting of a single person
saying the words "yes" and "no", that can be downloaded from the Kaldi website.
It's a very easy task, but useful for checking that the scripts run, or if
you don't yet have any of the LDC data.
Recipes in progress (these may be less polished than the ones above).
swbd: Switchboard. A fairly large amount of telephone speech (2-channel, 8kHz
......
for x in exp/*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null
exp/tri1/decode/score_12/eval2000.ctm.filt.sys: | Sum/Avg | 4459 42989 | 56.8 30.3 12.9 4.0 *47.2* 74.9 |
exp/tri2/decode/score_12/eval2000.ctm.filt.sys: | Sum/Avg | 4459 42989 | 57.3 30.1 12.6 4.0 46.7 74.9 |
......@@ -108,27 +108,29 @@ steps/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train_30k_nodup data/lang exp/tri1 exp/tri1_ali || exit 1;
steps/train_deltas.sh --nj 30 --cmd "$train_cmd" \
steps/train_deltas.sh --cmd "$train_cmd" \
2500 20000 data/train_30k_nodup data/lang exp/tri1_ali exp/tri2 || exit 1;
(
utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
steps/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri1/graph data/eval2000 exp/tri1/decode || exit 1;
utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
steps/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri2/graph data/eval2000 exp/tri2/decode || exit 1;
)&
exit 0;
# I AM HERE
steps/align_deltas.sh --nj 30 --cmd "$train_cmd" \
data/train_30k_nodup data/lang exp/tri2 exp/tri2_ali || exit 1;
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train_30k_nodup data/lang exp/tri2 exp/tri2_ali || exit 1;
# Train tri3a, which is LDA+MLLT, on 30k_nodup data.
steps/train_lda_mllt.sh --nj 30 --cmd "$train_cmd" \
steps/train_lda_mllt.sh --cmd "$train_cmd" \
2500 20000 data/train_30k_nodup data/lang exp/tri2_ali exp/tri3a || exit 1;
(
utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
steps/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri3a/graph data/eval2000 exp/tri3a/decode || exit 1;
)&
# From now, we start building a more serious system (with SAT), and we'll
# do the alignment with fMLLR.
......@@ -136,16 +138,18 @@ steps/train_lda_mllt.sh --nj 30 --cmd "$train_cmd" \
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train_100k_nodup data/lang exp/tri3a exp/tri3a_ali_100k_nodup || exit 1;
steps/train_sat.sh --nj 30 --cmd "$train_cmd" \
steps/train_sat.sh --cmd "$train_cmd" \
2500 20000 data/train_100k_nodup data/lang exp/tri3a_ali_100k_nodup exp/tri4a || exit 1;
# HERE.
exit 0;
(
utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri4a/graph data/eval2000 exp/tri4a/decode
)&
utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
utils/decode.sh -l data/lang_test --nj 30 --cmd "$decode_cmd" --opts "$decode_opts2" \
steps/decode_lda_mllt_sat.sh exp/tri4a/graph data/eval2000 exp/tri4a/decode_eval2000
# HERE.
exit 0;
utils/decode.sh --nj 30 --cmd "$decode_cmd" --opts "$decode_opts2" \
steps/decode_lda_mllt_sat.sh exp/tri4a/graph data/train_dev exp/tri4a/decode_train_dev
......@@ -252,8 +256,6 @@ steps/align_lda_mllt_sat.sh --nj 30 --cmd "$train_cmd" \
# getting results (see RESULTS file)
for x in exp/*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null
for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
......@@ -23,14 +23,26 @@
if [ -f ./path.sh ]; then . ./path.sh; fi
beam=13.0
lattice_beam=6.0
numjobs=1
jobid=0
if [ "$1" == "-j" ]; then
shift;
numjobs=$1;
jobid=$2;
shift; shift;
fi
for x in `seq 3`; do
if [ "$1" == "-j" ]; then
shift;
numjobs=$1;
jobid=$2;
shift 2;
fi
if [ "$1" == "--beam" ]; then
beam=$2;
shift 2;
fi
if [ "$1" == "--lattice-beam" ]; then
lattice_beam=$2;
shift 2;
fi
done
if [ $# != 3 ]; then
echo "Usage: steps/decode_deltas.sh [-j num-jobs job-number] <graph-dir> <data-dir> <decode-dir>"
......@@ -65,7 +77,7 @@ done
# CMVN stats-- we make them part of a pipe.
feats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | add-deltas ark:- ark:- |"
gmm-latgen-faster --max-active=7000 --beam=13.0 --lattice-beam=6.0 --acoustic-scale=0.083333 \
gmm-latgen-faster --max-active=7000 --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=0.083333 \
--allow-partial=true --word-symbol-table=$graphdir/words.txt \
$srcdir/final.mdl $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.$jobid.gz" \
2> $dir/decode$jobid.log || exit 1;
......
......@@ -18,14 +18,26 @@
# Flat start and monophone training, with delta-delta features.
# This script applies cepstral mean normalization (per speaker).
numgauss=300 # Initial num-Gauss (must be more than #states=3*phones).
totgauss=1000 # Target #Gaussians.
nj=4
cmd=scripts/run.pl
for x in 1 2; do
if [ $1 == "--num-jobs" ]; then
shift
nj=$1
shift
fi
if [ $1 == "--start-gauss" ]; then
numgauss=$2;
shift 2;
fi
if [ $1 == "--end-gauss" ]; then
totgauss=$2;
shift 2;
fi
if [ $1 == "--cmd" ]; then
shift
cmd=$1
......@@ -50,8 +62,6 @@ if [ -f path.sh ]; then . path.sh; fi
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
numiters=40 # Number of iterations of training
maxiterinc=30 # Last iter to increase #Gauss on.
numgauss=300 # Initial num-Gauss (must be more than #states=3*phones).
totgauss=1000 # Target #Gaussians.
incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
oov_sym=`cat $lang/oov.txt`
......
......@@ -9,6 +9,7 @@
# This script takes no command-line arguments but takes the --cmd option.
# Begin configuration section.
rand_seed=0
cmd=run.pl
nwords=10000 # This is how many words we're putting in the vocab of the RNNLM.
hidden=30
......@@ -108,6 +109,11 @@ for type in train valid; do
done
rm $dir/train.in # no longer needed-- and big.
# Now randomize the order of the training data.
cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \
sort | cut -f 2 > $dir/foo
mv $dir/foo $dir/train
# OK we'll train the RNNLM on this data.
# todo: change 100 to 320.
......
......@@ -24,29 +24,30 @@ utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang data/lang
local/wsj_format_data.sh || exit 1;
# # We suggest to run the next three commands in the background,
# # as they are not a precondition for the system building and
# # most of the tests: these commands build a dictionary
# # containing many of the OOVs in the WSJ LM training data,
# # and an LM trained directly on that data (i.e. not just
# # copying the arpa files from the disks from LDC).
# (
# on CSLP: local/wsj_extend_dict.sh /export/corpora5/LDC/LDC94S13B/13-32.1/ && \
# local/wsj_extend_dict.sh /mnt/matylda2/data/WSJ1/13-32.1 && \
# utils/prepare_lang.sh data/local/dict_larger "<SPOKEN_NOISE>" data/local/lang_larger data/lang_bd && \
# local/wsj_train_lms.sh && \
# local/wsj_format_local_lms.sh &&
# ( local/wsj_train_rnnlms.sh --cmd "$train_cmd -l mem_free=10G" data/local/rnnlm.h30.voc10k &
# sleep 20; # wait till tools compiled.
# local/wsj_train_rnnlms.sh --cmd "$train_cmd -l mem_free=12G" \
# --hidden 100 --nwords 20000 --class 350 --direct 1500 data/local/rnnlm.h100.voc20k &
# local/wsj_train_rnnlms.sh --cmd "$train_cmd -l mem_free=14G" \
# --hidden 200 --nwords 30000 --class 350 --direct 1500 data/local/rnnlm.h200.voc30k &
# local/wsj_train_rnnlms.sh --cmd "$train_cmd -l mem_free=16G" \
# --hidden 300 --nwords 40000 --class 400 --direct 2000 data/local/rnnlm.h300.voc40k &
# )
# ) &
# We suggest to run the next three commands in the background,
# as they are not a precondition for the system building and
# most of the tests: these commands build a dictionary
# containing many of the OOVs in the WSJ LM training data,
# and an LM trained directly on that data (i.e. not just
# copying the arpa files from the disks from LDC).
# Caution: the commands below will only work if $decode_cmd
# is setup to use qsub. Else, just remove the --cmd option.
(
# on CSLP: local/wsj_extend_dict.sh /export/corpora5/LDC/LDC94S13B/13-32.1/ && \
local/wsj_extend_dict.sh /mnt/matylda2/data/WSJ1/13-32.1 && \
utils/prepare_lang.sh data/local/dict_larger "<SPOKEN_NOISE>" data/local/lang_larger data/lang_bd && \
local/wsj_train_lms.sh && \
local/wsj_format_local_lms.sh &&
( local/wsj_train_rnnlms.sh --cmd "$decode_cmd -l mem_free=10G" data/local/rnnlm.h30.voc10k &
sleep 20; # wait till tools compiled.
local/wsj_train_rnnlms.sh --cmd "$decode_cmd -l mem_free=12G" \
--hidden 100 --nwords 20000 --class 350 --direct 1500 data/local/rnnlm.h100.voc20k &
local/wsj_train_rnnlms.sh --cmd "$decode_cmd -l mem_free=14G" \
--hidden 200 --nwords 30000 --class 350 --direct 1500 data/local/rnnlm.h200.voc30k &
local/wsj_train_rnnlms.sh --cmd "$decode_cmd -l mem_free=16G" \
--hidden 300 --nwords 40000 --class 400 --direct 2000 data/local/rnnlm.h300.voc40k &
)
) &
# Now make MFCC features.
# mfccdir should be some place with a largish disk where you
......@@ -339,7 +340,37 @@ steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_bd_
steps/rnnlmrescore.sh \
--N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000 &
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000
dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.75_N1000
rm -rf $dir
cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000 $dir
steps/rnnlmrescore.sh \
--stage 7 --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
exp/tri3b/decode_bd_tgpr_eval92_fg $dir
dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.75
rm -rf $dir
cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5 $dir
steps/rnnlmrescore.sh \
--stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
exp/tri3b/decode_bd_tgpr_eval92_fg $dir
dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.25
rm -rf $dir
cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5 $dir
steps/rnnlmrescore.sh \
--stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
0.25 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
exp/tri3b/decode_bd_tgpr_eval92_fg $dir
steps/rnnlmrescore.sh \
--N 10 --cmd "$decode_cmd" --inv-acwt 17 \
0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N10 \
|| exit 1;
# The following two steps, which are a kind of side-branch, try mixing up
......@@ -358,7 +389,7 @@ steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
# From 3b system, train another SAT system with all the si284 data.
# Use the letter tri4a, as tri4b was used in s3/ for a "quick-retrained" system.
steps/train_sat.sh --nj 20 --cmd "$train_cmd" \
steps/train_sat.sh --cmd "$train_cmd" \
4200 40000 data/train_si284 data/lang exp/tri3b_ali_si284 exp/tri4a || exit 1;
utils/mkgraph.sh data/lang_test_tgpr exp/tri4a exp/tri4a/graph_tgpr || exit 1;
......@@ -389,6 +420,9 @@ steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \
data/train_si284 data/lang exp/tri4b_ali_si284 exp/tri4b_denlats_si284 \
exp/tri4b_mmi_b0.1 || exit 1;
steps/decode.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri43b/decode_tgpr_dev93 \
exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_mmi_b0.1/decode_tgpr_dev93
## I AM HERE. Things below here are commands from the old run.sh,
# and I have to change them for the current run.sh.
exit 0;
......
--use-energy=false # only non-default option.
--sample-frequency=8000 # Switchboard is sampled at 8kHz
<Topology>
<TopologyEntry>
<ForPhones>
NONSILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State>
<State> 2 </State>
</TopologyEntry>
<TopologyEntry>
<ForPhones>
SILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State>
<State> 2 </State>
</TopologyEntry>
</Topology>
<Topology>
<TopologyEntry>
<ForPhones>
NONSILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State>
<State> 3 </State>
</TopologyEntry>
<TopologyEntry>
<ForPhones>
SILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 3 <PdfClass> 3 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 4 <PdfClass> 4 <Transition> 4 0.25 <Transition> 5 0.75 </State>
<State> 5 </State>
</TopologyEntry>
</Topology>
<SIL> SIL
YES Y_S
NO N_S
<eps> 0
SIL 1
Y_S 2
N_S 3
\data\
ngram 1=3
\1-grams:
-1 NO
-1 YES
-99 <s>
-1 </s>
\end\
#!/usr/bin/perl
$in_list = $ARGV[0];
open IL, $in_list;
while ($l = <IL>)
{
chomp($l);
$l =~ s/\.wav//;
$trans = $l;
$trans =~ s/0/NO/g;
$trans =~ s/1/YES/g;
$trans =~ s/\_/ /g;
print "$l $trans\n";
}
#!/usr/bin/perl
$waves_dir = $ARGV[0];
$in_list = $ARGV[1];
open IL, $in_list;
while ($l = <IL>)
{
chomp($l);
$full_path = $waves_dir . "\/" . $l;
$l =~ s/\.wav//;
print "$l $full_path\n";
}
#!/usr/bin/perl
$full_list = $ARGV[0];
$test_list = $ARGV[1];
$train_list = $ARGV[2];
open FL, $full_list;
$nol = 0;
while ($l = <FL>)
{
$nol++;
}
close FL;
$i = 0;
open FL, $full_list;
open TESTLIST, ">$test_list";
open TRAINLIST, ">$train_list";
while ($l = <FL>)
{
chomp($l);
$i++;
if ($i <= $nol/2 )
{
print TRAINLIST "$l\n";
}
else
{
print TESTLIST "$l\n";
}
}
#!/bin/bash
mkdir -p data/local
local=`pwd`/local
scripts=`pwd`/scripts
export PATH=$PATH:`pwd`/../../../tools/irstlm/bin
train_base_name=train_yesno
test_base_name=test_yesno
waves_dir=$1
cd data/local
ls -1 $waves_dir > waves_all.list
../../local/create_yesno_waves_test_train.pl waves_all.list waves.test waves.train
../../local/create_yesno_wav_scp.pl ${waves_dir} waves.test > ${test_base_name}_wav.scp
../../local/create_yesno_wav_scp.pl ${waves_dir} waves.train > ${train_base_name}_wav.scp
../../local/create_yesno_txt.pl waves.test > ${test_base_name}.txt
../../local/create_yesno_txt.pl waves.train > ${train_base_name}.txt
cp ../../input/task.arpabo lm_tg.arpa
cd ../..
echo "Data preparation succeeded"
#!/usr/bin/perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This program takes on its standard input a list of utterance
# id's, one for each line. (e.g. 4k0c030a is a an utterance id).
# It takes as
# Extracts from the dot files the transcripts for a given
# dataset (represented by a file list).
#
@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts";
$dot_flist = shift @ARGV;
open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n";
while(<L>){
chop;
m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_";
$spk = $1;
$spk2dot{$spk} = $_;
}
while(<STDIN>){
chop;
$uttid = $_;
$uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_";
$spk = $1;
if($spk ne $curspk) {
%utt2trans = { }; # Don't keep all the transcripts in memory...
$curspk = $spk;
$dotfile = $spk2dot{$spk};
defined $dotfile || die "No dot file for speaker $spk\n";
open(F, "<$dotfile") || die "Error opening dot file $dotfile\n";
while(<F>) {
$_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n";
$trans = $1;
$utt = $2;
$utt2trans{$utt} = $trans;
}
}
if(!defined $utt2trans{$uttid}) {
print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n";
} else {
print "$uttid $utt2trans{$uttid}\n";
}
}
#!/usr/bin/perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# takes in a file list with lines like
# /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
# and outputs an scp in kaldi format with lines like
# 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
# (the first thing is the utterance-id, which is the same as the basename of the file.
while(<>){
m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_";
$id = $1;
$id =~ tr/A-Z/a-z/; # Necessary because of weirdness on disk 13-16.1 (uppercase filenames)
print "$id $_";
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment