Commit af7e69f2 authored by Peng Qi's avatar Peng Qi
Browse files

trunk: Adding a new recipe that combines Switchboard and Fisher English

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4340 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 4da19ae7
#!/bin/bash
# Fisher + Switchboard combined recipe, adapted from respective Fisher and Switchboard
# recipes by Peng Qi (pengqi@cs.stanford.edu).
# (Aug 2014)
# It's best to run the commands in this one by one.
. cmd.sh
......
# "queue.pl" uses qsub. The options to it are
# options to qsub. If you have GridEngine installed,
# change this to a queue you have access to.
# Otherwise, use "run.pl", which will run jobs locally
# (make sure your --num-jobs options are no more than
# the number of cpus on your machine.
#a) JHU cluster options
#export train_cmd="queue.pl -l arch=*64*"
#export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
#export cuda_cmd="..."
#export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
#b) BUT cluster options
#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
#c) run it locally...
#export train_cmd=run.pl
#export decode_cmd=run.pl
#export cuda_cmd=run.pl
#export mkgraph_cmd=run.pl
#d) Gorgon cluster
export train_cmd="gorgon_queue.pl -q gorgon"
export decode_cmd="gorgon_queue.pl -q gorgon"
export cuda_cmd="gorgon_queue.pl -q gorgon"
export mkgraph_cmd="gorgon_queue.pl -q gorgon"
beam=11.0 # beam for decoding. Was 13.0 in the scripts.
first_beam=8.0 # beam for 1st-pass decoding in SAT.
--use-energy=false # only non-default option.
--sample-frequency=8000 # Switchboard is sampled at 8kHz
<Topology>
<TopologyEntry>
<ForPhones>
NONSILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State>
<State> 3 </State>
</TopologyEntry>
<TopologyEntry>
<ForPhones>
SILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 3 <PdfClass> 3 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 4 <PdfClass> 4 <Transition> 4 0.25 <Transition> 5 0.75 </State>
<State> 5 </State>
</TopologyEntry>
</Topology>
#!/bin/bash
# Hub-5 Eval 2000 data preparation
# Author: Arnab Ghoshal (Jan 2013)
# To be run from one directory above this script.
# The input is two directory names (possibly the same) containing the
# 2000 Hub5 english evaluation test set and transcripts, which are
# respectively: LDC2002S09 LDC2002T43
# e.g. see
# http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2002S09
# http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002T43
#
# Example usage:
# local/eval2000_data_prep_edin.sh /exports/work/inf_hcrc_cstr_general/corpora/hub5/2000 /exports/work/inf_hcrc_cstr_general/corpora/hub5/2000/transcr
# The first directory ($sdir) contains the speech data, and the directory
# $sdir/english/ must exist.
# The second directory ($tdir) contains the transcripts, and the directory
# $tdir/reference must exist; in particular we need the file
# $tdir/reference/hub5e00.english.000405.stm
if [ $# -ne 2 ]; then
echo "Usage: "`basename $0`" <speech-dir> <transcription-dir>"
echo "See comments in the script for more details"
exit 1
fi
sdir=$1
tdir=$2
[ ! -d $sdir/english ] \
&& echo Expecting directory $sdir/english to be present && exit 1;
[ ! -d $tdir/reference ] \
&& echo Expecting directory $tdir/reference to be present && exit 1;
. path.sh
dir=data/local/eval2000
mkdir -p $dir
find $sdir/english -iname '*.sph' | sort > $dir/sph.flist
sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
> $dir/sph.scp
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
[ ! -x $sph2pipe ] \
&& echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
awk -v sph2pipe=$sph2pipe '{
printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
#side A - channel 1, side B - channel 2
# Get segments file...
# segments file format is: utt-id side-id start-time end-time, e.g.:
# sw02001-A_000098-001156 sw02001-A 0.98 11.56
pem=$sdir/english/hub5e_00.pem
[ ! -f $pem ] && echo "No such file $pem" && exit 1;
# pem file has lines like:
# en_4156 A unknown_speaker 301.85 302.48
grep -v ';;' $pem \
| awk '{
spk=$1"-"$2;
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
print utt,spk,$4,$5;}' \
| sort -u > $dir/segments
# stm file has lines like:
# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F> HE IS A POLICE OFFICER
# TODO(arnab): We should really be lowercasing this since the Edinburgh
# recipe uses lowercase. This is not used in the actual scoring.
grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
| awk '{
spk=$1"-"$2;
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \
| sort > $dir/text.all
# We'll use the stm file for sclite scoring. There seem to be various errors
# in the stm file that upset hubscr.pl, and we fix them here.
sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' \
$tdir/reference/hub5e00.english.000405.stm > $dir/stm
cp $tdir/reference/en20000405_hub5.glm $dir/glm
# next line uses command substitution
# Just checking that the segments are the same in pem vs. stm.
! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \
echo "Segments from pem file and stm file do not match." && exit 1;
grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
# create an utt2spk file that assumes each conversation side is
# a separate speaker.
awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
# cp $dir/segments $dir/segments.tmp
# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \
# $dir/segments.tmp > $dir/segments
awk '{print $1}' $dir/wav.scp \
| perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
print "$1-$2 $1 $2\n"; ' \
> $dir/reco2file_and_channel || exit 1;
dest=data/eval2000
mkdir -p $dest
for x in wav.scp segments text utt2spk spk2utt stm glm reco2file_and_channel; do
cp $dir/$x $dest/$x
done
echo Data preparation and formatting completed for Eval 2000
echo "(but not MFCC extraction)"
#!/bin/bash
#
if [ -f path.sh ]; then . path.sh; fi
mkdir -p data/lang_test
arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
cp -rT data/lang data/lang_test
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
echo "First few lines of lexicon FST:"
fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head
echo Performing further checks
# Checking that G.fst is determinizable.
fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
# Checking that L_disambig.fst is determinizable.
fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
# Checking that disambiguated lexicon times G is determinizable
# Note: we do this with fstdeterminizestar not fstdeterminize, as
# fstdeterminize was taking forever (presumbaly relates to a bug
# in this version of OpenFst that makes determinization slow for
# some case).
fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
fstdeterminizestar >/dev/null || echo Error
# Checking that LG is stochastic:
fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
fstisstochastic || echo "[log:] LG is not stochastic"
echo "$0 succeeded"
#!/bin/bash
#
if [ -f path.sh ]; then . path.sh; fi
mkdir -p data/lang_test_fsh
arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
cp -rT data/lang data/lang_test_fsh
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test_fsh/words.txt \
--osymbols=data/lang_test_fsh/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test_fsh/G.fst
fstisstochastic data/lang_test_fsh/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test_fsh/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
echo "First few lines of lexicon FST:"
fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head
echo Performing further checks
# Checking that G.fst is determinizable.
fstdeterminize data/lang_test_fsh/G.fst /dev/null || echo Error determinizing G.
# Checking that L_disambig.fst is determinizable.
fstdeterminize data/lang_test_fsh/L_disambig.fst /dev/null || echo Error determinizing L.
# Checking that disambiguated lexicon times G is determinizable
# Note: we do this with fstdeterminizestar not fstdeterminize, as
# fstdeterminize was taking forever (presumbaly relates to a bug
# in this version of OpenFst that makes determinization slow for
# some case).
fsttablecompose data/lang_test_fsh/L_disambig.fst data/lang_test_fsh/G.fst | \
fstdeterminizestar >/dev/null || echo Error
# Checking that LG is stochastic:
fsttablecompose data/lang/L_disambig.fst data/lang_test_fsh/G.fst | \
fstisstochastic || echo "[log:] LG is not stochastic"
echo "$0 succeeded"
#!/bin/bash
# Copyright 2013 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
stage=0
. utils/parse_options.sh
if [ $# -eq 0 ]; then
echo "$0 <fisher-dir-1> [<fisher-dir-2> ...]"
echo " e.g.: $0 /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19\\"
echo " /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13"
echo " (We also support a single directory that has the contents of all of them)"
exit 1;
fi
# Check that the arguments are all absolute pathnames.
for dir in $*; do
case $dir in /*) ;; *)
echo "$0: all arguments must be absolute pathnames."; exit 1;
esac
done
# First check we have the right things in there...
#
rm -r data/local/data_fisher/links 2>/dev/null
mkdir -p data/local/data_fisher/links || exit 1;
for subdir in fe_03_p1_sph1 fe_03_p1_sph3 fe_03_p1_sph5 fe_03_p1_sph7 \
fe_03_p2_sph1 fe_03_p2_sph3 fe_03_p2_sph5 fe_03_p2_sph7 fe_03_p1_sph2 \
fe_03_p1_sph4 fe_03_p1_sph6 fe_03_p1_tran fe_03_p2_sph2 fe_03_p2_sph4 \
fe_03_p2_sph6 fe_03_p2_tran; do
found_subdir=false
for dir in $*; do
if [ -f $dir/$subdir ]; then
found_subdir=true
ln -s $dir data/local/data_fisher/links/$subdir
else
new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/)
if [ -d $dir/$new_style_subdir ]; then
found_subdir=true
ln -s $dir/$new_style_subdir data/local/data_fisher/links/$subdir
fi
fi
done
if ! $found_subdir; then
echo "$0: could not find the subdirectory $subdir in any of $*"
exit 1;
fi
done
tmpdir=`pwd`/data/local/data_fisher
links=data/local/data_fisher/links
. ./path.sh # Needed for KALDI_ROOT
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
# (1) Get transcripts in one file, and clean them up ...
if [ $stage -le 0 ]; then
find $links/fe_03_p1_tran/data $links/fe_03_p2_tran/data -iname '*.txt' > $tmpdir/transcripts.flist
for dir in fe_03_p{1,2}_sph{1,2,3,4,5,6,7}; do
find $links/$dir/ -iname '*.sph'
done > $tmpdir/sph.flist
n=`cat $tmpdir/transcripts.flist | wc -l`
if [ $n -ne 11699 ]; then
echo "Expected to find 11699 transcript files in the Fisher data, found $n"
exit 1;
fi
n=`cat $tmpdir/sph.flist | wc -l`
if [ $n -ne 11699 ]; then
echo "Expected to find 11699 .sph files in the Fisher data, found $n"
exit 1;
fi
fi
dir=data/train_fisher
if [ $stage -le 1 ]; then
mkdir -p $dir
## fe_03_00004.sph
## Transcpribed at the LDC
#
#7.38 8.78 A: an- so the topic is
echo -n > $tmpdir/text.1 || exit 1;
perl -e '
use File::Basename;
($tmpdir)=@ARGV;
open(F, "<$tmpdir/transcripts.flist") || die "Opening list of transcripts";
open(R, "|sort >data/train_fisher/reco2file_and_channel") || die "Opening reco2file_and_channel";
open(T, ">$tmpdir/text.1") || die "Opening text output";
while (<F>) {
$file = $_;
m:([^/]+)\.txt: || die "Bad filename $_";
$call_id = $1;
print R "$call_id-A $call_id A\n";
print R "$call_id-B $call_id B\n";
open(I, "<$file") || die "Opening file $_";
$line1 = <I>;
$line1 =~ m/# (.+)\.sph/ || die "Bad first line $line1 in file $file";
$call_id eq $1 || die "Mismatch call-id $call_id vs $1\n";
while (<I>) {
if (m/([0-9.]+)\s+([0-9.]+) ([AB]):\s*(\S.+\S|\S)\s*$/) {
$start = sprintf("%06d", $1 * 100.0);
$end = sprintf("%06d", $2 * 100.0);
length($end) > 6 && die "Time too long $end in file $file";
$side = $3;
$words = $4;
$utt_id = "${call_id}-$side-$start-$end";
print T "$utt_id $words\n" || die "Error writing to text file";
}
}
}
close(R); close(T) ' $tmpdir || exit 1;
fi
if [ $stage -le 2 ]; then
sort $tmpdir/text.1 | grep -v '((' | \
awk '{if (NF > 1){ print; }}' | \
sed 's:\[laugh\]:[laughter]:g' | \
sed 's:\[sigh\]:[noise]:g' | \
sed 's:\[cough\]:[noise]:g' | \
sed 's:\[sigh\]:[noise]:g' | \
sed 's:\[mn\]:[noise]:g' | \
sed 's:\[breath\]:[noise]:g' | \
sed 's:\[lipsmack\]:[noise]:g' > $tmpdir/text.2
cp $tmpdir/text.2 $dir/text
# create segments file and utt2spk file...
! cat $dir/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/utt2spk \
&& echo "Error producing utt2spk file" && exit 1;
cat $dir/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; $s = sprintf("%.2f", 0.01*$3);
$e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' > $dir/segments
utils/utt2spk_to_spk2utt.pl <$dir/utt2spk > $dir/spk2utt
fi
if [ $stage -le 3 ]; then
cat $tmpdir/sph.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; "; print "$1 $_"; ' > $tmpdir/sph.scp
cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
sort -k1,1 -u > $dir/wav.scp || exit 1;
fi
if [ $stage -le 4 ]; then
# get the spk2gender information. This is not a standard part of our
# file formats
# The files "filetable2fe_03_p2_sph1 fe_03_05852.sph ff
cat $links/fe_03_p1_sph{1,2,3,4,5,6,7}/filetable.txt \
$links/fe_03_p2_sph{1,2,3,4,5,6,7}/docs/filetable2.txt | \
perl -ane 'm:^\S+ (\S+)\.sph ([fm])([fm]): || die "bad line $_;"; print "$1-A $2\n", "$1-B $3\n"; ' | \
sort | uniq | utils/filter_scp.pl $dir/spk2utt > $dir/spk2gender
if [ ! -s $dir/spk2gender ]; then
echo "It looks like our first try at getting the spk2gender info did not work."
echo "(possibly older distribution?) Trying something else."
cat $links/fe_03_p1_tran/doc/fe_03_p1_filelist.tbl $links/fe_03_p2_tran/doc/fe_03_p2_filelist.tbl | \
perl -ane 'm:fe_03_p[12]_sph\d\t(\d+)\t([mf])([mf]): || die "Bad line $_";
print "fe_03_$1-A $2\n", "fe_03_$1-B $3\n"; ' | \
sort | uniq | utils/filter_scp.pl $dir/spk2utt > $dir/spk2gender
fi
fi
echo "Fisher data preparation succeeded"
#!/usr/bin/perl -w
# Copyright 2013 Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script cleans up the Fisher English transcripts and maps the words to
# be similar to the Switchboard Mississippi State transcripts
# Reads from STDIN and writes to STDOUT
use strict;
while (<>) {
chomp;
$_ = lc($_); # few things aren't lowercased in the data, e.g. I'm
s/\*//g; # *mandatory -> mandatory
s/\(//g; s/\)//g; # Remove parentheses
next if /^\s*$/; # Skip empty lines
# In one conversation people speak some German phrases that are tagged as
# <german (( ja wohl )) > -- we remove these
s/<[^>]*>//g;
s/\.\_/ /g; # Abbreviations: a._b._c. -> a b c.
s/(\w)\.s( |$)/$1's /g; # a.s -> a's
s/\./ /g; # Remove remaining .
s/(\w)\,(\w| )/$1 $2/g; # commas don't appear within numbers, but still
s/( |^)\'(blade|cause|course|frisco|okay|plain|specially)( |$)/ $2 /g;
s/\'em/-em/g;
# Remove an opening ' if there is a matching closing ' since some word
# fragments are annotated as: 'kay, etc.
# The substitution is done twice, since matching once doesn't capture
# consequetive quoted segments (the space in between is used up).
s/(^| )\'(.*?)\'( |$)/ $2 /g;
s/(^| )\'(.*?)\'( |$)/ $2 /g;
s/( |^)\'(\w)( |-|$)/$1 /g; # 'a- -> a
s/( |^)-( |$)/ /g; # Remove dangling -
s/\?//g; # Remove ?
s/( |^)non-(\w+)( |$)/ non $2 /g; # non-stop -> non stop
# Some words that are annotated as fragments are actual dictionary words
s/( |-)(acceptable|arthritis|ball|cause|comes|course|eight|eighty|field|giving|habitating|heard|hood|how|king|ninety|okay|paper|press|scripts|store|till|vascular|wood|what|york)(-| )/ $2 /g;
# Remove [[skip]] and [pause]
s/\[\[skip\]\]/ /g;
s/\[pause\]/ /g;
# [breath], [cough], [lipsmack], [sigh], [sneeze] -> [noise]
s/\[breath\]/[noise]/g;
s/\[cough\]/[noise]/g;
s/\[lipsmack\]/[noise]/g;
s/\[sigh\]/[noise]/g;
s/\[sneeze\]/[noise]/g;
s/\[mn\]/[vocalized-noise]/g; # [mn] -> [vocalized-noise]
s/\[laugh\]/[laughter]/g; # [laugh] -> [laughter]
# Now, mapping individual words
my @words = split /\s+/;
for my $i (0..$#words) {
my $w = $words[$i];
$w =~ s/^'/-/;
$words[$i] = $w;
}
print join(" ", @words) . "\n";
}
#!/bin/bash
#
# To be run from one directory above this script.
## The input is some directory containing the switchboard-1 release 2
## corpus (LDC97S62). Note: we don't make many assumptions about how
## you unpacked this. We are just doing a "find" command to locate
## the .sph files.
# for example /mnt/matylda2/data/SWITCHBOARD_1R2
. path.sh
# The parts of the output of this that will be needed are
# [in data/local/dict/ ]
# lexicon.txt
# extra_questions.txt
# nonsilence_phones.txt
# optional_silence.txt
# silence_phones.txt