Commit 7a8f5a0f authored by Arnab Ghoshal's avatar Arnab Ghoshal
Browse files

scripts for switchboard recipe that i am using in edinburgh

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@1990 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 5a2bc358
#!/bin/bash
# Hub-5 Eval 1997 data preparation
# Author: Arnab Ghoshal (Jan 2013)
# To be run from one directory above this script.
# The input is a directory name containing the 1997 Hub5 english evaluation
# test set and transcripts, which is LDC2002S10
# e.g. see
# http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002S10
#
# It is assumed that the transcripts are in a subdirectory called transcr
# However, we download the STM from NIST site:
# ftp://jaguar.ncsl.nist.gov/lvcsr/mar97/eval/hub5e97.english.980618.stm
if [ $# -ne 1 ]; then
echo "Usage: "`basename $0`" <speech-dir>"
echo "See comments in the script for more details"
exit 1
fi
sdir=$1
[ ! -d $sdir/speech ] \
&& echo Expecting directory $sdir/speech to be present && exit 1;
[ ! -d $sdir/transcr ] \
&& echo Expecting directory $sdir/transcr to be present && exit 1;
. path.sh
dir=data/local/eval1997
mkdir -p $dir
find $sdir/speech -iname '*.sph' | sort > $dir/sph.flist
sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
> $dir/sph.scp
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
[ ! -x $sph2pipe ] \
&& echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
awk -v sph2pipe=$sph2pipe '{
printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
#side A - channel 1, side B - channel 2
# Get segments file...
# segments file format is: utt-id side-id start-time end-time, e.g.:
# sw02001-A_000098-001156 sw02001-A 0.98 11.56
pem=$sdir/speech/97_hub5e.pem
[ ! -f $pem ] && echo "No such file $pem" && exit 1;
# pem file has lines like:
# en_4156 A unknown_speaker 301.85 302.48
# There is one line in the 97_hub5e.pem with an extra : on the channel
# sw_10022 B: unknown_speaker 281.21 284.37 -- the : is removed
# There are two other mistakes in the pem that are also corrected.
grep -v ';;' $pem | sed -e 's?:??g' \
| awk '{
spk=$1"-"$2; start=$4; end=$5;
if (spk == "en_4763-A" && start == 389.14) end=389.40;
if (spk == "en_5153-A" && start == 593.84) end=594.31;
utt=sprintf("%s_%06d-%06d", spk, start*100, end*100);
printf "%s %s %.2f %.2f\n", utt, spk, start, end; }' \
| sort -u > $dir/segments
# Download the STM and GLM files:
( cd $dir
rm -f stm glm
[ -f hub5e97.english.980618.stm ] || \
wget ftp://jaguar.ncsl.nist.gov/lvcsr/mar97/eval/hub5e97.english.980618.stm
ln -s hub5e97.english.980618.stm stm
[ -f en20010117_hub5.glm ] || \
wget ftp://jaguar.ncsl.nist.gov/rt/rt02/software/en20010117_hub5.glm
ln -s en20010117_hub5.glm glm
)
# stm file has lines like:
# en_4042 A en_4042_A 227.71 232.26 <O> BEANS RIGHT THAT IS WHY I SAID BEANS
# One of the segments (sw_10022-B_028120-028437) is removed since it is not
# scored and does not show up in the pem file.
grep -v ';;' $dir/hub5e97.english.980618.stm \
| awk '{
spk=$1"-"$2;
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \
| sort -k1,1 -u > $dir/text.all
grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
# next line uses command substitution
# Just checking that the segments are the same in pem vs. stm.
! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \
echo "Segments from pem file and stm file do not match." && exit 1;
# create an utt2spk file that assumes each conversation side is
# a separate speaker.
awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
# cp $dir/segments $dir/segments.tmp
# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \
# $dir/segments.tmp > $dir/segments
awk '{print $1}' $dir/wav.scp \
| perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
print "$1-$2 $1 $2\n"; ' \
> $dir/reco2file_and_channel || exit 1;
dest=data/eval1997
mkdir -p $dest
for x in wav.scp segments text utt2spk spk2utt stm glm reco2file_and_channel; do
cp $dir/$x $dest/$x
done
echo Data preparation and formatting completed for Eval 2000
echo "(but not MFCC extraction)"
#!/bin/bash
# Hub-5 Eval 2000 data preparation
# Author: Arnab Ghoshal (Jan 2013)
# To be run from one directory above this script.
# The input is two directory names (possibly the same) ontaining the
# 2000 Hub5 english evaluation test set and transcripts, which are
# respectively: LDC2002S09 LDC2002T43
# e.g. see
# http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2002S09
# http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002T43
#
# Example usage:
# local/eval2000_data_prep_edin.sh /exports/work/inf_hcrc_cstr_general/corpora/hub5/2000 /exports/work/inf_hcrc_cstr_general/corpora/hub5/2000/transcr
# The first directory ($sdir) contains the speech data, and the directory
# $sdir/english/ must exist.
# The second directory ($tdir) contains the transcripts, and the directory
# $tdir/reference must exist; in particular we need the file
# $tdir/reference/hub5e00.english.000405.stm
if [ $# -ne 2 ]; then
echo "Usage: "`basename $0`" <speech-dir> <transcription-dir>"
echo "See comments in the script for more details"
exit 1
fi
sdir=$1
tdir=$2
[ ! -d $sdir/english ] \
&& echo Expecting directory $sdir/english to be present && exit 1;
[ ! -d $tdir/reference ] \
&& echo Expecting directory $tdir/reference to be present && exit 1;
. path.sh
dir=data/local/eval2000
mkdir -p $dir
find $sdir/english -iname '*.sph' | sort > $dir/sph.flist
sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
> $dir/sph.scp
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
[ ! -x $sph2pipe ] \
&& echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
awk -v sph2pipe=$sph2pipe '{
printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
#side A - channel 1, side B - channel 2
# Get segments file...
# segments file format is: utt-id side-id start-time end-time, e.g.:
# sw02001-A_000098-001156 sw02001-A 0.98 11.56
pem=$sdir/english/hub5e_00.pem
[ ! -f $pem ] && echo "No such file $pem" && exit 1;
# pem file has lines like:
# en_4156 A unknown_speaker 301.85 302.48
grep -v ';;' $pem \
| awk '{
spk=$1"-"$2;
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
print utt,spk,$4,$5;}' \
| sort -u > $dir/segments
# stm file has lines like:
# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F> HE IS A POLICE OFFICER
# TODO(arnab): We should really be lowercasing this since the Edinburgh
# recipe uses lowercase. This is not used in the actual scoring.
grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
| awk '{
spk=$1"-"$2;
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \
| sort > $dir/text.all
# We'll use the stm file for sclite scoring. There seem to be various errors
# in the stm file that upset hubscr.pl, and we fix them here.
sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' \
$tdir/reference/hub5e00.english.000405.stm > $dir/stm
cp $tdir/reference/en20000405_hub5.glm $dir/glm
# next line uses command substitution
# Just checking that the segments are the same in pem vs. stm.
! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \
echo "Segments from pem file and stm file do not match." && exit 1;
grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
# create an utt2spk file that assumes each conversation side is
# a separate speaker.
awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
# cp $dir/segments $dir/segments.tmp
# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \
# $dir/segments.tmp > $dir/segments
awk '{print $1}' $dir/wav.scp \
| perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
print "$1-$2 $1 $2\n"; ' \
> $dir/reco2file_and_channel || exit 1;
dest=data/eval2000
mkdir -p $dest
for x in wav.scp segments text utt2spk spk2utt stm glm reco2file_and_channel; do
cp $dir/$x $dest/$x
done
echo Data preparation and formatting completed for Eval 2000
echo "(but not MFCC extraction)"
#!/usr/bin/perl -w
# Copyright 2013 Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script cleans up the Fisher English transcripts and maps the words to
# be similar to the Switchboard Mississippi State transcripts
# Reads from STDIN and writes to STDOUT
use strict;
while (<>) {
chomp;
$_ = lc($_); # few things aren't lowercased in the data, e.g. I'm
s/\*//g; # *mandatory -> mandatory
s/\(//g; s/\)//g; # Remove parentheses
next if /^\s*$/; # Skip empty lines
# In one conversation people speak some German phrases that are tagged as
# <german (( ja wohl )) > -- we remove these
s/<[^>]*>//g;
s/\.\_/ /g; # Abbreviations: a._b._c. -> a b c.
s/(\w)\.s( |$)/$1's /g; # a.s -> a's
s/\./ /g; # Remove remaining .
s/(\w)\,(\w| )/$1 $2/g; # commas don't appear within numbers, but still
s/( |^)\'(blade|cause|course|frisco|okay|plain|specially)( |$)/ $2 /g;
s/\'em/-em/g;
# Remove an opening ' if there is a matching closing ' since some word
# fragments are annotated as: 'kay, etc.
# The substitution is done twice, since matching once doesn't capture
# consequetive quoted segments (the space in between is used up).
s/(^| )\'(.*?)\'( |$)/ $2 /g;
s/(^| )\'(.*?)\'( |$)/ $2 /g;
s/( |^)\'(\w)( |-|$)/$1 /g; # 'a- -> a
s/( |^)-( |$)/ /g; # Remove dangling -
s/\?//g; # Remove ?
s/( |^)non-(\w+)( |$)/ non $2 /g; # non-stop -> non stop
# Some words that are annotated as fragments are actual dictionary words
s/( |-)(acceptable|arthritis|ball|cause|comes|course|eight|eighty|field|giving|habitating|heard|hood|how|king|ninety|okay|paper|press|scripts|store|till|vascular|wood|what|york)(-| )/ $2 /g;
# Remove [[skip]] and [pause]
s/\[\[skip\]\]/ /g;
s/\[pause\]/ /g;
# [breath], [cough], [lipsmack], [sigh], [sneeze] -> [noise]
s/\[breath\]/[noise]/g;
s/\[cough\]/[noise]/g;
s/\[lipsmack\]/[noise]/g;
s/\[sigh\]/[noise]/g;
s/\[sneeze\]/[noise]/g;
s/\[mn\]/[vocalized-noise]/g; # [mn] -> [vocalized-noise]
s/\[laugh\]/[laughter]/g; # [laugh] -> [laughter]
# Now, mapping individual words
my @words = split /\s+/;
for my $i (0..$#words) {
my $w = $words[$i];
$w =~ s/^'/-/;
$words[$i] = $w;
}
print join(" ", @words) . "\n";
}
......@@ -69,8 +69,9 @@ if [ $stage -le 1 ]; then
# Remove some stuff we don't want to score, from the ctm.
for x in $dir/score_*/$name.ctm; do
cp $x $dir/tmpf;
cat $dir/tmpf | grep -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \
grep -v -E '<UNK>|%HESITATION' > $x;
cat $dir/tmpf | grep -i -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \
grep -i -v -E '<UNK>' > $x;
# grep -i -v -E '<UNK>|%HESITATION' > $x; # hesitation is scored
done
fi
......
......@@ -58,8 +58,9 @@ if [ $stage -le 1 ]; then
# Remove some stuff we don't want to score, from the ctm.
for x in $dir/score_*/$name.ctm; do
cp $x $dir/tmpf;
cat $dir/tmpf | grep -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \
grep -v -E '<UNK>|%HESITATION' > $x;
cat $dir/tmpf | grep -i -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \
grep -i -v -E '<UNK>' > $x;
# grep -i -v -E '<UNK>|%HESITATION' > $x;
done
fi
......
#!/bin/bash
# Switchboard-1 training data preparation customized for Edinburgh
# Author: Arnab Ghoshal (Jan 2013)
# To be run from one directory above this script.
## The input is some directory containing the switchboard-1 release 2
## corpus (LDC97S62). Note: we don't make many assumptions about how
## you unpacked this. We are just doing a "find" command to locate
## the .sph files.
. path.sh
#check existing directories
if [ $# != 1 ]; then
echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD"
exit 1;
fi
SWBD_DIR=$1
dir=data/local/train
mkdir -p $dir
# Audio data directory check
if [ ! -d $SWBD_DIR ]; then
echo "Error: run.sh requires a directory argument"
exit 1;
fi
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
[ ! -x $sph2pipe ] \
&& echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
# Trans directory check
if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
# To get the SWBD transcriptions and dict, do:
echo " *** Downloading trascriptions and dictionary ***"
(
cd $dir;
wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
tar -xf switchboard_word_alignments.tar.gz
)
else
echo "Directory with transcriptions exists, skipping downloading"
[ -f $dir/swb_ms98_transcriptions ] \
|| ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/
fi
# Option A: SWBD dictionary file check
[ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \
echo "SWBD dictionary file does not exist" && exit 1;
# find sph audio files
find $SWBD_DIR -iname '*.sph' | sort > $dir/sph.flist
n=`cat $dir/sph.flist | wc -l`
[ $n -ne 2435 ] && \
echo Warning: expected 2435 data data files, found $n
# (1a) Transcriptions preparation
# make basic transcription file (add segments info)
# **NOTE: In the default Kaldi recipe, everything is made uppercase, while we
# make everything lowercase here. This is because we will be using SRILM which
# can optionally make everything lowercase (but not uppercase) when mapping
# LM vocabs.
awk '{
name=substr($1,1,6); gsub("^sw","sw0",name); side=substr($1,7,1);
stime=$2; etime=$3;
printf("%s-%s_%06.0f-%06.0f",
name, side, int(100*stime+0.5), int(100*etime+0.5));
for(i=4;i<=NF;i++) printf(" %s", tolower($i)); printf "\n"
}' $dir/swb_ms98_transcriptions/*/*/*-trans.text > $dir/transcripts1.txt
# test if trans. file is sorted
export LC_ALL=C;
sort -c $dir/transcripts1.txt || exit 1; # check it's sorted.
# Remove SILENCE, <B_ASIDE> and <E_ASIDE>.
# Note: we have [NOISE], [VOCALIZED-NOISE], [LAUGHTER], [SILENCE].
# removing [SILENCE], and the <B_ASIDE> and <E_ASIDE> markers that mark
# speech to somone; we will give phones to the other three (NSN, SPN, LAU).
# There will also be a silence phone, SIL.
# **NOTE: modified the pattern matches to make them case insensitive
cat $dir/transcripts1.txt \
| perl -ane 's:\s\[SILENCE\](\s|$):$1:gi;
s/<B_ASIDE>//gi;
s/<E_ASIDE>//gi;
print;' \
| awk '{if(NF > 1) { print; } } ' > $dir/transcripts2.txt
# **NOTE: swbd1_map_words.pl has been modified to make the pattern matches
# case insensitive
local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text # final transcripts
# (1c) Make segment files from transcript
#segments file format is: utt-id side-id start-time end-time, e.g.:
#sw02001-A_000098-001156 sw02001-A 0.98 11.56
awk '{
segment=$1;
split(segment,S,"[_-]");
side=S[2]; audioname=S[1]; startf=S[3]; endf=S[4];
print segment " " audioname "-" side " " startf/100 " " endf/100
}' < $dir/text > $dir/segments
sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
> $dir/sph.scp
awk -v sph2pipe=$sph2pipe '{
printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
#side A - channel 1, side B - channel 2
# this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
# to the file name sw02001 and the A, e.g.
# sw02001-A sw02001 A
# In this case it's trivial, but in other corpora the information might
# be less obvious. Later it will be needed for ctm scoring.
awk '{print $1}' $dir/wav.scp \
| perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
print "$1-$2 $1 $2\n"; ' \
> $dir/reco2file_and_channel || exit 1;
awk '{spk=substr($1,4,6); print $1 " " spk}' $dir/segments > $dir/utt2spk \
|| exit 1;
sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
# We assume each conversation side is a separate speaker. This is a very
# reasonable assumption for Switchboard. The actual speaker info file is at:
# http://www.ldc.upenn.edu/Catalog/desc/addenda/swb-multi-annot.summary
# Copy stuff into its final locations [this has been moved from the format_data
# script]
mkdir -p data/train
for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
cp data/local/train/$f data/train/$f || exit 1;
done
echo Switchboard-1 data preparation succeeded.
#!/usr/bin/perl
# Modified from swbd_map_words.pl in Kaldi s5 recipe to make pattern
# matches case-insensitive --Arnab (Jan 2013)
if ($ARGV[0] eq "-f") {
shift @ARGV;
$field_spec = shift @ARGV;
if ($field_spec =~ m/^\d+$/) {
$field_begin = $field_spec - 1; $field_end = $field_spec - 1;
}
if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
if ($1 ne "") {
$field_begin = $1 - 1; # Change to zero-based indexing.
}
if ($2 ne "") {
$field_end = $2 - 1; # Change to zero-based indexing.
}
}
if (!defined $field_begin && !defined $field_end) {
die "Bad argument to -f option: $field_spec";
}
}
while (<>) {
@A = split(" ", $_);
for ($n = 0; $n < @A; $n++) {
$a = $A[$n];
if ( (!defined $field_begin || $n >= $field_begin)
&& (!defined $field_end || $n <= $field_end)) {
# e.g. [LAUGHTER-STORY] -> STORY;
$a =~ s:(|\-)^\[LAUGHTER-(.+)\](|\-)$:$1$2$3:i;
# $1 and $3 relate to preserving trailing "-"
$a =~ s:^\[(.+)/.+\](|\-)$:$1$2:; # e.g. [IT'N/ISN'T] -> IT'N ... note,
# 1st part may include partial-word stuff, which we process further below,
# e.g. [LEM[GUINI]-/LINGUINI]
# the (|\_) at the end is to accept and preserve trailing -'s.
$a =~ s:^(|\-)\[[^][]+\](.+)$:-$2:; # e.g. -[AN]Y , note \047 is quote;
# let the leading - be optional on input, as sometimes omitted.
$a =~ s:^(.+)\[[^][]+\](|\-)$:$1-:; # e.g. AB[SOLUTE]- -> AB-;
# let the trailing - be optional on input, as sometimes omitted.
$a =~ s:([^][]+)\[.+\]$:$1:; # e.g. EX[SPECIALLY]-/ESPECIALLY] -> EX-
# which is a mistake in the input.
$a =~ s:^\{(.+)\}$:$1:; # e.g. {YUPPIEDOM} -> YUPPIEDOM
$a =~ s:[A-Z]\[([^][])+\][A-Z]:$1-$3:i; # e.g. AMMU[N]IT- -> AMMU-IT-
$a =~ s:_\d$::; # e.g. THEM_1 -> THEM
}
$A[$n] = $a;
}
print join(" ", @A) . "\n";
}
#!/bin/bash
# Formatting the Mississippi State dictionary for use in Edinburgh. Differs
# from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013)
# To be run from one directory above this script.
. path.sh
#check existing directories
[ $# != 0 ] && echo "Usage: local/swbd1_data_prep.sh" && exit 1;
srcdir=data/local/train # This is where we downloaded some stuff..
dir=data/local/dict
mkdir -p $dir
srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text
# assume swbd_p1_data_prep.sh was done already.
[ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1;
#(2a) Dictionary preparation:
# Pre-processing (Upper-case, remove comments)
awk 'BEGIN{getline}($0 !~ /^#/) {$0=tolower($0); print}' \
$srcdict | sort | awk '($0 !~ /^[:space:]*$/) {print}' \
> $dir/lexicon1.txt || exit 1;
cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
grep -v sil > $dir/nonsilence_phones.txt || exit 1;
( echo sil; echo spn; echo nsn; echo lau ) > $dir/silence_phones.txt
echo sil > $dir/optional_silence.txt
# No "extra questions" in the input to this setup, as we don't
# have stress or tone.
echo -n >$dir/extra_questions.txt
# Add to the lexicon the silences, noises etc.
( echo '!sil sil'; echo '[vocalized-noise] spn'; echo '[noise] nsn'; \
echo '[laughter] lau'; echo '<unk> spn' ) \
| cat - $dir/lexicon1.txt > $dir/lexicon2.txt || exit 1;
# Map the words in the lexicon. That is-- for each word in the lexicon, we map it
# to a new written form. The transformations we do are:
# remove laughter markings, e.g.
# [LAUGHTER-STORY] -> STORY
# Remove partial-words, e.g.
# -[40]1K W AH N K EY
# becomes -1K
# and
# -[AN]Y IY
# becomes
# -Y
# -[A]B[OUT]- B
# becomes
# -B-
# Also, curly braces, which appear to be used for "nonstandard"
# words or non-words, are removed, e.g.
# {WOLMANIZED} W OW L M AX N AY Z D
# -> WOLMANIZED
# Also, mispronounced words, e.g.
# [YEAM/YEAH] Y AE M
# are changed to just e.g. YEAM, i.e. the orthography
# of the mispronounced version.
# Note-- this is only really to be used in training. The main practical