Commit cd4a6454 authored by Arnab Ghoshal's avatar Arnab Ghoshal
Browse files

Added GlobalPhone recipe in trunk/egs/gp; removed sandbox/discrim;

fixed sandbox/karel/src/configure to use CUDA with MKL.


git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@758 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 954cf3ee
......@@ -20,7 +20,10 @@ Explanations of the corpora are below:
get the same data using combinations of other catalog numbers, but this
is the one we used).
Recipes in progress:
swbd: Switchboard. A fairly large amount of telephone speech (2-channel, 8kHz
sampling rate).
This directory is a work in progress.
\ No newline at end of file
gp: GlobalPhone. This is a multilingual speech corpus.
About the GlobalPhone corpus:
This is a corpus of read sentences from the newspapers in 19
different languages recorded under varying degrees of "clean"
conditions. There is roughly 15-20 hours of training data for
each language, as well as DEV and EVAL sets of roughly 2 hours
each.
Each subdirectory of this directory contains the
scripts for a sequence of experiments.
Note: s1 is the "default" set of scripts at the moment.
s1: This setup is experiments with GMM-based systems with various
Maximum Likelihood
techniques including global and speaker-specific transforms.
See a parallel setup in ../wsj/s3
AR TBA
BL 051 055 058 084 090 100 106
CR 033 034 035 036 046 048 051 053 054 057
CZ 083 085 087 089 091 093 095 097 099 101
FR XXX
GE 001 002 003 004 008 010
JA XXX
KO 006 012 025 040 045 061 084 086 091 098
CH 028 029 030 031 032 039 040 041 042 043 044
PO 064 065 072 073 102 103 104 132 133 134
PL TBA
RU XXX
WU TBA
SP 001 002 003 004 005 006 007 008 009 010
SW 045 046 047 048 049 066 067 068 069
TH 023 025 028 037 045 061 073 085
TA TBA
TU 001 002 003 005 006 008 013 014 015 016 019
VN 200 201 202 203 204 205 206 207 208 209
AR TBA
BL 040 059 063 068 095 109 110
CR 037 038 039 040 041 042 043 044 045 047
CZ 084 086 088 090 092 094 096 098 100 102
FR 091 092 093 094 095 096 097 098
GE 018 020 021 026 029 073
JA XXX
KO 019 029 032 042 051 064 069 080 082 088
CH 080 081 082 083 084 085 086 087 088 089
PO 135 136 137 138 139 142 143 312
PL TBA
RU XXX
WU TBA
SP 011 012 013 014 015 016 017 018
SW 040 041 042 043 044 060 061 062 063 064
TH 101 102 103 104 105 106 107 108
TA TBA
TU 025 030 031 032 037 039 041 046 056 063
VN 92 94 96 98 102 103 106 107 110 113
AR Arabic
BL Bulgarian
CR Croatian
CZ Czech
FR French
GE German
JA Japanese
KO Korean
CH Mandarin
PL Polish
PO Portuguese
RU Russian
SP Spanish
SW Swedish
TA Tamil
TH Thai
TU Turkish
VN Vietnamese
WU Wu
--use-energy=false # only non-default option.
<Topology>
<TopologyEntry>
<ForPhones>
NONSILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State>
<State> 3 </State>
</TopologyEntry>
<TopologyEntry>
<ForPhones>
SILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 3 <PdfClass> 3 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 4 <PdfClass> 4 <Transition> 4 0.25 <Transition> 5 0.75 </State>
<State> 5 </State>
</TopologyEntry>
</Topology>
#!/bin/bash
# Copyright 2012 Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
function errexit () {
echo -e "$@" >&2; exit 1;
}
if [ -d tools/shorten-3.6.1 ]; then
echo "tools/shorten-3.6.1 already exists. Remove manually before continuing."
else
echo -n "Installing shorten ... "
mkdir -p tools
cd tools
(
rm -f shorten-3.6.1.tar.gz
wget http://etree.org/shnutils/shorten/dist/src/shorten-3.6.1.tar.gz \
|| errexit "Download failed for shorten-3.6.1.";
set -e
tar -zxf shorten-3.6.1.tar.gz;
cd shorten-3.6.1
./configure --prefix=`pwd`
make
# make check -- Run this manually. 1 test fails when run from here, but
# not when run directly from the command line!
make install
set +e
cd ..
) >> install.log 2>&1
if [ $? -ne 0 ]; then
echo "installation failed (see tools/install.log)."
else
echo "installation succeeded."
fi
cd ..
fi
if [ -d tools/sox-14.3.2 ]; then
echo "tools/sox-14.3.2 already exists. Remove manually before continuing."
else
echo -n "Installing sox ... "
mkdir -p tools
cd tools
(
rm -f sox-14.3.2.tar.bz2
wget http://sourceforge.net/projects/sox/files/sox/14.3.2/sox-14.3.2.tar.bz2 || errexit "Download failed for sox-14.3.2.";
set -e
tar -jxf sox-14.3.2.tar.bz2;
cd sox-14.3.2
./configure --prefix=`pwd`
make -j 4
make install
set +e
cd ..
) >> install.log 2>&1
if [ $? -ne 0 ]; then
echo "installation failed (see tools/install.log)."
else
echo "installation succeeded."
fi
cd ..
fi
#!/bin/bash -u
# Copyright 2012 Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
set -o errexit
function read_dirname () {
local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
[ -d "$dir_name" ] || { echo "Argument '$dir_name' not a directory" >&2; \
exit 1; }
local retval=`cd $dir_name 2>/dev/null && pwd || $dir_name`;
echo $retval
}
PROG=`basename $0`;
usage="Usage: $PROG <arguments> [options]\n
Converts GlobalPhone audio files from shorten to WAV with error checking.\n
(Must have shorten and sox on PATH).\n\n
Required arguments:\n
--input-list=FILE\tList of shorten-compressed files to process.\n
--output-dir=DIR\tDirectory to write the WAV files to.\n
Options:\n
--output-list=FILE\tWrite list of converted files.\n
--help\t\t\tPrint this help and exit.\n
";
if [ $# -lt 2 ]; then
echo -e $usage; exit 1;
fi
while [ $# -gt 0 ];
do
case "$1" in
--help) echo -e $usage; exit 0 ;;
--input-list=*)
INLIST=`expr "X$1" : '[^=]*=\(.*\)'`;
[ -f "$INLIST" ] || { echo "Argument '$INLIST' not a file" >&2; exit 1; };
shift ;;
--output-dir=*)
ODIR=`read_dirname $1`; shift ;;
--output-list=*)
OLIST=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
esac
done
OLIST=${OLIST:-/dev/null} # Default for output list
# Checking for shorten and sox. Since 'errexit' option is set, the script will
# terminate if shorten and sox are not found.
which shorten > /dev/null
which sox > /dev/null
tmpdir=$(mktemp -d);
trap 'rm -rf "$tmpdir"' EXIT
mkdir -p $tmpdir/raw $ODIR
shnerr=$tmpdir/shnerr;
soxerr=$tmpdir/soxerr;
nshnerr=0;
nsoxerr=0;
while read line; do
[[ "$line" =~ ^.*/.*\.adc.shn$ ]] || { echo "Bad line: '$line'"; exit 1; }
set +e # Don't want script to die if conversion fails.
b=`basename $line .adc.shn`;
shorten -x $line $tmpdir/raw/${b}.raw;
if [ $? -ne 0 ]; then
echo "$line" >> $shnerr;
let "nshnerr+=1"
else
sox -t raw -r 16000 -e signed-integer -b 16 $tmpdir/raw/${b}.raw \
-t wav $ODIR/${b}.wav
if [ $? -ne 0 ]; then
echo "$tmpdir/raw/${b}.raw: exit status = $?" >> $soxerr;
let "nsoxerr+=1"
else
# Just in case there are empty files! Setting the cutoff at 1000 samples,
# which, assuming 16KHz sampling, is 0.0625 seconds.
nsamples=`soxi -s "$ODIR/${b}.wav"`;
if [[ "$nsamples" -gt 1000 ]]; then
echo "$ODIR/${b}.wav" >> $OLIST;
else
echo "$tmpdir/raw/${b}.raw: #samples = $nsamples" >> $soxerr;
let "nsoxerr+=1"
fi
fi
fi
set -e
done < "$INLIST"
[[ "$nshnerr" -gt 0 ]] && \
echo "shorten: error converting following $nshnerr file(s):" >&2
[ -f "$shnerr" ] && cat "$shnerr" >&2
[[ "$nsoxerr" -gt 0 ]] && \
echo "sox: error converting following $nsoxerr file(s):" >&2
[ -f "$soxerr" ] && cat "$soxerr" >&2
exit 0;
\ No newline at end of file
#!/bin/bash -u
# Copyright 2012 Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
set -o errexit
function error_exit () {
echo -e "$@" >&2; exit 1;
}
function read_dirname () {
local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
[ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory";
local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
echo $retval
}
PROG=`basename $0`;
usage="Usage: $PROG <arguments>\n
Prepare train, dev, eval file lists for a language.\n\n
Required arguments:\n
--config-dir=DIR\tDirecory containing the necessary config files\n
--corpus-dir=DIR\tDirectory for the GlobalPhone corpus\n
--lm-dir=DIR\t\tDirectory containing language models\n
--work-dir=DIR\t\tWorking directory\n
";
if [ $# -lt 4 ]; then
error_exit $usage;
fi
while [ $# -gt 0 ];
do
case "$1" in
--help) echo -e $usage; exit 0 ;;
--config-dir=*)
CONFDIR=`read_dirname $1`; shift ;;
--corpus-dir=*)
GPDIR=`read_dirname $1`; shift ;;
--lm-dir=*)
LMDIR=`read_dirname $1`; shift ;;
--work-dir=*)
WDIR=`read_dirname $1`; shift ;;
*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
esac
done
# (1) check if the config files are in place:
cd $CONFDIR
[ -f dev_spk.list ] || error_exit "$PROG: Dev-set speaker list not found.";
[ -f eval_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
[ -f lang_codes.txt ] || error_exit "$PROG: Mapping for language name to 2-letter code not found.";
cd $WDIR
[ -f path.sh ] && . path.sh # Sets the PATH to contain necessary executables
# (2) get the various file lists (for audio, transcription, etc.) for the
# specified language.
for LCODE in GE PO SP SW; do
mkdir -p data/$LCODE
gp_prep_flists.sh --corpus-dir=$GPDIR --dev-spk=$CONFDIR/dev_spk.list \
--eval-spk=$CONFDIR/eval_spk.list --lang-map=$CONFDIR/lang_codes.txt \
--work-dir=data $LCODE 2>data/$LCODE/prep_flists.log &
# Running these in parallel since this does audio conversion (to figure out
# which files cannot be processed) and takes some time to run.
done
wait;
# (3) Normalize the dictionary and transcripts.
for LCODE in GE PO SP SW; do
full_name=`awk '/'$LCODE'/ {print $2}' $CONFDIR/lang_codes.txt`;
gp_norm_dict_${LCODE}.pl -i $GPDIR/Dictionaries/${LCODE}/${full_name}-GPDict.txt | sort -u > data/$LCODE/local/lexicon_nosil_${LCODE}.txt
(echo -e '!SIL\tSIL\n<UNK>\tSPN';) \
| cat - data/$LCODE/local/lexicon_nosil_${LCODE}.txt \
> data/$LCODE/local/lexicon_${LCODE}.txt;
# add disambig symbols to the lexicon:
ndisambig=`add_lex_disambig.pl data/$LCODE/local/lexicon_${LCODE}.txt data/$LCODE/local/lexicon_disambig_${LCODE}.txt`
ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence
echo $ndisambig > data/$LCODE/local/lex_ndisambig
# Get the list of phones and map them to integers (adding silence and spoken
# nosie to the list).
cut -f2 data/$LCODE/local/lexicon_nosil_${LCODE}.txt | sed -e "s?_.*??g" \
| tr ' ' '\n' | sort -u \
| awk 'BEGIN{ print "<eps> 0"; print "SIL 1"; print "SPN 2"; N=3; }
{ printf("%s %d\n", $1, N++); }' > data/$LCODE/local/phones.txt
# If using word-boundary markers on phones, use this in the awk command above
# { printf("%s_WB %d\n", $1, N++); }
# If using position markers on phones, use these in the awk command above
# { printf("%s_B %d\n", $1, N++); }
# { printf("%s_E %d\n", $1, N++); }
# { printf("%s_S %d\n", $1, N++); }
# Get the list of words:
cut -f1 data/$LCODE/local/lexicon_${LCODE}.txt | sort -u \
| awk 'BEGIN{print "<eps> 0";} {printf("%s %d\n", $1, NR);}
END{printf("#0 %d\n", NR+1);}' > data/$LCODE/local/words.txt
for x in train dev eval; do
gp_norm_trans_${LCODE}.pl -i data/$LCODE/local/${x}_${LCODE}.trans \
> data/$LCODE/local/${x}_${LCODE}.trans2;
done
done
# (4) Normalize the LMs - this is very Edinburgh-specific since we have some
# LMs that came with the GlobalPhone corpus.
gp_prep_lms_edin.sh --lm-dir=$LMDIR --work-dir=$WDIR
echo "Finished data preparation."
#!/usr/bin/perl -w
use strict;
# Copyright 2012 Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script reads a list of Romanized GlobalPhone transcript files from the
# standard input (e.g. German/rmn/GE008.rmn).
# Following the conventions of the corpus, the basename of the transcript file
# is assumed to be the ID of the speaker (i.e. GE008 in this case).
# The transcript files are assumed to have the following format:
# ; 10:
# man mag es drehen und wenden wie man will
# where the number is the utterance ID. The script prints the utterance ID
# followed by the transcript, e.g.:
# GE008_10 man mag es drehen und wenden wie man will
while(<STDIN>) {
chomp;
$_ =~ m:\S+/(\S+).rmn: || die "Bad line in transcription file list: $_";
my $spk = $1;
open(F, "<$_") || die "Error opening transcription file $_\n";
while(<F>) {
s/\r//g; # Since the transcriptions are in DOS format!
chomp;
next unless($_ =~ /^;\s*(\d+)\:/);
my $utt = $1;
$_ = <F>;
die "Unexpected line: $_" if($_ =~ /^;/);
if ($_ =~ /^\s*$/) {
print STDERR "Empty transcript found for utterance '${spk}_${utt}.\n";
} else {
print "${spk}_${utt}\t$_" unless($_ =~ /^$/);
}
}
}
#!/bin/bash -u
# Copyright 2012 Arnab Ghoshal
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
set -o errexit
set -o pipefail
function error_exit () {
echo -e "$@" >&2; exit 1;
}
function read_dirname () {
local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
[ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory";
local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
echo $retval
}
PROG=`basename $0`;
usage="Usage: $PROG <arguments>\n
Prepare train, dev, eval file lists for a language.\n\n
Required arguments:\n
--hmm-proto=FILE\tPrototype of the HMM topology\n
--work-dir=DIR\t\tWorking directory\n
";
if [ $# -lt 2 ]; then
error_exit $usage;
fi
while [ $# -gt 0 ];
do
case "$1" in
--help) echo -e $usage; exit 0 ;;
--hmm-proto=*)
PROTO=`expr "X$1" : '[^=]*=\(.*\)'`;
[ -f $PROTO ] || error_exit "Cannot find HMM prototype file '$PROTO'";
shift ;;
--work-dir=*)
WDIR=`read_dirname $1`; shift ;;
*) echo "Unknown argument: $1, exiting"; error_exit $usage ;;
esac
done
cd $WDIR
. path.sh
echo "Preparing train data"
for LCODE in GE PO SP SW; do
# (0) Create a directory to contain files needed in training:
for x in train dev eval; do
mkdir -p data/$LCODE/$x
cp data/$LCODE/local/${x}_${LCODE}_wav.scp data/$LCODE/$x/wav.scp
cp data/$LCODE/local/${x}_${LCODE}.trans2 data/$LCODE/$x/text
cp data/$LCODE/local/${x}_${LCODE}.spk2utt data/$LCODE/$x/spk2utt
cp data/$LCODE/local/${x}_${LCODE}.utt2spk data/$LCODE/$x/utt2spk
done
mkdir -p data/$LCODE/lang
cp data/$LCODE/local/phones.txt -t data/$LCODE/lang/
cp data/$LCODE/local/words.txt -t data/$LCODE/lang/
# (1) Generate colon-separated lists of silence and non-silence phones, and
# the file 'oov.txt' containing a word that all OOVs map to during training.
silphones="SIL SPN";
silphones.pl data/$LCODE/lang/phones.txt "$silphones" \
data/$LCODE/lang/silphones.csl data/$LCODE/lang/nonsilphones.csl
echo "<UNK>" > data/$LCODE/lang/oov.txt
# (2) Create the L.fst without disambiguation symbols, for use in training.
make_lexicon_fst.pl data/$LCODE/local/lexicon_${LCODE}.txt 0.5 SIL \
| fstcompile --isymbols=data/$LCODE/lang/phones.txt \
--osymbols=data/$LCODE/lang/words.txt --keep_isymbols=false \
--keep_osymbols=false \
| fstarcsort --sort_type=olabel > data/$LCODE/lang/L.fst
# (3) Create phonesets.txt and extra_questions.txt.
gp_make_questions.pl -i data/$LCODE/lang/phones.txt \
-m data/$LCODE/lang/phonesets_mono.txt -r data/$LCODE/lang/roots.txt
# gp_extra_questions_${LCODE}.pl -i data/$LCODE/lang/phones.txt \
# -e data/$LCODE/lang/extra_questions.txt
grep -v SIL data/$LCODE/lang/phonesets_mono.txt \
> data/$LCODE/lang/phonesets_cluster.txt
# (4), Finally, for training, create the HMM topology prototype:
silphonelist=`cat data/$LCODE/lang/silphones.csl | sed 's/:/ /g'`
nonsilphonelist=`cat data/$LCODE/lang/nonsilphones.csl | sed 's/:/ /g'`
sed -e "s:NONSILENCEPHONES:$nonsilphonelist:" \
-e "s:SILENCEPHONES:$silphonelist:" $PROTO > data/$LCODE/lang/topo
done
echo "Preparing test data"
for LCODE in GE PO SP SW; do
# (0) Copy over some files common to traina and test:
mkdir -p data/$LCODE/lang_test
for f in phones.txt words.txt L.fst silphones.csl nonsilphones.csl; do
cp data/$LCODE/lang/$f -t data/$LCODE/lang_test/
done
# (1) Create a list of phones including the disambiguation symbols.
# --include-zero includes the #0 symbol that is passed from G.fst
ndisambig=`cat data/$LCODE/local/lex_ndisambig`;
add_disambig.pl --include-zero data/$LCODE/lang_test/phones.txt $ndisambig \
> data/$LCODE/lang_test/phones_disambig.txt
cp data/$LCODE/lang_test/phones_disambig.txt -t data/$LCODE/lang/ # for MMI.
# (2) Create the lexicon FST with disambiguation symbols. There is an extra
# step where we create a loop to "pass through" the disambiguation symbols
# from G.fst.
phone_disambig_symbol=`grep \#0 data/$LCODE/lang_test/phones_disambig.txt | awk '{print $2}'`