Commit d4584aec authored by Jan Trmal's avatar Jan Trmal
Browse files

(trunk/babel/s5b) Submitting modifications and tweaks done during eval +...

(trunk/babel/s5b) Submitting modifications and tweaks done during eval + changes of names of the nnet scripts (as the scripts moved to steps/nnet)


git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4248 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 71ef8633
......@@ -50,6 +50,10 @@ shadow_data_list=(
/export/babel/data/splits/Haitian_Babel201/dev.list
/export/babel/data/splits/Haitian_Babel201/eval.list
)
shadow_data_scoring=(
[dev10h]=/export/babel/data/splits/Haitian_Babel201/dev.list
[eval]=/export/babel/data/splits/Haitian_Babel201/eval.list
)
shadow_data_cmudb=/export/babel/data/splits/Haitian_Babel201/uem/201-shadow-v0-utt.dat
shadow_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel201b-v0.2b_conv-eval.ecf.xml
shadow_kwlist_file=/export/babel/data/scoring/IndusDB/IARPA-babel201b-v0.2b_conv-eval.kwlist4.xml
......
......@@ -29,10 +29,11 @@ dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-de
dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev/IARPA-babel204b-v1.1b_conv-dev.mitllfa3.rttm
dev10h_kwlist_file=/export/babel/data/splits/Tamil_Babel204/IARPA-babel204b-v1.1b_conv-dev.radical.kwlist.xml
dev10h_more_kwlists=(
[bbn1]=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev.kwlist.xml
[bbn2]=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev.kwlist2.xml
[ibm1]=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev.kwlist3.xml
[ibm2]=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev.kwlist4.xml
#[bbn1]=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev.kwlist.xml
#[bbn2]=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev.kwlist2.xml
#[ibm1]=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev.kwlist3.xml
#[ibm2]=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev.kwlist4.xml
[eval]=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev.kwlist5.xml
)
dev10h_nj=32
......@@ -47,6 +48,33 @@ unsup_data_list=(
)
unsup_nj=64
eval_data_dir=/export/babel/data/204-tamil/release-current/conversational/eval/
eval_data_cmudb=/export/babel/data/splits/Tamil_Babel204/uem/db-shadow-jhuseg-v8-utt.dat
eval_data_list=/export/babel/data/splits/Tamil_Babel204/eval.list
eval_nj=64
shadow_data_dir=(
/export/babel/data/204-tamil/release-current/conversational/dev/
/export/babel/data/204-tamil/release-current/conversational/eval/
)
shadow_data_cmudb=/export/babel/data/splits/Tamil_Babel204/uem/204-shadow-v0-utt.dat
shadow_data_list=(
/export/babel/data/splits/Tamil_Babel204/dev.list
/export/babel/data/splits/Tamil_Babel204/eval.list
)
shadow_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev/IARPA-babel204b-v1.1b_conv-dev.stm
shadow_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev/IARPA-babel204b-v1.1b_conv-dev.scoring.ecf.xml
shadow_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev/IARPA-babel204b-v1.1b_conv-dev.mitllfa3.rttm
shadow_kwlist_file=/export/babel/data/splits/Tamil_Babel204/IARPA-babel204b-v1.1b_conv-dev.radical.kwlist.xml
shadow_more_kwlists=(
#[bbn1]=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev.kwlist.xml
#[bbn2]=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev.kwlist2.xml
#[ibm1]=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev.kwlist3.xml
#[ibm2]=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev.kwlist4.xml
[eval]=/export/babel/data/scoring/IndusDB/IARPA-babel204b-v1.1b_conv-dev.kwlist5.xml
)
shadow_nj=64
# Acoustic model parameters
numLeavesTri1=1000
numGaussTri1=10000
......
......@@ -89,7 +89,15 @@ if [ ! -z "$oov_prob_file" ]; then
lmfile=$destdir/lm_tmp.gz
fi
gunzip -c $lmfile | \
if [[ $lmfile == *.bz2 ]] ; then
decompress="bunzip2 -c $lmfile"
elif [[ $lmfile == *.gz ]] ; then
decompress="gunzip -c $lmfile"
else
decompress="cat $lmfile"
fi
$decompress | \
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
arpa2fst - | \
fstprint | \
......@@ -97,7 +105,7 @@ gunzip -c $lmfile | \
utils/s2eps.pl | \
fstcompile --isymbols=$langdir/words.txt \
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $destdir/G.fst || exit 1
fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1
fstisstochastic $destdir/G.fst || true;
if $cleanup; then
......
......@@ -22,13 +22,23 @@ function register_extraid {
}
function setup_oov_search {
local nbest=500
#Basic lexicon
#local phone_beam=-1
#local phone_nbest=-1
#local beam=5
#local nbest=500
#Extended lexicon
local nbest=-1
local beam=-1
local phone_nbest=300
local phone_beam=5
local phone_cutoff=5
local g2p_nbest=10
local g2p_mass=0.95
local beam=5
local phone_beam=4
local phone_nbest=-1
local phone_cutoff=5
local data_dir=$1
local source_dir=$2
......@@ -37,10 +47,15 @@ function setup_oov_search {
local kwsdatadir=$data_dir/${extraid}_kws
mkdir -p $kwsdatadir
cp $source_dir/kwlist*.xml $kwsdatadir
cp $source_dir/ecf.xml $kwsdatadir
cp $source_dir/utter_* $kwsdatadir
[ -f $source_dir/rttm ] && cp $source_dir/rttm $kwsdatadir
if [ "${dataset_kind}" == "supervised" ] ; then
for file in $source_dir/rttm ; do
cp -f $file $kwsdatadir
done
fi
for file in $source_dir/utter_* $source_dir/kwlist*.xml $source_dir/ecf.xml ; do
cp -f $file $kwsdatadir
done
kwlist=$source_dir/kwlist_outvocab.xml
#Get the KW list
......@@ -84,55 +99,50 @@ function setup_oov_search {
}
if [ "$dataset_kind" == "shadow" ]; then
true #we do not support multiple kw lists for shadow set system
else # This will work for both supervised and unsupervised dataset kinds
kws_flags=( --use-icu true )
if [ "${dataset_kind}" == "supervised" ] ; then
#The presence of the file had been already verified, so just
#add the correct switches
kws_flags+=(--rttm-file $my_rttm_file )
fi
if $my_subset_ecf ; then
kws_flags+=(--subset-ecf $my_data_list)
fi
kws_flags=( --use-icu true )
if [ "${dataset_kind}" == "supervised" ] ; then
#The presence of the file had been already verified, so just
#add the correct switches
kws_flags+=(--rttm-file $my_rttm_file )
fi
if $my_subset_ecf ; then
kws_flags+=(--subset-ecf $my_data_list)
fi
if [ ! -f $dataset_dir/.done.kws.oov ] ; then
setup_oov_search $dataset_dir $dataset_dir/kws oov
register_extraid $dataset_dir oov
touch $dataset_dir/.done.kws.oov
fi
if [ ${#my_more_kwlists[@]} -ne 0 ] ; then
touch $dataset_dir/extra_kws_tasks
if [ ! -f $dataset_dir/.done.kws.oov ] ; then
setup_oov_search $dataset_dir $dataset_dir/kws oov || exit 1
register_extraid $dataset_dir oov
touch $dataset_dir/.done.kws.oov
fi
if [ ${#my_more_kwlists[@]} -ne 0 ] ; then
touch $dataset_dir/extra_kws_tasks
for extraid in "${!my_more_kwlists[@]}" ; do
#The next line will help us in running only one. We don't really
#know in which directory the KWS setup will reside in, so we will
#place the .done file directly into the data directory
[ -f $dataset_dir/.done.kws.$extraid ] && continue;
kwlist=${my_more_kwlists[$extraid]}
local/kws_setup.sh --extraid $extraid --case_insensitive $case_insensitive \
"${kws_flags[@]}" "${icu_opt[@]}" \
$my_ecf_file $kwlist data/lang ${dataset_dir} || exit 1
for extraid in "${!my_more_kwlists[@]}" ; do
#The next line will help us in running only one. We don't really
#know in which directory the KWS setup will reside in, so we will
#place the .done file directly into the data directory
[ -f $dataset_dir/.done.kws.$extraid ] && continue;
kwlist=${my_more_kwlists[$extraid]}
local/kws_setup.sh --extraid $extraid --case_insensitive $case_insensitive \
"${kws_flags[@]}" "${icu_opt[@]}" \
$my_ecf_file $kwlist data/lang ${dataset_dir} || exit 1
#Register the dataset for default running...
#We can do it without any problem here -- the kws_stt_tasks will not
#run it, unless called with --run-extra-tasks true switch
register_extraid $dataset_dir $extraid
touch $dataset_dir/.done.kws.$extraid
done
for extraid in "${!my_more_kwlists[@]}" ; do
#The next line will help us in running only one. We don't really
#know in which directory the KWS setup will reside in, so we will
#place the .done file directly into the data directory
[ -f $dataset_dir/.done.kws.${extraid}_oov ] && continue;
setup_oov_search $dataset_dir $dataset_dir/${extraid}_kws ${extraid}_oov
register_extraid $dataset_dir ${extraid}_oov
touch $dataset_dir/.done.kws.${extraid}_oov
done
fi
#Register the dataset for default running...
#We can do it without any problem here -- the kws_stt_tasks will not
#run it, unless called with --run-extra-tasks true switch
register_extraid $dataset_dir $extraid
touch $dataset_dir/.done.kws.$extraid
done
for extraid in "${!my_more_kwlists[@]}" ; do
#The next line will help us in running only one. We don't really
#know in which directory the KWS setup will reside in, so we will
#place the .done file directly into the data directory
[ -f $dataset_dir/.done.kws.${extraid}_oov ] && continue;
setup_oov_search $dataset_dir $dataset_dir/${extraid}_kws ${extraid}_oov
register_extraid $dataset_dir ${extraid}_oov
touch $dataset_dir/.done.kws.${extraid}_oov
done
fi
#! /usr/bin/env python
import argparse, sys
from argparse import ArgumentParser
import re
def main():
parser = ArgumentParser(description='Convert kaldi data directory to uem dat files',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--verbose', type=int, \
dest='verbose', default=0, \
help='Give higher verbose for more logging')
parser.add_argument('--get-text', action='store_true', \
help='Get text in dat file')
parser.add_argument('--prefix', type=str, \
help='Add db file name as db-<prefix>-{utt/spk}.dat')
parser.add_argument('kaldi_dir', \
help='Kaldi data directory')
parser.add_argument('output_dir', \
help='Directory to store uem dat files')
parser.usage=':'.join(parser.format_usage().split(':')[1:]) \
+ 'e.g. : %(prog)s --prefix 203-lao-v0 data/dev10h.seg CMU_db'
options = parser.parse_args()
if options.get_text:
try:
text_file = open(options.kaldi_dir+'/text', 'r')
except IOError as e:
repr(e)
sys.stderr.write("%s: No such file %s\n" % (sys.argv[0], options.kaldi_dir+'/text'))
sys.exit(1)
try:
segments_file = open(options.kaldi_dir+'/segments', 'r')
except IOError as e:
repr(e)
sys.stderr.write("%s: No such file %s\n" % (sys.argv[0], options.kaldi_dir+'/segments'))
sys.exit(1)
try:
scp_file = open(options.kaldi_dir+'/wav.scp', 'r')
except IOError as e:
repr(e)
sys.stderr.write("%s: No such file %s\n" % (sys.argv[0], options.kaldi_dir+'/wav.scp'))
sys.exit(1)
reco2file_map = {}
for line in scp_file.readlines():
splits = line.strip().split()
m = re.search(r".*/(?P<file_name>[0-9A-Za-z_]*\.(sph|wav)).*", line)
if not m:
sys.stderr.write("%s does not contain a valid speech file (.wav or .sph)\n" % line.strip())
sys.exit(1)
reco2file_map[splits[0]] = m.group('file_name')
# End for
spk2utt_map = {}
if options.prefix == None:
prefix = options.kaldi_dir.split('/')[-1].split('.')[0]
else:
prefix = options.prefix
try:
utt_dat = open(options.output_dir+'/db-'+prefix+'-utt.dat', 'w')
spk_dat = open(options.output_dir+'/db-'+prefix+'-spk.dat', 'w')
except IOError as e:
repr(e)
sys.stderr.write("%s: Could not write dat files in %s\n" % (sys.argv[0], options.output_dir))
sys.exit(1)
for line in segments_file.readlines():
utt_id, file_id, start, end = line.strip().split()
if (options.get_text):
splits = text_file.readline().split()
while splits[0] < utt_id:
splits = text_file.readline().split()
text = ' '.join(splits[1:])
else:
text = ""
utt_dat.write("{UTTID %s} {UTT %s} {SPK %s} {FROM %s} {TO %s} {TEXT %s}\n" % (utt_id, utt_id, file_id, start, end, text))
spk2utt_map.setdefault(file_id, [])
spk2utt_map[file_id].append(utt_id)
for spk, utts in spk2utt_map.items():
try:
spk_dat.write("{SEGS %s} {ADC %s} {CONV %s.wav} {CHANNEL 1} {DUR }\n" % (' '.join(utts), reco2file_map[spk], spk))
except KeyError as e:
repr(e)
sys.stderr.write("%s: Error in getting file for %s\n" % (sys.argv[0], spk))
sys.exit(1)
# End for
segments_file.close()
utt_dat.close()
spk_dat.close()
if __name__ == '__main__':
main()
......@@ -85,6 +85,7 @@ fi
mkdir -p $kwsdatadir
if [ -z $subset_ecf ] ; then
test -f $kwsdatadir/ecf.xml && rm -f $kwsdatadir/ecf.xml
cp "$ecf_file" $kwsdatadir/ecf.xml || exit 1
else
local/make_ecf_subset.sh $subset_ecf $ecf_file > $kwsdatadir/ecf.xml
......@@ -107,10 +108,12 @@ if $kwlist_wordlist ; then
echo '</kwlist>'
) > $kwsdatadir/kwlist.xml || exit 1
else
test -f $kwsdatadir/kwlist.xml && rm -f $kwsdatadir/kwlist.xml
cp "$kwlist_file" $kwsdatadir/kwlist.xml || exit 1
fi
if [ ! -z $rttm_file ] ; then
test -f $kwsdatadir/rttm && rm -f $kwsdatadir/rttm
cp "$rttm_file" $kwsdatadir/rttm || exit 1
fi
......
......@@ -96,6 +96,7 @@ use Getopt::Long;
#
########################################################################
print STDERR "$0 " . join(" ", @ARGV) . "\n";
GetOptions("fragmentMarkers=s" => \$fragMarkers,
"oov=s" => \$OOV_symbol,
"vocab=s" => \$vocabFile,
......@@ -165,7 +166,7 @@ if (-d $TranscriptionDir) {
open (TRANSCRIPT, $inputspec) || die "Unable to open $filename";
while ($line=<TRANSCRIPT>) {
chomp $line;
if ($line =~ m:^\[([0-9]+\.*[0-9]*)\]$:) {
if ($line =~ m:^\s*\[([0-9]+\.*[0-9]*)\]\s*$:) {
$thisTimeMark = $1;
if ($thisTimeMark < $prevTimeMark) {
print STDERR ("$0 ERROR: Found segment with negative duration in $filename\n");
......@@ -245,6 +246,7 @@ if (-d $TranscriptionDir) {
} else {
# This is a just regular spoken word
if ($vocabFile && (! $inVocab{$w}) && $fragMarkers) {
print "Not in vocab: $w\n";
# $w is a potential OOV token
# Remove fragMarkers to see if $w becomes in-vocabulary
while ($w =~ m:^(\S+[$fragMarkers]|[$fragMarkers]\S+)$:) {
......
#!/bin/bash
. path.sh
format=pdf # pdf svg
output=
. utils/parse_options.sh
if [ $# != 3 ]; then
echo "usage: $0 [--format pdf|svg] [--output <path-to-output>] <utt-id> <lattice-ark> <word-list>"
echo "e.g.: $0 utt-0001 \"test/lat.*.gz\" tri1/graph/words.txt"
exit 1;
fi
uttid=$1
lat=$2
words=$3
tmpdir=$(mktemp -d); trap "rm -r $tmpdir" EXIT # cleanup
gunzip -c $lat | lattice-to-fst ark:- ark,scp:$tmpdir/fst.ark,$tmpdir/fst.scp || exit 1
! grep "^$uttid " $tmpdir/fst.scp && echo "ERROR : Missing utterance '$uttid' from gzipped lattice ark '$lat'" && exit 1
fstcopy "scp:grep '^$uttid ' $tmpdir/fst.scp |" "scp:echo $uttid $tmpdir/$uttid.fst |" || exit 1
fstdraw --portrait=true --osymbols=$words $tmpdir/$uttid.fst | dot -T${format} > $tmpdir/$uttid.${format}
if [ ! -z $output ]; then
cp $tmpdir/$uttid.${format} $output
fi
[ $format == "pdf" ] && evince $tmpdir/$uttid.pdf
[ $format == "svg" ] && eog $tmpdir/$uttid.svg
exit 0
#!/bin/bash
lp=
lr=
ar=
split=BaEval
team=RADICAL
corpusid=
partition=
scase=BaEval #BaDev|BaEval
sysid=
master=
version=1
relname=
exp=c
sysid=
prim=c
cer=0
dryrun=true
dir="exp/sgmm5_mmi_b0.1/"
extrasys=""
data=data/dev10h.seg
master=dev10h
final=false
dev2shadow=dev10h.uem
eval2shadow=eval.uem
team=RADICAL
#end of configuration
echo $0 " " "$@"
[ -f ./cmd.sh ] && . ./cmd.sh
[ -f ./path.sh ] && . ./path.sh
. ./utils/parse_options.sh
if [ $# -ne 2 ] ; then
echo "Invalid number of parameters!"
echo "Parameters " "$@"
echo "$0 --ar <NTAR|TAR> --lr <BaseLR|BabelLR|OtherLR> --lp <FullLP|LimitedLP> --relname <NAME> [--version <version-nr> ] <config> <output>"
exit 1
fi
[ -z $lp ] && echo "Error -- you must specify --lp <FullLP|LimitedLP>" && exit 1
if [ "$lp" != "FullLP" ] && [ "$lp" != "LimitedLP" ] ; then
echo "Error -- you must specify --lp <FullLP|LimitedLP>" && exit 1
fi
[ -z $lr ] && echo "Error -- you must specify --lr <BaseLR|BabelLR|OtherLR>" && exit 1
if [ "$lr" != "BaseLR" ] && [ "$lr" != "BabelLR" ] && [ "$lr" != "OtherLR" ] ; then
echo "Error -- you must specify --lr <BaseLR|BabelLR|OtherLR>" && exit 1
fi
[ -z $ar ] && echo "Error -- you must specify --ar <NTAR|TAR>" && exit 1
if [ "$ar" != "NTAR" ] && [ "$ar" != "TAR" ] ; then
echo "Error -- you must specify --ar <NTAR|TAR>" && exit 1
fi
[ -z $relname ] && echo "Error -- you must specify name" && exit 1
[ ! -f $1 ] && echo "Configuration $1 does not exist! " && exit 1
. $1
outputdir=$2
set -e
set -o pipefail
function export_file {
set -x
#set -x
source_file=$1
target_file=$2
if [ ! -f $source_file ] ; then
......@@ -61,12 +43,17 @@ function export_file {
if [ ! -f $target_file ] ; then
if ! $dryrun ; then
ln -s `readlink -f $source_file` $target_file || exit 1
ls -al $target_file
else
echo "$source_file -> $target_file"
fi
else
echo "The file is already there, not doing anything. Either change the version (using --version), or delete that file manually)"
exit 1
fi
fi
#set +x
return 0
}
......@@ -97,38 +84,227 @@ function export_kws_file {
return 0
}
if [[ "$eval_kwlist_file" == *.kwlist.xml ]] ; then
corpus=`basename $eval_kwlist_file .kwlist.xml`
elif [[ "$eval_kwlist_file" == *.kwlist2.xml ]] ; then
corpus=`basename $eval_kwlist_file .kwlist2.xml`
function find_best_kws_result {
local dir=$1
local mask=$2
local record=`(find $dir -name "sum.txt" -path "$mask" | xargs grep "^| *Occ") | cut -f 1,13,17 -d '|' | sed 's/|//g' | column -t | sort -r -n -k 3 | tail -n 1`
echo $record >&2
local file=`echo $record | awk -F ":" '{print $1}'`
#echo $file >&2
local path=`dirname $file`
#echo $path >&2
echo $path
}
function find_best_stt_result {
local dir=$1
local mask=$2
local record=`(find $dir -name "*.ctm.sys" -path "$mask" | xargs grep Avg) | sed 's/|//g' | column -t | sort -n -k 9 | head -n 1`
echo $record >&2
local file=`echo $record | awk -F ":" '{print $1}'`
#echo $file >&2
local path=`dirname $file`
#echo $path >&2
echo $path
}
function create_sysid {
local best_one=$1
local extrasys=$2
local taskid=`basename $best_one`
local system_path=`dirname $best_one`
if [[ $system_path =~ .*sgmm5.* ]] ; then
sysid=PLP
elif [[ $system_path =~ .*nnet.* ]] ; then
sysid=DNN
elif [[ $system_path =~ .*sgmm7.* ]] ; then
sysid=BNF
else
echo "Unknown system path ($system_path), cannot deduce the systemID" >&2
exit 1
fi
if [ ! -z $extrasys ]; then
sysid="${sysid}-${extrasys}"
fi
local kwsid=${taskid//kws_*/}
kwsid=${kwsid//_/}
if [ -z $kwsid ]; then
echo ${sysid}
else
echo ${sysid}-$kwsid
fi
}
function get_ecf_name {
local best_one=$1
local taskid=`basename $best_one`
local kwstask=${taskid//kws_*/kws}
local kwlist=
#echo $kwstask
if [ -z $kwstask ] ; then
#echo $data/kws/kwlist.xml
kwlist= `readlink -f $data/kws/kwlist.xml`
else
#echo $data/$kwstask/kwlist.xml
kwlist=`readlink -f $data/$kwstask/kwlist.xml`
fi
ecf=`head -n 1 $kwlist | grep -Po "(?<=ecf_filename=\")[^\"]*"`
echo -e "\tFound ECF: $ecf" >&2
echo $ecf
return 0
}
function compose_expid {
local task=$1
local best_one=$2
local extraid=$3
[ ! -z $extraid ] && extraid="-$extraid"
local sysid=`create_sysid $best_one $extrasys`
if [ "$task" == "KWS" ]; then
ext="kwslist.xml"
elif [ "$task" == "STT" ]; then