Commit 57a19b6a authored by Jan Trmal's avatar Jan Trmal
Browse files

(sandbox/tacc_kald) Merging the trunk in again to simplify reintegration

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/tacc_kaldi@4251 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parents d4ad1dfb 57596e1d
......@@ -19,6 +19,8 @@ tools/irstlm/
tools/openfst
tools/openfst-1.3.2.tar.gz
tools/openfst-1.3.2/
tools/openfst-1.3.4.tar.gz
tools/openfst-1.3.4/
tools/pa_stable_v19_20111121.tgz
tools/portaudio/
tools/sctk-2.4.0-20091110-0958.tar.bz2
......
This directory contains example scripts that demonstrate how to
use Kaldi. Each subdirectory corresponds to a corpus that we have
example scripts for. Currently these are all corpora available from
the Linguistic Data Consortium (LDC).
example scripts for.
Explanations of the corpora are below.
Note: the easiest examples to work with are rm/s3 and wsj/s3.
wsj: The Wall Street Journal corpus. This is a corpus of read
sentences from the Wall Street Journal, recorded under clean conditions.
The vocabulary is quite large.
Available from the LDC as either: [ catalog numbers LDC93S6A (WSJ0) and LDC94S13A (WSJ1) ]
or: [ catalog numbers LDC93S6B (WSJ0) and LDC94S13B (WSJ1) ]
The latter option is cheaper and includes only the Sennheiser
microphone data (which is all we use in the example scripts).
rm: Resource Management. Clean speech in a medium-vocabulary task consisting
of commands to a (presumably imaginary) computer system.
Available from the LDC as catalog number LDC93S3A (it may be possible to
get the same data using combinations of other catalog numbers, but this
is the one we used).
tidigits: The TI Digits database, available from the LDC (catalog number LDC93S10).
This is one of the oldest speech databases; it consists of a bunch of speakers
saying digit strings. It's not considered a "real" task any more, but can be useful
for demos, tutorials, and the like.
yesno: This is a simple recipe with some data consisting of a single person
saying the words "yes" and "no", that can be downloaded from the Kaldi website.
It's a very easy task, but useful for checking that the scripts run, or if
you don't yet have any of the LDC data.
Recipes in progress (these may be less polished than the ones above).
swbd: Switchboard (from LDC). A fairly large amount of telephone speech (2-channel, 8kHz
sampling rate).
This directory is a work in progress.
gp: GlobalPhone (from ELDA). This is a multilingual speech corpus.
timit: TIMIT (from LDC), which is an old corpus of carefully read speech.
LDC corpous LDC93S1
voxforge: A recipe for the free speech data available from voxforge.org
hkust: A recipe for HKUST Mandarin Telephone Speech (available from LDC)
Note: we now have some scripts using free data, including voxforge,
vystadial_{cz,en} and yesno. Most of the others are available from
the Linguistic Data Consortium (LDC), which requires money (unless you
have a membership).
If you have an LDC membership, probably rm/s5 or wsj/s5 should be your first
choice to try out the scripts.
beam=18.0 # beam for decoding. Was 13.0 in the scripts.
latbeam=10.0 # this has most effect on size of the lattices.
lattice_beam=10.0 # this has most effect on size of the lattices.
......@@ -89,7 +89,15 @@ if [ ! -z "$oov_prob_file" ]; then
lmfile=$destdir/lm_tmp.gz
fi
gunzip -c $lmfile | \
if [[ $lmfile == *.bz2 ]] ; then
decompress="bunzip2 -c $lmfile"
elif [[ $lmfile == *.gz ]] ; then
decompress="gunzip -c $lmfile"
else
decompress="cat $lmfile"
fi
$decompress | \
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
arpa2fst - | \
fstprint | \
......@@ -97,7 +105,7 @@ gunzip -c $lmfile | \
utils/s2eps.pl | \
fstcompile --isymbols=$langdir/words.txt \
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $destdir/G.fst || exit 1
fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1
fstisstochastic $destdir/G.fst || true;
if $cleanup; then
......
......@@ -22,21 +22,23 @@ function register_extraid {
}
function setup_oov_search {
local nbest=500
local beam=5
local phone_nbest=-1
local phone_beam=-1
#local nbest=-1
#local beam=-1
#local phone_nbest=300
#local phone_beam=5
#Basic lexicon
#local phone_beam=-1
#local phone_nbest=-1
#local beam=5
#local nbest=500
#Extended lexicon
local nbest=-1
local beam=-1
local phone_nbest=300
local phone_beam=5
local phone_cutoff=5
local g2p_nbest=10
local g2p_mass=0.95
#local phone_nbest=150
local data_dir=$1
local source_dir=$2
......@@ -45,7 +47,7 @@ function setup_oov_search {
local kwsdatadir=$data_dir/${extraid}_kws
mkdir -p $kwsdatadir
if [ "${dataset_kind}" == "supervised" ] ; then
for file in $source_dir/rttm ; do
cp -f $file $kwsdatadir
......@@ -86,7 +88,7 @@ function setup_oov_search {
L1_lex=data/local/lexiconp.txt
local/kws_data_prep_proxy.sh \
--cmd "$decode_cmd " --nj $my_nj \
--cmd "$decode_cmd" --nj $my_nj \
--case-insensitive true \
--confusion-matrix $confusion \
--phone-cutoff $phone_cutoff \
......
#! /usr/bin/env python
import argparse, sys
from argparse import ArgumentParser
import re
def main():
parser = ArgumentParser(description='Convert kaldi data directory to uem dat files',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--verbose', type=int, \
dest='verbose', default=0, \
help='Give higher verbose for more logging')
parser.add_argument('--get-text', action='store_true', \
help='Get text in dat file')
parser.add_argument('--prefix', type=str, \
help='Add db file name as db-<prefix>-{utt/spk}.dat')
parser.add_argument('kaldi_dir', \
help='Kaldi data directory')
parser.add_argument('output_dir', \
help='Directory to store uem dat files')
parser.usage=':'.join(parser.format_usage().split(':')[1:]) \
+ 'e.g. : %(prog)s --prefix 203-lao-v0 data/dev10h.seg CMU_db'
options = parser.parse_args()
if options.get_text:
try:
text_file = open(options.kaldi_dir+'/text', 'r')
except IOError as e:
repr(e)
sys.stderr.write("%s: No such file %s\n" % (sys.argv[0], options.kaldi_dir+'/text'))
sys.exit(1)
try:
segments_file = open(options.kaldi_dir+'/segments', 'r')
except IOError as e:
repr(e)
sys.stderr.write("%s: No such file %s\n" % (sys.argv[0], options.kaldi_dir+'/segments'))
sys.exit(1)
try:
scp_file = open(options.kaldi_dir+'/wav.scp', 'r')
except IOError as e:
repr(e)
sys.stderr.write("%s: No such file %s\n" % (sys.argv[0], options.kaldi_dir+'/wav.scp'))
sys.exit(1)
reco2file_map = {}
for line in scp_file.readlines():
splits = line.strip().split()
m = re.search(r".*/(?P<file_name>[0-9A-Za-z_]*\.(sph|wav)).*", line)
if not m:
sys.stderr.write("%s does not contain a valid speech file (.wav or .sph)\n" % line.strip())
sys.exit(1)
reco2file_map[splits[0]] = m.group('file_name')
# End for
spk2utt_map = {}
if options.prefix == None:
prefix = options.kaldi_dir.split('/')[-1].split('.')[0]
else:
prefix = options.prefix
try:
utt_dat = open(options.output_dir+'/db-'+prefix+'-utt.dat', 'w')
spk_dat = open(options.output_dir+'/db-'+prefix+'-spk.dat', 'w')
except IOError as e:
repr(e)
sys.stderr.write("%s: Could not write dat files in %s\n" % (sys.argv[0], options.output_dir))
sys.exit(1)
for line in segments_file.readlines():
utt_id, file_id, start, end = line.strip().split()
if (options.get_text):
splits = text_file.readline().split()
while splits[0] < utt_id:
splits = text_file.readline().split()
text = ' '.join(splits[1:])
else:
text = ""
utt_dat.write("{UTTID %s} {UTT %s} {SPK %s} {FROM %s} {TO %s} {TEXT %s}\n" % (utt_id, utt_id, file_id, start, end, text))
spk2utt_map.setdefault(file_id, [])
spk2utt_map[file_id].append(utt_id)
for spk, utts in spk2utt_map.items():
try:
spk_dat.write("{SEGS %s} {ADC %s} {CONV %s.wav} {CHANNEL 1} {DUR }\n" % (' '.join(utts), reco2file_map[spk], spk))
except KeyError as e:
repr(e)
sys.stderr.write("%s: Error in getting file for %s\n" % (sys.argv[0], spk))
sys.exit(1)
# End for
segments_file.close()
utt_dat.close()
spk_dat.close()
if __name__ == '__main__':
main()
......@@ -85,6 +85,7 @@ fi
mkdir -p $kwsdatadir
if [ -z $subset_ecf ] ; then
test -f $kwsdatadir/ecf.xml && rm -f $kwsdatadir/ecf.xml
cp "$ecf_file" $kwsdatadir/ecf.xml || exit 1
else
local/make_ecf_subset.sh $subset_ecf $ecf_file > $kwsdatadir/ecf.xml
......@@ -107,10 +108,12 @@ if $kwlist_wordlist ; then
echo '</kwlist>'
) > $kwsdatadir/kwlist.xml || exit 1
else
test -f $kwsdatadir/kwlist.xml && rm -f $kwsdatadir/kwlist.xml
cp "$kwlist_file" $kwsdatadir/kwlist.xml || exit 1
fi
if [ ! -z $rttm_file ] ; then
test -f $kwsdatadir/rttm && rm -f $kwsdatadir/rttm
cp "$rttm_file" $kwsdatadir/rttm || exit 1
fi
......
......@@ -96,6 +96,7 @@ use Getopt::Long;
#
########################################################################
print STDERR "$0 " . join(" ", @ARGV) . "\n";
GetOptions("fragmentMarkers=s" => \$fragMarkers,
"oov=s" => \$OOV_symbol,
"vocab=s" => \$vocabFile,
......@@ -165,7 +166,7 @@ if (-d $TranscriptionDir) {
open (TRANSCRIPT, $inputspec) || die "Unable to open $filename";
while ($line=<TRANSCRIPT>) {
chomp $line;
if ($line =~ m:^\[([0-9]+\.*[0-9]*)\]$:) {
if ($line =~ m:^\s*\[([0-9]+\.*[0-9]*)\]\s*$:) {
$thisTimeMark = $1;
if ($thisTimeMark < $prevTimeMark) {
print STDERR ("$0 ERROR: Found segment with negative duration in $filename\n");
......@@ -245,6 +246,7 @@ if (-d $TranscriptionDir) {
} else {
# This is a just regular spoken word
if ($vocabFile && (! $inVocab{$w}) && $fragMarkers) {
print "Not in vocab: $w\n";
# $w is a potential OOV token
# Remove fragMarkers to see if $w becomes in-vocabulary
while ($w =~ m:^(\S+[$fragMarkers]|[$fragMarkers]\S+)$:) {
......
......@@ -1502,5 +1502,7 @@ def main():
global_analysis_final.write_length_stats()
if __name__ == '__main__':
main()
with Timer() as t:
main()
sys.stderr.write("\nSegmentation done!\nTook %f sec\n" % t.interval)
......@@ -73,10 +73,9 @@ mkdir -p $dir/scoring/log
if [ $stage -le 0 ] ; then
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
set -e';' set -o pipefail';' \
cp $data/stm $dir/score_LMWT/stm.unsorted '&&' \
cp $dir/score_LMWT/${name}.ctm $dir/score_LMWT/${name}.ctm.unsorted '&&'\
cp -f $data/stm $dir/score_LMWT/stm.unsorted '&&' \
cp -f $dir/score_LMWT/${name}.ctm $dir/score_LMWT/${name}.ctm.unsorted '&&'\
$SortingProgram sortSTM \<$dir/score_LMWT/stm.unsorted \>$dir/score_LMWT/stm.sorted '&&' \
utils/fix_ctm.sh $dir/score_LMWT/stm.sorted $dir/score_LMWT/${name}.ctm.unsorted '&&' \
$SortingProgram sortCTM \<$dir/score_LMWT/${name}.ctm.unsorted \>$dir/score_LMWT/${name}.ctm.sorted '&&' \
paste -d ' ' \<\(cut -f 1-5 -d ' ' $dir/score_LMWT/stm.sorted \) \
\<\(cut -f 6- -d ' ' $dir/score_LMWT/stm.sorted \| uconv -f utf8 -t utf8 -x "$icu_transform" \) \
......@@ -84,6 +83,7 @@ if [ $stage -le 0 ] ; then
paste -d ' ' \<\(cut -f 1-4 -d ' ' $dir/score_LMWT/${name}.ctm.sorted \) \
\<\(cut -f 5- -d ' ' $dir/score_LMWT/${name}.ctm.sorted \| uconv -f utf8 -t utf8 -x "$icu_transform" \) \
\> $dir/score_LMWT/${name}.ctm '&&' \
utils/fix_ctm.sh $dir/score_LMWT/stm $dir/score_LMWT/${name}.ctm '&&' \
$ScoringProgram -s -r $dir/score_LMWT/stm stm -h $dir/score_LMWT/${name}.ctm ctm \
-n "$name.ctm" -f 0 -D -F -o sum rsum prf dtl sgml -e utf-8 || exit 1
fi
......
#!/bin/bash
. path.sh
format=pdf # pdf svg
output=
. utils/parse_options.sh
if [ $# != 3 ]; then
echo "usage: $0 [--format pdf|svg] [--output <path-to-output>] <utt-id> <lattice-ark> <word-list>"
echo "e.g.: $0 utt-0001 \"test/lat.*.gz\" tri1/graph/words.txt"
exit 1;
fi
uttid=$1
lat=$2
words=$3
tmpdir=$(mktemp -d); trap "rm -r $tmpdir" EXIT # cleanup
gunzip -c $lat | lattice-to-fst ark:- ark,scp:$tmpdir/fst.ark,$tmpdir/fst.scp || exit 1
! grep "^$uttid " $tmpdir/fst.scp && echo "ERROR : Missing utterance '$uttid' from gzipped lattice ark '$lat'" && exit 1
fstcopy "scp:grep '^$uttid ' $tmpdir/fst.scp |" "scp:echo $uttid $tmpdir/$uttid.fst |" || exit 1
fstdraw --portrait=true --osymbols=$words $tmpdir/$uttid.fst | dot -T${format} > $tmpdir/$uttid.${format}
if [ ! -z $output ]; then
cp $tmpdir/$uttid.${format} $output
fi
[ $format == "pdf" ] && evince $tmpdir/$uttid.pdf
[ $format == "svg" ] && eog $tmpdir/$uttid.svg
exit 0
......@@ -4,6 +4,7 @@ team=RADICAL
corpusid=
partition=
scase=BaEval #BaDev|BaEval
sysid=
master=
version=1
sysid=
......@@ -11,9 +12,9 @@ prim=c
cer=0
dryrun=true
dir="exp/sgmm5_mmi_b0.1/"
extrasys=""
data=data/dev10h.seg
master=dev10h
extrasys=
final=false
#end of configuration
......@@ -31,17 +32,6 @@ outputdir=$2
set -e
set -o pipefail
function submit_to_google {
SYSPATH=$1
#curl 'https://docs.google.com/forms/d/1MV4gf-iVOX79ZEAekEiLIo7L_UVrJnoPjdtICK5F-nc/formResponse' \
# --data 'entry.1721972547='$MTWV'&entry.485509816='$ATWV'&entry.694031153='$RESPATH'&entry.1851048707='$(whoami)'&submit=Submit' \
# --compressed
curl -sS 'https://docs.google.com/forms/d/1MV4gf-iVOX79ZEAekEiLIo7L_UVrJnoPjdtICK5F-nc/formResponse' \
--data 'entry.1721972547='$MTWV'&entry.485509816='$ATWV'&entry.694031153='$SYSPATH'&entry.1851048707='$(whoami)'&entry.880350279='$STWV'&entry.60995624='$OTWV'&entry.1338769660='$LatticeRecall'&entry.1333349334='$THRESHOLD'&entry.1423358838='$(pwd)'&submit=Submit' --compressed |\
grep --color "Your response has been recorded." || return 1
return 0
}
function export_file {
#set -x
source_file=$1
......@@ -97,7 +87,7 @@ function export_kws_file {
function find_best_kws_result {
local dir=$1
local mask=$2
local record=`(find $dir -name "sum.txt" -path "$mask" -not -ipath "*rescored*" | xargs grep "^| *Occ") | cut -f 1,13,17 -d '|' | sed 's/|//g' | column -t | sort -r -n -k 3 | head -n 1`
local record=`(find $dir -name "sum.txt" -path "$mask" | xargs grep "^| *Occ") | cut -f 1,13,17 -d '|' | sed 's/|//g' | column -t | sort -r -n -k 3 | tail -n 1`
echo $record >&2
local file=`echo $record | awk -F ":" '{print $1}'`
#echo $file >&2
......@@ -109,7 +99,7 @@ function find_best_kws_result {
function find_best_stt_result {
local dir=$1
local mask=$2
local record=`(find $dir -name "*.ctm.sys" -path "$mask" -not -ipath "*rescore*" | xargs grep Avg) | sed 's/|//g' | column -t | sort -n -k 9 | head -n 1`
local record=`(find $dir -name "*.ctm.sys" -path "$mask" | xargs grep Avg) | sed 's/|//g' | column -t | sort -n -k 9 | head -n 1`
echo $record >&2
local file=`echo $record | awk -F ":" '{print $1}'`
......@@ -121,7 +111,7 @@ function find_best_stt_result {
function create_sysid {
local best_one=$1
local sysid=
local extrasys=$2
local taskid=`basename $best_one`
local system_path=`dirname $best_one`
if [[ $system_path =~ .*sgmm5.* ]] ; then
......@@ -130,12 +120,13 @@ function create_sysid {
sysid=DNN
elif [[ $system_path =~ .*sgmm7.* ]] ; then
sysid=BNF
elif [[ $system_path =~ .*4way.* ]] ; then
sysid=4way-comb
else
echo "Unknown system path ($system_path), cannot deduce the systemID" >&2
exit 1
fi
if [ ! -z $extrasys ]; then
sysid="${sysid}-${extrasys}"
fi
local kwsid=${taskid//kws_*/}
kwsid=${kwsid//_/}
if [ -z $kwsid ]; then
......@@ -168,13 +159,8 @@ function compose_expid {
local task=$1
local best_one=$2
local extraid=$3
echo "TASK: $task" >&2
echo "BEST ONE: $best_one" >&2
echo "EXTRA ID: $extraid" >&2
[ ! -z $extraid ] && extraid="-$extraid"
local sysid=`create_sysid $best_one`
echo "SYS ID: $sysid" >&2
local sysid=`create_sysid $best_one $extrasys`
if [ "$task" == "KWS" ]; then
ext="kwslist.xml"
elif [ "$task" == "STT" ]; then
......@@ -183,9 +169,6 @@ function compose_expid {
echo "Incorrect task ID ($task) given to compose_expid function!" >&2
exit 1
fi
echo "${corpusid}" >&2
echo "${partition}" >&2
echo "${scase}" >&2
echo "KWS14_${team}_${corpusid}_${partition}_${scase}_${task}_${prim}-${sysid}${extraid}_$version.$ext"
return 0
}
......@@ -196,19 +179,17 @@ function figure_out_scase {
local basnam=${ecf%%.ecf.xml}
local scase=`echo $basnam | awk -F _ '{print $2}'`
if [[ $scase =~ conv-dev(\..*)? ]]; then
if [ "$scase" = "conv-dev" ]; then
echo "BaDev"
elif [[ $scase =~ conv-eval(\..*)? ]]; then
elif [ "$scase" = "conv-eval" ]; then
echo "BaEval"
else
echo "WARNING: The ECF file $ecf is probably not an official file" >&2
echo "WARNING: Does not contain conv-dev|conv-eval ($scase)" >&2
echo "BaDev"
return 1
fi
else
echo "WARNING: The ECF file $ecf is probably not an official file" >&2
echo "WARNING: Does not match the mask IARPA-babel.*.ecf.xml" >&2
echo "BaDev"
return 1
fi
......@@ -221,9 +202,9 @@ function figure_out_partition {
local basnam=${ecf%%.ecf.xml}
local scase=`echo $basnam | awk -F _ '{print $2}'`
if [[ $scase =~ conv-dev(\..*)? ]]; then
if [ "$scase" = "conv-dev" ]; then
echo "conv-dev"
elif [[ $scase =~ conv-eval(\..*)? ]]; then
elif [ "$scase" = "conv-eval" ]; then
echo "conv-eval"
else
echo "WARNING: The ECF file $ecf is probably not an official file" >&2
......@@ -250,11 +231,6 @@ function figure_out_corpusid {
echo $corpusid
}
extrasys_unnorm="unnorm"
if [ ! -z $extrasys ] ; then
extrasys_unnorm="${extrasys}-unnorm"
fi
#data=data/shadow.uem
dirid=`basename $data`
kws_tasks="kws "
......@@ -265,44 +241,31 @@ if [ -z "$compounds" ] ; then
for kws in $kws_tasks ; do
echo $kws
best_one=`find_best_kws_result "$dir/decode_*${dirid}*/${kws}_*" "*"`
sysid=`create_sysid $best_one`
sysid=`create_sysid $best_one $extrasys`
ecf=`get_ecf_name $best_one`
scase=`figure_out_scase $ecf` || break
partition=`figure_out_partition $ecf` || break
corpusid=`figure_out_corpusid $ecf`
expid=`compose_expid KWS $best_one "$extrasys"`
echo -e "\tEXPORT NORMALIZED as: $expid"
expid_unnormalized=`compose_expid KWS $best_one "$extrasys_unnorm"`
echo -e "\tEXPORT UNNORMALIZED as: $expid_unnormalized"
export_kws_file $best_one/kwslist.xml $best_one/kwslist.fixed.xml $data/$kws/kwlist.xml $outputdir/$expid
export_kws_file $best_one/kwslist.unnormalized.xml $best_one/kwslist.unnormalized.fixed.xml $data/$kws/kwlist.xml $outputdir/$expid_unnormalized
echo -e "\tEXPORT as:" `compose_expid KWS $best_one`
done
else
[ -z $master ] && echo "You must choose the master compound (--master <compound>) for compound data set" && exit 1
for kws in $kws_tasks ; do
echo $kws
best_one=`find_best_kws_result "$dir/decode_*${dirid}*/$master/${kws}_*" "*"`
(
eval "`cat $best_one/metrics.txt | sed 's/ *= */=/g' | sed 's/,/;/g' | sed 's/Lattice Recall/LatticeRecall/g' `"
submit_to_google $best_one $ATWV $MTWV
) || echo "Submission failed!"
for compound in $compounds ; do
compound_best_one=`echo $best_one | sed "s:$master/${kws}_:$compound/${kws}_:g"`
echo "From ($kws) $best_one going to $compound_best_one"
compound_best_one=`echo $best_one | sed ":$master/${kws}_:$compound/${kws}_:g"`
echo -e "\tPREPARE EXPORT: $compound_best_one"
sysid=`create_sysid $compound_best_one`
sysid=`create_sysid $compound_best_one $extrasys`
#ecf=`get_ecf_name $best_one`
ecf=`readlink -f $data/compounds/$compound/ecf.xml`
scase=`figure_out_scase $ecf`
partition=`figure_out_partition $ecf`
corpusid=`figure_out_corpusid $ecf`
expid=`compose_expid KWS $compound_best_one "$extrasys"`
expid=`compose_expid KWS $compound_best_one`
echo -e "\tEXPORT NORMALIZED as: $expid"
expid_unnormalized=`compose_expid KWS $compound_best_one "$extrasys_unnorm"`
expid_unnormalized=`compose_expid KWS $compound_best_one "unnorm"`
echo -e "\tEXPORT UNNORMALIZED as: $expid_unnormalized"
export_kws_file $compound_best_one/kwslist.xml $compound_best_one/kwslist.fixed.xml $data/$kws/kwlist.xml $outputdir/$expid
......@@ -313,32 +276,30 @@ fi
##EXporting STT -- more straightforward, because there is only one task
if [ -z "$compounds" ] ; then
#best_one=`find_best_stt_result "$dir/decode_*${dirid}*/score_*" "*"`
best_one=`find_best_stt_result "$dir/*${dirid}*/score_*" "*"`
best_one=`find_best_stt_result "$dir/decode_*${dirid}*/score_*" "*"`
echo -e "\tERROR: I don't know how to do this, yet"
ecf=`get_ecf_name kws`
sysid=`create_sysid $best_one`
sysid=`create_sysid $best_one $extrasys`
scase=`figure_out_scase $ecf` || break
partition=`figure_out_partition $ecf`
corpusid=`figure_out_corpusid $ecf`
expid=`compose_expid STT $best_one "$extrasys"`
expid=`compose_expid STT $best_one`
echo -e "\tEXPORT NORMALIZED as: $expid"
export_file $best_one/${dirid}.ctm $outputdir/$expid
else
[ -z $master ] && echo "You must choose the master compound (--master <compound>) for compound data set" && exit 1
#best_one=`find_best_stt_result "$dir/decode_*${dirid}*/$master/score_*" "*"`
best_one=`find_best_stt_result "$dir/*${dirid}*/$master/score_*" "*"`
best_one=`find_best_stt_result "exp/sgmm5_mmi_b0.1/decode_*${dirid}*/$master/score_*" "*"`
for compound in $compounds ; do
compound_best_one=`echo $best_one | sed "s:$master/score_:$compound/score_:g"`
compound_best_one=`echo $best_one | sed ":$master/${kws}_:$compound/${kws}_:g"`
echo -e "\tPREPARE EXPORT: $compound_best_one"
sysid=`create_sysid $compound_best_one`
sysid=`create_sysid $compound_best_one $extrasys`
#ecf=`get_ecf_name $best_one`
ecf=`readlink -f $data/compounds/$compound/ecf.xml`
scase=`figure_out_scase $ecf`
partition=`figure_out_partition $ecf`
corpusid=`figure_out_corpusid $ecf`
expid=`compose_expid STT $compound_best_one $extrasys`
expid=`compose_expid STT $compound_best_one`
echo -e "\tEXPORT NORMALIZED as: $expid"
export_file $compound_best_one/${compound}.ctm $outputdir/$expid
......
......@@ -100,7 +100,7 @@ if [[ ! -f $lexicon || $lexicon -ot "$lexicon_file" ]]; then
# Extend the origina