Commit 519109f4 authored by Dan Povey's avatar Dan Povey
Browse files

trunk: merging changes from ^/sandbox/akirkedal, adding a Danish-language recipe.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4274 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parents 1a429da1 7e6af54b
About the sprakbanken corpus:
This corpus is a free corpus originally collected by NST for ASR purposes and currently
hosted by the Norwegian libraries. The corpus is multilingual and contains Swedish,
Norwegian (Bokmål) and Danish. The current setup works for Danish. The vocabulary is
large and there is approx. 350 hours of read-aloud speech with associated text scripts.
s1: This is the current recommended recipe. (Danish)
%WER 49.19 [ 5318 / 10811, 481 ins, 1511 del, 3326 sub ] exp/mono0a/decode_3g_test1k/wer_9
%WER 47.28 [ 5111 / 10811, 443 ins, 1489 del, 3179 sub ] exp/mono0a/decode_b3g_test1k/wer_10
%WER 16.19 [ 1750 / 10811, 397 ins, 323 del, 1030 sub ] exp/sgmm2_5a/decode_3g_test1k/wer_9
%WER 15.10 [ 1632 / 10811, 404 ins, 305 del, 923 sub ] exp/sgmm2_5b/decode_3g_test1k/wer_9
%WER 14.94 [ 1615 / 10811, 390 ins, 310 del, 915 sub ] exp/sgmm2_5b/decode_4g_test1k/wer_9
%WER 14.36 [ 1553 / 10811, 376 ins, 264 del, 913 sub ] exp/sgmm2_5c/decode_3g_test1k/wer_9
%WER 14.18 [ 1533 / 10811, 367 ins, 266 del, 900 sub ] exp/sgmm2_5c/decode_4g_test1k/wer_9
%WER 25.61 [ 2769 / 10811, 511 ins, 539 del, 1719 sub ] exp/tri1/decode_3g_test1k/wer_10
%WER 25.12 [ 2716 / 10811, 444 ins, 571 del, 1701 sub ] exp/tri1/decode_b3g_test1k/wer_11
%WER 23.81 [ 2574 / 10811, 426 ins, 564 del, 1584 sub ] exp/tri2a/decode_3g_test1k/wer_12
%WER 23.22 [ 2510 / 10811, 457 ins, 517 del, 1536 sub ] exp/tri2a/decode_3g_test1k_fromlats/wer_11
%WER 22.18 [ 2398 / 10811, 436 ins, 495 del, 1467 sub ] exp/tri2b/decode_3g_test1k/wer_11
%WER 21.87 [ 2364 / 10811, 380 ins, 553 del, 1431 sub ] exp/tri2b/decode_3g_test1k_mbr/wer_13
%WER 18.98 [ 2052 / 10811, 451 ins, 372 del, 1229 sub ] exp/tri3b_20k/decode_3g_test1k/wer_11
%WER 22.62 [ 2445 / 10811, 468 ins, 460 del, 1517 sub ] exp/tri3b_20k/decode_3g_test1k.si/wer_10
%WER 19.31 [ 2088 / 10811, 440 ins, 388 del, 1260 sub ] exp/tri3b/decode_3g_test1k/wer_11
%WER 23.19 [ 2507 / 10811, 435 ins, 520 del, 1552 sub ] exp/tri3b/decode_3g_test1k.si/wer_12
%WER 19.06 [ 2061 / 10811, 427 ins, 384 del, 1250 sub ] exp/tri3b/decode_4g_test1k/wer_11
%WER 23.20 [ 2508 / 10811, 447 ins, 520 del, 1541 sub ] exp/tri3b/decode_4g_test1k.si/wer_11
%WER 17.42 [ 1883 / 10811, 416 ins, 359 del, 1108 sub ] exp/tri4a/decode_3g_test1k/wer_13
%WER 20.86 [ 2255 / 10811, 403 ins, 473 del, 1379 sub ] exp/tri4a/decode_3g_test1k.si/wer_13
%WER 17.52 [ 1894 / 10811, 396 ins, 372 del, 1126 sub ] exp/tri4b/decode_3g_test1k/wer_13
%WER 20.82 [ 2251 / 10811, 399 ins, 471 del, 1381 sub ] exp/tri4b/decode_3g_test1k.si/wer_13
%WER 17.53 [ 1895 / 10811, 403 ins, 375 del, 1117 sub ] exp/tri4b/decode_4g_test1k/wer_13
%WER 20.99 [ 2269 / 10811, 438 ins, 436 del, 1395 sub ] exp/tri4b/decode_4g_test1k.si/wer_11
# "queue.pl" uses qsub. The options to it are
# options to qsub. If you have GridEngine installed,
# change this to a queue you have access to.
# Otherwise, use "run.pl", which will run jobs locally
# (make sure your --num-jobs options are no more than
# the number of cpus on your machine.
#a) JHU cluster options
#export train_cmd="queue.pl -l arch=*64"
#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
#export big_memory_cmd="queue.pl -l arch=*64,ram_free=8G,mem_free=8G"
#export cuda_cmd="queue.pl -l gpu=1"
#b) BUT cluster options
#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
#c) run it locally...
export train_cmd=run.pl
export decode_cmd=run.pl
export cuda_cmd=run.pl
export mkgraph_cmd=run.pl
& &+
0
3
e
E
i I
o
O
u U
V
W W+
y Y
@ @-
a
A
aI
n n-
b B
d
D
dZ tS
f
g
h
j ; J
k
l L
m
N ~
p
r R R3 3-
s z
S x Z
t
T
v
w
beam=18.0 # beam for decoding. Was 13.0 in the scripts.
latbeam=10.0 # this has most effect on size of the lattices.
# No non-default options for now.
--use-energy=false # only non-default option.
#!/bin/bash
# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Copyright 2014 Mirsk Digital ApS (Author: Andreas Kirkedal)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
KALDI_ROOT=$(pwd)/../../..
exproot=$(pwd)
dir=data/local/dict
mkdir -p $dir
# Dictionary preparation:
# This lexicon was created using eSpeak.
# To extend the setup, see local/dict_prep.sh
# Copy pre-made phone table
cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt
# Copy pre-made lexicon
wget http://www.openslr.org/resources/8/lexicon-da.tar.gz --directory-prefix=data/local/data/download
tar -xzf data/local/data/download/lexicon-da.tar.gz -C $dir
# silence phones, one per line.
echo SIL > $dir/silence_phones.txt
echo SIL > $dir/optional_silence.txt
touch $dir/extra_questions.txt
wait
## TODO: add cleanup commands
echo "Dictionary preparation succeeded"
#!/bin/bash
# Copyright 2014 Mirsk Digital ApS (Author: Andreas Kirkedal)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
if [ $# != 2 ]; then
echo "Usage: create_dataset.sh <src-data-dir> <dest-dir> "
exit 1
fi
src=$1
dest=$2
mkdir $dest
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $src/text.unnormalised $src/onlyids $src/transcripts.am
local/norm_dk/format_text.sh am $src/transcripts.am > $src/onlytext
paste -d ' ' $src/onlyids $src/onlytext > $dest/text
for f in wav.scp utt2spk; do
cp $src/$f $dest/$f
done
utils/utt2spk_to_spk2utt.pl $dest/utt2spk > $dest/spk2utt
utils/validate_data_dir.sh --no-feats $dest || exit 1;
#!/usr/bin/perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This is modified from the script in standard Kaldi recipe to account
# for the way the WSJ data is structured on the Edinburgh systems.
# - Arnab Ghoshal, 12/1/12
# This program takes as its standard input an .ndx file from the WSJ corpus that looks
# like this:
#;; File: tr_s_wv1.ndx, updated 04/26/94
#;;
#;; Index for WSJ0 SI-short Sennheiser training data
#;; Data is read WSJ sentences, Sennheiser mic.
#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts
#;; per speaker TI) = 7236 utts
#;;
#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
# and as command-line argument it takes the names of the WSJ disk locations, e.g.:
# /group/corpora/public/wsjcam0/data on DICE machines.
# It outputs a list of absolute pathnames.
$wsj_dir = $ARGV[0];
while(<STDIN>){
if(m/^;/){ next; } # Comment. Ignore it.
else {
m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_";
$filename = $2; # as a subdirectory of the distributed disk.
if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; }
$filename = "$wsj_dir/$filename";
if (-e $filename) {
print "$filename\n";
} else {
print STDERR "File $filename found in the index but not on disk\n";
}
}
}
'''
# Copyright 2013-2014 Mirsk Digital Aps (Author: Andreas Kirkedal)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
This script outputs text1, wav.scp and utt2spk in the directory specified
'''
import sys
import os
import codecs
import locale
# set locale for sorting purposes
locale.setlocale(locale.LC_ALL, "C")
# global var
def unique(items):
found = set([])
keep = []
for item in items:
if item not in found:
found.add(item)
keep.append(item)
return keep
def list2string(wordlist, lim=" ", newline=False):
'''Converts a list to a string with a delimiter $lim and the possible
addition of newline characters.'''
strout = ""
for w in wordlist:
strout += w + lim
if newline:
return strout.strip(lim) + "\n"
else:
return strout.strip(lim)
def get_sprak_info(abspath):
'''Returns the Sprakbank session id, utterance id and speaker id from the
filename.'''
return os.path.split(abspath)[-1].strip().split(".")[:-1]
def kaldi_utt_id(abspath, string=True):
'''Creates the kaldi utterance id from the filename.'''
fname = get_sprak_info(abspath)
spkutt_id = [fname[1].strip(), fname[0].strip(), fname[2].strip()]
if string:
return list2string(spkutt_id, lim="-")
else:
return spkutt_id
def make_kaldi_text(line):
'''Creates each line in the kaldi "text" file. '''
txtfile = codecs.open(line.strip(), "r", "utf8").read()
utt_id = kaldi_utt_id(line)
return utt_id + " " + txtfile
def txt2wav(fstring):
'''Changes the extension from .txt to .wav. '''
return os.path.splitext(fstring)[0] + ".wav"
def make_kaldi_scp(line, sph, wav=False):
if not wav:
wavpath = txt2wav(line)
else:
wavpath = wav
utt_id = kaldi_utt_id(line)
return utt_id + " " + sph + " " + wavpath + " |\n"
def make_utt2spk(line):
info = kaldi_utt_id(line, string=False)
utt_id = list2string(info, lim="-")
return list2string([utt_id, info[0]], newline=True)
def create_parallel_kaldi(filelist, sphpipe, snd=False):
'''Creates the "text" file that maps a transcript to an utterance id and
the corresponding wav.scp file. '''
transcripts = []
waves = []
utt2spk = []
for num, line in enumerate(filelist):
transcriptline = make_kaldi_text(line)
transcripts.append(transcriptline)
if snd:
scpline = make_kaldi_scp(line, sphpipe, snd[num].strip())
else:
scpline = make_kaldi_scp(line)
waves.append(scpline)
utt2spkline = make_utt2spk(line)
utt2spk.append(utt2spkline)
return (sorted(unique(transcripts)),
sorted(unique(waves)),
sorted(unique(utt2spk))
)
if __name__ == '__main__':
flist = codecs.open(sys.argv[1], "r").readlines()
outpath = sys.argv[2]
if len(sys.argv) == 5:
sndlist = codecs.open(sys.argv[3], "r").readlines()
sph2pipe = sys.argv[4] + " -f wav -p -c 1"
traindata = create_parallel_kaldi(flist, sph2pipe, snd=sndlist)
else:
traindata = create_parallel_kaldi(flist, "")
textout = codecs.open(os.path.join(outpath, "text.unnormalised"), "w", "utf8")
wavout = codecs.open(os.path.join(outpath, "wav.scp"), "w")
utt2spkout = codecs.open(os.path.join(outpath, "utt2spk"), "w")
textout.writelines(traindata[0])
wavout.writelines(traindata[1])
utt2spkout.writelines(traindata[2])
textout.close()
wavout.close()
utt2spkout.close()
#!/usr/bin/perl
# Add counts to an oovlist.
# Reads in counts as output by uniq -c, and
# an oovlist, and prints out the counts of the oovlist.
(@ARGV == 1 || @ARGV == 2) || die "Usage: add_counts.pl count_file [oovlist]\n";
$counts = shift @ARGV;
open(C, "<$counts") || die "Opening counts file $counts";
while(<C>) {
@A = split(" ", $_);
@A == 2 || die "Bad line in counts file: $_";
($count, $word) = @A;
$count =~ m:^\d+$: || die "Bad count $A[0]\n";
$counts{$word} = $count;
}
while(<>) {
chop;
$w = $_;
$w =~ m:\S+: || die "Bad word $w";
defined $counts{$w} || die "Word $w not present in counts file";
print "\t$counts{$w}\t$w\n";
}
#!/usr/bin/perl
# This program takes the output of score_prons.pl and collates
# it for each (rule, destress) pair so that we get the
# counts of right/partial/wrong for each pair.
# The input is a 7-tuple on each line, like:
# word;pron;base-word;base-pron;rule-name;de-stress;right|partial|wrong
#
# The output format is a 5-tuple like:
#
# rule;destress;right-count;partial-count;wrong-count
#
if (@ARGV != 0 && @ARGV != 1) {
die "Usage: count_rules.pl < scored_candidate_prons > rule_counts";
}
while(<>) {
chop;
$line = $_;
my ($word, $pron, $baseword, $basepron, $rulename, $destress, $score) = split(";", $line);
my $key = $rulename . ";" . $destress;
if (!defined $counts{$key}) {
$counts{$key} = [ 0, 0, 0 ]; # new anonymous array.
}
$ref = $counts{$key};
if ($score eq "right") {
$$ref[0]++;
} elsif ($score eq "partial") {
$$ref[1]++;
} elsif ($score eq "wrong") {
$$ref[2]++;
} else {
die "Bad score $score\n";
}
}
while ( my ($key, $value) = each(%counts)) {
print $key . ";" . join(";", @$value) . "\n";
}
#!/usr/bin/perl
# This program reads and writes either a dictionary or just a list
# of words, and it removes any words containing ";" or "," as these
# are used in these programs. It will warn about these.
# It will die if the pronunciations have these symbols in.
while(<>) {
chop;
@A = split(" ", $_);
$word = shift @A;
if ($word =~ m:[;,]:) {
print STDERR "Omitting line $_ since it has one of the banned characters ; or ,\n" ;
} else {
$_ =~ m:[;,]: && die "Phones cannot have ; or , in them.";
print $_ . "\n";
}
}
#!/usr/bin/perl
# Reads a dictionary, and prints out a list of words that seem to be pronounced
# as acronyms (not including plurals of acronyms, just acronyms). Uses
# the prons of the individual letters (A., B. and so on) to judge this.
# Note: this is somewhat dependent on the convention used in CMUduct, that
# the individual letters are spelled this way (e.g. "A.").
$max_length = 6; # Max length of words that might be
# acronyms.
while(<>) { # Read the dict.
chop;
@A = split(" ", $_);
$word = shift @A;
$pron = join(" ", @A);
if ($word =~ m/^([A-Z])\.$/ ) {
chop $word; # Remove trailing "." to get just the letter
$letter = $1;
if (!defined $letter_prons{$letter} ) {
$letter_prons{$letter} = [ ]; # new anonymous array
}
$arrayref = $letter_prons{$letter};
push @$arrayref, $pron;
} elsif( length($word) <= $max_length ) {
$pronof{$word . "," . $pron} = 1;
$isword{$word} = 1;
#if (!defined $prons{$word} ) {
# $prons{$word} = [ ];
#}
# push @{$prons{$word}}, $pron;
}
}
sub get_letter_prons;
foreach $word (keys %isword) {
my @letter_prons = get_letter_prons($word);
foreach $pron (@letter_prons) {
if (defined $pronof{$word.",".$pron}) {
print "$word $pron\n";
}
}
}
sub get_letter_prons {
@acronym = split("", shift); # The letters in the word.
my @prons = ( "" );
while (@acronym > 0) {
$l = shift @acronym;
$n = 1; # num-repeats of letter $l.
while (@acronym > 0 && $acronym[0] eq $l) {
$n++;
shift @acronym;
}
my $arrayref = $letter_prons{$l};
my @prons_of_block = ();
if ($n == 1) { # Just one repeat.
foreach $lpron ( @$arrayref ) {
push @prons_of_block, $lpron; # typically (always?) just one pron of a letter.
}
} elsif ($n == 2) { # Two repeats. Can be "double a" or "a a"
foreach $lpron ( @$arrayref ) {
push @prons_of_block, "D AH1 B AH0 L " . $lpron;
push @prons_of_block, $lpron . $lpron;
}
} elsif ($n == 3) { # can be "triple a" or "a a a"
foreach $lpron ( @$arrayref ) {
push @prons_of_block, "T R IH1 P AH0 L " . $lpron;
push @prons_of_block, $lpron . $lpron . $lpron;
}
} elsif ($n >= 4) { # let's say it can only be that letter repeated $n times..
# not sure really.
foreach $lpron ( @$arrayref ) {
$nlpron = "";
for ($m = 0; $m < $n; $m++) { $nlpron = $nlpron . $lpron; }
push @prons_of_block, $nlpron;
}
}
my @new_prons = ();
foreach $pron (@prons) {
foreach $pron_of_block(@prons_of_block) {
if ($pron eq "") {
push @new_prons, $pron_of_block;
} else {
push @new_prons, $pron . " " . $pron_of_block;
}
}
}
@prons = @new_prons;
}
return @prons;
}
#!/usr/bin/perl
# Reads a dictionary (for prons of letters), and an OOV list,
# and puts out candidate pronunciations of words in that list
# that could plausibly be acronyms.
# We judge that a word can plausibly be an acronym if it is
# a sequence of just letters (no non-letter characters such
# as "'"), or something like U.K.,
# and the number of letters is four or less.
#