Commit eefb92b7 authored by Dan Povey's avatar Dan Povey
Browse files

trunk: further improvements and bug-fixes in validate_dict_dir.pl

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4239 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 535a51dd
......@@ -15,6 +15,10 @@ $dict = shift @ARGV;
$dict =~ s:/$::;
$exit = 0;
$success = 1; # this is re-set each time we read a file.
sub set_to_fail { $exit = 1; $success = 0; }
# Checking silence_phones.txt -------------------------------
print "Checking $dict/silence_phones.txt ...\n";
if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;}
......@@ -26,19 +30,19 @@ print "--> reading $dict/silence_phones.txt\n";
while(<S>) {
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
$exit = 1;
set_to_fail();
}
my @col = split(" ", $_);
if (@col == 0) {
$exit = 1;
set_to_fail();
print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n";
}
foreach(0 .. @col-1) {
my $p = $col[$_];
if($silence{$p}) {$exit = 1; print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; }
if($silence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; }
else {$silence{$p} = 1;}
if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){
$exit = 1;
set_to_fail();
print "--> ERROR: phone \"$p\" has disallowed written form";
}
......@@ -60,9 +64,9 @@ while(<OS>) {
chomp;
my @col = split(" ", $_);
if ($idx > 1 or @col > 1) {
$exit = 1; print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n";
set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n";
} elsif (!$silence{$col[0]}) {
$exit = 1; print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n";
set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n";
}
$idx ++;
}
......@@ -81,19 +85,19 @@ print "--> reading $dict/nonsilence_phones.txt\n";
while(<NS>) {
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
$exit = 1;
set_to_fail();
}
my @col = split(" ", $_);
if (@col == 0) {
$exit = 1;
set_to_fail();
print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n";
}
foreach(0 .. @col-1) {
my $p = $col[$_];
if($nonsilence{$p}) {$exit = 1; print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; }
if($nonsilence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; }
else {$nonsilence{$p} = 1;}
if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){
$exit = 1;
set_to_fail();
print "--> ERROR: phone \"$p\" has disallowed written form";
}
......@@ -121,34 +125,37 @@ sub intersect {
print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n";
@itset = intersect(\%silence, \%nonsilence);
if(@itset == 0) {print "--> disjoint property is OK.\n";}
else {$exit = 1; print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
print "\n";
sub check_lexicon {
my ($lexfn, $pron_probs) = @_;
print "Checking $lexfn\n";
if(-z "$lexfn") {$exit = 1; print "--> ERROR: $lexfn is empty or not exists\n";}
if(!open(L, "<$lexfn")) {$exit = 1; print "--> ERROR: fail to open $lexfn\n";}
if(-z "$lexfn") {set_to_fail(); print "--> ERROR: $lexfn is empty or not exists\n";}
if(!open(L, "<$lexfn")) {set_to_fail(); print "--> ERROR: fail to open $lexfn\n";}
$idx = 1;
$success = 1;
print "--> reading $lexfn\n";
while (<L>) {
chomp;
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $lexfn does not end in newline.\n";
set_to_fail();
}
my @col = split(" ", $_);
$word = shift @col;
if (!defined $word) {
$exit = 1; print "--> ERROR: empty lexicon line in $lexfn\n";
set_to_fail(); print "--> ERROR: empty lexicon line in $lexfn\n";
}
if ($pron_probs) {
$prob = shift @col;
if (!($prob > 0.0 && $prob <= 1.0)) {
$exit = 1; print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lexfn\n";
set_to_fail(); print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lexfn\n";
}
}
foreach (0 .. @col-1) {
if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
$exit = 1; print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx)\n";
set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx)\n";
}
}
$idx ++;
......@@ -162,7 +169,7 @@ if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0); }
if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1); }
if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) {
print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n";
$exit = 1;
set_to_fail();
}
# If both lexicon.txt and lexiconp.txt exist, we check that they correspond to
# each other. If not, it could be that the user overwrote one and we need to
......@@ -175,19 +182,19 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
while(<L>) {
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $dict/lexicon.txt does not end in newline.\n";
$exit = 1;
set_to_fail();
last;
}
@A = split;
$x = <P>;
if ($x !~ s/\n$//) {
print "--> ERROR: last line '$x' of $dict/lexiconp.txt does not end in newline.\n";
$exit = 1;
set_to_fail();
last;
}
if (!defined $x) {
print "--> ERROR: lexicon.txt and lexiconp.txt have different numbers of lines (mismatch); delete one.\n";
$exit = 1;
set_to_fail();
last;
}
@B = split(" ", $x);
......@@ -197,13 +204,13 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
# now @A and @B should be the same.
if ($#A != $#B) {
print "--> ERROR: lexicon.txt and lexiconp.txt have mismatched lines '$_' versus '$x'; delete one.\n";
$exit = 1;
set_to_fail();
last;
}
for ($n = 0; $n < @A; $n++) {
if ($A[$n] ne $B[$n]) {
print "--> ERROR: lexicon.txt and lexiconp.txt have mismatched lines '$_' versus '$x'; delete one.\n";
$exit = 1;
set_to_fail();
last;
}
}
......@@ -211,32 +218,40 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
$x = <P>;
if (defined $x && $exit == 0) {
print "--> ERROR: lexicon.txt and lexiconp.txt have different numbers of lines (mismatch); delete one.\n";
$exit = 1;
set_to_fail();
}
}
# Checking extra_questions.txt -------------------------------
print "Checking $dict/extra_questions.txt ...\n";
if (-s "$dict/extra_questions.txt") {
if(!open(EX, "<$dict/extra_questions.txt")) {$exit = 1; print "--> ERROR: fail to open $dict/extra_questions.txt\n";}
if (!open(EX, "<$dict/extra_questions.txt")) {
set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n";
}
$idx = 1;
$success = 1;
print "--> reading $dict/extra_questions.txt\n";
while(<EX>) {
chomp;
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n";
set_to_fail();
}
my @col = split(" ", $_);
foreach(0 .. @col-1) {
if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
$exit = 1; print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n";
}
if (@col == 0) {
set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n";
}
}
foreach(0 .. @col-1) {
if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n";
}
$idx ++;
}
}
close(EX);
$success == 0 || print "--> $dict/extra_questions.txt is OK\n";
} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}
if ($exit == 1) { print " [Error detected ]\n"; exit 1;}
if ($exit == 1) { print "--> ERROR validating dictionary directory $dict (see detailed error messages above)\n"; exit 1;}
else { print "--> SUCCESS [validating dictionary directory $dict]\n"; }
exit 0;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment