Commit 4e97dff3 authored by Gaurav Kumar's avatar Gaurav Kumar
Browse files

Modified the rule based lexicon generation script from LDC and added to local/ .

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4552 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 5fa5fbf8
......@@ -45,11 +45,12 @@ fi
#Get pronunciation for each word using the spron.pl file in the lexicon folder
if [ $stage -le 2 ]; then
cd $lexicon/callhome_spanish_lexicon_970908
#cd $lexicon/callhome_spanish_lexicon_970908
# Replace all words for which no pronunciation was generated with an orthographic
# representation
cat $tmpdir/uniquewords | ./spron.pl | cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \
> $tmpdir/lexicon_raw
cat $tmpdir/uniquewords | $local/spron.pl $lexicon/callhome_spanish_lexicon_970908/preferences $lexicon/callhome_spanish_lexicon_970908/basic_rules \
| cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \
> $tmpdir/lexicon_raw
fi
#Break the pronunciation down according to the format required by Kaldi
......
#!/usr/bin/perl
# Oct 21, 2015 : Gaurav Kumar (Johns Hopkins University)
# Apache 2.0
#
# This script was modified under GPL and is being distributed with
# Kaldi under a new license. It requires the preference and rule files
# (under LDC copyright) from LDC96L16. The main changes were
# - Outdated usage of perl conventions updated @_ => $_ or @A
# - This script no longer needs the preference and rule files to
# be in the same directory as this script.
# - Accepts tokens from <STDIN> instead of <>
# spron.pl Version 0.1 Jan. 11 1995
# Written by Zhibiao Wu, LDC, wzb@unagi.cis.upenn.edu
# This program needs the basic_rules file to run. The rules must be sorted
# in alphabetical order. The most specific rules should precede the more
# general ones. The conventions used in the basic rules are the same as
# regular expressions used in Perl.
# Revised history: Feb. 10 1995
# The file "preferences" (assumed to be in your current directory)
# gives an "oracle" of correct pronunciations that override the
# machine-generated ones.
# slightly changed 97/09/05 robertm:
# - look for basic_rules and preferences in $PWD instead of ~wzb/...
# - use next to shortcut loop instead of if/else
# - added a bit of documentation, without really trying to decipher this thing
$vfile = "";
$preference_file = "";
$rules_file = "";
$print_input = 0;
if ($#ARGV < 1) {
# Print Usage
print "Usage : local/spron.pl pref-file rules-file <v-file> <print-input>\n";
exit 1;
} else {
$preference_file = $ARGV[0];
$rules_file = $ARGV[1];
if ($#ARGV > 1) {
$vfile = $ARGV[2];
}
if ($#ARGV > 2) {
$print_input = 1;
}
}
$rule_num = 0;
$previous = "";
if ($vfile ne "") {
open(VF, $vfile) || die "Can't find file $vfile!\n";
while (<VF>) {
chop;
@A = split(//);
if (($A[0] ne '#') && ($_ ne "")) {
if (/(\S+)\s*->\s*(\S*)\s*:\s*(\S*)\s*__\s*(\S*)\s*(#?)/) {
$head[$rule_num] = $1;
$end[$rule_num] = $2;
$pre[$rule_num] = $3;
if ($4 =~ /#/) {
$nex[$rule_num] = "";
$some[$rule_num] = $4;
} else {
$nex[$rule_num] = $4;
$some[$rule_num] = $5;
}
if ($previous ne substr($head[$rule_num],0,1)) {
$first{$head[$rule_num]} = $rule_num;
$last{$previous} = $rule_num - 1;
}
$previous = substr($head[$rule_num++],0,1);
} else {
print "Rule format error: Cannot parse $_\n";
exit(1);
}
}
}
$last{$previous} = $rule_num - 1;
close(VF);
}
open(PF, $preference_file) || die "Can't read `preferences' file";
while (<PF>) {
chop;
if ($_ ne "") {
@A = split;
$pron{$A[0]} = $A[1];
$stre{$A[0]} = $A[2];
}
}
$previous = "";
$brule_num = 0;
open(BF, $rules_file) || die "Can't read `basic_rules' file";
while (<BF>) {
chop;
@A = split(//);
if (($A[0] ne '#') && ($_ ne "")) {
if (/(\S+)\s*->\s*(\S*)\s*:\s*(\S*)\s*__\s*(\S*)\s*(#?)/) {
$bhead[$brule_num] = $1;
$bend[$brule_num] = $2;
$bpre[$brule_num] = $3;
if ($4 =~ /#/) {
$bnex[$brule_num] = "";
$bsome[$brule_num] = $4;
} else {
$bnex[$brule_num] = $4;
$bsome[$brule_num] = $5;
}
if ($previous ne substr($bhead[$brule_num],0,1)) {
$bfirst{substr($bhead[$brule_num],0,1)} = $brule_num;
$blast{$previous} = $brule_num - 1;
}
$previous = substr($bhead[$brule_num++],0,1);
} else {
print "Rule format error in file basic_rules: Cannot parse $_\n";
exit(1);
}
}
}
$blast{$previous} = $brule_num - 1;
close(BF);
if ($brule_num == 0) {
print "No basic rules, Program exit!\n";
exit(1);
}
while(<STDIN>){
next if ((/^#/) || (/^\s*$/) );
chop;
if ($print_input) {
print $_, "\t";
}
if ($pron{$_}) {
# print answer from preferences and skip to next word
print "$pron{$_}\t$stre{$_}\n";
next;
}
$original = $_;
tr/A-Z/a-z/;
$orig = "#" . $_ . "#";
@l = ();
push(@l,split("",$orig));
@pron = &transfer(1);
foreach (@pron) {
$a = $_;
y/aeiou//cd;
if ($_ eq "") {
print "#No stressable vowel in $original\n";
} else {
s/[aeiou]/0/go;
s/[]/1/go;
if (!/1/) {
if(length() == 1){
s/\b./1/o;
} elsif($l[$#l - 1] =~ /[aeiouns]/o){
s/00\b/10/o;
} else {
s/0\b/1/o;
}
}
$a =~ s//a/g;
$a =~ s//e/g;
$a =~ s//i/g;
$a =~ s//o/g;
$a =~ s//u/g;
print "$a\t$_\n";
}
}
}
sub transfer{
local($_) = @_;
local(@p) = ();
local($s) = 0;
local($over) = 0;
local($i,$j,$k) = (0,0,0);
if ($_ >= length($orig) - 1) {
push(@p, "");
return(@p);
} else {
if ($vfile ne "") {
for ($i= $first{substr($orig, $_, 1)};
$i <= $last{substr($orig, $_, 1)} ; $i++) {
if (&matchv($_,$i)) {
$s = $_ + length($head[$i]);
foreach $w (&transfer($s)) {
push(@p, $end[$i] . $w);
if ($some[$i] ne "") {
$over = 0;
} else {
$over = 1;
}
}
}
}
}
if ($over == 0 ) {
$i = $bfirst{substr($orig, $_, 1)};
while (($i <= $blast{substr($orig, $_, 1)}) && ($over == 0)) {
if (&matchb($_,$i)) {
$over = 1;
$s = $_ + length($bhead[$i]);
foreach $w (&transfer($s)) {
push(@p, $bend[$i] . $w);
}
}
$i++;
}
if ($over == 0) {
$s = $_ + 1;
foreach $w (&transfer($s)) {
push(@p, substr($orig,$_,1) . $w);
}
}
}
return(@p);
}
}
sub matchv {
$h = $head[$_[1]];
$p = $pre[$_[1]];
$n = $nex[$_[1]];
return(&match($_[0],$h,$p,$n));
}
sub matchb {
$h = $bhead[$_[1]];
$p = $bpre[$_[1]];
$n = $bnex[$_[1]];
return(&match($_[0],$h,$p,$n));
}
sub match {
if (substr($orig, $_[0], length($_[1])) eq $_[1]) {
return ( &match_n($_[0] + length($_[1]) - 1, $_[3]) &&
&match_p($_[0], $_[2]));
} else {
return (0);
}
}
sub match_p {
local($a) = $_[0];
local($b) = $_[1];
local($_);
if ($b eq "" ) {
return (1);
} else {
$_ = substr($orig, 0, $a) . "!";
if (/($b)!/) {
return(1);
} else {
return(0);
}
}
}
sub match_n {
local($a) = $_[0];
local($b) = $_[1];
local($_);
if ($b eq "" ) {
return (1);
} else {
$_ = "!" . substr($orig, $a + 1, length($orig) - $a - 1);
if (/!($b)/) {
return(1);
} else {
return(0);
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment