Commit d13597fd authored by Dan Povey's avatar Dan Povey
Browse files

Adding scripts for dictionary and LM preparation for WSJ.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@604 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 36a18b6a
#!/usr/bin/perl
# Add counts to an oovlist.
# Reads in counts as output by uniq -c, and
# an oovlist, and prints out the counts of the oovlist.
(@ARGV == 1 || @ARGV == 2) || die "Usage: add_counts.pl count_file [oovlist]\n";
$counts = shift @ARGV;
open(C, "<$counts") || die "Opening counts file $counts";
while(<C>) {
@A = split(" ", $_);
@A == 2 || die "Bad line in counts file: $_";
($count, $word) = @A;
$count =~ m:^\d+$: || die "Bad count $A[0]\n";
$counts{$word} = $count;
}
while(<>) {
chop;
$w = $_;
$w =~ m:\S+: || die "Bad word $w";
defined $counts{$w} || die "Word $w not present in counts file";
print "\t$counts{$w}\t$w\n";
}
#!/usr/bin/perl
# This program takes the output of score_prons.pl and collates
# it for each (rule, destress) pair so that we get the
# counts of right/partial/wrong for each pair.
# The input is a 7-tuple on each line, like:
# word;pron;base-word;base-pron;rule-name;de-stress;right|partial|wrong
#
# The output format is a 5-tuple like:
#
# rule;destress;right-count;partial-count;wrong-count
#
if (@ARGV != 0 && @ARGV != 1) {
die "Usage: count_rules.pl < scored_candidate_prons > rule_counts";
}
while(<>) {
chop;
$line = $_;
my ($word, $pron, $baseword, $basepron, $rulename, $destress, $score) = split(";", $line);
my $key = $rulename . ";" . $destress;
if (!defined $counts{$key}) {
$counts{$key} = [ 0, 0, 0 ]; # new anonymous array.
}
$ref = $counts{$key};
if ($score eq "right") {
$$ref[0]++;
} elsif ($score eq "partial") {
$$ref[1]++;
} elsif ($score eq "wrong") {
$$ref[2]++;
} else {
die "Bad score $score\n";
}
}
while ( my ($key, $value) = each(%counts)) {
print $key . ";" . join(";", @$value) . "\n";
}
#!/usr/bin/perl
# This program reads and writes either a dictionary or just a list
# of words, and it removes any words containing ";" or "," as these
# are used in these programs. It will warn about these.
# It will die if the pronunciations have these symbols in.
while(<>) {
chop;
@A = split(" ", $_);
$word = shift @A;
if ($word =~ m:[;,]:) {
print STDERR "Omitting line $_ since it has one of the banned characters ; or ,\n" ;
} else {
$_ =~ m:[;,]: && die "Phones cannot have ; or , in them.";
print $_ . "\n";
}
}
#!/usr/bin/perl
# Reads a dictionary, and prints out a list of words that seem to be pronounced
# as acronyms (not including plurals of acronyms, just acronyms). Uses
# the prons of the individual letters (A., B. and so on) to judge this.
# Note: this is somewhat dependent on the convention used in CMUduct, that
# the individual letters are spelled this way (e.g. "A.").
$max_length = 6; # Max length of words that might be
# acronyms.
while(<>) { # Read the dict.
chop;
@A = split(" ", $_);
$word = shift @A;
$pron = join(" ", @A);
if ($word =~ m/^([A-Z])\.$/ ) {
chop $word; # Remove trailing "." to get just the letter
$letter = $1;
if (!defined $letter_prons{$letter} ) {
$letter_prons{$letter} = [ ]; # new anonymous array
}
$arrayref = $letter_prons{$letter};
push @$arrayref, $pron;
} elsif( length($word) <= $max_length ) {
$pronof{$word . "," . $pron} = 1;
$isword{$word} = 1;
#if (!defined $prons{$word} ) {
# $prons{$word} = [ ];
#}
# push @{$prons{$word}}, $pron;
}
}
sub get_letter_prons;
foreach $word (keys %isword) {
my @letter_prons = get_letter_prons($word);
foreach $pron (@letter_prons) {
if (defined $pronof{$word.",".$pron}) {
print "$word $pron\n";
}
}
}
sub get_letter_prons {
@acronym = split("", shift); # The letters in the word.
my @prons = ( "" );
while (@acronym > 0) {
$l = shift @acronym;
$n = 1; # num-repeats of letter $l.
while (@acronym > 0 && $acronym[0] eq $l) {
$n++;
shift @acronym;
}
my $arrayref = $letter_prons{$l};
my @prons_of_block = ();
if ($n == 1) { # Just one repeat.
foreach $lpron ( @$arrayref ) {
push @prons_of_block, $lpron; # typically (always?) just one pron of a letter.
}
} elsif ($n == 2) { # Two repeats. Can be "double a" or "a a"
foreach $lpron ( @$arrayref ) {
push @prons_of_block, "D AH1 B AH0 L " . $lpron;
push @prons_of_block, $lpron . $lpron;
}
} elsif ($n == 3) { # can be "triple a" or "a a a"
foreach $lpron ( @$arrayref ) {
push @prons_of_block, "T R IH1 P AH0 L " . $lpron;
push @prons_of_block, $lpron . $lpron . $lpron;
}
} elsif ($n >= 4) { # let's say it can only be that letter repeated $n times..
# not sure really.
foreach $lpron ( @$arrayref ) {
$nlpron = "";
for ($m = 0; $m < $n; $m++) { $nlpron = $nlpron . $lpron; }
push @prons_of_block, $nlpron;
}
}
my @new_prons = ();
foreach $pron (@prons) {
foreach $pron_of_block(@prons_of_block) {
if ($pron eq "") {
push @new_prons, $pron_of_block;
} else {
push @new_prons, $pron . " " . $pron_of_block;
}
}
}
@prons = @new_prons;
}
return @prons;
}
#!/usr/bin/perl
# Reads a dictionary (for prons of letters), and an OOV list,
# and puts out candidate pronunciations of words in that list
# that could plausibly be acronyms.
# We judge that a word can plausibly be an acronym if it is
# a sequence of just letters (no non-letter characters such
# as "'"), or something like U.K.,
# and the number of letters is four or less.
#
# If the text were not already pre-normalized, there would
# be other hints such as capitalization.
# This program appends
# the prons of the individual letters (A., B. and so on) to work out
# the pron of the acronym.
# Note: this is somewhat dependent on the convention used in CMUduct, that
# the individual letters are spelled this way (e.g. "A."). [it seems
# to also have the separated versions.
if (!(@ARGV == 1 || @ARGV == 2)) {
print "Usage: get_acronym_prons.pl dict [oovlist]";
}
$max_length = 4; # Max #letters in an acronym. (Longer
# acronyms tend to have "pseudo-pronunciations", e.g. think about UNICEF.
$dict = shift @ARGV;
open(D, "<$dict") || die "Opening dictionary $dict";
while(<D>) { # Read the dict, to get the prons of the letters.
chop;
@A = split(" ", $_);
$word = shift @A;
$pron = join(" ", @A);
if ($word =~ m/^([A-Z])\.$/ ) {
chop $word; # Remove trailing "." to get just the letter
$letter = $1;
if (!defined $letter_prons{$letter} ) {
$letter_prons{$letter} = [ ]; # new anonymous array
}
$arrayref = $letter_prons{$letter};
push @$arrayref, $pron;
} elsif( length($word) <= $max_length ) {
$pronof{$word . "," . $pron} = 1;
$isword{$word} = 1;
#if (!defined $prons{$word} ) {
# $prons{$word} = [ ];
#}
# push @{$prons{$word}}, $pron;
}
}
sub get_letter_prons;
while(<>) { # Read OOVs.
# For now, just do the simple cases without "." in
# between... things with "." in the OOV list seem to
# be mostly errors.
chop;
$word = $_;
if ($word =~ m/^[A-Z]{1,5}$/) {
foreach $pron ( get_letter_prons($word) ) { # E.g. UNPO
print "$word $pron\n";
}
} elsif ($word =~ m:^(\w\.){1,4}\w\.?$:) { # E.g. U.K. Make the final "." optional.
$letters = $word;
$letters =~ s:\.::g;
foreach $pron ( get_letter_prons($letters) ) {
print "$word $pron\n";
}
}
}
sub get_letter_prons {
@acronym = split("", shift); # The letters in the word.
my @prons = ( "" );
while (@acronym > 0) {
$l = shift @acronym;
$n = 1; # num-repeats of letter $l.
while (@acronym > 0 && $acronym[0] eq $l) {
$n++;
shift @acronym;
}
my $arrayref = $letter_prons{$l};
my @prons_of_block = ();
if ($n == 1) { # Just one repeat.
foreach $lpron ( @$arrayref ) {
push @prons_of_block, $lpron; # typically (always?) just one pron of a letter.
}
} elsif ($n == 2) { # Two repeats. Can be "double a" or "a a"
foreach $lpron ( @$arrayref ) {
push @prons_of_block, "D AH1 B AH0 L " . $lpron;
push @prons_of_block, $lpron . " " . $lpron;
}
} elsif ($n == 3) { # can be "triple a" or "a a a"
foreach $lpron ( @$arrayref ) {
push @prons_of_block, "T R IH1 P AH0 L " . $lpron;
push @prons_of_block, "$lpron $lpron $lpron";
}
} elsif ($n >= 4) { # let's say it can only be that letter repeated $n times..
# not sure really.
foreach $lpron ( @$arrayref ) {
$nlpron = $lpron;
for ($m = 1; $m < $n; $m++) { $nlpron = $nlpron . " " . $lpron; }
push @prons_of_block, $nlpron;
}
}
my @new_prons = ();
foreach $pron (@prons) {
foreach $pron_of_block(@prons_of_block) {
if ($pron eq "") {
push @new_prons, $pron_of_block;
} else {
push @new_prons, $pron . " " . $pron_of_block;
}
}
}
@prons = @new_prons;
}
return @prons;
}
#!/usr/bin/perl
# This script takes three command-line arguments (typically files, or "-"):
# the suffix rules (as output by get_rules.pl), the rule-hierarchy
# (from get_rule_hierarchy.pl), and the words that we want prons to be
# generated for (one per line).
# The output consists of candidate generated pronunciations for those words,
# together with information about how we generated those pronunciations.
# This does not do pruning of the candidates using the restriction
# "you can't use a more general rule when a more specific one is applicable".
# That is done by limit_candidate_prons.pl.
# Each line of the output consists of a 4-tuple, separated by ";", of the
# form:
# word;pron;base-word;base-pron;rule-name;destress[;rule-score]
# [the last field is only present if you supplied rules with score information].
# where:
# - "word" is the input word that we queried for, e.g. WASTED
# - "pron" is the generated pronunciation, e.g. "W EY1 S T AH0 D"
# - rule-name is a 4-tuple separated by commas that describes the rule, e.g.
# "STED,STING,D,NG",
# - "base-word" is the base-word we're getting the pron from,
# e.g. WASTING
# - "base-pron" is the pron of the base-word, e.g. "W EY1 S T IH0 NG"
# - "destress" is either "yes" or "no" and corresponds to whether we destressed the
# base-word or not [de-stressing just corresponds to just taking any 2's down to 1's,
# although we may extend this in future]...
# - "rule-score" is a numeric score of the rule (this field is only present
# if there was score information in your rules.
(@ARGV == 2 || @ARGV == 3) || die "Usage: get_candidate_prons.pl rules base-dict [ words ]";
$min_prefix_len = 3; # this should probably match with get_rules.pl
$rules = shift @ARGV; # Note: rules may be with destress "yes/no" indicators or without...
# if without, it's treated as if both "yes" and "no" are present.
$dict = shift @ARGV;
open(R, "<$rules") || die "Opening rules file: $rules";
sub process_word;
while(<R>) {
chop $_;
my ($rule, $destress, $rule_score) = split(";", $_); # We may have "destress" markings (yes|no),
# and scores, or we may have just rule, in which case
# $destress and $rule_score will be undefined.
my @R = split(",", $rule, 4); # "my" means new instance of @R each
# time we do this loop -> important because we'll be creating
# a reference to @R below.
# Note: the last arg to SPLIT tells it how many fields max to get.
# This stops it from omitting empty trailing fields.
@R == 4 || die "Bad rule $_";
$suffix = $R[0]; # Suffix of word we want pron for.
if (!defined $isrule{$rule}) {
$isrule{$rule} = 1; # make sure we do this only once for each rule
# (don't repeate for different stresses).
if (!defined $suffix2rule{$suffix}) {
# The syntax [ $x, $y, ... ] means a reference to a newly created array
# containing $x, $y, etc. \@R creates an array reference to R.
# so suffix2rule is a hash from suffix to ref to array of refs to
# 4-dimensional arrays.
$suffix2rule{$suffix} = [ \@R ];
} else {
# Below, the syntax @{$suffix2rule{$suffix}} dereferences the array
# reference inside the hash; \@R pushes onto that array a new array
# reference pointing to @R.
push @{$suffix2rule{$suffix}}, \@R;
}
}
if (!defined $rule_score) { $rule_score = -1; } # -1 means we don't have the score info.
# Now store information on which destress markings (yes|no) this rule
# is valid for, and the associated scores (if supplied)
# If just the rule is given (i.e. no destress marking specified),
# assume valid for both.
if (!defined $destress) { # treat as if both "yes" and "no" are valid.
$rule_and_destress_to_rule_score{$rule.";yes"} = $rule_score;
$rule_and_destress_to_rule_score{$rule.";no"} = $rule_score;
} else {
$rule_and_destress_to_rule_score{$rule.";".$destress} = $rule_score;
}
}
open(D, "<$dict") || die "Opening base dictionary: $dict";
while(<D>) {
@A = split(" ", $_);
$word = shift @A;
$pron = join(" ", @A);
if (!defined $word2prons{$word}) {
$word2prons{$word} = [ $pron ]; # Ref to new anonymous array containing just "pron".
} else {
push @{$word2prons{$word}}, $pron; # Push $pron onto array referred to (@$ref derefs array).
}
}
foreach $word (%word2prons) {
# Set up the hash "prefixcount", which says how many times a char-sequence
# is a prefix (not necessarily a strict prefix) of a word in the dict.
$len = length($word);
for ($l = 0; $l <= $len; $l++) {
$prefixcount{substr($word, 0, $l)}++;
}
}
open(R, "<$rules") || die "Opening rules file: $rules";
while(<>) {
chop;
m/^\S+$/ || die;
process_word($_);
}
sub process_word {
my $word = shift @_;
$len = length($word);
# $owncount is used in evaluating whether a particular prefix is a prefix
# of some other word in the dict... if a word itself may be in the dict
# (usually because we're running this on the dict itself), we need to
# correct for this.
if (defined $word2prons{$word}) { $owncount = 1; } else { $owncount = 0; }
for ($prefix_len = $min_prefix_len; $prefix_len <= $len; $prefix_len++) {
my $prefix = substr($word, 0, $prefix_len);
my $suffix = substr($word, $prefix_len);
if ($prefixcount{$prefix} - $owncount == 0) {
# This prefix is not a prefix of any word in the dict, so no point
# checking the rules below-- none of them can match.
next;
}
$rules_array_ref = $suffix2rule{$suffix};
if (defined $rules_array_ref) {
foreach $R (@$rules_array_ref) { # @$rules_array_ref dereferences the array.
# $R is a refernce to a 4-dimensional array, whose elements we access with
# $$R[0], etc.
my $base_suffix = $$R[1];
my $base_word = $prefix . $base_suffix;
my $base_prons_ref = $word2prons{$base_word};
if (defined $base_prons_ref) {
my $psuffix = $$R[2];
my $base_psuffix = $$R[3];
if ($base_psuffix ne "") {
$base_psuffix = " " . $base_psuffix;
# Include " ", the space between phones, to prevent
# matching partial phones below.
}
my $base_psuffix_len = length($base_psuffix);
foreach $base_pron (@$base_prons_ref) { # @$base_prons_ref derefs
# that reference to an array.
my $base_pron_prefix_len = length($base_pron) - $base_psuffix_len;
# Note: these lengths are in characters, not phones.
if ($base_pron_prefix_len >= 0 &&
substr($base_pron, $base_pron_prefix_len) eq $base_psuffix) {
# The suffix of the base_pron is what it should be.
my $pron_prefix = substr($base_pron, 0, $base_pron_prefix_len);
my $rule = join(",", @$R); # we'll output this..
my $len = @R;
for ($destress = 0; $destress <= 1; $destress++) { # Two versions
# of each rule: with destressing and without.
# pron is the generated pron.
if ($destress) { $pron_prefix =~ s/2/1/g; }
my $pron;
if ($psuffix ne "") { $pron = $pron_prefix . " " . $psuffix; }
else { $pron = $pron_prefix; }
# Now print out the info about the generated pron.
my $destress_mark = ($destress ? "yes" : "no");
my $rule_score = $rule_and_destress_to_rule_score{$rule.";".$destress_mark};
if (defined $rule_score) { # Means that the (rule,destress) combination was
# seen [note: this if-statement may be pointless, as currently we don't
# do any pruning of rules].
my @output = ($word, $pron, $base_word, $base_pron, $rule, $destress_mark);
if ($rule_score != -1) { push @output, $rule_score; } # If scores were supplied,
# we also output the score info.
print join(";", @output) . "\n";
}
}
}
}
}
}
}
}
}
#!/usr/bin/perl
#This reads in rules, of the form put out by get_rules.pl, e.g.:
# ERT,,ER0 T,
# MENT,ING,M AH0 N T,IH0 NG
# S,TON,Z,T AH0 N
# ,ER,IH0 NG,IH0 NG ER0
# ,'S,M AH0 N,M AH0 N Z
#TIONS,TIVE,SH AH0 N Z,T IH0 V
# and it works out a hierarchy that says which rules are sub-cases
# of which rules: it outputs on each line a pair separated by ";", where
# each member of the pair is a rule, first one is the specialization, the
# second one being more general.
# E.g.:
# RED,RE,D,/ED,E,D,
# RED,RE,D,/D,,D,
# GING,GE,IH0 NG,/ING,I,IH0 NG,
# TOR,TING,T ER0,T IH0 NG/OR,OR,T ER0,T ER0
# ERED,ER,D,/RED,R,D,
# ERED,ER,D,/ED,,D,
while(<>) {
chop;
$rule = $_;
$isrule{$rule} = 1;
push @rules, $rule;
}
foreach my $rule (@rules) {
# Truncate the letters and phones in the rule, while we
# can, to get more general rules; if the more general rule
# exists, put out the pair.
@A = split(",", $rule);
@suffixa = split("", $A[0]);
@suffixb = split("", $A[1]);
@psuffixa = split(" ", $A[2]);
@psuffixb = split(" ", $A[3]);
for ($common_suffix_len = 0; $common_suffix_len < @suffixa && $common_suffix_len < @suffixb;) {
if ($suffixa[$common_suffix_len] eq $suffixb[$common_suffix_len]) {
$common_suffix_len++;
} else {
last;
}
}
for ($common_psuffix_len = 0; $common_psuffix_len < @psuffixa && $common_psuffix_len < @psuffixb;) {
if ($psuffixa[$common_psuffix_len] eq $psuffixb[$common_psuffix_len]) {
$common_psuffix_len++;
} else {
last;
}
}
# Get all combinations of pairs of integers <= (common_suffix_len, common_psuffix_len),
# except (0,0), and print out this rule together with the corresponding rule (if it exists).
for ($m = 0; $m <= $common_suffix_len; $m++) {
$sa = join("", @suffixa[$m...$#suffixa]); # @x[a..b] is array slice notation.
$sb = join("", @suffixb[$m...$#suffixb]);
for ($n = 0; $n <= $common_psuffix_len; $n++) {
if (!($m == 0 && $n == 0)) {
$psa = join(" ", @psuffixa[$n...$#psuffixa]);
$psb = join(" ", @psuffixb[$n...$#psuffixb]);
$more_general_rule = join(",", ($sa, $sb, $psa, $psb));
if (defined $isrule{$more_general_rule}) {
print $rule . ";" . $more_general_rule . "\n";
}
}
}
}
}
#!/usr/bin/perl
# This program creates suggested suffix rules from a dictionary.
# It outputs quadruples of the form:
# suffix,base-suffix,psuffix,base-psuffix
# where "suffix" is the suffix of the letters of a word, "base-suffix" is
# the suffix of the letters of the base-word, "psuffix" is the suffix of the
# pronunciation of the word (a space-separated list of phonemes), and
# "base-psuffix" is the suffix of the pronunciation of the baseword.
# As far as this program is concerned, there is no distinction between
# "word" and "base-word". To simplify things slightly, what it does
# is return all tuples (a,b,c,d) [with a != b] such that there are
# at least $min_suffix_count instances in the dictionary of
# a (word-prefix, pron-prefix) pair where there exists (word,pron)
# pairs of the form