From d13597fd56a26c04cc445a2bf020d43467ca73de Mon Sep 17 00:00:00 2001 From: Dan Povey <dpovey@gmail.com> Date: Tue, 15 Nov 2011 07:32:53 +0000 Subject: [PATCH] Adding scripts for dictionary and LM preparation for WSJ. git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@604 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8 --- egs/wsj/s3/local/dict/add_counts.pl | 31 ++ egs/wsj/s3/local/dict/count_rules.pl | 44 ++ egs/wsj/s3/local/dict/filter_dict.pl | 19 + egs/wsj/s3/local/dict/find_acronyms.pl | 95 ++++ egs/wsj/s3/local/dict/get_acronym_prons.pl | 123 ++++++ egs/wsj/s3/local/dict/get_candidate_prons.pl | 187 ++++++++ egs/wsj/s3/local/dict/get_rule_hierarchy.pl | 73 ++++ egs/wsj/s3/local/dict/get_rules.pl | 204 +++++++++ .../s3/local/dict/limit_candidate_prons.pl | 103 +++++ egs/wsj/s3/local/dict/reverse_candidates.pl | 50 +++ egs/wsj/s3/local/dict/reverse_dict.pl | 14 + egs/wsj/s3/local/dict/score_prons.pl | 50 +++ egs/wsj/s3/local/dict/score_rules.pl | 52 +++ .../s3/local/dict/select_candidate_prons.pl | 84 ++++ egs/wsj/s3/local/wsj_extend_dict.sh | 168 ++++++++ egs/wsj/s3/local/wsj_format_data.sh | 4 +- egs/wsj/s3/local/wsj_format_data_local.sh | 151 +++++++ egs/wsj/s3/local/wsj_prepare_local_dict.sh | 48 +++ egs/wsj/s3/local/wsj_train_lms.sh | 404 ++++++++---------- egs/wsj/s3/run.sh | 21 +- 20 files changed, 1701 insertions(+), 224 deletions(-) create mode 100755 egs/wsj/s3/local/dict/add_counts.pl create mode 100755 egs/wsj/s3/local/dict/count_rules.pl create mode 100755 egs/wsj/s3/local/dict/filter_dict.pl create mode 100755 egs/wsj/s3/local/dict/find_acronyms.pl create mode 100755 egs/wsj/s3/local/dict/get_acronym_prons.pl create mode 100755 egs/wsj/s3/local/dict/get_candidate_prons.pl create mode 100755 egs/wsj/s3/local/dict/get_rule_hierarchy.pl create mode 100755 egs/wsj/s3/local/dict/get_rules.pl create mode 100755 egs/wsj/s3/local/dict/limit_candidate_prons.pl create mode 100755 egs/wsj/s3/local/dict/reverse_candidates.pl create mode 100755 egs/wsj/s3/local/dict/reverse_dict.pl create mode 100755 egs/wsj/s3/local/dict/score_prons.pl create mode 100755 egs/wsj/s3/local/dict/score_rules.pl create mode 100755 egs/wsj/s3/local/dict/select_candidate_prons.pl create mode 100755 egs/wsj/s3/local/wsj_extend_dict.sh create mode 100755 egs/wsj/s3/local/wsj_format_data_local.sh create mode 100755 egs/wsj/s3/local/wsj_prepare_local_dict.sh diff --git a/egs/wsj/s3/local/dict/add_counts.pl b/egs/wsj/s3/local/dict/add_counts.pl new file mode 100755 index 000000000..409277c72 --- /dev/null +++ b/egs/wsj/s3/local/dict/add_counts.pl @@ -0,0 +1,31 @@ +#!/usr/bin/perl + + +# Add counts to an oovlist. +# Reads in counts as output by uniq -c, and +# an oovlist, and prints out the counts of the oovlist. + +(@ARGV == 1 || @ARGV == 2) || die "Usage: add_counts.pl count_file [oovlist]\n"; + +$counts = shift @ARGV; + +open(C, "<$counts") || die "Opening counts file $counts"; + +while(<C>) { + @A = split(" ", $_); + @A == 2 || die "Bad line in counts file: $_"; + ($count, $word) = @A; + $count =~ m:^\d+$: || die "Bad count $A[0]\n"; + $counts{$word} = $count; +} + +while(<>) { + chop; + $w = $_; + $w =~ m:\S+: || die "Bad word $w"; + defined $counts{$w} || die "Word $w not present in counts file"; + print "\t$counts{$w}\t$w\n"; +} + + + diff --git a/egs/wsj/s3/local/dict/count_rules.pl b/egs/wsj/s3/local/dict/count_rules.pl new file mode 100755 index 000000000..2805e98c3 --- /dev/null +++ b/egs/wsj/s3/local/dict/count_rules.pl @@ -0,0 +1,44 @@ +#!/usr/bin/perl + +# This program takes the output of score_prons.pl and collates +# it for each (rule, destress) pair so that we get the +# counts of right/partial/wrong for each pair. + +# The input is a 7-tuple on each line, like: +# word;pron;base-word;base-pron;rule-name;de-stress;right|partial|wrong +# +# The output format is a 5-tuple like: +# +# rule;destress;right-count;partial-count;wrong-count +# + +if (@ARGV != 0 && @ARGV != 1) { + die "Usage: count_rules.pl < scored_candidate_prons > rule_counts"; +} + + +while(<>) { + chop; + $line = $_; + my ($word, $pron, $baseword, $basepron, $rulename, $destress, $score) = split(";", $line); + + my $key = $rulename . ";" . $destress; + + if (!defined $counts{$key}) { + $counts{$key} = [ 0, 0, 0 ]; # new anonymous array. + } + $ref = $counts{$key}; + if ($score eq "right") { + $$ref[0]++; + } elsif ($score eq "partial") { + $$ref[1]++; + } elsif ($score eq "wrong") { + $$ref[2]++; + } else { + die "Bad score $score\n"; + } +} + +while ( my ($key, $value) = each(%counts)) { + print $key . ";" . join(";", @$value) . "\n"; +} diff --git a/egs/wsj/s3/local/dict/filter_dict.pl b/egs/wsj/s3/local/dict/filter_dict.pl new file mode 100755 index 000000000..1210bb5e6 --- /dev/null +++ b/egs/wsj/s3/local/dict/filter_dict.pl @@ -0,0 +1,19 @@ +#!/usr/bin/perl + + +# This program reads and writes either a dictionary or just a list +# of words, and it removes any words containing ";" or "," as these +# are used in these programs. It will warn about these. +# It will die if the pronunciations have these symbols in. +while(<>) { + chop; + @A = split(" ", $_); + $word = shift @A; + + if ($word =~ m:[;,]:) { + print STDERR "Omitting line $_ since it has one of the banned characters ; or ,\n" ; + } else { + $_ =~ m:[;,]: && die "Phones cannot have ; or , in them."; + print $_ . "\n"; + } +} diff --git a/egs/wsj/s3/local/dict/find_acronyms.pl b/egs/wsj/s3/local/dict/find_acronyms.pl new file mode 100755 index 000000000..ed4655afa --- /dev/null +++ b/egs/wsj/s3/local/dict/find_acronyms.pl @@ -0,0 +1,95 @@ +#!/usr/bin/perl + +# Reads a dictionary, and prints out a list of words that seem to be pronounced +# as acronyms (not including plurals of acronyms, just acronyms). Uses +# the prons of the individual letters (A., B. and so on) to judge this. +# Note: this is somewhat dependent on the convention used in CMUduct, that +# the individual letters are spelled this way (e.g. "A."). + +$max_length = 6; # Max length of words that might be + # acronyms. + +while(<>) { # Read the dict. + chop; + @A = split(" ", $_); + $word = shift @A; + $pron = join(" ", @A); + if ($word =~ m/^([A-Z])\.$/ ) { + chop $word; # Remove trailing "." to get just the letter + $letter = $1; + if (!defined $letter_prons{$letter} ) { + $letter_prons{$letter} = [ ]; # new anonymous array + } + $arrayref = $letter_prons{$letter}; + push @$arrayref, $pron; + } elsif( length($word) <= $max_length ) { + $pronof{$word . "," . $pron} = 1; + $isword{$word} = 1; + #if (!defined $prons{$word} ) { + # $prons{$word} = [ ]; + #} + # push @{$prons{$word}}, $pron; + } +} + +sub get_letter_prons; + +foreach $word (keys %isword) { + my @letter_prons = get_letter_prons($word); + foreach $pron (@letter_prons) { + if (defined $pronof{$word.",".$pron}) { + print "$word $pron\n"; + } + } +} + + +sub get_letter_prons { + @acronym = split("", shift); # The letters in the word. + my @prons = ( "" ); + + while (@acronym > 0) { + $l = shift @acronym; + $n = 1; # num-repeats of letter $l. + while (@acronym > 0 && $acronym[0] eq $l) { + $n++; + shift @acronym; + } + my $arrayref = $letter_prons{$l}; + my @prons_of_block = (); + if ($n == 1) { # Just one repeat. + foreach $lpron ( @$arrayref ) { + push @prons_of_block, $lpron; # typically (always?) just one pron of a letter. + } + } elsif ($n == 2) { # Two repeats. Can be "double a" or "a a" + foreach $lpron ( @$arrayref ) { + push @prons_of_block, "D AH1 B AH0 L " . $lpron; + push @prons_of_block, $lpron . $lpron; + } + } elsif ($n == 3) { # can be "triple a" or "a a a" + foreach $lpron ( @$arrayref ) { + push @prons_of_block, "T R IH1 P AH0 L " . $lpron; + push @prons_of_block, $lpron . $lpron . $lpron; + } + } elsif ($n >= 4) { # let's say it can only be that letter repeated $n times.. + # not sure really. + foreach $lpron ( @$arrayref ) { + $nlpron = ""; + for ($m = 0; $m < $n; $m++) { $nlpron = $nlpron . $lpron; } + push @prons_of_block, $nlpron; + } + } + my @new_prons = (); + foreach $pron (@prons) { + foreach $pron_of_block(@prons_of_block) { + if ($pron eq "") { + push @new_prons, $pron_of_block; + } else { + push @new_prons, $pron . " " . $pron_of_block; + } + } + } + @prons = @new_prons; + } + return @prons; +} diff --git a/egs/wsj/s3/local/dict/get_acronym_prons.pl b/egs/wsj/s3/local/dict/get_acronym_prons.pl new file mode 100755 index 000000000..3f9936818 --- /dev/null +++ b/egs/wsj/s3/local/dict/get_acronym_prons.pl @@ -0,0 +1,123 @@ +#!/usr/bin/perl + +# Reads a dictionary (for prons of letters), and an OOV list, +# and puts out candidate pronunciations of words in that list +# that could plausibly be acronyms. +# We judge that a word can plausibly be an acronym if it is +# a sequence of just letters (no non-letter characters such +# as "'"), or something like U.K., +# and the number of letters is four or less. +# +# If the text were not already pre-normalized, there would +# be other hints such as capitalization. + +# This program appends +# the prons of the individual letters (A., B. and so on) to work out +# the pron of the acronym. +# Note: this is somewhat dependent on the convention used in CMUduct, that +# the individual letters are spelled this way (e.g. "A."). [it seems +# to also have the separated versions. + +if (!(@ARGV == 1 || @ARGV == 2)) { + print "Usage: get_acronym_prons.pl dict [oovlist]"; +} + +$max_length = 4; # Max #letters in an acronym. (Longer + # acronyms tend to have "pseudo-pronunciations", e.g. think about UNICEF. + +$dict = shift @ARGV; +open(D, "<$dict") || die "Opening dictionary $dict"; + +while(<D>) { # Read the dict, to get the prons of the letters. + chop; + @A = split(" ", $_); + $word = shift @A; + $pron = join(" ", @A); + if ($word =~ m/^([A-Z])\.$/ ) { + chop $word; # Remove trailing "." to get just the letter + $letter = $1; + if (!defined $letter_prons{$letter} ) { + $letter_prons{$letter} = [ ]; # new anonymous array + } + $arrayref = $letter_prons{$letter}; + push @$arrayref, $pron; + } elsif( length($word) <= $max_length ) { + $pronof{$word . "," . $pron} = 1; + $isword{$word} = 1; + #if (!defined $prons{$word} ) { + # $prons{$word} = [ ]; + #} + # push @{$prons{$word}}, $pron; + } +} + +sub get_letter_prons; + +while(<>) { # Read OOVs. + # For now, just do the simple cases without "." in + # between... things with "." in the OOV list seem to + # be mostly errors. + chop; + $word = $_; + if ($word =~ m/^[A-Z]{1,5}$/) { + foreach $pron ( get_letter_prons($word) ) { # E.g. UNPO + print "$word $pron\n"; + } + } elsif ($word =~ m:^(\w\.){1,4}\w\.?$:) { # E.g. U.K. Make the final "." optional. + $letters = $word; + $letters =~ s:\.::g; + foreach $pron ( get_letter_prons($letters) ) { + print "$word $pron\n"; + } + } +} + +sub get_letter_prons { + @acronym = split("", shift); # The letters in the word. + my @prons = ( "" ); + + while (@acronym > 0) { + $l = shift @acronym; + $n = 1; # num-repeats of letter $l. + while (@acronym > 0 && $acronym[0] eq $l) { + $n++; + shift @acronym; + } + my $arrayref = $letter_prons{$l}; + my @prons_of_block = (); + if ($n == 1) { # Just one repeat. + foreach $lpron ( @$arrayref ) { + push @prons_of_block, $lpron; # typically (always?) just one pron of a letter. + } + } elsif ($n == 2) { # Two repeats. Can be "double a" or "a a" + foreach $lpron ( @$arrayref ) { + push @prons_of_block, "D AH1 B AH0 L " . $lpron; + push @prons_of_block, $lpron . " " . $lpron; + } + } elsif ($n == 3) { # can be "triple a" or "a a a" + foreach $lpron ( @$arrayref ) { + push @prons_of_block, "T R IH1 P AH0 L " . $lpron; + push @prons_of_block, "$lpron $lpron $lpron"; + } + } elsif ($n >= 4) { # let's say it can only be that letter repeated $n times.. + # not sure really. + foreach $lpron ( @$arrayref ) { + $nlpron = $lpron; + for ($m = 1; $m < $n; $m++) { $nlpron = $nlpron . " " . $lpron; } + push @prons_of_block, $nlpron; + } + } + my @new_prons = (); + foreach $pron (@prons) { + foreach $pron_of_block(@prons_of_block) { + if ($pron eq "") { + push @new_prons, $pron_of_block; + } else { + push @new_prons, $pron . " " . $pron_of_block; + } + } + } + @prons = @new_prons; + } + return @prons; +} diff --git a/egs/wsj/s3/local/dict/get_candidate_prons.pl b/egs/wsj/s3/local/dict/get_candidate_prons.pl new file mode 100755 index 000000000..b13efd203 --- /dev/null +++ b/egs/wsj/s3/local/dict/get_candidate_prons.pl @@ -0,0 +1,187 @@ +#!/usr/bin/perl + +# This script takes three command-line arguments (typically files, or "-"): +# the suffix rules (as output by get_rules.pl), the rule-hierarchy +# (from get_rule_hierarchy.pl), and the words that we want prons to be +# generated for (one per line). + +# The output consists of candidate generated pronunciations for those words, +# together with information about how we generated those pronunciations. +# This does not do pruning of the candidates using the restriction +# "you can't use a more general rule when a more specific one is applicable". +# That is done by limit_candidate_prons.pl. + +# Each line of the output consists of a 4-tuple, separated by ";", of the +# form: +# word;pron;base-word;base-pron;rule-name;destress[;rule-score] +# [the last field is only present if you supplied rules with score information]. +# where: +# - "word" is the input word that we queried for, e.g. WASTED +# - "pron" is the generated pronunciation, e.g. "W EY1 S T AH0 D" +# - rule-name is a 4-tuple separated by commas that describes the rule, e.g. +# "STED,STING,D,NG", +# - "base-word" is the base-word we're getting the pron from, +# e.g. WASTING +# - "base-pron" is the pron of the base-word, e.g. "W EY1 S T IH0 NG" +# - "destress" is either "yes" or "no" and corresponds to whether we destressed the +# base-word or not [de-stressing just corresponds to just taking any 2's down to 1's, +# although we may extend this in future]... +# - "rule-score" is a numeric score of the rule (this field is only present +# if there was score information in your rules. + + +(@ARGV == 2 || @ARGV == 3) || die "Usage: get_candidate_prons.pl rules base-dict [ words ]"; + +$min_prefix_len = 3; # this should probably match with get_rules.pl + +$rules = shift @ARGV; # Note: rules may be with destress "yes/no" indicators or without... + # if without, it's treated as if both "yes" and "no" are present. +$dict = shift @ARGV; + +open(R, "<$rules") || die "Opening rules file: $rules"; + +sub process_word; + +while(<R>) { + chop $_; + my ($rule, $destress, $rule_score) = split(";", $_); # We may have "destress" markings (yes|no), + # and scores, or we may have just rule, in which case + # $destress and $rule_score will be undefined. + + my @R = split(",", $rule, 4); # "my" means new instance of @R each + # time we do this loop -> important because we'll be creating + # a reference to @R below. + # Note: the last arg to SPLIT tells it how many fields max to get. + # This stops it from omitting empty trailing fields. + @R == 4 || die "Bad rule $_"; + $suffix = $R[0]; # Suffix of word we want pron for. + if (!defined $isrule{$rule}) { + $isrule{$rule} = 1; # make sure we do this only once for each rule + # (don't repeate for different stresses). + if (!defined $suffix2rule{$suffix}) { + # The syntax [ $x, $y, ... ] means a reference to a newly created array + # containing $x, $y, etc. \@R creates an array reference to R. + # so suffix2rule is a hash from suffix to ref to array of refs to + # 4-dimensional arrays. + $suffix2rule{$suffix} = [ \@R ]; + } else { + # Below, the syntax @{$suffix2rule{$suffix}} dereferences the array + # reference inside the hash; \@R pushes onto that array a new array + # reference pointing to @R. + push @{$suffix2rule{$suffix}}, \@R; + } + } + if (!defined $rule_score) { $rule_score = -1; } # -1 means we don't have the score info. + + # Now store information on which destress markings (yes|no) this rule + # is valid for, and the associated scores (if supplied) + # If just the rule is given (i.e. no destress marking specified), + # assume valid for both. + if (!defined $destress) { # treat as if both "yes" and "no" are valid. + $rule_and_destress_to_rule_score{$rule.";yes"} = $rule_score; + $rule_and_destress_to_rule_score{$rule.";no"} = $rule_score; + } else { + $rule_and_destress_to_rule_score{$rule.";".$destress} = $rule_score; + } + +} + +open(D, "<$dict") || die "Opening base dictionary: $dict"; +while(<D>) { + @A = split(" ", $_); + $word = shift @A; + $pron = join(" ", @A); + if (!defined $word2prons{$word}) { + $word2prons{$word} = [ $pron ]; # Ref to new anonymous array containing just "pron". + } else { + push @{$word2prons{$word}}, $pron; # Push $pron onto array referred to (@$ref derefs array). + } +} +foreach $word (%word2prons) { + # Set up the hash "prefixcount", which says how many times a char-sequence + # is a prefix (not necessarily a strict prefix) of a word in the dict. + $len = length($word); + for ($l = 0; $l <= $len; $l++) { + $prefixcount{substr($word, 0, $l)}++; + } +} + +open(R, "<$rules") || die "Opening rules file: $rules"; + + +while(<>) { + chop; + m/^\S+$/ || die; + process_word($_); +} + +sub process_word { + my $word = shift @_; + $len = length($word); + # $owncount is used in evaluating whether a particular prefix is a prefix + # of some other word in the dict... if a word itself may be in the dict + # (usually because we're running this on the dict itself), we need to + # correct for this. + if (defined $word2prons{$word}) { $owncount = 1; } else { $owncount = 0; } + + for ($prefix_len = $min_prefix_len; $prefix_len <= $len; $prefix_len++) { + my $prefix = substr($word, 0, $prefix_len); + my $suffix = substr($word, $prefix_len); + if ($prefixcount{$prefix} - $owncount == 0) { + # This prefix is not a prefix of any word in the dict, so no point + # checking the rules below-- none of them can match. + next; + } + $rules_array_ref = $suffix2rule{$suffix}; + if (defined $rules_array_ref) { + foreach $R (@$rules_array_ref) { # @$rules_array_ref dereferences the array. + # $R is a refernce to a 4-dimensional array, whose elements we access with + # $$R[0], etc. + my $base_suffix = $$R[1]; + my $base_word = $prefix . $base_suffix; + my $base_prons_ref = $word2prons{$base_word}; + if (defined $base_prons_ref) { + my $psuffix = $$R[2]; + my $base_psuffix = $$R[3]; + if ($base_psuffix ne "") { + $base_psuffix = " " . $base_psuffix; + # Include " ", the space between phones, to prevent + # matching partial phones below. + } + my $base_psuffix_len = length($base_psuffix); + foreach $base_pron (@$base_prons_ref) { # @$base_prons_ref derefs + # that reference to an array. + my $base_pron_prefix_len = length($base_pron) - $base_psuffix_len; + # Note: these lengths are in characters, not phones. + if ($base_pron_prefix_len >= 0 && + substr($base_pron, $base_pron_prefix_len) eq $base_psuffix) { + # The suffix of the base_pron is what it should be. + my $pron_prefix = substr($base_pron, 0, $base_pron_prefix_len); + my $rule = join(",", @$R); # we'll output this.. + my $len = @R; + for ($destress = 0; $destress <= 1; $destress++) { # Two versions + # of each rule: with destressing and without. + # pron is the generated pron. + if ($destress) { $pron_prefix =~ s/2/1/g; } + my $pron; + if ($psuffix ne "") { $pron = $pron_prefix . " " . $psuffix; } + else { $pron = $pron_prefix; } + # Now print out the info about the generated pron. + my $destress_mark = ($destress ? "yes" : "no"); + my $rule_score = $rule_and_destress_to_rule_score{$rule.";".$destress_mark}; + if (defined $rule_score) { # Means that the (rule,destress) combination was + # seen [note: this if-statement may be pointless, as currently we don't + # do any pruning of rules]. + my @output = ($word, $pron, $base_word, $base_pron, $rule, $destress_mark); + if ($rule_score != -1) { push @output, $rule_score; } # If scores were supplied, + # we also output the score info. + print join(";", @output) . "\n"; + } + } + } + } + } + } + } + } +} diff --git a/egs/wsj/s3/local/dict/get_rule_hierarchy.pl b/egs/wsj/s3/local/dict/get_rule_hierarchy.pl new file mode 100755 index 000000000..35805b46b --- /dev/null +++ b/egs/wsj/s3/local/dict/get_rule_hierarchy.pl @@ -0,0 +1,73 @@ +#!/usr/bin/perl + +#This reads in rules, of the form put out by get_rules.pl, e.g.: +# ERT,,ER0 T, +# MENT,ING,M AH0 N T,IH0 NG +# S,TON,Z,T AH0 N +# ,ER,IH0 NG,IH0 NG ER0 +# ,'S,M AH0 N,M AH0 N Z +#TIONS,TIVE,SH AH0 N Z,T IH0 V + +# and it works out a hierarchy that says which rules are sub-cases +# of which rules: it outputs on each line a pair separated by ";", where +# each member of the pair is a rule, first one is the specialization, the +# second one being more general. +# E.g.: +# RED,RE,D,/ED,E,D, +# RED,RE,D,/D,,D, +# GING,GE,IH0 NG,/ING,I,IH0 NG, +# TOR,TING,T ER0,T IH0 NG/OR,OR,T ER0,T ER0 +# ERED,ER,D,/RED,R,D, +# ERED,ER,D,/ED,,D, + + + + +while(<>) { + chop; + $rule = $_; + $isrule{$rule} = 1; + push @rules, $rule; +} + +foreach my $rule (@rules) { + # Truncate the letters and phones in the rule, while we + # can, to get more general rules; if the more general rule + # exists, put out the pair. + @A = split(",", $rule); + @suffixa = split("", $A[0]); + @suffixb = split("", $A[1]); + @psuffixa = split(" ", $A[2]); + @psuffixb = split(" ", $A[3]); + for ($common_suffix_len = 0; $common_suffix_len < @suffixa && $common_suffix_len < @suffixb;) { + if ($suffixa[$common_suffix_len] eq $suffixb[$common_suffix_len]) { + $common_suffix_len++; + } else { + last; + } + } + for ($common_psuffix_len = 0; $common_psuffix_len < @psuffixa && $common_psuffix_len < @psuffixb;) { + if ($psuffixa[$common_psuffix_len] eq $psuffixb[$common_psuffix_len]) { + $common_psuffix_len++; + } else { + last; + } + } + # Get all combinations of pairs of integers <= (common_suffix_len, common_psuffix_len), + # except (0,0), and print out this rule together with the corresponding rule (if it exists). + for ($m = 0; $m <= $common_suffix_len; $m++) { + $sa = join("", @suffixa[$m...$#suffixa]); # @x[a..b] is array slice notation. + $sb = join("", @suffixb[$m...$#suffixb]); + for ($n = 0; $n <= $common_psuffix_len; $n++) { + if (!($m == 0 && $n == 0)) { + $psa = join(" ", @psuffixa[$n...$#psuffixa]); + $psb = join(" ", @psuffixb[$n...$#psuffixb]); + $more_general_rule = join(",", ($sa, $sb, $psa, $psb)); + if (defined $isrule{$more_general_rule}) { + print $rule . ";" . $more_general_rule . "\n"; + } + } + } + } +} + diff --git a/egs/wsj/s3/local/dict/get_rules.pl b/egs/wsj/s3/local/dict/get_rules.pl new file mode 100755 index 000000000..a5b57b088 --- /dev/null +++ b/egs/wsj/s3/local/dict/get_rules.pl @@ -0,0 +1,204 @@ +#!/usr/bin/perl + +# This program creates suggested suffix rules from a dictionary. +# It outputs quadruples of the form: +# suffix,base-suffix,psuffix,base-psuffix +# where "suffix" is the suffix of the letters of a word, "base-suffix" is +# the suffix of the letters of the base-word, "psuffix" is the suffix of the +# pronunciation of the word (a space-separated list of phonemes), and +# "base-psuffix" is the suffix of the pronunciation of the baseword. +# As far as this program is concerned, there is no distinction between +# "word" and "base-word". To simplify things slightly, what it does +# is return all tuples (a,b,c,d) [with a != b] such that there are +# at least $min_suffix_count instances in the dictionary of +# a (word-prefix, pron-prefix) pair where there exists (word,pron) +# pairs of the form +# ( word-prefix . a, pron-prefix . c) +# and +# ( word-prefix . b, pron-prefix . d) +# For example if (a,b,c,d) equals (USLY,US,S L IY0,S) +# then this quadruple will be output as long as there at least +# e.g. 30 instances of prefixes like (FAM, F EY1 M AH0) +# where there exist (word, pron) pairs like: +# FAMOUS, F EY1 M AH0 S +# FAMOUSLY F EY1 M AH0 S L IY0 +# +# There are some modifications to the picture above, for efficiency. +# If $disallow_empty_suffix != 0, this program will not output 4-tuples where +# the first element (the own-word suffix) is empty, as this would cause +# efficiency problems in get_candidate_prons.pl. If +# $ignore_prefix_stress != 0, this program will ignore stress markings +# while evaluating whether prefixes are the same. +# The minimum count for a quadruple to be output is $min_suffix_count +# (e.g. 30). +# +# The function of this program is not to evaluate the accuracy of these rules; +# it is mostly a pruning step, where we suggest rules that have large enough +# counts to be suitable for our later procedure where we evaluate their +# accuracy in predicting prons. + +$disallow_empty_suffix = 1; # Disallow rules where the suffix of the "own-word" is + # empty. This is for efficiency in later stages (e.g. get_candidate_prons.pl). +$min_prefix_len = 3; # this must match with get_candidate_prons.pl +$ignore_prefix_stress = 1; # or 0 to take account of stress in prefix. +$min_suffix_count = 20; + +# Takes in dictionary. + +print STDERR "Reading dict\n"; +while(<>) { + @A = split(" ", $_); + my $word = shift @A; + my $pron = join(" ", @A); + if (!defined $prons{$word}) { + $prons{$word} = $pron; + push @words, $word; + } else { + $prons{$word} = $prons{$word} . ";" . $pron; + } +} + +# Get common suffixes (e.g., count >100). Include empty suffix. + +print STDERR "Getting common suffix counts.\n"; +{ + foreach $word (@words) { + $len = length($word); + for ($x = $min_prefix_len; $x <= $len; $x++) { + $suffix_count{substr($word, $x)}++; + } + } + + foreach $suffix (keys %suffix_count) { + if ($suffix_count{$suffix} >= $min_suffix_count) { + $newsuffix_count{$suffix} = $suffix_count{$suffix}; + } + } + %suffix_count = %newsuffix_count; + undef %newsuffix_count; + + foreach $suffix ( sort { $suffix_count{$b} <=> $suffix_count{$a} } keys %suffix_count ) { + print STDERR "$suffix_count{$suffix} $suffix\n"; + } +} + +print STDERR "Getting common suffix pairs.\n"; + +{ + print STDERR " Getting map from prefix -> suffix-set.\n"; + + # Create map from prefix -> suffix-set. + foreach $word (@words) { + $len = length($word); + for ($x = $min_prefix_len; $x <= $len; $x++) { + $prefix = substr($word, 0, $x); + $suffix = substr($word, $x); + if (defined $suffix_count{$suffix}) { # Suffix is common... + if (!defined $suffixes_of{$prefix}) { + $suffixes_of{$prefix} = [ $suffix ]; # Create a reference to a new array with + # one element. + } else { + push @{$suffixes_of{$prefix}}, $suffix; # Push $suffix onto array that the + # hash member is a reference . + } + } + } + } + my %suffix_set_count; + print STDERR " Getting map from suffix-set -> count.\n"; + while ( my ($key, $value) = each(%suffixes_of) ) { + my @suffixes = sort ( @$value ); + $suffix_set_count{join(";", @suffixes)}++; + } + print STDERR " Getting counts for suffix pairs.\n"; + while ( my ($suffix_set, $count) = each (%suffix_set_count) ) { + my @suffixes = split(";", $suffix_set); + # Consider pairs to be ordered. This is more convenient + # later on. + foreach $suffix_a (@suffixes) { + foreach $suffix_b (@suffixes) { + if ($suffix_a ne $suffix_b) { + $suffix_pair = $suffix_a . "," . $suffix_b; + $suffix_pair_count{$suffix_pair} += $count; + } + } + } + } + + # To save memory, only keep pairs above threshold in the hash. + while ( my ($suffix_pair, $count) = each (%suffix_pair_count) ) { + if ($count >= $min_suffix_count) { + $new_hash{$suffix_pair} = $count; + } + } + %suffix_pair_count = %new_hash; + undef %new_hash; + + # Print out the suffix pairs so the user can see. + foreach $suffix_pair ( + sort { $suffix_pair_count{$b} <=> $suffix_pair_count{$a} } keys %suffix_pair_count ) { + print STDERR "$suffix_pair_count{$suffix_pair} $suffix_pair\n"; + } +} + +print STDERR "Getting common suffix/suffix/psuffix/psuffix quadruples\n"; + +{ + while ( my ($prefix, $suffixes_ref) = each(%suffixes_of) ) { + # Note: suffixes_ref is a reference to an array. We dereference with + # @$suffixes_ref. + # Consider each pair of suffixes (in each order). + foreach my $suffix_a ( @$suffixes_ref ) { + foreach my $suffix_b ( @$suffixes_ref ) { + # could just used "defined" in next line, but this is for clarity. + $suffix_pair = $suffix_a.",".$suffix_b; + if ( $suffix_pair_count{$suffix_pair} >= $min_suffix_count ) { + foreach $pron_a_str (split(";", $prons{$prefix.$suffix_a})) { + @pron_a = split(" ", $pron_a_str); + foreach $pron_b_str (split(";", $prons{$prefix.$suffix_b})) { + @pron_b = split(" ", $pron_b_str); + $len_a = @pron_a; # evaluating array as scalar automatically gives length. + $len_b = @pron_b; + for (my $pos = 0; $pos <= $len_a && $pos <= $len_b; $pos++) { + # $pos is starting-pos of psuffix-pair. + $psuffix_a = join(" ", @pron_a[$pos...$#pron_a]); + $psuffix_b = join(" ", @pron_b[$pos...$#pron_b]); + $quadruple = $suffix_pair . "," . $psuffix_a . "," . $psuffix_b; + $quadruple_count{$quadruple}++; + + my $pron_a_pos = $pron_a[$pos], $pron_b_pos = $pron_b[$pos]; + if ($ignore_prefix_stress) { + $pron_a_pos =~ s/\d//; # e.g convert IH0 to IH. Only affects + $pron_b_pos =~ s/\d//; # whether we exit the loop below. + } + if ($pron_a_pos ne $pron_b_pos) { + # This is important: we don't consider a pron suffix-pair to be + # valid unless the pron prefix is the same. + last; + } + } + } + } + } + } + } + } + # To save memory, only keep pairs above threshold in the hash. + while ( my ($quadruple, $count) = each (%quadruple_count) ) { + if ($count >= $min_suffix_count) { + $new_hash{$quadruple} = $count; + } + } + %quadruple_count = %new_hash; + undef %new_hash; + + # Print out the quadruples for diagnostics. + foreach $quadruple ( + sort { $quadruple_count{$b} <=> $quadruple_count{$a} } keys %quadruple_count ) { + print STDERR "$quadruple_count{$quadruple} $quadruple\n"; + } +} +# Now print out the quadruples; these are the output of this program. +foreach $quadruple (keys %quadruple_count) { + print $quadruple."\n"; +} diff --git a/egs/wsj/s3/local/dict/limit_candidate_prons.pl b/egs/wsj/s3/local/dict/limit_candidate_prons.pl new file mode 100755 index 000000000..ceff9fbad --- /dev/null +++ b/egs/wsj/s3/local/dict/limit_candidate_prons.pl @@ -0,0 +1,103 @@ +#!/usr/bin/perl + +# This program enforces the rule that +# if a "more specific" rule applies, we cannot use the more general rule. +# It takes in tuples generated by get_candidate_prons (one per line, separated +# by ";"), of the form: +# word;pron;base-word;base-pron;rule-name;de-stress[;rule-score] +# [note: we mean that the last element, the numeric score of the rule, is optional] +# and it outputs a (generally shorter) list +# of the same form. + + +# For each word: + # For each (base-word,base-pron): + # Eliminate "more-general" rules as follows: + # For each pair of rules applying to this (base-word, base-pron): + # If pair is in more-general hash, disallow more general one. + # Let the output be: for each (base-word, base-pron, rule): + # for (destress-prefix) in [yes, no], do: + # print out the word input, the rule-name, [destressed:yes|no], and the new pron. + + +if (@ARGV != 1 && @ARGV != 2) { + die "Usage: limit_candidate_prons.pl rule_hierarchy [candidate_prons] > limited_candidate_prons"; +} + +$hierarchy = shift @ARGV; +open(H, "<$hierarchy") || die "Opening rule hierarchy $hierarchy"; + +while(<H>) { + chop; + m:.+;.+: || die "Bad rule-hierarchy line $_"; + $hierarchy{$_} = 1; # Format is: if $rule1 is the string form of the more specific rule + # and $rule21 is that string form of the more general rule, then $hierarchy{$rule1.";".$rule2} + # is defined, else undefined. +} + + +sub process_word; + +undef $cur_word; +@cur_lines = (); + +while(<>) { + # input, output is: + # word;pron;base-word;base-pron;rule-name;destress;score + chop; + m:^([^;]+);: || die "Unexpected input: $_"; + $word = $1; + if (!defined $cur_word || $word eq $cur_word) { + if (!defined $cur_word) { $cur_word = $word; } + push @cur_lines, $_; + } else { + process_word(@cur_lines); # Process a series of suggested prons + # for a particular word. + $cur_word = $word; + @cur_lines = ( $_ ); + } +} +process_word(@cur_lines); + +sub process_word { + my %pair2rule_list; # hash from $baseword.";".$baseword to ref + # to array of [ line1, line2, ... ]. + my @cur_lines = @_; + foreach my $line (@cur_lines) { + my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line); + my $key = $baseword.";".$basepron; + if (defined $pair2rule_list{$key}) { + push @{$pair2rule_list{$key}}, $line; # @{...} derefs the array pointed to + # by the array ref inside {}. + } else { + $pair2rule_list{$key} = [ $line ]; # [ $x ] is new anonymous array with 1 elem ($x) + } + } + while ( my ($key, $value) = each(%pair2rule_list) ) { + my @lines = @$value; # array of lines that are for this (baseword,basepron). + my @stress, @rules; # Arrays of stress markers and rule names, indexed by + # same index that indexes @lines. + for (my $n = 0; $n < @lines; $n++) { + my $line = $lines[$n]; + my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line); + $stress[$n] = $destress; + $rules[$n] = $rulename; + } + for (my $m = 0; $m < @lines; $m++) { + my $ok = 1; # if stays 1, this line is OK. + for (my $n = 0; $n < @lines; $n++) { + if ($m != $n && $stress[$m] eq $stress[$n]) { + if (defined $hierarchy{$rules[$n].";".$rules[$m]}) { + # Note: this "hierarchy" variable is defined if $rules[$n] is a more + # specific instances of $rules[$m], thus invalidating $rules[$m]. + $ok = 0; + last; # no point iterating further. + } + } + } + if ($ok != 0) { + print $lines[$m] . "\n"; + } + } + } +} diff --git a/egs/wsj/s3/local/dict/reverse_candidates.pl b/egs/wsj/s3/local/dict/reverse_candidates.pl new file mode 100755 index 000000000..d5c5effc2 --- /dev/null +++ b/egs/wsj/s3/local/dict/reverse_candidates.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl + +# This takes the output of e.g. get_candidate_prons.pl or limit_candidate_prons.pl, +# which is 7-tuples, one per line, of the form: + +# word;pron;base-word;base-pron;rule-name;de-stress;rule-score +# (where rule-score is somtimes listed as optional, but this +# program does expect it, since we don't anticipate it being used +# without it). +# This program assumes that all the words and prons and rules have +# come from a reversed dictionary (reverse_dict.pl) where the order +# of the characters in the words, and the phones in the prons, have +# been reversed, and it un-reverses them. That it, the characters +# in "word" and "base-word", and the phones in "pron" and "base-pron" +# are reversed; and the rule ("rule-name") is parsed as a 4-tuple, +# like: +# suffix,base-suffix,psuffix,base-psuffix +# so this program reverses the characters in "suffix" and "base-suffix" +# and the phones (separated by spaces) in "psuffix" and "base-psuffix". + +sub reverse_str { + $str = shift; + return join("", reverse(split("", $str))); +} +sub reverse_pron { + $str = shift; + return join(" ", reverse(split(" ", $str))); +} + +while(<>){ + chop; + @A = split(";", $_); + @A == 7 || die "Bad input line $_: found $len fields, expected 7."; + + ($word,$pron,$baseword,$basepron,$rule,$destress,$score) = @A; + $word = reverse_str($word); + $pron = reverse_pron($pron); + $baseword = reverse_str($baseword); + $basepron = reverse_pron($basepron); + @R = split(",", $rule, 4); + @R == 4 || die "Bad rule $rule"; + + $R[0] = reverse_str($R[0]); # suffix. + $R[1] = reverse_str($R[1]); # base-suffix. + $R[2] = reverse_pron($R[2]); # pron. + $R[3] = reverse_pron($R[3]); # base-pron. + $rule = join(",", @R); + @A = ($word,$pron,$baseword,$basepron,$rule,$destress,$score); + print join(";", @A) . "\n"; +} diff --git a/egs/wsj/s3/local/dict/reverse_dict.pl b/egs/wsj/s3/local/dict/reverse_dict.pl new file mode 100755 index 000000000..75681711b --- /dev/null +++ b/egs/wsj/s3/local/dict/reverse_dict.pl @@ -0,0 +1,14 @@ +#!/usr/bin/perl + +# Used in conjunction with get_rules.pl +# example input line: XANTHE Z AE1 N DH +# example output line: EHTNAX DH N AE1 Z + +while(<>){ + @A = split(" ", $_); + $word = shift @A; + $word = join("", reverse(split("", $word))); # Reverse letters of word. + @A = reverse(@A); # Reverse phones in pron. + unshift @A, $word; + print join(" ", @A) . "\n"; +} diff --git a/egs/wsj/s3/local/dict/score_prons.pl b/egs/wsj/s3/local/dict/score_prons.pl new file mode 100755 index 000000000..fd5a004d8 --- /dev/null +++ b/egs/wsj/s3/local/dict/score_prons.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl + +# This program takes candidate prons from "get_candidate_prons.pl" or +# "limit_candidate_prons.pl", and a reference dictionary covering those words, +# and outputs the same format but with scoring information added (so we go from +# 6 to 7 fields). The scoring information says, for each generated pron, +# whether we have a match, a partial match, or no match, to some word in the +# dictionary. A partial match means it's correct except for stress. + +# The input is a 6-tuple on each line, like: +# word;pron;base-word;base-pron;rule-name;de-stress +# +# The output is the same except with one more field, the score, +# which may be "right", "wrong", "partial". + +if (@ARGV != 1 && @ARGV != 2) { + die "Usage: score_prons.pl reference_dict [candidate_prons] > scored_candidate_prons"; +} + +$dict = shift @ARGV; +open(D, "<$dict") || die "Opening dictionary $dict"; + +while(<D>) { # Set up some hashes that tell us when + # a (word,pron) pair is correct (and the same for + # prons with stress information removed). + chop; + @A = split(" ", $_); + $word = shift @A; + $pron = join(" ", @A); + $pron_nostress = $pron; + $pron_nostress =~ s:\d::g; + $word_and_pron{$word.";".$pron} = 1; + $word_and_pron_nostress{$word.";".$pron_nostress} = 1; +} + +while(<>) { + chop; + $line = $_; + my ($word, $pron, $baseword, $basepron, $rulename, $destress) = split(";", $line); + $pron_nostress = $pron; + $pron_nostress =~ s:\d::g; + if (defined $word_and_pron{$word.";".$pron}) { + $score = "right"; + } elsif (defined $word_and_pron_nostress{$word.";".$pron_nostress}) { + $score = "partial"; + } else { + $score = "wrong"; + } + print $line.";".$score."\n"; +} diff --git a/egs/wsj/s3/local/dict/score_rules.pl b/egs/wsj/s3/local/dict/score_rules.pl new file mode 100755 index 000000000..8d165f7f1 --- /dev/null +++ b/egs/wsj/s3/local/dict/score_rules.pl @@ -0,0 +1,52 @@ +#!/usr/bin/perl + +# This program takes the output of count_rules.pl, which is tuples +# of the form +# +# rule;destress;right-count;partial-count;wrong-count +# +# and outputs lines of the form +# +# rule;de-stress;score +# +# where the score, between 0 and 1 (1 better), is +# equal to: +# +# It forms a score between 0 and 1, of the form: +# ((#correct) + $partial_score * (#partial)) / (#correct + #partial + #wrong + $ballast) +# +# where $partial_score (e.g. 0.8) is the score we assign to a "partial" match, +# and $ballast is a small number, e.g. 1, that is treated like "extra" wrong scores, to penalize +# rules with few observations. +# +# It outputs all rules that at are at least the + +$ballast = 1; +$partial_score = 0.8; +$destress_penalty = 1.0e-05; # Give destressed rules a small +# penalty vs. their no-destress counterparts, so if we +# have to choose arbitrarily we won't destress (seems safer)> + +for ($n = 1; $n <= 4; $n++) { + if ($ARGV[0] eq "--ballast") { + shift @ARGV; + $ballast = shift @ARGV; + } + if ($ARGV[0] eq "--partial-score") { + shift @ARGV; + $partial_score = shift @ARGV; + ($partial_score >= 0.0 && $partial_score <= 1.0) || die "Invalid partial_score: $partial_score"; + } +} + +(@ARGV == 0 || @ARGV == 1) || die "Usage: score_rules.pl [--ballast ballast-count] [--partial-score partial-score] [input from count_rules.pl]"; + +while(<>) { + @A = split(";", $_); + @A == 5 || die "Bad input line; $_"; + ($rule,$destress,$right_count,$partial_count,$wrong_count) = @A; + $rule_score = ($right_count + $partial_score*$partial_count) / + ($right_count+$partial_count+$wrong_count+$ballast); + if ($destress eq "yes") { $rule_score -= $destress_penalty; } + print join(";", $rule, $destress, sprintf("%.5f", $rule_score)) . "\n"; +} diff --git a/egs/wsj/s3/local/dict/select_candidate_prons.pl b/egs/wsj/s3/local/dict/select_candidate_prons.pl new file mode 100755 index 000000000..d0018c98a --- /dev/null +++ b/egs/wsj/s3/local/dict/select_candidate_prons.pl @@ -0,0 +1,84 @@ +#!/usr/bin/perl + +# This takes the output of e.g. get_candidate_prons.pl or limit_candidate_prons.pl +# or reverse_candidates.pl, which is 7-tuples, one per line, of the form: +# +# word;pron;base-word;base-pron;rule-name;de-stress;rule-score +# +# and selects the most likely prons for the words based on rule +# score. It outputs in the same format as the input (thus, it is +# similar to limit_candidates.pl in its input and output format, +# except it has a different way of selecting the prons to put out). +# +# This script will select the $max_prons best pronunciations for +# each candidate word, subject to the constraint that no pron should +# have a rule score worse than $min_rule_score. +# It first merges the candidates by, if there are multiple candidates +# generating the same pron, selecting the candidate that had the +# best associated score. It then sorts the prons on score and +# selects the n best prons (but doesn't print out candidates with +# score beneath the threshold). + + +$max_prons = 4; +$min_rule_score = 0.35; + + +for ($n = 1; $n <= 3; $n++) { + if ($ARGV[0] eq "--max-prons") { + shift @ARGV; + $max_prons = shift @ARGV; + } + if ($ARGV[0] eq "--min-rule-score") { + shift @ARGV; + $min_rule_score = shift @ARGV; + } +} + +if (@ARGV != 0 && @ARGV != 1) { + die "Usage: select_candidates_prons.pl [candidate_prons] > selected_candidate_prons"; +} + +sub process_word; + +undef $cur_word; +@cur_lines = (); + +while(<>) { + # input, output is: + # word;pron;base-word;base-pron;rule-name;destress;score + chop; + m:^([^;]+);: || die "Unexpected input: $_"; + $word = $1; + if (!defined $cur_word || $word eq $cur_word) { + if (!defined $cur_word) { $cur_word = $word; } + push @cur_lines, $_; + } else { + process_word(@cur_lines); # Process a series of suggested prons + # for a particular word. + $cur_word = $word; + @cur_lines = ( $_ ); + } +} +process_word(@cur_lines); + + +sub process_word { + my %pron2rule_score; # hash from generated pron to rule score for that pron. + my %pron2line; # hash from generated pron to best line for that pron. + my @cur_lines = @_; + foreach my $line (@cur_lines) { + my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line); + if (!defined $pron2rule_score{$pron} || + $rule_score > $pron2rule_score{$pron}) { + $pron2rule_score{$pron} = $rule_score; + $pron2line{$pron} = $line; + } + } + my @prons = sort { $pron2rule_score{$b} <=> $pron2rule_score{$a} } keys %pron2rule_score; + for (my $n = 0; $n < @prons && $n < $max_prons && + $pron2rule_score{$prons[$n]} >= $min_rule_score; $n++) { + print $pron2line{$prons[$n]} . "\n"; + } +} + diff --git a/egs/wsj/s3/local/wsj_extend_dict.sh b/egs/wsj/s3/local/wsj_extend_dict.sh new file mode 100755 index 000000000..ce82840b2 --- /dev/null +++ b/egs/wsj/s3/local/wsj_extend_dict.sh @@ -0,0 +1,168 @@ +#!/bin/bash + +# This script builds a larger word-list and dictionary +# than used for the LMs supplied with the WSJ corpus. +# It uses a couple of strategies to fill-in words in +# the LM training data but not in CMUdict. One is +# to generate special prons for possible acronyms, that +# just consist of the constituent letters. The other +# is designed to handle derivatives of known words +# (e.g. deriving the pron of a plural from the pron of +# the base-word), but in a more general, learned-from-data +# way. +# It makes use of scripts in local/dict/ + +if [ $# -ne 1 ]; then + echo "Usage: local/wsj_train_lms.sh /foo/bar/WSJ/13-32.1/" + exit 1 +fi +if [ "`basename $1`" != 13-32.1 ]; then + echo "Expecting the argument to this script to end in 13-32.1" + exit 1 +fi + +# e.g. +#srcdir=/mnt/matylda2/data/WSJ1/13-32.1 +export PATH=$PATH:`pwd`/local/dict/ +srcdir=$1 +mkdir -p data/local/dict_larger +dir=data/local/dict_larger +mincount=2 # Minimum count of an OOV we will try to generate a pron for. + +[ ! -f data/local/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1; + +# Remove comments from cmudict; print first field; remove +# words like FOO(1) which are alternate prons: our dict format won't +# include these markers. +grep -v ';;;' data/local/cmudict/cmudict.0.7a | + perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu + +cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu + +echo "Getting training data [this should take at least a few seconds; if not, there's a problem]" + +# Convert to uppercase, remove XML-like markings. +# For words ending in "." that are not in CMUdict, we assume that these +# are periods that somehow remained in the data during data preparation, +# and we we replace the "." with "\n". Note: we found this by looking at +# oov.counts below (before adding this rule). + + +touch $dir/cleaned.gz +if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then + echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]"; +else + gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \ + | awk '/^</{next}{print toupper($0)}' | perl -e ' + open(F, "<$ARGV[0]")||die; + while(<F>){ chop; $isword{$_} = 1; } + while(<STDIN>) { + @A = split(" ", $_); + for ($n = 0; $n < @A; $n++) { + $a = $A[$n]; + if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "." + # and have no other "." in them: treat as period. + print "$a"; + if ($n+1 < @A) { print "\n"; } + } else { print "$a "; } + } + print "\n"; + } + ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz +fi + +# get unigram counts +echo "Getting unigram counts" +gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \ + awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams + +cat $dir/unigrams | awk -v dict=$dir/dict.cmu \ + 'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \ + > $dir/oov.counts + +echo "Most frequent unseen unigrams d are: " +head $dir/oov.counts + +# Prune away singleton counts, and remove things with numbers in +# (which should have been normalized) and with no letters at all. + + +cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \ + | awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist + +# Automatic rule-finding... + +# First make some prons for possible acronyms. +# Note: we don't do this for things like U.K or U.N, +# or A.B. (which doesn't exist anyway), +# as we consider this normalization/spelling errors. + +cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu > $dir/dict.acronyms + +mkdir $dir/f $dir/b # forward, backward directions of rules... + # forward is normal suffix + # rules, backward is reversed (prefix rules). These + # dirs contain stuff we create while making the rule-based + # extensions to the dictionary. + +# Remove ; and , from words, if they are present; these +# might crash our scripts, as they are used as separators there. +filter_dict.pl $dir/dict.cmu > $dir/f/dict +cat $dir/oovlist | filter_dict.pl > $dir/f/oovs +reverse_dict.pl $dir/f/dict > $dir/b/dict +reverse_dict.pl $dir/f/oovs > $dir/b/oovs + +# The next stage takes a few minutes. +# Note: the forward stage takes longer, as English is +# mostly a suffix-based language, and there are more rules +# that it finds. +for d in $dir/f $dir/b; do + ( + cd $d + cat dict | get_rules.pl 2>get_rules.log >rules + get_rule_hierarchy.pl rules >hierarchy + awk '{print $1}' dict | get_candidate_prons.pl rules dict | \ + limit_candidate_prons.pl hierarchy | \ + score_prons.pl dict | \ + count_rules.pl >rule.counts + # the sort command below is just for convenience of reading. + score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores + get_candidate_prons.pl rules.with_scores dict oovs | \ + limit_candidate_prons.pl hierarchy > oovs.candidates + ) & +done +wait + +# Merge the candidates. +reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates +select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s\n", $1, $2);}' \ + > $dir/dict.oovs + +cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged + +awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled +sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled + +echo "**Top OOVs we handled are:**"; +head $dir/oovlist.handled +echo "**Top OOVs we didn't handle are as follows (not: they are mostly misspellings):**"; +head $dir/oovlist.not_handled + +# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs +add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts +add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts + +echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`" +echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`" +echo "Count of OOVs we didn't handle due to low count is" \ + `awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts` + + +# The two files created just above are for humans to look at, as diagnostics. + +cat $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/dict.final +cat $dir/dict.final | awk '{print $1}' | uniq > $dir/wordlist.final + +echo "Created $dir/dict.final and $dir/wordlist.final" + + diff --git a/egs/wsj/s3/local/wsj_format_data.sh b/egs/wsj/s3/local/wsj_format_data.sh index 9074be8cc..f2b2b6373 100755 --- a/egs/wsj/s3/local/wsj_format_data.sh +++ b/egs/wsj/s3/local/wsj_format_data.sh @@ -92,7 +92,7 @@ echo "<SPOKEN_NOISE>" > data/lang/oov.txt # triphone system. extra_questions.txt is some pre-defined extra questions about # position and stress that split apart the categories we created in phonesets.txt. # in extra_questions.txt there is also a question about silence phones, since we -# didn't include that in our +# don't include them in our automatically generated clustering of phones. local/make_shared_phones.sh < data/lang/phones.txt > data/lang/phonesets_mono.txt grep -v SIL data/lang/phonesets_mono.txt > data/lang/phonesets_cluster.txt @@ -133,7 +133,7 @@ cp data/lang_test/phones_disambig.txt data/lang # Needed for MMI. # Create the lexicon FST with disambiguation symbols, and put it in lang_test. # There is an extra -# step where we create a loop "pass through" the disambiguation symbols +# step where we create a loop to "pass through" the disambiguation symbols # from G.fst. phone_disambig_symbol=`grep \#0 data/lang_test/phones_disambig.txt | awk '{print $2}'` word_disambig_symbol=`grep \#0 data/lang_test/words.txt | awk '{print $2}'` diff --git a/egs/wsj/s3/local/wsj_format_data_local.sh b/egs/wsj/s3/local/wsj_format_data_local.sh new file mode 100755 index 000000000..e5935a5d5 --- /dev/null +++ b/egs/wsj/s3/local/wsj_format_data_local.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +# This script creates the appropriate lang_test directories for +# the locally created dictionary and lexicon. + + +# "bd" is big-dict. +# The "fgpr" LM is a locally estimated one (4-gram, pruned) +. path.sh || exit 1; +dict_srcdir=data/local/dict_larger_prep/ +lm_srcdir=data/local/lm/4gram-mincount +lang=data/lang_test_bd_fgpr +lang_unpruned=data/lang_test_bd_fg +mkdir -p $lang + +[ ! -f $dict_srcdir/lexicon.txt ] && \ + echo "First run wsj_prepare_local_dict.sh" && exit 1; +[ ! -f $lm_srcdir/lm_pr7.0.gz -o ! -f $lm_srcdir/lm_unpruned.gz ] && \ + echo "First run wsj_train_lms.sh" && exit 1; + + +# Get number of disambig symbols, and lexicon with disambig symbols. +ndisambig=`scripts/add_lex_disambig.pl $dict_srcdir/lexicon.txt $dict_srcdir/lexicon_disambig.txt` +ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST. +echo $ndisambig > $dict_srcdir/lex_ndisambig + +# (1) Put into $lang, phones.txt, silphones.csl, nonsilphones.csl, words.txt, +# oov.txt +cp data/local/phones.txt $lang + +silphones="SIL SPN NSN"; +# Generate colon-separated lists of silence and non-silence phones. +scripts/silphones.pl $lang/phones.txt "$silphones" $lang/silphones.csl $lang/nonsilphones.csl + +# Make word symbol table. +cat $dict_srcdir/lexicon.txt | awk '{print $1}' | sort | uniq | \ + awk 'BEGIN{print "<eps> 0";} {printf("%s %d\n", $1, NR);} END{printf("#0 %d\n", NR+1);} ' \ + > $lang/words.txt + +# Create the basic L.fst without disambiguation symbols, for use +# in training. +scripts/make_lexicon_fst.pl $dict_srcdir/lexicon.txt 0.5 SIL | \ + fstcompile --isymbols=$lang/phones.txt --osymbols=$lang/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstarcsort --sort_type=olabel > $lang/L.fst + +# The file oov.txt contains a word that we will map any OOVs to during +# training. This won't be needed in a test directory, but for completion we +# do it. +echo "<SPOKEN_NOISE>" > data/lang/oov.txt + +# Note: we don't need phonesets.txt and extra_questions.txt, as they are +# only needed during training. So we don't bother creating them. +# Anyway they're the same as they would be in other lang directories. + +silphonelist=`cat data/lang/silphones.csl | sed 's/:/ /g'` +nonsilphonelist=`cat data/lang/nonsilphones.csl | sed 's/:/ /g'` +cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \ + sed "s:SILENCEPHONES:$silphonelist:" > $lang/topo + + +# (3), +# In lang_test, create a phones.txt file that includes the disambiguation symbols. +# the --include-zero includes the #0 symbol we pass through from the grammar. +# Note: we previously echoed the # of disambiguation symbols to $dict_srcdir/lex_ndisambig. + +scripts/add_disambig.pl --include-zero $lang/phones.txt \ + `cat $dict_srcdir/lex_ndisambig` > $lang/phones_disambig.txt + +# Create the lexicon FST with disambiguation symbols, and put it in lang_test. +# There is an extra +# step where we create a loop to "pass through" the disambiguation symbols +# from G.fst. +phone_disambig_symbol=`grep \#0 $lang/phones_disambig.txt | awk '{print $2}'` +word_disambig_symbol=`grep \#0 $lang/words.txt | awk '{print $2}'` + +scripts/make_lexicon_fst.pl $dict_srcdir/lexicon_disambig.txt 0.5 SIL '#'$ndisambig | \ + fstcompile --isymbols=$lang/phones_disambig.txt --osymbols=$lang/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstaddselfloops "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \ + fstarcsort --sort_type=olabel > $lang/L_disambig.fst || exit 1; + + +# Create L_align.fst, which is as L.fst but with alignment symbols (#1 and #2 at the +# beginning and end of words, on the input side)... useful if we +# ever need to e.g. create ctm's-- these are used to work out the +# word boundaries. +cat $dict_srcdir/lexicon.txt | \ + awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }' | \ + scripts/make_lexicon_fst.pl - 0.5 SIL | \ + fstcompile --isymbols=$lang/phones_disambig.txt --osymbols=$lang/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstarcsort --sort_type=olabel > $lang/L_align.fst || exit 1; + +# Next, for each type of language model, create the corresponding FST +# and the corresponding lang_test directory. + +echo "Preparing language models for test" + +# Note: at this point, $lang=="data/lang_test_bd_fgpr", we put a pruned 4-gram model +# there. + +echo "Checking there are no OOVs" # there shouldn't be in this LM. +# If you have an LM with OOVs you'd have to put back the command +# "remove_oovs.pl" below, as it is used in wsj_format_data.sh. +gunzip -c $lm_srcdir/lm_pr7.0.gz | \ + scripts/find_arpa_oovs.pl $lang/words.txt | cmp - /dev/null || \ + exit 1; + + +# Removing these "invalid combinations" of <s> and </s> is not +# necessary because we produced these LMs ourselves, and they aren't +# broken. But we'll leave this in the script just in case it gets modified +# later. +# Note: ~1.5M N-grams. +gunzip -c $lm_srcdir/lm_pr7.0.gz | \ + grep -v '<s> <s>' | \ + grep -v '</s> <s>' | \ + grep -v '</s> </s>' | \ + arpa2fst - | fstprint | \ + scripts/eps2disambig.pl | scripts/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ + --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon > $lang/G.fst || exit 1; + fstisstochastic $lang/G.fst + +mkdir -p $lang_unpruned +cp $lang/* $lang_unpruned +# Be careful: this time we dispense with the grep -v '<s> <s>' so this might +# not work for LMs generated from all toolkits. +gunzip -c $lm_srcdir/lm_unpruned.gz | \ + arpa2fst - | fstprint | \ + scripts/eps2disambig.pl | scripts/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ + --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon > $lang_unpruned/G.fst || exit 1; + fstisstochastic $lang_unpruned/G.fst + + +# The commands below are just diagnostic tests. + mkdir -p tmpdir.g + awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ + < data/local/lexicon.txt >tmpdir.g/select_empty.fst.txt + fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt tmpdir.g/select_empty.fst.txt | \ + fstarcsort --sort_type=olabel | fstcompose - $lang/G.fst > tmpdir.g/empty_words.fst + fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' && + echo "Language model has cycles with empty words" && exit 1 + rm -r tmpdir.g + + + + +echo "Succeeded in formatting data." diff --git a/egs/wsj/s3/local/wsj_prepare_local_dict.sh b/egs/wsj/s3/local/wsj_prepare_local_dict.sh new file mode 100755 index 000000000..27d29cebd --- /dev/null +++ b/egs/wsj/s3/local/wsj_prepare_local_dict.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# This script takes the dictionary prepared in wsj_extend_dict.sh +# (which puts its output in data/local/dict_larger), and puts +# in data/local/dict_larger_prep a file "lexicon.txt" that +# contains the begin/end markings and silence, etc. There +# is also a file phones_disambig.txt in the same directory. +# +# This script is similar to wsj_prepare_dict.sh + +# Call this script from one level above, e.g. from the s3/ directory. It puts +# its output in data/local/. + +dictin=data/local/dict_larger/dict.final +[ ! -f $dictin ] && echo No such file $dictin && exit 1; + +# run this from ../ +dir=data/local/dict_larger_prep +mkdir -p $dir + +# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point). +# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones. + +cat data/local/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \ + awk 'BEGIN{print "<eps> 0"; print "SIL 1"; print "SPN 2"; print "NSN 3"; N=4; } + {printf("%s %d\n", $1, N++); } + {printf("%s_B %d\n", $1, N++); } + {printf("%s_E %d\n", $1, N++); } + {printf("%s_S %d\n", $1, N++); } ' >$dir/phones.txt + + +# First make a version of the lexicon without the silences etc, but with the position-markers. +# Remove the comments from the cmu lexicon and remove the (1), (2) from words with multiple +# pronunciations. + +cat $dictin | \ + perl -ane '@A=split(" ",$_); $w = shift @A; @A>0||die; + if(@A==1) { print "$w $A[0]_S\n"; } else { print "$w $A[0]_B "; + for($n=1;$n<@A-1;$n++) { print "$A[$n] "; } print "$A[$n]_E\n"; } ' \ + > $dir/lexicon_nosil.txt || exit 1; + +# Add the silences, noises etc. + +(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \ + cat - $dir/lexicon_nosil.txt > $dir/lexicon.txt || exit 1; + +echo "Local dictionary preparation succeeded; output is in $dir/lexicon.txt" + diff --git a/egs/wsj/s3/local/wsj_train_lms.sh b/egs/wsj/s3/local/wsj_train_lms.sh index f264f0d19..59400fe45 100755 --- a/egs/wsj/s3/local/wsj_train_lms.sh +++ b/egs/wsj/s3/local/wsj_train_lms.sh @@ -1,227 +1,189 @@ #!/bin/bash - -if [ $# -ne 1 -o `basename $1` != 13-32.1 ]; then - echo "Usage: local/wsj_train_lms.sh /foo/bar/13-32.1/" +# This script trains LMs on the WSJ LM-training data. +# It requires that you have already run wsj_extend_dict.sh, +# to get the larger-size dictionary including all of CMUdict +# plus any OOVs and possible acronyms that we could easily +# derive pronunciations for. + + +# This script takes no command-line arguments + +dir=data/local/lm +srcdir=data/local/dict_larger +mkdir -p $dir +export PATH=$PATH:`pwd`/../../../tools/kaldi_lm +( # First make sure the kaldi_lm toolkit is installed. + cd ../../../tools + if [ -d kaldi_lm ]; then + echo Not installing the kaldi_lm toolkit since it is already there. + else + echo Downloading and installing the kaldi_lm tools + if [ ! -f kaldi_lm.tar.gz ]; then + wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1; + fi + tar -xvzf kaldi_lm.tar.gz || exit 1; + cd kaldi_lm + make || exit 1; + echo Done making the kaldi_lm tools + fi +) || exit 1; + + + +if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/wordlist.final ]; then + echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist"; + echo "You need to run local/wsj_extend_dict.sh before running this script." + exit 1; fi -srcdir=$1 -mkdir -p data/local/lm -cd data/local/lm -# e.g. srcdir=/mnt/matylda2/data/WSJ1/13-32.1/ -# Convert to uppercase, remove XML-like markings. -echo "Getting training data [this should take at least a few seconds; if not, there's a problem]" -gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \ - | awk '/^</{next}{print toupper($0)}' | gzip -c > cleaned.gz +# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>. +echo "Getting training data with OOV words repalaced with <UNK> (train_nounk.gz)" +gunzip -c $srcdir/cleaned.gz | awk -v w=$srcdir/wordlist.final \ + 'BEGIN{while((getline<w)>0) v[$1]=1;} + {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \ + | gzip -c > $dir/train_nounk.gz + +# Get unigram counts (without bos/eos, but this doens't matter here, it's +# only to get the word-map, which treats them specially & doesn't need their +# counts). +# Add a 1-count for each word in word-list by including that in the data, +# so all words appear. +gunzip -c $dir/train_nounk.gz | cat - $srcdir/wordlist.final | \ + awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \ + sort -nr > $dir/unigram.counts + +# Get "mapped" words-- a character encoding of the words that makes the common words very short. +cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map + +gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;} + { for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz + +# To save disk space, remove the un-mapped training data. We could +# easily generate it again if needed. +rm $dir/train_nounk.gz + +train_lm.sh --arpa --lmtype 4gram-mincount $dir +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180 +# 10.3 million N-grams. + +prune_lm.sh --arpa 7.0 $dir/4gram-mincount +# 1.50 million N-grams +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757 + + +exit 0 + +### Below here, this script is showing various commands that +## were run during LM tuning. + +train_lm.sh --arpa --lmtype 3gram-mincount $dir +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826 +# 7.8 million N-grams. + +prune_lm.sh --arpa 3.0 $dir/3gram-mincount/ +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740 +# 2.5 million N-grams. + +prune_lm.sh --arpa 6.0 $dir/3gram-mincount/ +# 1.45 million N-grams. +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139 + +train_lm.sh --arpa --lmtype 4gram-mincount $dir +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180 +# 10.3 million N-grams. + +prune_lm.sh --arpa 3.0 $dir/4gram-mincount +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294 +# 2.6 million N-grams. + +prune_lm.sh --arpa 4.0 $dir/4gram-mincount +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717 +# 2.15 million N-grams. + +prune_lm.sh --arpa 5.0 $dir/4gram-mincount +# 1.86 million N-grams +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023 + +prune_lm.sh --arpa 7.0 $dir/4gram-mincount +# 1.50 million N-grams +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757 + +train_lm.sh --arpa --lmtype 3gram $dir +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866 +# 20.0 million N-grams + + + +################# +# You could finish the script here if you wanted. +# Below is to show how to do baselines with SRILM. +# You'd have to install the SRILM toolkit first. + +heldout_sent=10000 # Don't change this if you want result to be comparable with + # kaldi_lm results +sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. +mkdir -p $sdir +gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout +gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train +(echo "<s>"; echo "</s>" ) | cat - $srcdir/wordlist.final > $sdir/wordlist.final.s + +# 3-gram: +ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ + -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz +ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2 +#file data/local/lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs +#0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437 + +# Trying 4-gram: +ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ + -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz +ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout +#file data/local/lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs +#0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822 + +#3-gram with pruning: +ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ + -prune 0.0000001 -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz +ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout +#file data/local/lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs +#0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616 +# Around 2.25M N-grams. +# Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/" +# above, which gave 2.5 million N-grams and a perplexity of 156. + +# Note: all SRILM experiments above fully discount all singleton 3 and 4-grams. +# You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to +# the kaldi_lm experiments above without "-mincount". + +## From here is how to train with +# IRSTLM. This is not really working at the moment. +export IRSTLM=../../../tools/irstlm/ + +idir=$dir/irstlm +mkdir $idir +gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \ + gzip -c > $idir/train.gz + +$IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no + cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\ +{print $0;}}' > vocab.irstlm.20k + + +$IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz -p yes \ + -n 3 -s improved-kneser-ney -b yes +# Testing perplexity with SRILM tools: +ngram -lm $idir/lm_3gram.gz -ppl $sdir/cleaned.heldout +#data/local/lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for <unk> in closed-vocabulary LM +#file data/local/lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs +#0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599 + +# Perplexity is very bad (should be ~141, since we used -p option, +# not 175), +# but adding -debug 3 to the command line shows that +# the IRSTLM LM does not seem to sum to one properly, so it seems that +# it produces an LM that isn't interpretable in the normal way as an ARPA +# LM. + -# get unigram counts -echo "Getting unigram counts" -gunzip -c cleaned.gz | tr -s ' ' '\n' | sort | uniq -c | sort -nr > unigrams - -[ ! -f ../cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1; - -cat unigrams | \ -perl -e ' open(F, "<../cmudict/cmudict.0.7a") || die "Cmudict not found"; - while(<F>){ if (! m/^;;/){ @A=split(" ", $_); $seen{$A[0]} = 1; } } - open(O, ">oov.counts"); - while(<>) { - ($count,$word) = split(" ", $_); - if (defined $seen{$word}) { print "$word\n"; } else { print O $_; } - } ' > vocab.intersect - -echo "Most frequent unseen unigrams are: " -head oov.counts - - - ' - awk 'BEGIN{while(getline<"unigrams")vc[$2]=$1;}{if ($1 in vc)print $1;}'|sort|uniq > vocab.intersect - - -. path.sh || exit 1; - -echo "Preparing train and test data" - -for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do - mkdir -p data/$x - cp data/local/${x}_wav.scp data/$x/wav.scp - cp data/local/$x.txt data/$x/text - cp data/local/$x.spk2utt data/$x/spk2utt - cp data/local/$x.utt2spk data/$x/utt2spk - scripts/filter_scp.pl data/$x/spk2utt data/local/spk2gender.map > data/$x/spk2gender -done - -echo "Preparing word lists etc." - -# Create the "lang" directory for training... we'll copy this same setup -# to be used in test too, and also add the G.fst. -# Note: the lexicon.txt and lexicon_disambig.txt are not used directly by -# the training scripts, so we put these in data/local/. - -# TODO: make sure we properly handle the begin/end symbols in the lexicon, - -# lang_test will contain common things we'll put in lang_test_{bg,tgpr,tg} -mkdir -p data/lang data/lang_test - - -# (0), this is more data-preparation than data-formatting; -# add disambig symbols to the lexicon in data/local/lexicon.txt -# and produce data/local/lexicon_disambig.txt - -ndisambig=`scripts/add_lex_disambig.pl data/local/lexicon.txt data/local/lexicon_disambig.txt` -ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST. -echo $ndisambig > data/local/lex_ndisambig - - -# (1) Put into data/lang, phones.txt, silphones.csl, nonsilphones.csl, words.txt, -# oov.txt -cp data/local/phones.txt data/lang # we could get these from the lexicon, but prefer to - # do it like this so we get all the possible begin/middle/end versions of phones even - # if they don't appear. This is so if we extend the lexicon later, we could use the - # same phones.txt (which is "baked into" the model and can't be changed after you build it). - -silphones="SIL SPN NSN"; -# Generate colon-separated lists of silence and non-silence phones. -scripts/silphones.pl data/lang/phones.txt "$silphones" data/lang/silphones.csl data/lang/nonsilphones.csl - -cat data/local/lexicon.txt | awk '{print $1}' | sort | uniq | \ - awk 'BEGIN{print "<eps> 0";} {printf("%s %d\n", $1, NR);} END{printf("#0 %d\n", NR+1);} ' \ - > data/lang/words.txt - -# Create the basic L.fst without disambiguation symbols, for use -# in training. -scripts/make_lexicon_fst.pl data/local/lexicon.txt 0.5 SIL | \ - fstcompile --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > data/lang/L.fst - -# The file oov.txt contains a word that we will map any OOVs to during -# training. -echo "<SPOKEN_NOISE>" > data/lang/oov.txt - -# (2) -# Create phonesets.txt and extra_questions.txt ... -# phonesets.txt is sets of phones that are shared when building the monophone system -# and when asking questions based on an automatic clustering of phones, for the -# triphone system. extra_questions.txt is some pre-defined extra questions about -# position and stress that split apart the categories we created in phonesets.txt. -# in extra_questions.txt there is also a question about silence phones, since we -# didn't include that in our - -local/make_shared_phones.sh < data/lang/phones.txt > data/lang/phonesets_mono.txt -grep -v SIL data/lang/phonesets_mono.txt > data/lang/phonesets_cluster.txt -local/make_extra_questions.sh < data/lang/phones.txt > data/lang/extra_questions.txt - -( # Creating the "roots file" for building the context-dependent systems... - # we share the roots across all the versions of each real phone. We also - # share the states of the 3 forms of silence. "not-shared" here means the - # states are distinct p.d.f.'s... normally we would automatically split on - # the HMM-state but we're not making silences context dependent. - echo 'not-shared not-split SIL SPN NSN'; - cat data/lang/phones.txt | grep -v eps | grep -v SIL | grep -v SPN | grep -v NSN | awk '{print $1}' | \ - perl -e 'while(<>){ m:([A-Za-z]+)(\d*)(_.)?: || die "Bad line $_"; - $phone=$1; $stress=$2; $position=$3; - if($phone eq $curphone){ print " $phone$stress$position"; } - else { if(defined $curphone){ print "\n"; } $curphone=$phone; - print "shared split $phone$stress$position"; }} print "\n"; ' -) > data/lang/roots.txt - -silphonelist=`cat data/lang/silphones.csl | sed 's/:/ /g'` -nonsilphonelist=`cat data/lang/nonsilphones.csl | sed 's/:/ /g'` -cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \ - sed "s:SILENCEPHONES:$silphonelist:" > data/lang/topo - -for f in phones.txt words.txt L.fst silphones.csl nonsilphones.csl topo; do - cp data/lang/$f data/lang_test -done - - -# (3), -# In lang_test, create a phones.txt file that includes the disambiguation symbols. -# the --include-zero includes the #0 symbol we pass through from the grammar. -# Note: we previously echoed the # of disambiguation symbols to data/local/lex_ndisambig. -scripts/add_disambig.pl --include-zero data/lang_test/phones.txt \ - `cat data/local/lex_ndisambig` > data/lang_test/phones_disambig.txt -cp data/lang_test/phones_disambig.txt data/lang # Needed for MMI. - - -# Create the lexicon FST with disambiguation symbols, and put it in lang_test. -# There is an extra -# step where we create a loop "pass through" the disambiguation symbols -# from G.fst. -phone_disambig_symbol=`grep \#0 data/lang_test/phones_disambig.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 data/lang_test/words.txt | awk '{print $2}'` - -scripts/make_lexicon_fst.pl data/local/lexicon_disambig.txt 0.5 SIL '#'$ndisambig | \ - fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > data/lang_test/L_disambig.fst || exit 1; - -# Copy into data/lang/ also, where it will be needed for discriminative training. -cp data/lang_test/L_disambig.fst data/lang/ - - -# Create L_align.fst, which is as L.fst but with alignment symbols (#1 and #2 at the -# beginning and end of words, on the input side)... useful if we -# ever need to e.g. create ctm's-- these are used to work out the -# word boundaries. -cat data/local/lexicon.txt | \ - awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }' | \ - scripts/make_lexicon_fst.pl - 0.5 SIL | \ - fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > data/lang_test/L_align.fst - -# Next, for each type of language model, create the corresponding FST -# and the corresponding lang_test directory. - -echo Preparing language models for test - -for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do - test=data/lang_test_${lm_suffix} - mkdir -p $test - for f in phones.txt words.txt phones_disambig.txt L.fst L_disambig.fst \ - silphones.csl nonsilphones.csl; do - cp data/lang_test/$f $test - done - gunzip -c data/local/lm_${lm_suffix}.arpa.gz | \ - scripts/find_arpa_oovs.pl $test/words.txt > data/local/oovs_${lm_suffix}.txt - - # grep -v '<s> <s>' because the LM seems to have some strange and useless - # stuff in it with multiple <s>'s in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>, - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. - gunzip -c data/local/lm_${lm_suffix}.arpa.gz | \ - grep -v '<s> <s>' | \ - grep -v '</s> <s>' | \ - grep -v '</s> </s>' | \ - arpa2fst - | fstprint | \ - scripts/remove_oovs.pl data/local/oovs_${lm_suffix}.txt | \ - scripts/eps2disambig.pl | scripts/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon > $test/G.fst - fstisstochastic $test/G.fst - # The output is like: - # 9.14233e-05 -0.259833 - # we do expect the first of these 2 numbers to be close to zero (the second is - # nonzero because the backoff weights make the states sum to >1). - # Because of the <s> fiasco for these particular LMs, the first number is not - # as close to zero as it could be. - - # Everything below is only for diagnostic. - # Checking that G has no cycles with empty words on them (e.g. <s>, </s>); - # this might cause determinization failure of CLG. - # #0 is treated as an empty word. - mkdir -p tmpdir.g - awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ - < data/local/lexicon.txt >tmpdir.g/select_empty.fst.txt - fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt tmpdir.g/select_empty.fst.txt | \ - fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > tmpdir.g/empty_words.fst - fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' && - echo "Language model has cycles with empty words" && exit 1 - rm -r tmpdir.g -done - -echo "Succeeded in formatting data." diff --git a/egs/wsj/s3/run.sh b/egs/wsj/s3/run.sh index 6d18ee702..eaba5c3e9 100644 --- a/egs/wsj/s3/run.sh +++ b/egs/wsj/s3/run.sh @@ -36,6 +36,20 @@ local/wsj_prepare_dict.sh local/wsj_format_data.sh +# We suggest to run the next three commands in the background, +# as they are not a precondition for the system building and +# most of the tests: these commands build a dictionary +# containing many of the OOVs in the WSJ LM training data, +# and an LM trained directly on that data (i.e. not just +# copying the arpa files from the disks from LDC). +( + local/wsj_extend_dict.sh /mnt/matylda2/data/WSJ1/13-32.1 && \ + local/wsj_prepare_local_dict.sh && \ + local/wsj_train_lms.sh && \ + local/wsj_format_data_local.sh +) & + + # Now make MFCC features. # mfccdir should be some place with a largish disk where you # want to store MFCC features. @@ -68,7 +82,7 @@ steps/train_mono.sh --num-jobs 10 --cmd "$train_cmd" \ data/train_si84_2kshort data/lang exp/mono0a ( -scripts/mkgraph.sh --mono data/lang_test_tgpr exp/mono0a exp/mono0a/graph_tgpr +scripts/mkgraph.sh --mono data/lang_test_fgpr exp/mono0a exp/mono0a/graph_fgpr scripts/decode.sh --cmd "$decode_cmd" steps/decode_deltas.sh exp/mono0a/graph_tgpr data/test_dev93 exp/mono0a/decode_tgpr_dev93 scripts/decode.sh --cmd "$decode_cmd" steps/decode_deltas.sh exp/mono0a/graph_tgpr data/test_eval92 exp/mono0a/decode_tgpr_eval92 )& @@ -170,6 +184,11 @@ scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri3b/gra scripts/lmrescore.sh --cmd "$decode_cmd" data/lang_test_tgpr data/lang_test_tg \ data/test_eval92 exp/tri3b/decode_tgpr_eval92 exp/tri3b/decode_tgpr_eval92_tg +# Trying the larger dictionary ("big-dict"/bd) + locally produced LM. +scripts/mkgraph.sh data/lang_test_bd_tgpr exp/tri3b exp/tri3b/graph_bd_tgpr +scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri3b/graph_bd_tgpr \ + data/test_eval92 exp/tri3b/decode_bd_tgpr_eval92 + # From 3b system, align all si284 data. steps/align_lda_mllt_sat.sh --num-jobs 10 --cmd "$train_cmd" \ data/train_si284 data/lang exp/tri3b exp/tri3b_ali_si284 -- GitLab