From d13597fd56a26c04cc445a2bf020d43467ca73de Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Tue, 15 Nov 2011 07:32:53 +0000
Subject: [PATCH] Adding scripts for dictionary and LM preparation for WSJ.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@604 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
---
 egs/wsj/s3/local/dict/add_counts.pl           |  31 ++
 egs/wsj/s3/local/dict/count_rules.pl          |  44 ++
 egs/wsj/s3/local/dict/filter_dict.pl          |  19 +
 egs/wsj/s3/local/dict/find_acronyms.pl        |  95 ++++
 egs/wsj/s3/local/dict/get_acronym_prons.pl    | 123 ++++++
 egs/wsj/s3/local/dict/get_candidate_prons.pl  | 187 ++++++++
 egs/wsj/s3/local/dict/get_rule_hierarchy.pl   |  73 ++++
 egs/wsj/s3/local/dict/get_rules.pl            | 204 +++++++++
 .../s3/local/dict/limit_candidate_prons.pl    | 103 +++++
 egs/wsj/s3/local/dict/reverse_candidates.pl   |  50 +++
 egs/wsj/s3/local/dict/reverse_dict.pl         |  14 +
 egs/wsj/s3/local/dict/score_prons.pl          |  50 +++
 egs/wsj/s3/local/dict/score_rules.pl          |  52 +++
 .../s3/local/dict/select_candidate_prons.pl   |  84 ++++
 egs/wsj/s3/local/wsj_extend_dict.sh           | 168 ++++++++
 egs/wsj/s3/local/wsj_format_data.sh           |   4 +-
 egs/wsj/s3/local/wsj_format_data_local.sh     | 151 +++++++
 egs/wsj/s3/local/wsj_prepare_local_dict.sh    |  48 +++
 egs/wsj/s3/local/wsj_train_lms.sh             | 404 ++++++++----------
 egs/wsj/s3/run.sh                             |  21 +-
 20 files changed, 1701 insertions(+), 224 deletions(-)
 create mode 100755 egs/wsj/s3/local/dict/add_counts.pl
 create mode 100755 egs/wsj/s3/local/dict/count_rules.pl
 create mode 100755 egs/wsj/s3/local/dict/filter_dict.pl
 create mode 100755 egs/wsj/s3/local/dict/find_acronyms.pl
 create mode 100755 egs/wsj/s3/local/dict/get_acronym_prons.pl
 create mode 100755 egs/wsj/s3/local/dict/get_candidate_prons.pl
 create mode 100755 egs/wsj/s3/local/dict/get_rule_hierarchy.pl
 create mode 100755 egs/wsj/s3/local/dict/get_rules.pl
 create mode 100755 egs/wsj/s3/local/dict/limit_candidate_prons.pl
 create mode 100755 egs/wsj/s3/local/dict/reverse_candidates.pl
 create mode 100755 egs/wsj/s3/local/dict/reverse_dict.pl
 create mode 100755 egs/wsj/s3/local/dict/score_prons.pl
 create mode 100755 egs/wsj/s3/local/dict/score_rules.pl
 create mode 100755 egs/wsj/s3/local/dict/select_candidate_prons.pl
 create mode 100755 egs/wsj/s3/local/wsj_extend_dict.sh
 create mode 100755 egs/wsj/s3/local/wsj_format_data_local.sh
 create mode 100755 egs/wsj/s3/local/wsj_prepare_local_dict.sh

diff --git a/egs/wsj/s3/local/dict/add_counts.pl b/egs/wsj/s3/local/dict/add_counts.pl
new file mode 100755
index 000000000..409277c72
--- /dev/null
+++ b/egs/wsj/s3/local/dict/add_counts.pl
@@ -0,0 +1,31 @@
+#!/usr/bin/perl
+
+
+# Add counts to an oovlist.
+# Reads in counts as output by uniq -c, and
+# an oovlist, and prints out the counts of the oovlist.
+
+(@ARGV == 1 || @ARGV == 2) || die "Usage: add_counts.pl count_file [oovlist]\n";
+
+$counts = shift @ARGV;
+
+open(C, "<$counts") || die "Opening counts file $counts";
+
+while(<C>) {
+  @A = split(" ", $_);
+  @A == 2 || die "Bad line in counts file: $_";
+  ($count, $word) = @A;
+  $count =~ m:^\d+$: || die "Bad count $A[0]\n";
+  $counts{$word} = $count;
+}
+
+while(<>) {
+  chop;
+  $w = $_;
+  $w =~ m:\S+: || die "Bad word $w";
+  defined $counts{$w} || die "Word $w not present in counts file";
+  print "\t$counts{$w}\t$w\n";
+}
+    
+  
+
diff --git a/egs/wsj/s3/local/dict/count_rules.pl b/egs/wsj/s3/local/dict/count_rules.pl
new file mode 100755
index 000000000..2805e98c3
--- /dev/null
+++ b/egs/wsj/s3/local/dict/count_rules.pl
@@ -0,0 +1,44 @@
+#!/usr/bin/perl
+
+# This program takes the output of score_prons.pl and collates
+# it for each (rule, destress) pair so that we get the
+# counts of right/partial/wrong for each pair.
+
+# The input is a 7-tuple on each line, like:
+# word;pron;base-word;base-pron;rule-name;de-stress;right|partial|wrong
+#
+# The output format is a 5-tuple like:
+#
+# rule;destress;right-count;partial-count;wrong-count
+#
+
+if (@ARGV != 0 && @ARGV != 1) {
+  die "Usage: count_rules.pl < scored_candidate_prons > rule_counts";
+}
+
+
+while(<>) {
+  chop;
+  $line = $_;
+  my ($word, $pron, $baseword, $basepron, $rulename, $destress, $score) = split(";", $line);
+  
+  my $key = $rulename . ";" . $destress;
+
+  if (!defined $counts{$key}) {
+    $counts{$key} = [ 0, 0, 0 ]; # new anonymous array.
+  }
+  $ref = $counts{$key};
+  if ($score eq "right") {
+    $$ref[0]++;
+  } elsif ($score eq "partial") {
+    $$ref[1]++;
+  } elsif ($score eq "wrong") {
+    $$ref[2]++;
+  } else {
+    die "Bad score $score\n";
+  }
+}
+
+while ( my ($key, $value) = each(%counts)) {
+  print $key . ";" . join(";", @$value) . "\n";
+}
diff --git a/egs/wsj/s3/local/dict/filter_dict.pl b/egs/wsj/s3/local/dict/filter_dict.pl
new file mode 100755
index 000000000..1210bb5e6
--- /dev/null
+++ b/egs/wsj/s3/local/dict/filter_dict.pl
@@ -0,0 +1,19 @@
+#!/usr/bin/perl
+
+
+# This program reads and writes either a dictionary or just a list
+# of words, and it removes any words containing ";" or "," as these
+# are used in these programs.  It will warn about these.
+# It will die if the pronunciations have these symbols in.
+while(<>) {
+  chop;
+  @A = split(" ", $_);
+  $word = shift @A;
+  
+  if ($word =~ m:[;,]:) {
+    print STDERR "Omitting line $_ since it has one of the banned characters ; or ,\n" ;
+  } else {
+    $_ =~ m:[;,]: && die "Phones cannot have ; or , in them.";
+    print $_ . "\n";
+  }
+}
diff --git a/egs/wsj/s3/local/dict/find_acronyms.pl b/egs/wsj/s3/local/dict/find_acronyms.pl
new file mode 100755
index 000000000..ed4655afa
--- /dev/null
+++ b/egs/wsj/s3/local/dict/find_acronyms.pl
@@ -0,0 +1,95 @@
+#!/usr/bin/perl
+
+# Reads a dictionary, and prints out a list of words that seem to be pronounced
+# as acronyms (not including plurals of acronyms, just acronyms).  Uses
+# the prons of the individual letters (A., B. and so on) to judge this.
+# Note: this is somewhat dependent on the convention used in CMUduct, that
+# the individual letters are spelled this way (e.g. "A.").
+
+$max_length = 6; # Max length of words that might be
+ # acronyms.
+
+while(<>) { # Read the dict.
+  chop;
+  @A = split(" ", $_);
+  $word = shift @A;
+  $pron = join(" ", @A);
+  if ($word =~ m/^([A-Z])\.$/ ) {
+    chop $word; # Remove trailing "." to get just the letter
+    $letter = $1;
+    if (!defined $letter_prons{$letter} ) { 
+      $letter_prons{$letter} = [ ]; # new anonymous array
+    }
+    $arrayref = $letter_prons{$letter};
+    push @$arrayref, $pron;
+  } elsif( length($word) <= $max_length ) {
+    $pronof{$word . "," . $pron} = 1;
+    $isword{$word} = 1;
+    #if (!defined $prons{$word} ) {
+    #  $prons{$word} = [ ];
+    #}
+    #  push @{$prons{$word}}, $pron;
+  }
+}
+
+sub get_letter_prons;
+
+foreach $word (keys %isword) {
+  my @letter_prons = get_letter_prons($word);
+  foreach $pron (@letter_prons) {
+    if (defined $pronof{$word.",".$pron}) {
+      print "$word  $pron\n";
+    }
+  }
+}
+
+
+sub get_letter_prons {
+  @acronym = split("", shift); # The letters in the word.
+  my @prons = ( "" );
+  
+  while (@acronym > 0) {
+    $l = shift @acronym;
+    $n = 1; # num-repeats of letter $l.
+    while (@acronym > 0 && $acronym[0] eq $l) {
+      $n++;
+      shift @acronym;
+    }
+    my $arrayref = $letter_prons{$l};
+    my @prons_of_block = ();
+    if ($n == 1) { # Just one repeat.
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, $lpron; # typically (always?) just one pron of a letter.
+      }
+    } elsif ($n == 2) { # Two repeats.  Can be "double a" or "a a"
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, "D AH1 B AH0 L " . $lpron;
+        push @prons_of_block, $lpron . $lpron;
+      }
+    } elsif ($n == 3) { # can be "triple a" or "a a a"
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, "T R IH1 P AH0 L " . $lpron;
+        push @prons_of_block, $lpron . $lpron . $lpron;
+      }
+    } elsif ($n >= 4) { # let's say it can only be that letter repeated $n times..
+      # not sure really.
+      foreach $lpron ( @$arrayref ) {
+        $nlpron = "";
+        for ($m = 0; $m < $n; $m++) { $nlpron = $nlpron . $lpron; }
+        push @prons_of_block, $nlpron;
+      }
+    }
+    my @new_prons = ();
+    foreach $pron (@prons) {
+      foreach $pron_of_block(@prons_of_block) {
+        if ($pron eq "") {
+          push @new_prons, $pron_of_block;
+        } else {
+          push @new_prons, $pron . " " . $pron_of_block;
+        }
+      }
+    }
+    @prons = @new_prons;
+  }
+  return @prons;
+}
diff --git a/egs/wsj/s3/local/dict/get_acronym_prons.pl b/egs/wsj/s3/local/dict/get_acronym_prons.pl
new file mode 100755
index 000000000..3f9936818
--- /dev/null
+++ b/egs/wsj/s3/local/dict/get_acronym_prons.pl
@@ -0,0 +1,123 @@
+#!/usr/bin/perl
+
+# Reads a dictionary (for prons of letters), and an OOV list,
+# and puts out candidate pronunciations of words in that list
+# that could plausibly be acronyms.
+# We judge that a word can plausibly be an acronym if it is
+# a sequence of just letters (no non-letter characters such
+# as "'"),  or something like U.K.,
+# and the number of letters is four or less.
+#
+# If the text were not already pre-normalized, there would
+# be other hints such as capitalization.
+
+# This program appends
+# the prons of the individual letters (A., B. and so on) to work out
+# the pron of the acronym.
+# Note: this is somewhat dependent on the convention used in CMUduct, that
+# the individual letters are spelled this way (e.g. "A."). [it seems
+# to also have the separated versions.
+
+if (!(@ARGV == 1 || @ARGV == 2)) { 
+  print "Usage: get_acronym_prons.pl dict [oovlist]";
+}
+
+$max_length = 4; # Max #letters in an acronym. (Longer 
+ # acronyms tend to have "pseudo-pronunciations", e.g. think about UNICEF.
+
+$dict = shift @ARGV;
+open(D, "<$dict") || die "Opening dictionary $dict";
+
+while(<D>) { # Read the dict, to get the prons of the letters.
+  chop;
+  @A = split(" ", $_);
+  $word = shift @A;
+  $pron = join(" ", @A);
+  if ($word =~ m/^([A-Z])\.$/ ) {
+    chop $word; # Remove trailing "." to get just the letter
+    $letter = $1;
+    if (!defined $letter_prons{$letter} ) { 
+      $letter_prons{$letter} = [ ]; # new anonymous array
+    }
+    $arrayref = $letter_prons{$letter};
+    push @$arrayref, $pron;
+  } elsif( length($word) <= $max_length ) {
+    $pronof{$word . "," . $pron} = 1;
+    $isword{$word} = 1;
+    #if (!defined $prons{$word} ) {
+    #  $prons{$word} = [ ];
+    #}
+    #  push @{$prons{$word}}, $pron;
+  }
+}
+
+sub get_letter_prons;
+
+while(<>) { # Read OOVs.
+  # For now, just do the simple cases without "." in 
+  # between... things with "." in the OOV list seem to
+  # be mostly errors.
+  chop;
+  $word = $_;
+  if ($word =~ m/^[A-Z]{1,5}$/) {
+    foreach $pron ( get_letter_prons($word) ) { # E.g. UNPO
+      print "$word  $pron\n";
+    }
+  } elsif ($word =~ m:^(\w\.){1,4}\w\.?$:) { # E.g. U.K.  Make the final "." optional.
+    $letters = $word;
+    $letters =~ s:\.::g;
+    foreach $pron ( get_letter_prons($letters) ) { 
+      print "$word  $pron\n";
+    }
+  }
+}
+
+sub get_letter_prons {
+  @acronym = split("", shift); # The letters in the word.
+  my @prons = ( "" );
+  
+  while (@acronym > 0) {
+    $l = shift @acronym;
+    $n = 1; # num-repeats of letter $l.
+    while (@acronym > 0 && $acronym[0] eq $l) {
+      $n++;
+      shift @acronym;
+    }
+    my $arrayref = $letter_prons{$l};
+    my @prons_of_block = ();
+    if ($n == 1) { # Just one repeat.
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, $lpron; # typically (always?) just one pron of a letter.
+      }
+    } elsif ($n == 2) { # Two repeats.  Can be "double a" or "a a"
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, "D AH1 B AH0 L " . $lpron;
+        push @prons_of_block, $lpron . " " . $lpron;
+      }
+    } elsif ($n == 3) { # can be "triple a" or "a a a"
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, "T R IH1 P AH0 L " . $lpron;
+        push @prons_of_block, "$lpron $lpron $lpron";
+      }
+    } elsif ($n >= 4) { # let's say it can only be that letter repeated $n times..
+      # not sure really.
+      foreach $lpron ( @$arrayref ) {
+        $nlpron = $lpron;
+        for ($m = 1; $m < $n; $m++) { $nlpron = $nlpron . " " . $lpron; }
+        push @prons_of_block, $nlpron;
+      }
+    }
+    my @new_prons = ();
+    foreach $pron (@prons) {
+      foreach $pron_of_block(@prons_of_block) {
+        if ($pron eq "") {
+          push @new_prons, $pron_of_block;
+        } else {
+          push @new_prons, $pron . " " . $pron_of_block;
+        }
+      }
+    }
+    @prons = @new_prons;
+  }
+  return @prons;
+}
diff --git a/egs/wsj/s3/local/dict/get_candidate_prons.pl b/egs/wsj/s3/local/dict/get_candidate_prons.pl
new file mode 100755
index 000000000..b13efd203
--- /dev/null
+++ b/egs/wsj/s3/local/dict/get_candidate_prons.pl
@@ -0,0 +1,187 @@
+#!/usr/bin/perl
+
+# This script takes three command-line arguments (typically files, or "-"):
+# the suffix rules (as output by get_rules.pl), the rule-hierarchy 
+# (from get_rule_hierarchy.pl), and the words that we want prons to be 
+# generated for (one per line).
+
+# The output consists of candidate generated pronunciations for those words,
+# together with information about how we generated those pronunciations.
+# This does not do pruning of the candidates using the restriction
+# "you can't use a more general rule when a more specific one is applicable".
+# That is done by limit_candidate_prons.pl.
+
+# Each line of the output consists of a 4-tuple, separated by ";", of the
+# form:
+# word;pron;base-word;base-pron;rule-name;destress[;rule-score]
+# [the last field is only present if you supplied rules with score information].
+# where:
+# - "word" is the input word that we queried for, e.g. WASTED
+# - "pron" is the generated pronunciation, e.g. "W EY1 S T AH0 D"
+# - rule-name is a 4-tuple separated by commas that describes the rule, e.g.
+#   "STED,STING,D,NG",
+# - "base-word" is the base-word we're getting the pron from,
+#   e.g. WASTING
+# - "base-pron" is the pron of the base-word, e.g. "W EY1 S T IH0 NG"
+# - "destress" is either "yes" or "no" and corresponds to whether we destressed the
+#   base-word or not [de-stressing just corresponds to just taking any 2's down to 1's,
+#   although we may extend this in future]... 
+# - "rule-score" is a numeric score of the rule (this field is only present
+#   if there was score information in your rules.
+
+
+(@ARGV == 2  || @ARGV == 3) || die "Usage: get_candidate_prons.pl rules base-dict [ words ]";
+
+$min_prefix_len = 3;  # this should probably match with get_rules.pl
+
+$rules = shift @ARGV; # Note: rules may be with destress "yes/no" indicators or without...
+                      # if without, it's treated as if both "yes" and "no" are present.
+$dict = shift @ARGV;
+
+open(R, "<$rules") || die "Opening rules file: $rules";
+
+sub process_word;
+
+while(<R>) {
+  chop $_;
+  my ($rule, $destress, $rule_score) = split(";", $_); # We may have "destress" markings (yes|no),
+  # and scores, or we may have just rule, in which case
+  # $destress and $rule_score will be undefined.
+
+  my @R = split(",", $rule, 4); # "my" means new instance of @R each
+  # time we do this loop -> important because we'll be creating
+  # a reference to @R below.
+  # Note: the last arg to SPLIT tells it how many fields max to get.
+  # This stops it from omitting empty trailing fields.
+  @R == 4 || die "Bad rule $_";
+  $suffix = $R[0]; # Suffix of word we want pron for.
+  if (!defined $isrule{$rule}) {
+    $isrule{$rule} = 1; # make sure we do this only once for each rule 
+    # (don't repeate for different stresses).
+    if (!defined $suffix2rule{$suffix}) {
+      # The syntax [ $x, $y, ... ] means a reference to a newly created array
+      # containing $x, $y, etc.   \@R creates an array reference to R.
+      # so suffix2rule is a hash from suffix to ref to array of refs to 
+      # 4-dimensional arrays.
+      $suffix2rule{$suffix} = [ \@R ];
+    } else {
+      # Below, the syntax @{$suffix2rule{$suffix}} dereferences the array
+      # reference inside the hash; \@R pushes onto that array a new array
+      # reference pointing to @R.
+      push @{$suffix2rule{$suffix}}, \@R;
+    }
+  }
+  if (!defined $rule_score) { $rule_score = -1; } # -1 means we don't have the score info.
+  
+  # Now store information on which destress markings (yes|no) this rule
+  # is valid for, and the associated scores (if supplied)
+  # If just the rule is given (i.e. no destress marking specified),
+  # assume valid for both.
+  if (!defined $destress) { # treat as if both "yes" and "no" are valid.
+    $rule_and_destress_to_rule_score{$rule.";yes"} = $rule_score;
+    $rule_and_destress_to_rule_score{$rule.";no"} = $rule_score;
+  } else {
+    $rule_and_destress_to_rule_score{$rule.";".$destress} = $rule_score;
+  }
+
+}
+
+open(D, "<$dict") || die "Opening base dictionary: $dict";
+while(<D>) {
+  @A = split(" ", $_);
+  $word = shift @A;
+  $pron = join(" ", @A);
+  if (!defined $word2prons{$word}) {
+    $word2prons{$word} = [ $pron ]; # Ref to new anonymous array containing just "pron".
+  } else {
+    push @{$word2prons{$word}}, $pron; # Push $pron onto array referred to (@$ref derefs array).
+  }
+}
+foreach $word (%word2prons) {
+  # Set up the hash "prefixcount", which says how many times a char-sequence
+  # is a prefix (not necessarily a strict prefix) of a word in the dict.
+  $len = length($word);
+  for ($l = 0; $l <= $len; $l++) {
+    $prefixcount{substr($word, 0, $l)}++;
+  }
+}
+
+open(R, "<$rules") || die "Opening rules file: $rules";
+
+
+while(<>) {
+  chop;
+  m/^\S+$/ || die;
+  process_word($_);
+}
+
+sub process_word {
+  my $word = shift @_;
+  $len = length($word);
+  # $owncount is used in evaluating whether a particular prefix is a prefix
+  # of some other word in the dict... if a word itself may be in the dict
+  # (usually because we're running this on the dict itself), we need to
+  # correct for this.
+  if (defined $word2prons{$word}) { $owncount = 1; } else { $owncount = 0; }
+  
+  for ($prefix_len = $min_prefix_len; $prefix_len <= $len; $prefix_len++) {
+    my $prefix = substr($word, 0, $prefix_len);
+    my $suffix = substr($word, $prefix_len);
+    if ($prefixcount{$prefix} - $owncount == 0) {
+      # This prefix is not a prefix of any word in the dict, so no point
+      # checking the rules below-- none of them can match.
+      next;
+    }
+    $rules_array_ref = $suffix2rule{$suffix};
+    if (defined $rules_array_ref) {
+      foreach $R (@$rules_array_ref) { # @$rules_array_ref dereferences the array.
+        # $R is a refernce to a 4-dimensional array, whose elements we access with
+        # $$R[0], etc.
+        my $base_suffix = $$R[1];
+        my $base_word = $prefix . $base_suffix;
+        my $base_prons_ref = $word2prons{$base_word};
+        if (defined $base_prons_ref) {
+          my $psuffix = $$R[2];
+          my $base_psuffix = $$R[3];
+          if ($base_psuffix ne "") { 
+            $base_psuffix = " " . $base_psuffix; 
+            # Include " ", the space between phones, to prevent
+            # matching partial phones below.
+          }
+          my $base_psuffix_len = length($base_psuffix);
+          foreach $base_pron (@$base_prons_ref) { # @$base_prons_ref derefs 
+            # that reference to an array.
+            my $base_pron_prefix_len = length($base_pron) - $base_psuffix_len;
+            # Note: these lengths are in characters, not phones.
+            if ($base_pron_prefix_len >= 0 && 
+                substr($base_pron, $base_pron_prefix_len) eq $base_psuffix) {
+              # The suffix of the base_pron is what it should be.
+              my $pron_prefix = substr($base_pron, 0, $base_pron_prefix_len);
+              my $rule = join(",", @$R); # we'll output this..
+              my $len = @R;
+              for ($destress = 0; $destress <= 1; $destress++) { # Two versions 
+                # of each rule: with destressing and without.
+                # pron is the generated pron.
+                if ($destress) {  $pron_prefix =~ s/2/1/g; }
+                my $pron;
+                if ($psuffix ne "") { $pron = $pron_prefix . " " . $psuffix; }
+                else { $pron = $pron_prefix; }
+                # Now print out the info about the generated pron.
+                my $destress_mark = ($destress ? "yes" : "no");
+                my $rule_score = $rule_and_destress_to_rule_score{$rule.";".$destress_mark};
+                if (defined $rule_score) { # Means that the (rule,destress) combination was
+                  # seen [note: this if-statement may be pointless, as currently we don't
+                  # do any pruning of rules].
+                  my @output = ($word, $pron, $base_word, $base_pron, $rule, $destress_mark);
+                  if ($rule_score != -1) { push @output, $rule_score; } # If scores were supplied,
+                  # we also output the score info.
+                  print join(";", @output) . "\n";
+                }
+              }
+            }  
+          }
+        }
+      }
+    }
+  }
+}  
diff --git a/egs/wsj/s3/local/dict/get_rule_hierarchy.pl b/egs/wsj/s3/local/dict/get_rule_hierarchy.pl
new file mode 100755
index 000000000..35805b46b
--- /dev/null
+++ b/egs/wsj/s3/local/dict/get_rule_hierarchy.pl
@@ -0,0 +1,73 @@
+#!/usr/bin/perl
+
+#This reads in rules, of the form put out by get_rules.pl, e.g.:
+# ERT,,ER0 T,
+# MENT,ING,M AH0 N T,IH0 NG
+# S,TON,Z,T AH0 N
+# ,ER,IH0 NG,IH0 NG ER0
+# ,'S,M AH0 N,M AH0 N Z
+#TIONS,TIVE,SH AH0 N Z,T IH0 V
+
+# and it works out a hierarchy that says which rules are sub-cases
+# of which rules: it outputs on each line a pair separated by ";", where
+# each member of the pair is a rule, first one is the specialization, the
+# second one being more general.
+# E.g.:
+# RED,RE,D,/ED,E,D,
+# RED,RE,D,/D,,D,
+# GING,GE,IH0 NG,/ING,I,IH0 NG,
+# TOR,TING,T ER0,T IH0 NG/OR,OR,T ER0,T ER0 
+# ERED,ER,D,/RED,R,D,
+# ERED,ER,D,/ED,,D,
+
+
+
+
+while(<>) {
+  chop;
+  $rule = $_;
+  $isrule{$rule} = 1;
+  push @rules, $rule;
+}
+
+foreach my $rule (@rules) {
+  # Truncate the letters and phones in the rule, while we
+  # can, to get more general rules; if the more general rule
+  # exists, put out the pair.
+  @A = split(",", $rule);
+  @suffixa = split("", $A[0]);
+  @suffixb = split("", $A[1]);
+  @psuffixa = split(" ", $A[2]);
+  @psuffixb = split(" ", $A[3]);
+  for ($common_suffix_len = 0; $common_suffix_len < @suffixa && $common_suffix_len < @suffixb;) {
+    if ($suffixa[$common_suffix_len] eq $suffixb[$common_suffix_len]) {
+      $common_suffix_len++;
+    } else {
+      last;
+    }
+  }
+  for ($common_psuffix_len = 0; $common_psuffix_len < @psuffixa && $common_psuffix_len < @psuffixb;) {
+    if ($psuffixa[$common_psuffix_len] eq $psuffixb[$common_psuffix_len]) {
+      $common_psuffix_len++;
+    } else {
+      last;
+    }
+  }
+  # Get all combinations of pairs of integers <= (common_suffix_len, common_psuffix_len),
+  # except (0,0), and print out this rule together with the corresponding rule (if it exists).
+  for ($m = 0; $m <= $common_suffix_len; $m++) {
+    $sa = join("", @suffixa[$m...$#suffixa]); # @x[a..b] is array slice notation.
+    $sb = join("", @suffixb[$m...$#suffixb]);
+    for ($n = 0; $n <= $common_psuffix_len; $n++) {
+      if (!($m == 0 && $n == 0)) {
+        $psa = join(" ", @psuffixa[$n...$#psuffixa]);
+        $psb = join(" ", @psuffixb[$n...$#psuffixb]);
+        $more_general_rule = join(",", ($sa, $sb, $psa, $psb));
+        if (defined $isrule{$more_general_rule}) {
+          print $rule . ";" . $more_general_rule . "\n";
+        }
+      }
+    }
+  }
+}
+
diff --git a/egs/wsj/s3/local/dict/get_rules.pl b/egs/wsj/s3/local/dict/get_rules.pl
new file mode 100755
index 000000000..a5b57b088
--- /dev/null
+++ b/egs/wsj/s3/local/dict/get_rules.pl
@@ -0,0 +1,204 @@
+#!/usr/bin/perl
+
+# This program creates suggested suffix rules from a dictionary.
+# It outputs quadruples of the form:
+# suffix,base-suffix,psuffix,base-psuffix
+# where "suffix" is the suffix of the letters of a word, "base-suffix" is
+# the suffix of the letters of the base-word, "psuffix" is the suffix of the
+# pronunciation of the word (a space-separated list of phonemes), and
+# "base-psuffix" is the suffix of the pronunciation of the baseword.
+# As far as this program is concerned, there is no distinction between
+# "word" and "base-word".  To simplify things slightly, what it does
+# is return all tuples (a,b,c,d) [with a != b] such that there are
+# at least $min_suffix_count instances in the dictionary of
+# a (word-prefix, pron-prefix) pair where there exists (word,pron)
+# pairs of the form
+# ( word-prefix . a,  pron-prefix . c)
+# and 
+# ( word-prefix . b, pron-prefix . d)
+# For example if (a,b,c,d) equals (USLY,US,S L IY0,S)
+# then this quadruple will be output as long as there at least
+# e.g. 30 instances of prefixes like (FAM, F EY1 M AH0)
+# where there exist (word, pron) pairs like:
+# FAMOUS, F EY1 M AH0 S
+# FAMOUSLY  F EY1 M AH0 S L IY0
+#
+# There are some modifications to the picture above, for efficiency.
+# If $disallow_empty_suffix != 0, this program will not output 4-tuples where
+# the first element (the own-word suffix) is empty, as this would cause
+# efficiency problems in get_candidate_prons.pl.  If 
+# $ignore_prefix_stress != 0, this program will ignore stress markings
+# while evaluating whether prefixes are the same.
+# The minimum count for a quadruple to be output is $min_suffix_count
+# (e.g. 30).
+#
+# The function of this program is not to evaluate the accuracy of these rules;
+# it is mostly a pruning step, where we suggest rules that have large enough
+# counts to be suitable for our later procedure where we evaluate their
+# accuracy in predicting prons.
+
+$disallow_empty_suffix = 1; # Disallow rules where the suffix of the "own-word" is
+   # empty.  This is for efficiency in later stages (e.g. get_candidate_prons.pl).
+$min_prefix_len = 3;  # this must match with get_candidate_prons.pl
+$ignore_prefix_stress = 1; # or 0 to take account of stress in prefix.
+$min_suffix_count = 20;
+
+# Takes in dictionary.
+
+print STDERR "Reading dict\n";
+while(<>) {
+  @A = split(" ", $_);
+  my $word = shift @A;
+  my $pron = join(" ", @A);
+  if (!defined $prons{$word}) {
+    $prons{$word} = $pron;
+    push @words, $word;
+  } else {
+    $prons{$word} = $prons{$word} . ";" . $pron;
+  }
+}
+
+# Get common suffixes (e.g., count >100).  Include empty suffix.
+
+print STDERR "Getting common suffix counts.\n";
+{
+  foreach $word (@words) {
+    $len = length($word);
+    for ($x = $min_prefix_len; $x <= $len; $x++) {
+      $suffix_count{substr($word, $x)}++;
+    }
+  }
+
+  foreach $suffix (keys %suffix_count) {
+    if ($suffix_count{$suffix} >= $min_suffix_count) {
+      $newsuffix_count{$suffix} = $suffix_count{$suffix};
+    }
+  }
+  %suffix_count = %newsuffix_count;
+  undef %newsuffix_count;
+
+  foreach $suffix ( sort { $suffix_count{$b} <=> $suffix_count{$a} } keys %suffix_count ) {
+    print STDERR "$suffix_count{$suffix} $suffix\n";
+  }
+}
+
+print STDERR "Getting common suffix pairs.\n";
+
+{
+  print STDERR " Getting map from prefix -> suffix-set.\n";
+
+  # Create map from prefix -> suffix-set.
+  foreach $word (@words) {
+    $len = length($word);
+    for ($x = $min_prefix_len; $x <= $len; $x++) {
+      $prefix = substr($word, 0, $x);
+      $suffix = substr($word, $x);
+      if (defined $suffix_count{$suffix}) { # Suffix is common...
+        if (!defined $suffixes_of{$prefix}) {
+          $suffixes_of{$prefix} = [ $suffix ]; # Create a reference to a new array with
+          # one element.
+        } else {
+          push @{$suffixes_of{$prefix}}, $suffix; # Push $suffix onto array that the
+          # hash member is a reference .
+        }
+      }
+    }
+  }
+  my %suffix_set_count;
+  print STDERR " Getting map from suffix-set -> count.\n";
+  while ( my ($key, $value) = each(%suffixes_of) ) { 
+    my @suffixes = sort ( @$value );
+    $suffix_set_count{join(";", @suffixes)}++;
+  }
+  print STDERR " Getting counts for suffix pairs.\n";
+  while ( my ($suffix_set, $count) = each (%suffix_set_count) ) {
+    my @suffixes = split(";", $suffix_set);
+    # Consider pairs to be ordered.  This is more convenient
+    # later on.
+    foreach $suffix_a (@suffixes) {
+      foreach $suffix_b (@suffixes) {
+        if ($suffix_a ne $suffix_b) {
+          $suffix_pair = $suffix_a . "," . $suffix_b;
+          $suffix_pair_count{$suffix_pair} += $count;
+        }
+      }
+    }
+  }
+
+  # To save memory, only keep pairs above threshold in the hash.
+  while ( my ($suffix_pair, $count) = each (%suffix_pair_count) ) {
+    if ($count >= $min_suffix_count) {
+      $new_hash{$suffix_pair} = $count;
+    }
+  }
+  %suffix_pair_count = %new_hash;
+  undef %new_hash;
+
+  # Print out the suffix pairs so the user can see.
+  foreach $suffix_pair ( 
+      sort { $suffix_pair_count{$b} <=> $suffix_pair_count{$a} } keys %suffix_pair_count ) {
+    print STDERR "$suffix_pair_count{$suffix_pair} $suffix_pair\n";
+  }
+}
+
+print STDERR "Getting common suffix/suffix/psuffix/psuffix quadruples\n";
+
+{
+  while ( my ($prefix, $suffixes_ref) = each(%suffixes_of) ) {
+    # Note: suffixes_ref is a reference to an array.  We dereference with
+    # @$suffixes_ref.
+    # Consider each pair of suffixes (in each order).
+    foreach my $suffix_a ( @$suffixes_ref ) {
+      foreach my $suffix_b ( @$suffixes_ref ) {
+        # could just used "defined" in next line, but this is for clarity.
+        $suffix_pair = $suffix_a.",".$suffix_b;
+        if ( $suffix_pair_count{$suffix_pair} >= $min_suffix_count ) {
+          foreach $pron_a_str (split(";", $prons{$prefix.$suffix_a})) {
+            @pron_a = split(" ", $pron_a_str);
+            foreach $pron_b_str (split(";", $prons{$prefix.$suffix_b})) {
+              @pron_b = split(" ", $pron_b_str);
+              $len_a = @pron_a; # evaluating array as scalar automatically gives length.
+              $len_b = @pron_b;
+              for (my $pos = 0; $pos <= $len_a && $pos <= $len_b; $pos++) {
+                # $pos is starting-pos of psuffix-pair. 
+                $psuffix_a = join(" ", @pron_a[$pos...$#pron_a]);
+                $psuffix_b = join(" ", @pron_b[$pos...$#pron_b]);
+                $quadruple = $suffix_pair . "," . $psuffix_a . "," . $psuffix_b;
+                $quadruple_count{$quadruple}++;
+                
+                my $pron_a_pos = $pron_a[$pos], $pron_b_pos = $pron_b[$pos];
+                if ($ignore_prefix_stress) {
+                  $pron_a_pos =~ s/\d//; # e.g convert IH0 to IH.  Only affects
+                  $pron_b_pos =~ s/\d//; # whether we exit the loop below.
+                }
+                if ($pron_a_pos ne $pron_b_pos) {
+                  # This is important: we don't consider a pron suffix-pair to be
+                  # valid unless the pron prefix is the same.
+                  last;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  # To save memory, only keep pairs above threshold in the hash.
+  while ( my ($quadruple, $count) = each (%quadruple_count) ) {
+    if ($count >= $min_suffix_count) {
+      $new_hash{$quadruple} = $count;
+    }
+  }
+  %quadruple_count = %new_hash;
+  undef %new_hash;
+  
+  # Print out the quadruples for diagnostics.
+  foreach $quadruple ( 
+    sort { $quadruple_count{$b} <=> $quadruple_count{$a} } keys %quadruple_count ) {
+    print STDERR "$quadruple_count{$quadruple} $quadruple\n";
+  }
+}
+# Now print out the quadruples; these are the output of this program.
+foreach $quadruple (keys %quadruple_count) {
+  print $quadruple."\n";
+}
diff --git a/egs/wsj/s3/local/dict/limit_candidate_prons.pl b/egs/wsj/s3/local/dict/limit_candidate_prons.pl
new file mode 100755
index 000000000..ceff9fbad
--- /dev/null
+++ b/egs/wsj/s3/local/dict/limit_candidate_prons.pl
@@ -0,0 +1,103 @@
+#!/usr/bin/perl
+
+# This program enforces the rule that
+# if a "more specific" rule applies, we cannot use the more general rule.
+# It takes in tuples generated by get_candidate_prons (one per line, separated
+# by ";"), of the form:
+# word;pron;base-word;base-pron;rule-name;de-stress[;rule-score]
+# [note: we mean that the last element, the numeric score of the rule, is optional]
+# and it outputs a (generally shorter) list
+# of the same form.
+
+
+# For each word:
+  # For each (base-word,base-pron):
+  #  Eliminate "more-general" rules as follows:
+  #    For each pair of rules applying to this (base-word, base-pron):
+  #      If pair is in more-general hash, disallow more general one.
+  # Let the output be: for each (base-word, base-pron, rule):
+  # for (destress-prefix) in [yes, no], do:
+  #  print out the word input, the rule-name, [destressed:yes|no], and the new pron.
+
+
+if (@ARGV != 1 && @ARGV != 2) {
+  die "Usage: limit_candidate_prons.pl rule_hierarchy [candidate_prons] > limited_candidate_prons";
+}
+
+$hierarchy = shift @ARGV;
+open(H, "<$hierarchy") || die "Opening rule hierarchy $hierarchy";
+
+while(<H>) {
+  chop;
+  m:.+;.+: || die "Bad rule-hierarchy line $_";
+  $hierarchy{$_} = 1; # Format is: if $rule1 is the string form of the more specific rule
+  # and $rule21 is that string form of the more general rule, then $hierarchy{$rule1.";".$rule2}
+  # is defined, else undefined.
+}
+
+
+sub process_word;
+
+undef $cur_word;
+@cur_lines = ();
+
+while(<>) {
+  # input, output is:
+  # word;pron;base-word;base-pron;rule-name;destress;score
+  chop;
+  m:^([^;]+);: || die "Unexpected input: $_";
+  $word = $1;
+  if (!defined $cur_word || $word eq $cur_word) {
+    if (!defined $cur_word) { $cur_word = $word; }
+    push @cur_lines, $_;
+  } else {
+    process_word(@cur_lines); # Process a series of suggested prons
+    # for a particular word.
+    $cur_word = $word;
+    @cur_lines = ( $_ ); 
+  }
+}
+process_word(@cur_lines);
+  
+sub process_word {
+  my %pair2rule_list; # hash from $baseword.";".$baseword to ref
+  # to array of [ line1, line2, ... ].
+  my @cur_lines = @_;
+  foreach my $line (@cur_lines) {
+    my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
+    my $key = $baseword.";".$basepron;
+    if (defined $pair2rule_list{$key}) {
+      push @{$pair2rule_list{$key}}, $line; # @{...} derefs the array pointed to 
+      # by the array ref inside {}. 
+    } else {
+      $pair2rule_list{$key} = [ $line ]; # [ $x ] is new anonymous array with 1 elem ($x)
+    }
+  }
+  while ( my ($key, $value) = each(%pair2rule_list) ) {
+    my @lines = @$value; # array of lines that are for this (baseword,basepron).
+    my @stress, @rules; # Arrays of stress markers and rule names, indexed by
+    # same index that indexes @lines.
+    for (my $n = 0; $n < @lines; $n++) {
+      my $line = $lines[$n];
+      my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
+      $stress[$n] = $destress;
+      $rules[$n] = $rulename;
+    }
+    for (my $m = 0; $m < @lines; $m++) {
+      my $ok = 1; # if stays 1, this line is OK.
+      for (my $n = 0; $n < @lines; $n++) {
+        if ($m != $n && $stress[$m] eq $stress[$n]) {
+          if (defined $hierarchy{$rules[$n].";".$rules[$m]}) {
+            # Note: this "hierarchy" variable is defined if $rules[$n] is a more
+            # specific instances of $rules[$m], thus invalidating $rules[$m].
+            $ok = 0;
+            last; # no point iterating further.
+          }
+        }
+      }
+      if ($ok != 0) {
+        print $lines[$m] . "\n";
+      }
+    }
+  }
+}
diff --git a/egs/wsj/s3/local/dict/reverse_candidates.pl b/egs/wsj/s3/local/dict/reverse_candidates.pl
new file mode 100755
index 000000000..d5c5effc2
--- /dev/null
+++ b/egs/wsj/s3/local/dict/reverse_candidates.pl
@@ -0,0 +1,50 @@
+#!/usr/bin/perl
+
+# This takes the output of e.g. get_candidate_prons.pl or limit_candidate_prons.pl,
+# which is 7-tuples, one per line, of the form:
+
+# word;pron;base-word;base-pron;rule-name;de-stress;rule-score
+# (where rule-score is somtimes listed as optional, but this
+# program does expect it, since we don't anticipate it being used
+# without it).
+# This program assumes that all the words and prons and rules have
+# come from a reversed dictionary (reverse_dict.pl) where the order
+# of the characters in the words, and the phones in the prons, have
+# been reversed, and it un-reverses them.  That it, the characters
+# in "word" and "base-word", and the phones in "pron" and "base-pron"
+# are reversed; and the rule ("rule-name") is parsed as a 4-tuple,
+# like:
+# suffix,base-suffix,psuffix,base-psuffix
+# so this program reverses the characters in "suffix" and "base-suffix"
+# and the phones (separated by spaces) in "psuffix" and "base-psuffix".
+
+sub reverse_str {
+  $str = shift;
+  return join("", reverse(split("", $str)));
+}
+sub reverse_pron {
+  $str = shift;
+  return join(" ", reverse(split(" ", $str)));
+}
+
+while(<>){ 
+  chop;
+  @A = split(";", $_);
+  @A == 7 || die "Bad input line $_: found $len fields, expected 7.";
+
+  ($word,$pron,$baseword,$basepron,$rule,$destress,$score) = @A;
+  $word = reverse_str($word);
+  $pron = reverse_pron($pron);
+  $baseword = reverse_str($baseword);
+  $basepron = reverse_pron($basepron);
+  @R = split(",", $rule, 4);
+  @R == 4 || die "Bad rule $rule";
+
+  $R[0] = reverse_str($R[0]); # suffix.
+  $R[1] = reverse_str($R[1]); # base-suffix.
+  $R[2] = reverse_pron($R[2]); # pron.
+  $R[3] = reverse_pron($R[3]); # base-pron.
+  $rule = join(",", @R);
+  @A = ($word,$pron,$baseword,$basepron,$rule,$destress,$score);
+  print join(";", @A) . "\n";
+}
diff --git a/egs/wsj/s3/local/dict/reverse_dict.pl b/egs/wsj/s3/local/dict/reverse_dict.pl
new file mode 100755
index 000000000..75681711b
--- /dev/null
+++ b/egs/wsj/s3/local/dict/reverse_dict.pl
@@ -0,0 +1,14 @@
+#!/usr/bin/perl
+
+# Used in conjunction with get_rules.pl
+# example input line: XANTHE  Z AE1 N DH
+# example output line: EHTNAX DH N AE1 Z
+
+while(<>){ 
+  @A = split(" ", $_);
+  $word = shift @A;
+  $word = join("", reverse(split("", $word))); # Reverse letters of word.
+  @A = reverse(@A); # Reverse phones in pron.
+  unshift @A, $word;
+  print join(" ", @A) . "\n";
+}
diff --git a/egs/wsj/s3/local/dict/score_prons.pl b/egs/wsj/s3/local/dict/score_prons.pl
new file mode 100755
index 000000000..fd5a004d8
--- /dev/null
+++ b/egs/wsj/s3/local/dict/score_prons.pl
@@ -0,0 +1,50 @@
+#!/usr/bin/perl
+
+# This program takes candidate prons from "get_candidate_prons.pl" or
+# "limit_candidate_prons.pl", and a reference dictionary covering those words,
+# and outputs the same format but with scoring information added (so we go from
+# 6 to 7 fields).  The scoring information says, for each generated pron,
+# whether we have a match, a partial match, or no match, to some word in the
+# dictionary.  A partial match means it's correct except for stress.
+
+# The input is a 6-tuple on each line, like:
+# word;pron;base-word;base-pron;rule-name;de-stress
+#
+# The output is the same except with one more field, the score,
+# which may be "right", "wrong", "partial".
+
+if (@ARGV != 1 && @ARGV != 2) {
+  die "Usage: score_prons.pl reference_dict [candidate_prons] > scored_candidate_prons";
+}
+
+$dict = shift @ARGV;
+open(D, "<$dict") || die "Opening dictionary $dict";
+
+while(<D>) { # Set up some hashes that tell us when
+  # a (word,pron) pair is correct (and the same for
+  # prons with stress information removed).
+  chop;
+  @A = split(" ", $_);
+  $word = shift @A;
+  $pron = join(" ", @A);
+  $pron_nostress = $pron;
+  $pron_nostress =~ s:\d::g;
+  $word_and_pron{$word.";".$pron} = 1;
+  $word_and_pron_nostress{$word.";".$pron_nostress} = 1;
+}
+
+while(<>) {
+  chop;
+  $line = $_;
+  my ($word, $pron, $baseword, $basepron, $rulename, $destress) = split(";", $line);
+  $pron_nostress = $pron;
+  $pron_nostress =~ s:\d::g;
+  if (defined $word_and_pron{$word.";".$pron}) {
+    $score = "right";
+  } elsif (defined $word_and_pron_nostress{$word.";".$pron_nostress}) {
+    $score = "partial";
+  } else {
+    $score = "wrong";
+  }
+  print $line.";".$score."\n";
+}
diff --git a/egs/wsj/s3/local/dict/score_rules.pl b/egs/wsj/s3/local/dict/score_rules.pl
new file mode 100755
index 000000000..8d165f7f1
--- /dev/null
+++ b/egs/wsj/s3/local/dict/score_rules.pl
@@ -0,0 +1,52 @@
+#!/usr/bin/perl
+
+# This program takes the output of count_rules.pl, which is tuples
+# of the form
+#
+# rule;destress;right-count;partial-count;wrong-count
+#
+# and outputs lines of the form
+#
+# rule;de-stress;score
+#
+# where the score, between 0 and 1 (1 better), is 
+# equal to:
+#
+# It forms a score between 0 and 1, of the form:
+# ((#correct) +  $partial_score * (#partial)) / (#correct + #partial + #wrong + $ballast)
+#
+# where $partial_score (e.g. 0.8) is the score we assign to a "partial" match,
+# and $ballast is a small number, e.g. 1, that is treated like "extra" wrong scores, to penalize
+# rules with few observations.
+#
+# It outputs all rules that at are at least the
+
+$ballast = 1;
+$partial_score = 0.8;
+$destress_penalty = 1.0e-05; # Give destressed rules a small
+# penalty vs. their no-destress counterparts, so if we
+# have to choose arbitrarily we won't destress (seems safer)>
+
+for ($n = 1; $n <= 4; $n++) {
+  if ($ARGV[0] eq "--ballast") {
+    shift @ARGV;
+    $ballast = shift @ARGV;
+  }
+  if ($ARGV[0] eq "--partial-score") {
+    shift @ARGV;
+    $partial_score = shift @ARGV;
+    ($partial_score >= 0.0 && $partial_score <= 1.0) || die "Invalid partial_score: $partial_score";
+  }
+}
+
+(@ARGV == 0 || @ARGV == 1) || die "Usage: score_rules.pl [--ballast ballast-count] [--partial-score partial-score] [input from count_rules.pl]";
+
+while(<>) {
+  @A = split(";", $_);
+  @A == 5 || die "Bad input line; $_";
+  ($rule,$destress,$right_count,$partial_count,$wrong_count) = @A;
+  $rule_score = ($right_count + $partial_score*$partial_count) / 
+    ($right_count+$partial_count+$wrong_count+$ballast);
+  if ($destress eq "yes") { $rule_score -= $destress_penalty; }
+  print join(";", $rule, $destress, sprintf("%.5f", $rule_score)) . "\n";
+}
diff --git a/egs/wsj/s3/local/dict/select_candidate_prons.pl b/egs/wsj/s3/local/dict/select_candidate_prons.pl
new file mode 100755
index 000000000..d0018c98a
--- /dev/null
+++ b/egs/wsj/s3/local/dict/select_candidate_prons.pl
@@ -0,0 +1,84 @@
+#!/usr/bin/perl
+
+# This takes the output of e.g. get_candidate_prons.pl or limit_candidate_prons.pl
+# or reverse_candidates.pl, which is 7-tuples, one per line, of the form:
+#
+# word;pron;base-word;base-pron;rule-name;de-stress;rule-score
+#
+# and selects the most likely prons for the words based on rule
+# score.  It outputs in the same format as the input (thus, it is
+# similar to limit_candidates.pl in its input and output format,
+# except it has a different way of selecting the prons to put out).
+#
+# This script will select the $max_prons best pronunciations for
+# each candidate word, subject to the constraint that no pron should
+# have a rule score worse than $min_rule_score.
+# It first merges the candidates by, if there are multiple candidates
+# generating the same pron, selecting the candidate that had the
+# best associated score.  It then sorts the prons on score and
+# selects the n best prons (but doesn't print out candidates with
+# score beneath the threshold).
+
+
+$max_prons = 4;
+$min_rule_score = 0.35;
+
+
+for ($n = 1; $n <= 3; $n++) {
+  if ($ARGV[0] eq "--max-prons") {
+    shift @ARGV;
+    $max_prons = shift @ARGV;
+  }
+  if ($ARGV[0] eq "--min-rule-score") {
+    shift @ARGV;
+    $min_rule_score = shift @ARGV;
+  }
+}
+
+if (@ARGV != 0 && @ARGV != 1) {
+  die "Usage: select_candidates_prons.pl [candidate_prons] > selected_candidate_prons";
+}
+
+sub process_word;
+
+undef $cur_word;
+@cur_lines = ();
+
+while(<>) {
+  # input, output is:
+  # word;pron;base-word;base-pron;rule-name;destress;score
+  chop;
+  m:^([^;]+);: || die "Unexpected input: $_";
+  $word = $1;
+  if (!defined $cur_word || $word eq $cur_word) {
+    if (!defined $cur_word) { $cur_word = $word; }
+    push @cur_lines, $_;
+  } else {
+    process_word(@cur_lines); # Process a series of suggested prons
+    # for a particular word.
+    $cur_word = $word;
+    @cur_lines = ( $_ ); 
+  }
+}
+process_word(@cur_lines);
+
+
+sub process_word {
+  my %pron2rule_score; # hash from generated pron to rule score for that pron.
+  my %pron2line; # hash from generated pron to best line for that pron.
+  my @cur_lines = @_;
+  foreach my $line (@cur_lines) {
+    my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
+    if (!defined $pron2rule_score{$pron} ||
+        $rule_score > $pron2rule_score{$pron}) {
+      $pron2rule_score{$pron} = $rule_score;
+      $pron2line{$pron} = $line;
+    }
+  }
+  my @prons = sort { $pron2rule_score{$b} <=> $pron2rule_score{$a} } keys %pron2rule_score;
+  for (my $n = 0; $n < @prons && $n < $max_prons &&
+       $pron2rule_score{$prons[$n]} >= $min_rule_score; $n++) {
+    print $pron2line{$prons[$n]} . "\n";
+  }
+}
+
diff --git a/egs/wsj/s3/local/wsj_extend_dict.sh b/egs/wsj/s3/local/wsj_extend_dict.sh
new file mode 100755
index 000000000..ce82840b2
--- /dev/null
+++ b/egs/wsj/s3/local/wsj_extend_dict.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+# This script builds a larger word-list and dictionary 
+# than used for the LMs supplied with the WSJ corpus.
+# It uses a couple of strategies to fill-in words in
+# the LM training data but not in CMUdict.  One is
+# to generate special prons for possible acronyms, that
+# just consist of the constituent letters.  The other
+# is designed to handle derivatives of known words
+# (e.g. deriving the pron of a plural from the pron of
+# the base-word), but in a more general, learned-from-data
+# way.
+# It makes use of scripts in local/dict/
+
+if [ $# -ne 1 ]; then
+  echo "Usage: local/wsj_train_lms.sh /foo/bar/WSJ/13-32.1/"
+  exit 1
+fi
+if [ "`basename $1`" != 13-32.1 ]; then
+  echo "Expecting the argument to this script to end in 13-32.1"
+  exit 1
+fi
+
+# e.g.
+#srcdir=/mnt/matylda2/data/WSJ1/13-32.1
+export PATH=$PATH:`pwd`/local/dict/
+srcdir=$1
+mkdir -p data/local/dict_larger
+dir=data/local/dict_larger
+mincount=2 # Minimum count of an OOV we will try to generate a pron for.
+
+[ ! -f data/local/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
+
+# Remove comments from cmudict; print first field; remove
+# words like FOO(1) which are alternate prons: our dict format won't
+# include these markers.
+grep -v ';;;' data/local/cmudict/cmudict.0.7a | 
+ perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
+
+cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
+
+echo "Getting training data [this should take at least a few seconds; if not, there's a problem]"
+
+# Convert to uppercase, remove XML-like markings.
+# For words ending in "." that are not in CMUdict, we assume that these
+# are periods that somehow remained in the data during data preparation,
+# and we we replace the "." with "\n".  Note: we found this by looking at
+# oov.counts below (before adding this rule).
+
+
+touch $dir/cleaned.gz
+if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
+  echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
+else
+ gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \
+  | awk '/^</{next}{print toupper($0)}' | perl -e '
+   open(F, "<$ARGV[0]")||die;
+   while(<F>){ chop; $isword{$_} = 1; }
+   while(<STDIN>) { 
+    @A = split(" ", $_); 
+    for ($n = 0; $n < @A; $n++) {
+      $a = $A[$n];
+      if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
+         # and have no other "." in them: treat as period.
+         print "$a";
+         if ($n+1 < @A) { print "\n"; }
+      } else { print "$a "; }
+    }
+    print "\n";
+  }
+ ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz
+fi
+  
+# get unigram counts
+echo "Getting unigram counts"
+gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \
+  awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams
+
+cat $dir/unigrams | awk -v dict=$dir/dict.cmu \
+  'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
+   > $dir/oov.counts
+
+echo "Most frequent unseen unigrams d are: "
+head $dir/oov.counts
+
+# Prune away singleton counts, and remove things with numbers in
+# (which should have been normalized) and with no letters at all.
+
+
+cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \
+  | awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist
+
+# Automatic rule-finding...
+
+# First make some prons for possible acronyms.
+# Note: we don't do this for things like U.K or U.N,
+# or A.B. (which doesn't exist anyway), 
+# as we consider this normalization/spelling errors.
+
+cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu  > $dir/dict.acronyms
+
+mkdir $dir/f $dir/b # forward, backward directions of rules...
+  # forward is normal suffix
+  # rules, backward is reversed (prefix rules).  These
+  # dirs contain stuff we create while making the rule-based
+  # extensions to the dictionary.
+
+# Remove ; and , from words, if they are present; these
+# might crash our scripts, as they are used as separators there.
+filter_dict.pl $dir/dict.cmu > $dir/f/dict 
+cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
+reverse_dict.pl $dir/f/dict > $dir/b/dict
+reverse_dict.pl $dir/f/oovs > $dir/b/oovs
+
+# The next stage takes a few minutes.
+# Note: the forward stage takes longer, as English is
+# mostly a suffix-based language, and there are more rules
+# that it finds.
+for d in $dir/f $dir/b; do
+ (
+   cd $d
+   cat dict | get_rules.pl 2>get_rules.log >rules
+   get_rule_hierarchy.pl rules >hierarchy
+   awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
+     limit_candidate_prons.pl hierarchy | \
+     score_prons.pl dict | \
+     count_rules.pl >rule.counts
+   # the sort command below is just for convenience of reading.
+   score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
+   get_candidate_prons.pl rules.with_scores dict oovs | \
+     limit_candidate_prons.pl hierarchy > oovs.candidates
+ )  &   
+done 
+wait
+
+# Merge the candidates.
+reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
+select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s  %s\n", $1, $2);}' \
+  > $dir/dict.oovs
+
+cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
+
+awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
+sort $dir/oovlist | diff - $dir/oovlist.handled  | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
+
+echo "**Top OOVs we handled are:**"; 
+head $dir/oovlist.handled
+echo "**Top OOVs we didn't handle are as follows (not: they are mostly misspellings):**"; 
+head $dir/oovlist.not_handled
+
+# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs
+add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts
+add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts
+
+echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`"
+echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`"
+echo "Count of OOVs we didn't handle due to low count is" \
+    `awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts`
+
+
+# The two files created just above are for humans to look at, as diagnostics.
+
+cat $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/dict.final
+cat $dir/dict.final | awk '{print $1}' | uniq > $dir/wordlist.final
+
+echo "Created $dir/dict.final and $dir/wordlist.final"
+
+
diff --git a/egs/wsj/s3/local/wsj_format_data.sh b/egs/wsj/s3/local/wsj_format_data.sh
index 9074be8cc..f2b2b6373 100755
--- a/egs/wsj/s3/local/wsj_format_data.sh
+++ b/egs/wsj/s3/local/wsj_format_data.sh
@@ -92,7 +92,7 @@ echo "<SPOKEN_NOISE>" > data/lang/oov.txt
 # triphone system.  extra_questions.txt is some pre-defined extra questions about
 # position and stress that split apart the categories we created in phonesets.txt.
 # in extra_questions.txt there is also a question about silence phones, since we 
-# didn't include that in our
+# don't include them in our automatically generated clustering of phones.
 
 local/make_shared_phones.sh < data/lang/phones.txt > data/lang/phonesets_mono.txt
 grep -v SIL data/lang/phonesets_mono.txt > data/lang/phonesets_cluster.txt
@@ -133,7 +133,7 @@ cp data/lang_test/phones_disambig.txt data/lang # Needed for MMI.
 
 # Create the lexicon FST with disambiguation symbols, and put it in lang_test.
 # There is an extra
-# step where we create a loop "pass through" the disambiguation symbols
+# step where we create a loop to "pass through" the disambiguation symbols
 # from G.fst.  
 phone_disambig_symbol=`grep \#0 data/lang_test/phones_disambig.txt | awk '{print $2}'`
 word_disambig_symbol=`grep \#0 data/lang_test/words.txt | awk '{print $2}'`
diff --git a/egs/wsj/s3/local/wsj_format_data_local.sh b/egs/wsj/s3/local/wsj_format_data_local.sh
new file mode 100755
index 000000000..e5935a5d5
--- /dev/null
+++ b/egs/wsj/s3/local/wsj_format_data_local.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+# This script creates the appropriate lang_test directories for
+# the locally created dictionary and lexicon.
+
+
+# "bd" is big-dict.
+# The "fgpr" LM is a locally estimated one (4-gram, pruned)
+. path.sh || exit 1;
+dict_srcdir=data/local/dict_larger_prep/
+lm_srcdir=data/local/lm/4gram-mincount
+lang=data/lang_test_bd_fgpr
+lang_unpruned=data/lang_test_bd_fg
+mkdir -p $lang
+
+[ ! -f $dict_srcdir/lexicon.txt ] && \
+   echo "First run wsj_prepare_local_dict.sh" && exit 1;
+[ ! -f $lm_srcdir/lm_pr7.0.gz -o ! -f $lm_srcdir/lm_unpruned.gz ] && \
+   echo "First run wsj_train_lms.sh" && exit 1;
+
+
+# Get number of disambig symbols, and lexicon with disambig symbols.
+ndisambig=`scripts/add_lex_disambig.pl $dict_srcdir/lexicon.txt $dict_srcdir/lexicon_disambig.txt`
+ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST.
+echo $ndisambig > $dict_srcdir/lex_ndisambig
+
+# (1) Put into $lang, phones.txt, silphones.csl, nonsilphones.csl, words.txt,
+#   oov.txt
+cp data/local/phones.txt $lang
+
+silphones="SIL SPN NSN";
+# Generate colon-separated lists of silence and non-silence phones.
+scripts/silphones.pl $lang/phones.txt "$silphones" $lang/silphones.csl $lang/nonsilphones.csl
+
+# Make word symbol table.
+cat $dict_srcdir/lexicon.txt | awk '{print $1}' | sort | uniq  | \
+ awk 'BEGIN{print "<eps> 0";} {printf("%s %d\n", $1, NR);} END{printf("#0 %d\n", NR+1);} ' \
+  > $lang/words.txt
+
+# Create the basic L.fst without disambiguation symbols, for use
+# in training. 
+scripts/make_lexicon_fst.pl $dict_srcdir/lexicon.txt 0.5 SIL | \
+  fstcompile --isymbols=$lang/phones.txt --osymbols=$lang/words.txt \
+  --keep_isymbols=false --keep_osymbols=false | \
+   fstarcsort --sort_type=olabel > $lang/L.fst
+
+# The file oov.txt contains a word that we will map any OOVs to during
+# training.  This won't be needed in a test directory, but for completion we
+# do it.
+echo "<SPOKEN_NOISE>" > data/lang/oov.txt
+
+# Note: we don't need phonesets.txt and extra_questions.txt, as they are
+# only needed during training.  So we don't bother creating them.
+# Anyway they're the same as they would be in other lang directories.
+
+silphonelist=`cat data/lang/silphones.csl | sed 's/:/ /g'`
+nonsilphonelist=`cat data/lang/nonsilphones.csl | sed 's/:/ /g'`
+cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \
+   sed "s:SILENCEPHONES:$silphonelist:" > $lang/topo
+
+
+# (3),
+# In lang_test, create a phones.txt file that includes the disambiguation symbols.
+# the --include-zero includes the #0 symbol we pass through from the grammar.
+# Note: we previously echoed the # of disambiguation symbols to $dict_srcdir/lex_ndisambig.
+
+scripts/add_disambig.pl --include-zero $lang/phones.txt \
+   `cat $dict_srcdir/lex_ndisambig` > $lang/phones_disambig.txt
+
+# Create the lexicon FST with disambiguation symbols, and put it in lang_test.
+# There is an extra
+# step where we create a loop to "pass through" the disambiguation symbols
+# from G.fst.  
+phone_disambig_symbol=`grep \#0 $lang/phones_disambig.txt | awk '{print $2}'`
+word_disambig_symbol=`grep \#0 $lang/words.txt | awk '{print $2}'`
+
+scripts/make_lexicon_fst.pl $dict_srcdir/lexicon_disambig.txt 0.5 SIL '#'$ndisambig | \
+   fstcompile --isymbols=$lang/phones_disambig.txt --osymbols=$lang/words.txt \
+   --keep_isymbols=false --keep_osymbols=false |   \
+   fstaddselfloops  "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \
+   fstarcsort --sort_type=olabel > $lang/L_disambig.fst || exit 1;
+
+
+# Create L_align.fst, which is as L.fst but with alignment symbols (#1 and #2 at the
+# beginning and end of words, on the input side)... useful if we
+# ever need to e.g. create ctm's-- these are used to work out the
+# word boundaries.
+cat $dict_srcdir/lexicon.txt | \
+ awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }' | \
+ scripts/make_lexicon_fst.pl - 0.5 SIL | \
+ fstcompile --isymbols=$lang/phones_disambig.txt --osymbols=$lang/words.txt \
+  --keep_isymbols=false --keep_osymbols=false | \
+ fstarcsort --sort_type=olabel > $lang/L_align.fst || exit 1;
+
+# Next, for each type of language model, create the corresponding FST
+# and the corresponding lang_test directory.
+
+echo "Preparing language models for test"
+
+# Note: at this point, $lang=="data/lang_test_bd_fgpr", we put a pruned 4-gram model
+# there.
+
+echo "Checking there are no OOVs" # there shouldn't be in this LM.
+# If you have an LM with OOVs you'd have to put back the command
+# "remove_oovs.pl" below, as it is used in wsj_format_data.sh.
+gunzip -c $lm_srcdir/lm_pr7.0.gz | \
+  scripts/find_arpa_oovs.pl $lang/words.txt | cmp - /dev/null || \
+ exit 1;   
+
+
+# Removing these "invalid combinations" of <s> and </s> is not 
+# necessary because we produced these LMs ourselves, and they aren't
+# broken.  But we'll leave this in the script just in case it gets modified
+# later.
+# Note: ~1.5M N-grams.
+gunzip -c $lm_srcdir/lm_pr7.0.gz | \
+  grep -v '<s> <s>' | \
+  grep -v '</s> <s>' | \
+  grep -v '</s> </s>' | \
+  arpa2fst - | fstprint | \
+    scripts/eps2disambig.pl | scripts/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
+      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+     fstrmepsilon > $lang/G.fst || exit 1;
+  fstisstochastic $lang/G.fst
+
+mkdir -p $lang_unpruned
+cp $lang/* $lang_unpruned
+# Be careful: this time we dispense with the grep -v '<s> <s>' so this might
+# not work for LMs generated from all toolkits.
+gunzip -c $lm_srcdir/lm_unpruned.gz | \
+  arpa2fst - | fstprint | \
+    scripts/eps2disambig.pl | scripts/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
+      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+     fstrmepsilon > $lang_unpruned/G.fst || exit 1;
+  fstisstochastic $lang_unpruned/G.fst
+
+
+# The commands below are just diagnostic tests.
+ mkdir -p tmpdir.g
+ awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
+   < data/local/lexicon.txt  >tmpdir.g/select_empty.fst.txt
+ fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt tmpdir.g/select_empty.fst.txt | \
+  fstarcsort --sort_type=olabel | fstcompose - $lang/G.fst > tmpdir.g/empty_words.fst
+ fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' && 
+   echo "Language model has cycles with empty words" && exit 1
+  rm -r tmpdir.g
+
+
+
+
+echo "Succeeded in formatting data."
diff --git a/egs/wsj/s3/local/wsj_prepare_local_dict.sh b/egs/wsj/s3/local/wsj_prepare_local_dict.sh
new file mode 100755
index 000000000..27d29cebd
--- /dev/null
+++ b/egs/wsj/s3/local/wsj_prepare_local_dict.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# This script takes the dictionary prepared in wsj_extend_dict.sh
+# (which puts its output in data/local/dict_larger), and puts
+# in data/local/dict_larger_prep a file "lexicon.txt" that
+# contains the begin/end markings and silence, etc.  There
+# is also a file phones_disambig.txt in the same directory.
+# 
+# This script is similar to wsj_prepare_dict.sh
+
+# Call this script from one level above, e.g. from the s3/ directory.  It puts
+# its output in data/local/.
+
+dictin=data/local/dict_larger/dict.final
+[ ! -f $dictin ] && echo No such file $dictin && exit 1;
+
+# run this from ../
+dir=data/local/dict_larger_prep
+mkdir -p $dir
+
+# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
+# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
+
+cat data/local/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
+ awk 'BEGIN{print "<eps> 0"; print "SIL 1"; print "SPN 2"; print "NSN 3"; N=4; } 
+           {printf("%s %d\n", $1, N++); }
+           {printf("%s_B %d\n", $1, N++); }
+           {printf("%s_E %d\n", $1, N++); }
+           {printf("%s_S %d\n", $1, N++); } ' >$dir/phones.txt
+
+
+# First make a version of the lexicon without the silences etc, but with the position-markers.
+# Remove the comments from the cmu lexicon and remove the (1), (2) from words with multiple 
+# pronunciations.
+
+cat $dictin | \
+ perl -ane '@A=split(" ",$_); $w = shift @A; @A>0||die;
+   if(@A==1) { print "$w $A[0]_S\n"; } else { print "$w $A[0]_B ";
+     for($n=1;$n<@A-1;$n++) { print "$A[$n] "; } print "$A[$n]_E\n"; } ' \
+  > $dir/lexicon_nosil.txt || exit 1;
+
+# Add the silences, noises etc.
+
+(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
+ cat - $dir/lexicon_nosil.txt  > $dir/lexicon.txt || exit 1;
+
+echo "Local dictionary preparation succeeded; output is in $dir/lexicon.txt"
+
diff --git a/egs/wsj/s3/local/wsj_train_lms.sh b/egs/wsj/s3/local/wsj_train_lms.sh
index f264f0d19..59400fe45 100755
--- a/egs/wsj/s3/local/wsj_train_lms.sh
+++ b/egs/wsj/s3/local/wsj_train_lms.sh
@@ -1,227 +1,189 @@
 #!/bin/bash
 
-
-if [ $# -ne 1 -o `basename $1` != 13-32.1 ]; then
-  echo "Usage: local/wsj_train_lms.sh /foo/bar/13-32.1/"
+# This script trains LMs on the WSJ LM-training data.
+# It requires that you have already run wsj_extend_dict.sh,
+# to get the larger-size dictionary including all of CMUdict
+# plus any OOVs and possible acronyms that we could easily 
+# derive pronunciations for.
+
+
+# This script takes no command-line arguments
+
+dir=data/local/lm
+srcdir=data/local/dict_larger
+mkdir -p $dir
+export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
+( # First make sure the kaldi_lm toolkit is installed.
+ cd ../../../tools
+ if [ -d kaldi_lm ]; then
+   echo Not installing the kaldi_lm toolkit since it is already there.
+ else
+   echo Downloading and installing the kaldi_lm tools
+   if [ ! -f kaldi_lm.tar.gz ]; then
+     wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
+   fi
+   tar -xvzf kaldi_lm.tar.gz || exit 1;
+   cd kaldi_lm
+   make || exit 1;
+   echo Done making the kaldi_lm tools
+ fi
+) || exit 1;
+
+
+
+if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/wordlist.final ]; then
+  echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist";
+  echo "You need to run local/wsj_extend_dict.sh before running this script."
+  exit 1;
 fi
 
-srcdir=$1
-mkdir -p data/local/lm
-cd data/local/lm
-# e.g. srcdir=/mnt/matylda2/data/WSJ1/13-32.1/
-# Convert to uppercase, remove XML-like markings.
-echo "Getting training data [this should take at least a few seconds; if not, there's a problem]"
-gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \
-  | awk '/^</{next}{print toupper($0)}'  | gzip -c > cleaned.gz
+# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
+echo "Getting training data with OOV words repalaced with <UNK> (train_nounk.gz)"
+gunzip -c $srcdir/cleaned.gz | awk -v w=$srcdir/wordlist.final \
+  'BEGIN{while((getline<w)>0) v[$1]=1;}
+  {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
+  | gzip -c > $dir/train_nounk.gz
+
+# Get unigram counts (without bos/eos, but this doens't matter here, it's
+# only to get the word-map, which treats them specially & doesn't need their
+# counts).
+# Add a 1-count for each word in word-list by including that in the data,
+# so all words appear.
+gunzip -c $dir/train_nounk.gz | cat - $srcdir/wordlist.final | \
+  awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
+ sort -nr > $dir/unigram.counts
+
+# Get "mapped" words-- a character encoding of the words that makes the common words very short.
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map
+
+gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz
+
+# To save disk space, remove the un-mapped training data.  We could
+# easily generate it again if needed.
+rm $dir/train_nounk.gz 
+
+train_lm.sh --arpa --lmtype 4gram-mincount $dir
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
+# 10.3 million N-grams.
+
+prune_lm.sh --arpa 7.0 $dir/4gram-mincount
+# 1.50 million N-grams
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
+
+
+exit 0
+
+### Below here, this script is showing various commands that 
+## were run during LM tuning.
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
+# 7.8 million N-grams.
+
+prune_lm.sh --arpa 3.0 $dir/3gram-mincount/
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740
+# 2.5 million N-grams.
+
+prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
+# 1.45 million N-grams.
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
+
+train_lm.sh --arpa --lmtype 4gram-mincount $dir
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
+# 10.3 million N-grams.
+
+prune_lm.sh --arpa 3.0 $dir/4gram-mincount
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294
+# 2.6 million N-grams.
+
+prune_lm.sh --arpa 4.0 $dir/4gram-mincount
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717
+# 2.15 million N-grams.
+
+prune_lm.sh --arpa 5.0 $dir/4gram-mincount
+# 1.86 million N-grams
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023
+
+prune_lm.sh --arpa 7.0 $dir/4gram-mincount
+# 1.50 million N-grams
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
+
+train_lm.sh --arpa --lmtype 3gram $dir
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866
+# 20.0 million N-grams
+
+
+
+#################
+# You could finish the script here if you wanted.
+# Below is to show how to do baselines with SRILM.
+#  You'd have to install the SRILM toolkit first.
+
+heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
+mkdir -p $sdir
+gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout
+gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train
+(echo "<s>"; echo "</s>" ) | cat - $srcdir/wordlist.final > $sdir/wordlist.final.s
+
+# 3-gram:
+ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
+ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2
+#file data/local/lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
+#0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437
+
+# Trying 4-gram:
+ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz
+ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout 
+#file data/local/lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
+#0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822
+
+#3-gram with pruning:
+ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
+  -prune 0.0000001 -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz
+ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout 
+#file data/local/lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
+#0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616
+# Around 2.25M N-grams.
+# Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/"
+# above, which gave 2.5 million N-grams and a perplexity of 156.
+
+# Note: all SRILM experiments above fully discount all singleton 3 and 4-grams.
+# You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to
+# the kaldi_lm experiments above without "-mincount".
+
+##  From here is how to train with
+# IRSTLM.  This is not really working at the moment.
+export IRSTLM=../../../tools/irstlm/
+
+idir=$dir/irstlm
+mkdir $idir
+gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \
+  gzip -c > $idir/train.gz
+
+$IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no
+ cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\
+{print $0;}}' > vocab.irstlm.20k
+
+
+$IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz  -p yes \
+  -n 3 -s improved-kneser-ney -b yes
+# Testing perplexity with SRILM tools:
+ngram -lm $idir/lm_3gram.gz  -ppl $sdir/cleaned.heldout 
+#data/local/lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for <unk> in closed-vocabulary LM
+#file data/local/lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs
+#0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599
+
+# Perplexity is very bad (should be ~141, since we used -p option,
+# not 175),
+# but adding -debug 3 to the command line shows that
+# the IRSTLM LM does not seem to sum to one properly, so it seems that
+# it produces an LM that isn't interpretable in the normal way as an ARPA
+# LM.
+
 
 
-# get unigram counts
-echo "Getting unigram counts"
-gunzip -c cleaned.gz | tr -s ' ' '\n' | sort | uniq -c | sort -nr > unigrams
-
-[ ! -f ../cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
-
-cat unigrams | \
-perl -e ' open(F, "<../cmudict/cmudict.0.7a") || die "Cmudict not found";
-  while(<F>){ if (! m/^;;/){  @A=split(" ", $_); $seen{$A[0]} = 1; } }
-  open(O, ">oov.counts");
-  while(<>) {
-    ($count,$word) = split(" ", $_);
-    if (defined $seen{$word}) { print "$word\n"; } else { print O $_; }
-  } ' > vocab.intersect
-
-echo "Most frequent unseen unigrams are: "
-head oov.counts
-
- 
-  '
-  awk 'BEGIN{while(getline<"unigrams")vc[$2]=$1;}{if ($1 in vc)print $1;}'|sort|uniq > vocab.intersect
-
-
-. path.sh || exit 1;
-
-echo "Preparing train and test data"
-
-for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do 
-  mkdir -p data/$x
-  cp data/local/${x}_wav.scp data/$x/wav.scp
-  cp data/local/$x.txt data/$x/text
-  cp data/local/$x.spk2utt data/$x/spk2utt
-  cp data/local/$x.utt2spk data/$x/utt2spk
-  scripts/filter_scp.pl data/$x/spk2utt data/local/spk2gender.map > data/$x/spk2gender
-done
-
-echo "Preparing word lists etc."
-
-# Create the "lang" directory for training... we'll copy this same setup
-# to be used in test too, and also add the G.fst.
-# Note: the lexicon.txt and lexicon_disambig.txt are not used directly by
-# the training scripts, so we put these in data/local/.
-
-# TODO: make sure we properly handle the begin/end symbols in the lexicon,
-
-# lang_test will contain common things we'll put in lang_test_{bg,tgpr,tg}
-mkdir -p data/lang data/lang_test
-
-
-# (0), this is more data-preparation than data-formatting;
-# add disambig symbols to the lexicon in data/local/lexicon.txt
-# and produce data/local/lexicon_disambig.txt
-
-ndisambig=`scripts/add_lex_disambig.pl data/local/lexicon.txt data/local/lexicon_disambig.txt`
-ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST.
-echo $ndisambig > data/local/lex_ndisambig
-
-
-# (1) Put into data/lang, phones.txt, silphones.csl, nonsilphones.csl, words.txt,
-#   oov.txt
-cp data/local/phones.txt data/lang # we could get these from the lexicon, but prefer to
- # do it like this so we get all the possible begin/middle/end versions of phones even
- # if they don't appear.  This is so if we extend the lexicon later, we could use the
- # same phones.txt (which is "baked into" the model and can't be changed after you build it).
-
-silphones="SIL SPN NSN";
-# Generate colon-separated lists of silence and non-silence phones.
-scripts/silphones.pl data/lang/phones.txt "$silphones" data/lang/silphones.csl data/lang/nonsilphones.csl
-
-cat data/local/lexicon.txt | awk '{print $1}' | sort | uniq  | \
- awk 'BEGIN{print "<eps> 0";} {printf("%s %d\n", $1, NR);} END{printf("#0 %d\n", NR+1);} ' \
-  > data/lang/words.txt
-
-# Create the basic L.fst without disambiguation symbols, for use
-# in training. 
-scripts/make_lexicon_fst.pl data/local/lexicon.txt 0.5 SIL | \
-  fstcompile --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt \
-  --keep_isymbols=false --keep_osymbols=false | \
-   fstarcsort --sort_type=olabel > data/lang/L.fst
-
-# The file oov.txt contains a word that we will map any OOVs to during
-# training.
-echo "<SPOKEN_NOISE>" > data/lang/oov.txt
-
-# (2)
-# Create phonesets.txt and extra_questions.txt ...
-# phonesets.txt is sets of phones that are shared when building the monophone system
-# and when asking questions based on an automatic clustering of phones, for the
-# triphone system.  extra_questions.txt is some pre-defined extra questions about
-# position and stress that split apart the categories we created in phonesets.txt.
-# in extra_questions.txt there is also a question about silence phones, since we 
-# didn't include that in our
-
-local/make_shared_phones.sh < data/lang/phones.txt > data/lang/phonesets_mono.txt
-grep -v SIL data/lang/phonesets_mono.txt > data/lang/phonesets_cluster.txt
-local/make_extra_questions.sh < data/lang/phones.txt > data/lang/extra_questions.txt
-
-( # Creating the "roots file" for building the context-dependent systems...
-  # we share the roots across all the versions of each real phone.  We also
-  # share the states of the 3 forms of silence.  "not-shared" here means the
-  # states are distinct p.d.f.'s... normally we would automatically split on
-  # the HMM-state but we're not making silences context dependent.
-  echo 'not-shared not-split SIL SPN NSN';
-  cat data/lang/phones.txt | grep -v eps | grep -v SIL | grep -v SPN | grep -v NSN | awk '{print $1}' | \
-    perl -e 'while(<>){ m:([A-Za-z]+)(\d*)(_.)?: || die "Bad line $_"; 
-            $phone=$1; $stress=$2; $position=$3;
-      if($phone eq $curphone){ print " $phone$stress$position"; }
-      else { if(defined $curphone){ print "\n"; } $curphone=$phone; 
-            print "shared split $phone$stress$position";  }} print "\n"; '
-) > data/lang/roots.txt
-
-silphonelist=`cat data/lang/silphones.csl | sed 's/:/ /g'`
-nonsilphonelist=`cat data/lang/nonsilphones.csl | sed 's/:/ /g'`
-cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \
-   sed "s:SILENCEPHONES:$silphonelist:" > data/lang/topo
-
-for f in phones.txt words.txt L.fst silphones.csl nonsilphones.csl topo; do
-  cp data/lang/$f data/lang_test
-done
-
-
-# (3),
-# In lang_test, create a phones.txt file that includes the disambiguation symbols.
-# the --include-zero includes the #0 symbol we pass through from the grammar.
-# Note: we previously echoed the # of disambiguation symbols to data/local/lex_ndisambig.
-scripts/add_disambig.pl --include-zero data/lang_test/phones.txt \
-   `cat data/local/lex_ndisambig` > data/lang_test/phones_disambig.txt
-cp data/lang_test/phones_disambig.txt data/lang # Needed for MMI.
-
-
-# Create the lexicon FST with disambiguation symbols, and put it in lang_test.
-# There is an extra
-# step where we create a loop "pass through" the disambiguation symbols
-# from G.fst.  
-phone_disambig_symbol=`grep \#0 data/lang_test/phones_disambig.txt | awk '{print $2}'`
-word_disambig_symbol=`grep \#0 data/lang_test/words.txt | awk '{print $2}'`
-
-scripts/make_lexicon_fst.pl data/local/lexicon_disambig.txt 0.5 SIL '#'$ndisambig | \
-   fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \
-   --keep_isymbols=false --keep_osymbols=false |   \
-   fstaddselfloops  "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \
-   fstarcsort --sort_type=olabel > data/lang_test/L_disambig.fst || exit 1;
-
-# Copy into data/lang/ also, where it will be needed for discriminative training.
-cp data/lang_test/L_disambig.fst data/lang/
-
-
-# Create L_align.fst, which is as L.fst but with alignment symbols (#1 and #2 at the
-# beginning and end of words, on the input side)... useful if we
-# ever need to e.g. create ctm's-- these are used to work out the
-# word boundaries.
-cat data/local/lexicon.txt | \
- awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }' | \
- scripts/make_lexicon_fst.pl - 0.5 SIL | \
- fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \
-  --keep_isymbols=false --keep_osymbols=false | \
- fstarcsort --sort_type=olabel > data/lang_test/L_align.fst
-
-# Next, for each type of language model, create the corresponding FST
-# and the corresponding lang_test directory.
-
-echo Preparing language models for test
-
-for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
-  test=data/lang_test_${lm_suffix}
-  mkdir -p $test
-  for f in phones.txt words.txt phones_disambig.txt L.fst L_disambig.fst \
-     silphones.csl nonsilphones.csl; do
-    cp data/lang_test/$f $test
-  done
-  gunzip -c data/local/lm_${lm_suffix}.arpa.gz | \
-   scripts/find_arpa_oovs.pl $test/words.txt  > data/local/oovs_${lm_suffix}.txt
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-  gunzip -c data/local/lm_${lm_suffix}.arpa.gz | \
-    grep -v '<s> <s>' | \
-    grep -v '</s> <s>' | \
-    grep -v '</s> </s>' | \
-    arpa2fst - | fstprint | \
-    scripts/remove_oovs.pl data/local/oovs_${lm_suffix}.txt | \
-    scripts/eps2disambig.pl | scripts/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > $test/G.fst
-  fstisstochastic $test/G.fst
- # The output is like:
- # 9.14233e-05 -0.259833
- # we do expect the first of these 2 numbers to be close to zero (the second is
- # nonzero because the backoff weights make the states sum to >1).
- # Because of the <s> fiasco for these particular LMs, the first number is not
- # as close to zero as it could be.
-
-  # Everything below is only for diagnostic.
-  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
-  # this might cause determinization failure of CLG.
-  # #0 is treated as an empty word.
-  mkdir -p tmpdir.g
-  awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
-    < data/local/lexicon.txt  >tmpdir.g/select_empty.fst.txt
-  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt tmpdir.g/select_empty.fst.txt | \
-   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > tmpdir.g/empty_words.fst
-  fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' && 
-    echo "Language model has cycles with empty words" && exit 1
-  rm -r tmpdir.g
-done
-
-echo "Succeeded in formatting data."
diff --git a/egs/wsj/s3/run.sh b/egs/wsj/s3/run.sh
index 6d18ee702..eaba5c3e9 100644
--- a/egs/wsj/s3/run.sh
+++ b/egs/wsj/s3/run.sh
@@ -36,6 +36,20 @@ local/wsj_prepare_dict.sh
 
 local/wsj_format_data.sh
 
+# We suggest to run the next three commands in the background,
+# as they are not a precondition for the system building and
+# most of the tests: these commands build a dictionary
+# containing many of the OOVs in the WSJ LM training data,
+# and an LM trained directly on that data (i.e. not just
+# copying the arpa files from the disks from LDC).
+(
+ local/wsj_extend_dict.sh /mnt/matylda2/data/WSJ1/13-32.1  && \
+ local/wsj_prepare_local_dict.sh && \
+ local/wsj_train_lms.sh && \
+ local/wsj_format_data_local.sh
+) &
+
+
 # Now make MFCC features.
 # mfccdir should be some place with a largish disk where you
 # want to store MFCC features.
@@ -68,7 +82,7 @@ steps/train_mono.sh --num-jobs 10 --cmd "$train_cmd" \
   data/train_si84_2kshort data/lang exp/mono0a
 
 (
-scripts/mkgraph.sh --mono data/lang_test_tgpr exp/mono0a exp/mono0a/graph_tgpr
+scripts/mkgraph.sh --mono data/lang_test_fgpr exp/mono0a exp/mono0a/graph_fgpr
 scripts/decode.sh --cmd "$decode_cmd" steps/decode_deltas.sh exp/mono0a/graph_tgpr data/test_dev93 exp/mono0a/decode_tgpr_dev93
 scripts/decode.sh --cmd "$decode_cmd" steps/decode_deltas.sh exp/mono0a/graph_tgpr data/test_eval92 exp/mono0a/decode_tgpr_eval92
 )&
@@ -170,6 +184,11 @@ scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri3b/gra
 scripts/lmrescore.sh --cmd "$decode_cmd" data/lang_test_tgpr data/lang_test_tg \
   data/test_eval92 exp/tri3b/decode_tgpr_eval92 exp/tri3b/decode_tgpr_eval92_tg
 
+# Trying the larger dictionary ("big-dict"/bd) + locally produced LM.
+scripts/mkgraph.sh data/lang_test_bd_tgpr exp/tri3b exp/tri3b/graph_bd_tgpr
+scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri3b/graph_bd_tgpr \
+  data/test_eval92 exp/tri3b/decode_bd_tgpr_eval92
+
 # From 3b system, align all si284 data.
 steps/align_lda_mllt_sat.sh --num-jobs 10 --cmd "$train_cmd" \
   data/train_si284 data/lang exp/tri3b exp/tri3b_ali_si284
-- 
GitLab