generate_proxy_keywords.sh 5.37 KB
Newer Older
1 2
#!/bin/bash

3
# Copyright 2012-2014  Guoguo Chen
4 5
# Apache 2.0.

6 7
# Begin configuration section.  
nj=8
8
cmd=run.pl
9 10 11 12 13 14
beam=-1             # Beam for proxy FST, -1 means no prune
phone_beam=-1       # Beam for KxL2xE FST, -1 means no prune
nbest=-1            # Use top n best proxy keywords in proxy FST, -1 means all
                    # proxies
phone_nbest=50      # Use top n best phone sequences in KxL2xE, -1 means all
                    # phone sequences
15 16 17 18 19 20
confusion_matrix=   # If supplied, using corresponding E transducer
count_cutoff=1      # Minimal count to be considered in the confusion matrix;
                    # will ignore phone pairs that have count less than this.
pron_probs=false    # If true, then lexicon looks like:
                    # Word Prob Phone1 Phone2...
# End configuration section.
21 22

[ -f ./path.sh ] && . ./path.sh; # source the path.
23
echo "$0 " "$@"
24 25
. parse_options.sh || exit 1;

26 27 28 29 30 31 32 33
if [ $# -ne 1 ]; then
  echo "Generate proxy keywords for IV/OOV keywords. Phone confusions will be"
  echo "used when generating the proxies if the confusion matrix is supplied."
  echo "If you are going to use the confusion matrix, please use the following"
  echo "format for the file \$confusion_matrix:"
  echo "  p1 p2 count1        // For substitution"
  echo "  p3 <eps> count2     // For deletion"
  echo "  <eps> p4 count3     // For insertion"
34
  echo ""
35 36 37 38 39 40 41 42 43 44 45
  echo "Proxies keywords are generated using:"
  echo "K x L2 x E x L1'"
  echo "where K is a keyword FST, L2 is a lexicon that contains pronunciations"
  echo "of keywords in K, E is an edit distance FST that contains the phone"
  echo "confusions and L1 is the original lexicon."
  echo ""
  echo "The script assumes that L1.lex, L2.lex, words.txt and keywords.txt have"
  echo "been prepared and stored in the directory <kws-data-dir>."
  echo ""
  echo "Usage: local/generate_example_kws.sh <kws-data-dir>"
  echo " e.g.: local/generate_example_kws.sh data/dev10h/kws_proxy/"
46 47 48
  exit 1;
fi

49 50
set -e 
set -o pipefail
51

52
kwsdatadir=$1
53

54 55 56 57 58
# Checks some files.
for f in $kwsdatadir/L1.lex $kwsdatadir/L2.lex \
  $kwsdatadir/words.txt $kwsdatadir/keywords.txt; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1
done
59

60 61
# Gets phone symbols
phone_start=2
62
if $pron_probs; then
63
  phone_start=3
64
fi
65 66

pron_probs_param="";
67
if $pron_probs; then
68 69
  pron_probs_param="--pron-probs";
fi
70 71 72 73 74 75 76 77 78 79 80 81 82

ndisambig=`utils/add_lex_disambig.pl \
  $pron_probs_param $kwsdatadir/L1.lex $kwsdatadir/L1_disambig.lex`
ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST.
( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $kwsdatadir/disambig.txt

cat $kwsdatadir/L2.lex $kwsdatadir/L1.lex |\
  awk '{for(i='$phone_start'; i <= NF; i++) {print $i;}}' |\
  sort -u | sed '1i\<eps>' |\
  cat - $kwsdatadir/disambig.txt | awk 'BEGIN{x=0} {print $0"\t"x; x++;}' \
  > $kwsdatadir/phones.txt

# Compiles lexicon into FST
83 84 85 86 87
cat $kwsdatadir/L2.lex |\
  utils/make_lexicon_fst.pl $pron_probs_param - |\
  fstcompile --isymbols=$kwsdatadir/phones.txt \
  --osymbols=$kwsdatadir/words.txt - |\
  fstinvert | fstarcsort --sort_type=olabel > $kwsdatadir/L2.fst
88 89 90 91 92 93 94 95

phone_disambig_symbol=`grep \#0 $kwsdatadir/phones.txt | awk '{print $2}'`
word_disambig_symbol=`grep \#0 $kwsdatadir/words.txt | awk '{print $2}'`
phone_disambig_symbols=`grep \# $kwsdatadir/phones.txt |\
  awk '{print $2}' | tr "\n" " "`
word_disambig_symbols=`grep \# $kwsdatadir/words.txt |\
  awk '{print $2}' | tr "\n" " "`
cat $kwsdatadir/L1_disambig.lex |\
96 97 98
  utils/make_lexicon_fst.pl $pron_probs_param - |\
  fstcompile --isymbols=$kwsdatadir/phones.txt \
  --osymbols=$kwsdatadir/words.txt - |\
99 100 101 102
  fstaddselfloops "echo $phone_disambig_symbol |" \
  "echo $word_disambig_symbol |" |\
  fstdeterminize | fstrmsymbols "echo $phone_disambig_symbols|" |\
  fstrmsymbols --remove-from-output=true "echo $word_disambig_symbols|" |\
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
  fstarcsort --sort_type=ilabel > $kwsdatadir/L1.fst

# Compiles E.fst
confusion_matrix_param=""
if [ ! -z $confusion_matrix ]; then
  echo "$0: Using confusion matrix, normalizing"
  local/count_to_logprob.pl --cutoff $count_cutoff \
    $confusion_matrix $kwsdatadir/confusion.txt
  confusion_matrix_param="--confusion-matrix $kwsdatadir/confusion.txt"
fi
cat $kwsdatadir/phones.txt |\
  grep -v -E "<.*>" | grep -v "SIL" | awk '{print $1;}' |\
  local/build_edit_distance_fst.pl --boundary-off=true \
  $confusion_matrix_param - - |\
  fstcompile --isymbols=$kwsdatadir/phones.txt \
  --osymbols=$kwsdatadir/phones.txt - $kwsdatadir/E.fst

# Pre-composes L2 and E, for the sake of efficiency
fstcompose $kwsdatadir/L2.fst $kwsdatadir/E.fst |\
122
  fstarcsort --sort_type=ilabel > $kwsdatadir/L2xE.fst
123 124 125 126

keywords=$kwsdatadir/keywords.int
# Prepares for parallelization
cat $kwsdatadir/keywords.txt |\
127
  utils/sym2int.pl -f 2- $kwsdatadir/words.txt | sort -R > $keywords
128 129 130 131

nof_keywords=`cat $keywords|wc -l`
if [ $nj -gt $nof_keywords ]; then
  nj=$nof_keywords
132 133
  echo "$0: Too many number of jobs, using $nj instead"
fi
134

135 136 137
# Generates the proxy keywords
mkdir -p $kwsdatadir/split/log
$cmd JOB=1:$nj $kwsdatadir/split/log/proxy.JOB.log \
138
  split -n r/JOB/$nj $keywords \| \
139 140 141
  generate-proxy-keywords --verbose=1 \
  --proxy-beam=$beam --proxy-nbest=$nbest \
  --phone-beam=$phone_beam --phone-nbest=$phone_nbest \
142
  $kwsdatadir/L2xE.fst $kwsdatadir/L1.fst ark:- ark:$kwsdatadir/split/proxy.JOB.fsts
143

144 145 146 147 148
proxy_fsts=""
for j in `seq 1 $nj`; do
  proxy_fsts="$proxy_fsts $kwsdatadir/split/proxy.$j.fsts"
done
cat $proxy_fsts > $kwsdatadir/keywords.fsts