prepare_dict.sh 6.12 KB
Newer Older
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
#!/bin/bash

# Copyright 2017 @Linagora Abdel HEBA OK

# Prepares the dictionary and auto-generates the pronunciations for the words,
# that are in our vocabulary but not in CMUdict
# Yup :)

stage=0
nj=4 # number of parallel Sequitur G2P jobs, we would like to use
cmd=run.pl


. utils/parse_options.sh || exit 1;
. path.sh || exit 1
16
export LC_ALL=C
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41

if [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <lm-dir> <g2p-model-dir> <dst-dir>"
  #echo "e.g.: data/lm /data/g2p data/local/dict"
  echo "Options:"
  echo "  --cmd '<command>'    # script to launch jobs with, default: run.pl"
  echo "  --nj <nj>            # number of jobs to run, default: 4."
  exit 1
fi

lm_dir=$1
g2p_model_dir=$2
dst_dir=$3


# this file is either a copy of the lexicon we download from openslr.org/11 or is
# created by the G2P steps below
lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt

cmudict_dir=$dst_dir/cmudict
cmudict_plain=$dst_dir/fr.dict

mkdir -p $dst_dir || exit 1;

if [ $stage -le 0 ]; then
42 43
  vocab=$lm_dir/meeting-vocab.txt
  [ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1;
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
44
  echo "Downloading and preparing CMUdict"
45
  if [ ! -s $cmudict_plain ]; then
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
  # a modifier
    svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $cmudict_dir || exit 1;
  fi
  #echo "Removing the pronunciation variant markers ..."
  #grep -v ';;;' $cmudict_dir/cmudict.0.7a | \
  #  perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
  #  > $cmudict_plain || exit 1;
fi


if [ $stage -le 1 ]; then
  # check if we have Sequitur G2P is installed
  if [ ! -f  "$sequitur" ]; then
    if ! which swig >&/dev/null; then
      echo "Please install 'swig' and then run $KALDI_ROOT/tools/extra/install_sequitur.sh"
      exit 1
    else
      echo "Sequitur G2P not found- running $KALDI_ROOT/tools/extra/install_sequitur.sh"
      pushd $KALDI_ROOT/tools
      extras/install_sequitur.sh || exit 1
      popd
    fi
  fi
  [[ -f "$sequitur" ]] || { echo "Still can't find Sequitur G2P- check your path.sh"; exit 1; }

  g2p_dir=$dst_dir/g2p
  auto_vocab_prefix="$g2p_dir/vocab_autogen"
  auto_lexicon_prefix="$g2p_dir/lexicon_autogen"

  mkdir -p $g2p_dir/log
  auto_vocab_splits=$(eval "echo $auto_vocab_prefix.{$(seq -s',' $nj)}")
  awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $cmudict_plain $vocab |\
    sort | tee $g2p_dir/vocab_autogen.full |\
    utils/split_scp.pl - $auto_vocab_splits || exit 1
  echo "Autogenerating pronunciations for the words in $auto_vocab_prefix.* ..."
  $cmd JOB=1:$nj $g2p_dir/log/g2p.JOB.log \
    local/g2p.sh  $auto_vocab_prefix.JOB $g2p_model_dir $auto_lexicon_prefix.JOB || exit 1
  g2p_vocab_size=$(wc -l <$g2p_dir/vocab_autogen.full)
  g2p_lex_size=$(wc -l < <(cat $auto_lexicon_prefix.*))
85 86
  echo $g2p_vocab_size
  echo $g2p_lex_size
87
  # TODO Fix problem
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
88 89 90 91 92 93 94 95
  [[ "$g2p_vocab_size" -eq "$g2p_lex_size" ]] || { echo "Unexpected G2P error"; exit 1; }
  sort <(cat $auto_vocab_prefix.*) >$dst_dir/vocab_autogen.txt
  sort <(cat $auto_lexicon_prefix.*) >$dst_dir/lexicon_autogen.txt
  echo "$(wc -l <$g2p_dir/vocab_autogen.full) pronunciations autogenerated OK"
fi

if [ $stage -le 2 ]; then
  echo "Combining the CMUdict pronunciations with the autogenerated ones ..."
96 97 98
  #awk 'NR==FNR{a[$1]=1; next} ($1 in a)' $vocab $cmudict_plain |\
  #  cat - $dst_dir/lexicon_autogen.txt | sort >$lexicon_raw_nosil || exit 1
  cat $cmudict_plain $dst_dir/lexicon_autogen.txt | sort > $lexicon_raw_nosil || exit 1
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
99 100
  raw_lex_size=$(cat $lexicon_raw_nosil | awk '{print $1}' | sort -u | wc -l)
  vocab_size=$(wc -l <$vocab)
101
  # TODO Fixe problem
102 103 104
  #[[ "$vocab_size" -eq "$raw_lex_size" ]] || {
  #  echo "Inconsistent lexicon($raw_lex_size) vs vocabulary($vocab_size) size!";
  #  exit 1; }
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
105 106 107 108 109
  echo "Combined lexicon saved to '$lexicon_raw_nosil'"
fi

# The copy operation below is necessary, if we skip the g2p stages(e.g. using --stage 3)
if [[ ! -s "$lexicon_raw_nosil" ]]; then
110
  cp $cmudict_plain $lexicon_raw_nosil || exit 1
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
111 112 113 114 115 116 117 118 119 120
fi

if [ $stage -le 3 ]; then
  silence_phones=$dst_dir/silence_phones.txt
  optional_silence=$dst_dir/optional_silence.txt
  nonsil_phones=$dst_dir/nonsilence_phones.txt
  extra_questions=$dst_dir/extra_questions.txt

  echo "Preparing phone lists and clustering questions"
  (echo SIL; echo SPN; echo NSN; echo LAU;) > $silence_phones
121
  #(echo SIL; echo SPN;) > $silence_phones
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
  echo SIL > $optional_silence
  # nonsilence phones; on each line is a list of phones that correspond
  # really to the same base phone.
  awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\
    sort -u |\
    perl -e 'while(<>){
      chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
      $phones_of{$1} .= "$_ "; }
      foreach $list (values %phones_of) {print $list . "\n"; } ' \
      > $nonsil_phones || exit 1;
  # A few extra questions that will be added to those obtained by automatically clustering
  # the "real" phones.  These ask about stress; there's also one for silence.
  cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1;
  cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
    $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
    >> $extra_questions || exit 1;
  echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones"
  echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence"
  echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones"
  echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions"
fi

if [ $stage -le 4 ]; then
145
  # TCOF
146
  #(echo '!sil SIL'; echo '<spoken_noise> SPN'; echo '<UNK> SPN'; echo '<laugh> LAU'; echo '<noise> NSN') |\
147 148
  # ESTER
  (echo '<unk> SPN'; echo '<laugh> LAU'; echo '<noise> NSN'; echo '<top> NSN';\
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
149
   echo '<whisperedvoice> NSN'; echo '<breath> SPN'; echo '<blowshard> NSN'; echo '<cough> SPN'; echo '<glottisblow> SPN';\
150
   echo '<noisemouth> SPN';echo '<whistling> NSN') |\
151
  # ESTER without noise states
152
  #(echo '!sil SIL'; echo '<UNK> SPN') |\
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
153 154 155 156 157
  cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt
  echo "Lexicon text file saved as: $dst_dir/lexicon.txt"
fi

exit 0