prepare_dict.sh 5.59 KB
Newer Older
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
#!/bin/bash

# Copyright 2017 @Linagora Abdel HEBA OK

# Prepares the dictionary and auto-generates the pronunciations for the words,
# that are in our vocabulary but not in CMUdict
# Yup :)

stage=0
nj=4 # number of parallel Sequitur G2P jobs, we would like to use
cmd=run.pl


. utils/parse_options.sh || exit 1;
. path.sh || exit 1


if [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <lm-dir> <g2p-model-dir> <dst-dir>"
  #echo "e.g.: data/lm /data/g2p data/local/dict"
  echo "Options:"
  echo "  --cmd '<command>'    # script to launch jobs with, default: run.pl"
  echo "  --nj <nj>            # number of jobs to run, default: 4."
  exit 1
fi

lm_dir=$1
g2p_model_dir=$2
dst_dir=$3

vocab=$lm_dir/meeting-vocab.txt
[ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1;

# this file is either a copy of the lexicon we download from openslr.org/11 or is
# created by the G2P steps below
lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt

cmudict_dir=$dst_dir/cmudict
cmudict_plain=$dst_dir/fr.dict

mkdir -p $dst_dir || exit 1;

if [ $stage -le 0 ]; then
  echo "Downloading and preparing CMUdict"
45
  if [ ! -s $cmudict_plain ]; then
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
  # a modifier
    svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $cmudict_dir || exit 1;
  fi
  #echo "Removing the pronunciation variant markers ..."
  #grep -v ';;;' $cmudict_dir/cmudict.0.7a | \
  #  perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
  #  > $cmudict_plain || exit 1;
fi


if [ $stage -le 1 ]; then
  # check if we have Sequitur G2P is installed
  if [ ! -f  "$sequitur" ]; then
    if ! which swig >&/dev/null; then
      echo "Please install 'swig' and then run $KALDI_ROOT/tools/extra/install_sequitur.sh"
      exit 1
    else
      echo "Sequitur G2P not found- running $KALDI_ROOT/tools/extra/install_sequitur.sh"
      pushd $KALDI_ROOT/tools
      extras/install_sequitur.sh || exit 1
      popd
    fi
  fi
  [[ -f "$sequitur" ]] || { echo "Still can't find Sequitur G2P- check your path.sh"; exit 1; }

  g2p_dir=$dst_dir/g2p
  auto_vocab_prefix="$g2p_dir/vocab_autogen"
  auto_lexicon_prefix="$g2p_dir/lexicon_autogen"

  mkdir -p $g2p_dir/log
  auto_vocab_splits=$(eval "echo $auto_vocab_prefix.{$(seq -s',' $nj)}")
  awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $cmudict_plain $vocab |\
    sort | tee $g2p_dir/vocab_autogen.full |\
    utils/split_scp.pl - $auto_vocab_splits || exit 1
  echo "Autogenerating pronunciations for the words in $auto_vocab_prefix.* ..."
  $cmd JOB=1:$nj $g2p_dir/log/g2p.JOB.log \
    local/g2p.sh  $auto_vocab_prefix.JOB $g2p_model_dir $auto_lexicon_prefix.JOB || exit 1
  g2p_vocab_size=$(wc -l <$g2p_dir/vocab_autogen.full)
  g2p_lex_size=$(wc -l < <(cat $auto_lexicon_prefix.*))
85
  # TODO Fix problem
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
86 87 88 89 90 91 92 93 94 95 96 97
  [[ "$g2p_vocab_size" -eq "$g2p_lex_size" ]] || { echo "Unexpected G2P error"; exit 1; }
  sort <(cat $auto_vocab_prefix.*) >$dst_dir/vocab_autogen.txt
  sort <(cat $auto_lexicon_prefix.*) >$dst_dir/lexicon_autogen.txt
  echo "$(wc -l <$g2p_dir/vocab_autogen.full) pronunciations autogenerated OK"
fi

if [ $stage -le 2 ]; then
  echo "Combining the CMUdict pronunciations with the autogenerated ones ..."
  awk 'NR==FNR{a[$1]=1; next} ($1 in a)' $vocab $cmudict_plain |\
    cat - $dst_dir/lexicon_autogen.txt | sort >$lexicon_raw_nosil || exit 1
  raw_lex_size=$(cat $lexicon_raw_nosil | awk '{print $1}' | sort -u | wc -l)
  vocab_size=$(wc -l <$vocab)
98
  # TODO Fixe problem
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
99 100 101 102 103 104 105 106
  [[ "$vocab_size" -eq "$raw_lex_size" ]] || {
    echo "Inconsistent lexicon($raw_lex_size) vs vocabulary($vocab_size) size!";
    exit 1; }
  echo "Combined lexicon saved to '$lexicon_raw_nosil'"
fi

# The copy operation below is necessary, if we skip the g2p stages(e.g. using --stage 3)
if [[ ! -s "$lexicon_raw_nosil" ]]; then
107
  cp $cmudict_plain $lexicon_raw_nosil || exit 1
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
fi

if [ $stage -le 3 ]; then
  silence_phones=$dst_dir/silence_phones.txt
  optional_silence=$dst_dir/optional_silence.txt
  nonsil_phones=$dst_dir/nonsilence_phones.txt
  extra_questions=$dst_dir/extra_questions.txt

  echo "Preparing phone lists and clustering questions"
  (echo SIL; echo SPN; echo NSN; echo LAU;) > $silence_phones
  echo SIL > $optional_silence
  # nonsilence phones; on each line is a list of phones that correspond
  # really to the same base phone.
  awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\
    sort -u |\
    perl -e 'while(<>){
      chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
      $phones_of{$1} .= "$_ "; }
      foreach $list (values %phones_of) {print $list . "\n"; } ' \
      > $nonsil_phones || exit 1;
  # A few extra questions that will be added to those obtained by automatically clustering
  # the "real" phones.  These ask about stress; there's also one for silence.
  cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1;
  cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
    $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
    >> $extra_questions || exit 1;
  echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones"
  echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence"
  echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones"
  echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions"
fi

if [ $stage -le 4 ]; then
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
141
  (echo '!sil SIL'; echo '<spoken_noise> SPN'; echo '<UNK> SPN'; echo '<laugh> LAU'; echo '<noise> NSN') |\
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
142 143 144 145 146
  cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt
  echo "Lexicon text file saved as: $dst_dir/lexicon.txt"
fi

exit 0