prepare_dict.sh 5.57 KB
Newer Older
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
#!/bin/bash

# Copyright 2017 @Linagora Abdel HEBA OK

# Prepares the dictionary and auto-generates the pronunciations for the words,
# that are in our vocabulary but not in CMUdict
# Yup :)

stage=0
nj=4 # number of parallel Sequitur G2P jobs, we would like to use
cmd=run.pl


. utils/parse_options.sh || exit 1;
. path.sh || exit 1


if [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <lm-dir> <g2p-model-dir> <dst-dir>"
  #echo "e.g.: data/lm /data/g2p data/local/dict"
  echo "Options:"
  echo "  --cmd '<command>'    # script to launch jobs with, default: run.pl"
  echo "  --nj <nj>            # number of jobs to run, default: 4."
  exit 1
fi

lm_dir=$1
g2p_model_dir=$2
dst_dir=$3

vocab=$lm_dir/meeting-vocab.txt
[ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1;

# this file is either a copy of the lexicon we download from openslr.org/11 or is
# created by the G2P steps below
lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt

cmudict_dir=$dst_dir/cmudict
cmudict_plain=$dst_dir/fr.dict

mkdir -p $dst_dir || exit 1;

if [ $stage -le 0 ]; then
  echo "Downloading and preparing CMUdict"
  if [ ! -s $cmudict_dir/fr.dict ]; then
  # a modifier
    svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $cmudict_dir || exit 1;
  fi
  #echo "Removing the pronunciation variant markers ..."
  #grep -v ';;;' $cmudict_dir/cmudict.0.7a | \
  #  perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
  #  > $cmudict_plain || exit 1;
fi


if [ $stage -le 1 ]; then
  # check if we have Sequitur G2P is installed
  if [ ! -f  "$sequitur" ]; then
    if ! which swig >&/dev/null; then
      echo "Please install 'swig' and then run $KALDI_ROOT/tools/extra/install_sequitur.sh"
      exit 1
    else
      echo "Sequitur G2P not found- running $KALDI_ROOT/tools/extra/install_sequitur.sh"
      pushd $KALDI_ROOT/tools
      extras/install_sequitur.sh || exit 1
      popd
    fi
  fi
  [[ -f "$sequitur" ]] || { echo "Still can't find Sequitur G2P- check your path.sh"; exit 1; }

  g2p_dir=$dst_dir/g2p
  auto_vocab_prefix="$g2p_dir/vocab_autogen"
  auto_lexicon_prefix="$g2p_dir/lexicon_autogen"

  mkdir -p $g2p_dir/log
  auto_vocab_splits=$(eval "echo $auto_vocab_prefix.{$(seq -s',' $nj)}")
  awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $cmudict_plain $vocab |\
    sort | tee $g2p_dir/vocab_autogen.full |\
    utils/split_scp.pl - $auto_vocab_splits || exit 1
  echo "Autogenerating pronunciations for the words in $auto_vocab_prefix.* ..."
  $cmd JOB=1:$nj $g2p_dir/log/g2p.JOB.log \
    local/g2p.sh  $auto_vocab_prefix.JOB $g2p_model_dir $auto_lexicon_prefix.JOB || exit 1
  g2p_vocab_size=$(wc -l <$g2p_dir/vocab_autogen.full)
  g2p_lex_size=$(wc -l < <(cat $auto_lexicon_prefix.*))
  [[ "$g2p_vocab_size" -eq "$g2p_lex_size" ]] || { echo "Unexpected G2P error"; exit 1; }
  sort <(cat $auto_vocab_prefix.*) >$dst_dir/vocab_autogen.txt
  sort <(cat $auto_lexicon_prefix.*) >$dst_dir/lexicon_autogen.txt
  echo "$(wc -l <$g2p_dir/vocab_autogen.full) pronunciations autogenerated OK"
fi

if [ $stage -le 2 ]; then
  echo "Combining the CMUdict pronunciations with the autogenerated ones ..."
  awk 'NR==FNR{a[$1]=1; next} ($1 in a)' $vocab $cmudict_plain |\
    cat - $dst_dir/lexicon_autogen.txt | sort >$lexicon_raw_nosil || exit 1
  raw_lex_size=$(cat $lexicon_raw_nosil | awk '{print $1}' | sort -u | wc -l)
  vocab_size=$(wc -l <$vocab)
  [[ "$vocab_size" -eq "$raw_lex_size" ]] || {
    echo "Inconsistent lexicon($raw_lex_size) vs vocabulary($vocab_size) size!";
    exit 1; }
  echo "Combined lexicon saved to '$lexicon_raw_nosil'"
fi

# The copy operation below is necessary, if we skip the g2p stages(e.g. using --stage 3)
if [[ ! -s "$lexicon_raw_nosil" ]]; then
  cp $lm_dir/librispeech-lexicon.txt $lexicon_raw_nosil || exit 1
fi

if [ $stage -le 3 ]; then
  silence_phones=$dst_dir/silence_phones.txt
  optional_silence=$dst_dir/optional_silence.txt
  nonsil_phones=$dst_dir/nonsilence_phones.txt
  extra_questions=$dst_dir/extra_questions.txt

  echo "Preparing phone lists and clustering questions"
  (echo SIL; echo SPN; echo NSN; echo LAU;) > $silence_phones
  echo SIL > $optional_silence
  # nonsilence phones; on each line is a list of phones that correspond
  # really to the same base phone.
  awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\
    sort -u |\
    perl -e 'while(<>){
      chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
      $phones_of{$1} .= "$_ "; }
      foreach $list (values %phones_of) {print $list . "\n"; } ' \
      > $nonsil_phones || exit 1;
  # A few extra questions that will be added to those obtained by automatically clustering
  # the "real" phones.  These ask about stress; there's also one for silence.
  cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1;
  cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
    $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
    >> $extra_questions || exit 1;
  echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones"
  echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence"
  echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones"
  echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions"
fi

if [ $stage -le 4 ]; then
  (echo '!sil SIL'; echo '<spoken_noise> SPN'; echo '<unk> SPN'; echo '<laugh> LAU'; echo '<noise> NSN') |\
  cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt
  echo "Lexicon text file saved as: $dst_dir/lexicon.txt"
fi

exit 0