format_lms.sh 1.49 KB
Newer Older
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
#!/bin/bash

# Copyright 2014 Vassil Panayotov
# Apache 2.0

# Prepares the test time language model(G) transducers
# (adapted from wsj/s5/local/wsj_format_data.sh)

. ./path.sh || exit 1;

# begin configuration section
src_dir=data/lang
# end configuration section

. utils/parse_options.sh || exit 1;

set -e

if [ $# -ne 1 ]; then
  echo "Usage: $0 <lm-dir>"
  echo "e.g.: $0 /export/a15/vpanayotov/data/lm"
  echo ", where:"
  echo "    <lm-dir> is the directory in which the language model is stored/downloaded"
  echo "Options:"
  echo "   --src-dir  <dir>           # source lang directory, default data/lang"
  exit 1
fi

lm_dir=$1

if [ ! -d $lm_dir ]; then
  echo "$0: expected source LM directory $lm_dir to exist"
  exit 1;
fi
if [ ! -f $src_dir/words.txt ]; then
  echo "$0: expected $src_dir/words.txt to exist."
  exit 1;
fi


tmpdir=data/local/lm_tmp.$$
trap "rm -r $tmpdir" EXIT

mkdir -p $tmpdir
45
#for lm_suffix in tgsmall tgmed tglarge fglarge; do
46 47
for lm_suffix in tglarge french-small; do
#for lm_suffix in linto1; do
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
48 49 50 51 52 53 54 55 56 57 58 59 60 61
  # tglarge is prepared by a separate command, called from run.sh; we don't
  # want to compile G.fst for tglarge, as it takes a while.
  test=${src_dir}_test_${lm_suffix}
  mkdir -p $test
  cp -r ${src_dir}/* $test
  gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
    arpa2fst --disambig-symbol=#0 \
             --read-symbol-table=$test/words.txt - $test/G.fst
  utils/validate_lang.pl --skip-determinization-check $test || exit 1;
done

echo "Succeeded in formatting data."

exit 0