Run_ESTER1_2.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    ". path.sh\n",
    ". cmd.sh\n",
    "idata_kaldi=data_last_ffmpeg\n",
    "exp_kaldi=exp_last_ffmpeg\n",
    "#model_tri_sat_ESTER=exp-ESTER-V4/monoAll_1_2/tri1_10K_100K_ESTER_All/tri1_15K_200K_ESTER_All/tri2_20K_300K_ESTER_All/tri1_SAT_50K_400K/tri1_SAT_70K_500K"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare ESTER 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "prepare /fast/LINAGORA/Corpus/database/Corpus/ESTER/DGA/Phase1/data\n",
      "utils/data/get_utt2dur.sh: working out data_last_ffmpeg/data/utt2dur from data_last_ffmpeg/data/segments\n",
      "utils/data/get_utt2dur.sh: computed data_last_ffmpeg/data/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/data\n",
      "Successfully prepared data in data_last_ffmpeg/data..\n",
      "prepare /fast/LINAGORA/Corpus/database/Corpus/ESTER/DGA/Phase2/DATA2\n",
      "utils/data/get_utt2dur.sh: working out data_last_ffmpeg/DATA2/utt2dur from data_last_ffmpeg/DATA2/segments\n",
      "utils/data/get_utt2dur.sh: computed data_last_ffmpeg/DATA2/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/DATA2\n",
      "Successfully prepared data in data_last_ffmpeg/DATA2..\n",
      "prepare /fast/LINAGORA/Corpus/database/Corpus/ESTER/DGA/Eval2005/DATA\n",
      "utils/data/get_utt2dur.sh: working out data_last_ffmpeg/DATA/utt2dur from data_last_ffmpeg/DATA/segments\n",
      "utils/data/get_utt2dur.sh: computed data_last_ffmpeg/DATA/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/DATA\n",
      "Successfully prepared data in data_last_ffmpeg/DATA..\n",
      "prepare /fast/LINAGORA/Corpus/database/Corpus/ESTER2/corpus//train\n",
      "utils/data/get_utt2dur.sh: working out data_last_ffmpeg/train/utt2dur from data_last_ffmpeg/train/segments\n",
      "utils/data/get_utt2dur.sh: computed data_last_ffmpeg/train/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/train\n",
      "Successfully prepared data in data_last_ffmpeg/train..\n"
     ]
    }
   ],
   "source": [
    "data_ESTER2=/fast/LINAGORA/Corpus/database/Corpus/ESTER2/corpus/\n",
    "data_ESTER_Phase1=/fast/LINAGORA/Corpus/database/Corpus/ESTER/DGA/Phase1\n",
    "data_ESTER_Phase2=/fast/LINAGORA/Corpus/database/Corpus/ESTER/DGA/Phase2\n",
    "data_ESTER_EVAL=/fast/LINAGORA/Corpus/database/Corpus/ESTER/DGA/Eval2005\n",
    "#idata_kaldi=data_last\n",
    "for part in $data_ESTER_Phase1/data $data_ESTER_Phase2/DATA2 $data_ESTER_EVAL/DATA $data_ESTER2/train; do\n",
    "  # use underscore-separated names in data directories.\n",
    "  echo \"prepare $part\"\n",
    "  #local/data_prepTCOF.sh $data/$part $idata_kaldi/$part\n",
    "  # probleme event (URL:)\n",
    "  folder_name=$(basename $part)\n",
    "  local/data_prepESTER.sh $part $idata_kaldi/$folder_name\n",
    "done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data_last_ffmpeg/data exp_last_ffmpeg/make_mfcc/data mfcc\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/data\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "Succeeded creating MFCC features for data\n",
      "steps/compute_cmvn_stats.sh data_last_ffmpeg/data exp_last_ffmpeg/make_mfcc/data mfcc\n",
      "Succeeded creating CMVN stats for data\n",
      "fix_data_dir.sh: kept all 35574 utterances.\n",
      "fix_data_dir.sh: old files are kept in data_last_ffmpeg/data/.backup\n",
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data_last_ffmpeg/DATA2 exp_last_ffmpeg/make_mfcc/DATA2 mfcc\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/DATA2\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "It seems not all of the feature files were successfully processed (56956 != 57341);\n",
      "consider using utils/fix_data_dir.sh data_last_ffmpeg/DATA2\n",
      "Succeeded creating MFCC features for DATA2\n",
      "steps/compute_cmvn_stats.sh data_last_ffmpeg/DATA2 exp_last_ffmpeg/make_mfcc/DATA2 mfcc\n",
      "steps/compute_cmvn_stats.sh: warning: it seems not all of the speakers got cmvn stats (1555 != 1558);\n",
      "Succeeded creating CMVN stats for DATA2\n",
      "utils/fix_data_dir.sh: filtered /tmp/kaldi.A3Y6/speakers from 1558 to 1555 lines based on filter data_last_ffmpeg/DATA2/cmvn.scp.\n",
      "utils/fix_data_dir.sh: filtered data_last_ffmpeg/DATA2/spk2utt from 1558 to 1555 lines based on filter /tmp/kaldi.A3Y6/speakers.\n",
      "utils/fix_data_dir.sh: filtered data_last_ffmpeg/DATA2/spk2gender from 1558 to 1555 lines based on filter /tmp/kaldi.A3Y6/speakers.\n",
      "fix_data_dir.sh: kept 56956 utterances out of 56957\n",
      "utils/fix_data_dir.sh: filtered data_last_ffmpeg/DATA2/wav.scp from 126 to 124 lines based on filter /tmp/kaldi.A3Y6/recordings.\n",
      "fix_data_dir.sh: old files are kept in data_last_ffmpeg/DATA2/.backup\n",
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data_last_ffmpeg/DATA exp_last_ffmpeg/make_mfcc/DATA mfcc\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/DATA\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "Succeeded creating MFCC features for DATA\n",
      "steps/compute_cmvn_stats.sh data_last_ffmpeg/DATA exp_last_ffmpeg/make_mfcc/DATA mfcc\n",
      "Succeeded creating CMVN stats for DATA\n",
      "fix_data_dir.sh: kept all 10486 utterances.\n",
      "fix_data_dir.sh: old files are kept in data_last_ffmpeg/DATA/.backup\n",
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data_last_ffmpeg/train exp_last_ffmpeg/make_mfcc/train mfcc\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/train\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "run.pl: 1 / 32 failed, log is in exp_last_ffmpeg/make_mfcc/train/make_mfcc_train.*.log\n",
      "steps/compute_cmvn_stats.sh data_last_ffmpeg/train exp_last_ffmpeg/make_mfcc/train mfcc\n",
      "make_cmvn.sh: no such file data_last_ffmpeg/train/feats.scp\n",
      "fix_data_dir.sh: kept all 87296 utterances.\n",
      "fix_data_dir.sh: old files are kept in data_last_ffmpeg/train/.backup\n",
      "utils/combine_data.sh data_last_ffmpeg/ESTER1 data_last_ffmpeg/data data_last_ffmpeg/DATA2\n",
      "utils/combine_data.sh [info]: not combining utt2uniq as it does not exist\n",
      "utils/combine_data.sh: combined segments\n",
      "utils/combine_data.sh: combined utt2spk\n",
      "utils/combine_data.sh [info]: not combining utt2lang as it does not exist\n",
      "utils/combine_data.sh: combined utt2dur\n",
      "utils/combine_data.sh: combined feats.scp\n",
      "utils/combine_data.sh: combined text\n",
      "utils/combine_data.sh: combined cmvn.scp\n",
      "utils/combine_data.sh [info]: not combining reco2file_and_channel as it does not exist\n",
      "utils/combine_data.sh: combined wav.scp\n",
      "utils/combine_data.sh: combined spk2gender\n",
      "fix_data_dir.sh: kept all 92530 utterances.\n",
      "fix_data_dir.sh: old files are kept in data_last_ffmpeg/ESTER1/.backup\n",
      "utils/combine_data.sh data_last_ffmpeg/ESTER1_2 data_last_ffmpeg/ESTER1 data_last_ffmpeg/train\n",
      "utils/combine_data.sh [info]: not combining utt2uniq as it does not exist\n",
      "utils/combine_data.sh: combined segments\n",
      "utils/combine_data.sh: combined utt2spk\n",
      "utils/combine_data.sh [info]: not combining utt2lang as it does not exist\n",
      "utils/combine_data.sh: combined utt2dur\n",
      "utils/combine_data.sh [info]: **not combining feats.scp as it does not exist everywhere**\n",
      "utils/combine_data.sh: combined text\n",
      "utils/combine_data.sh [info]: **not combining cmvn.scp as it does not exist everywhere**\n",
      "utils/combine_data.sh [info]: not combining reco2file_and_channel as it does not exist\n",
      "utils/combine_data.sh: combined wav.scp\n",
      "utils/combine_data.sh: combined spk2gender\n",
      "fix_data_dir.sh: kept all 179826 utterances.\n",
      "fix_data_dir.sh: old files are kept in data_last_ffmpeg/ESTER1_2/.backup\n"
     ]
    }
   ],
   "source": [
    "# Feature Extraction MFCC:\n",
    "#exp_kaldi=exp_last\n",
    "mfccdir=mfcc\n",
    "for part in data DATA2 DATA train; do\n",
    "    #MFCC features\n",
    "    steps/make_mfcc.sh --cmd \"$train_cmd\" --nj 32 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    #MFCC features + Pitch\n",
    "    #steps/make_mfcc_pitch.sh --cmd \"$train_cmd\" --nj 12 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    steps/compute_cmvn_stats.sh $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    utils/fix_data_dir.sh $idata_kaldi/$part\n",
    "done\n",
    "# Combine data ESTER Phase 1 & Phase 2\n",
    "utils/combine_data.sh $idata_kaldi/ESTER1 $idata_kaldi/data $idata_kaldi/DATA2\n",
    "utils/combine_data.sh $idata_kaldi/ESTER1_2 $idata_kaldi/ESTER1 $idata_kaldi/train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build Language model using ESTER 1 & ESTER 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Splitting into 32 parts, to allow for parallel processing ...\n",
      "Checking the splits ...\n",
      "Performing text normalization (32 jobs) - check data_last_ffmpeg/local_ESTER12/lm/norm/tmp/txt_norm.JOB.log ...\n",
      "Finished OK\n",
      "Selecting the vocabulary (400000 words) ...\n",
      "Making the corpus and the vocabulary ...\n",
      "Word counts saved to 'data_last_ffmpeg/local_ESTER12/lm/word_counts.txt'\n",
      "Vocabulary saved as 'data_last_ffmpeg/local_ESTER12/lm/meeting-vocab.txt'\n",
      "All unique sentences (in sorted order) stored in 'data_last_ffmpeg/local_ESTER12/lm/meeting-lm-norm.txt.gz'\n",
      "Counting the total number word tokens in the corpus ...\n",
      "There are 2066518 tokens in the corpus\n",
      "Training a 3-gram LM ...\n",
      "This implementation assumes that you have a lot of free RAM(> 12GB) on your machine\n",
      "If that's not the case, consider something like: http://joshua-decoder.org/4.0/large-lms.html\n",
      "6,4M\tdata_last_ffmpeg/local_ESTER12/lm/lm_tglarge.arpa.gz\n",
      "Creating a 'small' pruned 3-gram LM (threshold: 0.0000003) ...\n",
      "data_last_ffmpeg/local_ESTER12/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "4,1M\tdata_last_ffmpeg/local_ESTER12/lm/lm_tgsmall.arpa.gz\n",
      "Creating a 'medium' pruned 3-gram LM (threshold: 0.0000001) ...\n",
      "data_last_ffmpeg/local_ESTER12/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "5,4M\tdata_last_ffmpeg/local_ESTER12/lm/lm_tgmed.arpa.gz\n",
      "Training a 4-gram LM ...\n",
      "7,9M\tdata_last_ffmpeg/local_ESTER12/lm/lm_fglarge.arpa.gz\n"
     ]
    }
   ],
   "source": [
    "corpus_path=/fast/LINAGORA/Corpus/database\n",
    "LM_train_text=$corpus_path/Textall\n",
    "local/lm/train_lm.sh $LM_train_text \\\n",
    "$idata_kaldi/local_ESTER12/lm/norm/tmp $idata_kaldi/local_ESTER12/lm/norm/norm_texts $idata_kaldi/local_ESTER12/lm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/fast/LINAGORA/tools/LM/lm_tgsphinx.arpa.gz: line 35: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "data-ESTER-V4/local_ESTER12/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "BOW numerator for context \"\" is -3.2194e-05 < 0\n",
      "BOW numerator for context \"c' puisque\" is -0.000595476 < 0\n",
      "BOW numerator for context \"c' que\" is -0.000972837 < 0\n",
      "BOW numerator for context \"c' matin\" is -0.000964833 < 0\n",
      "BOW numerator for context \"c' monde\" is -0.000930619 < 0\n",
      "reading 74451 1-grams\n",
      "exp-ESTER-V4/eval_LM_ESTER12/mixed.gz: line 35: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "reading 18615477 2-grams\n",
      "reading 23653218 3-grams\n"
     ]
    }
   ],
   "source": [
    "# Evaluate LM\n",
    "testfile=$idata_kaldi/DATA/text_without_noise_tag\n",
    "# LM's\n",
    "LM_LIUM_LARGE=/fast/LINAGORA/tools/LM/lm_tgsphinx.arpa.gz\n",
    "LM_LIUM_SMALL=/fast/LINAGORA/tools/LM/lm_french-small.arpa.gz\n",
    "LM_ESTER_tglarge=$idata_kaldi/local_ESTER12/lm/lm_tglarge.arpa.gz\n",
    "LM_ESTER_fglarge=$idata_kaldi/local_ESTER12/lm/lm_fglarge.arpa.gz\n",
    "# dir eval language model\n",
    "dir_eval_lm=$exp_kaldi/eval_LM_ESTER12\n",
    "mkdir -p $dir_eval_lm\n",
    "\n",
    "# Get Text from kaldi text format\n",
    "cut -f2- -d' ' < $idata_kaldi/DATA/text | cut -d ' ' -f2- |\\\n",
    "sed -e 's/[ ]\\+/ /g' | sed -e 's/<[^ ][^ ]*>\\|!sil//g' > $testfile\n",
    "# Lium 3-gram all\n",
    "ngram -lm $LM_LIUM_LARGE -ppl $testfile -debug 2 > $dir_eval_lm/LM_tgLium_full.ppl\n",
    "# Lium 3-gram pruné\n",
    "ngram -lm $LM_LIUM_SMALL -ppl $testfile -debug 2 > $dir_eval_lm/LM_tgLium_small.ppl\n",
    "# ESTER 3-gram\n",
    "ngram -lm $LM_ESTER_tglarge -ppl $testfile -debug 2 > $dir_eval_lm/LM_tgESTER.ppl\n",
    "# ESTER 4-gram\n",
    "ngram -lm $LM_ESTER_fglarge -ppl $testfile -debug 2 > $dir_eval_lm/LM_fgESTER.ppl\n",
    "#===== Mixe languages\n",
    "# compute best lambda\n",
    "compute-best-mix $dir_eval_lm/LM_tgLium_full.ppl $dir_eval_lm/LM_tgESTER.ppl > $dir_eval_lm/best_mix\n",
    "# mixe languages\n",
    "ngram -lm $LM_LIUM_LARGE -mix-lm $LM_ESTER_tglarge -lambda 0.597881 -write-lm $dir_eval_lm/mixed.gz\n",
    "# compute perplexity\n",
    "ngram -lm $dir_eval_lm/mixed.gz -ppl $testfile -debug 2 > $dir_eval_lm/LM_mixed.ppl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " 105023 \n",
      "  52878 aa\n",
      "      1 aaq\n",
      "  49016 ai\n",
      "  20328 an\n",
      "   6620 au\n",
      "  13761 bb\n",
      "   5354 ch\n",
      "  20387 dd\n",
      "  32294 ee\n",
      "  22574 ei\n",
      "   1816 eu\n",
      "  11273 ff\n",
      "   8086 gg\n",
      "   1043 gn\n",
      "  47329 ii\n",
      "   7160 in\n",
      "   8405 jj\n",
      "  29273 kk\n",
      "  31392 ll\n",
      "  21284 mm\n",
      "  21994 nn\n",
      "   2169 oe\n",
      "  10989 on\n",
      "  25795 oo\n",
      "   7341 ou\n",
      "  20469 pp\n",
      "  64183 rr\n",
      "  38533 ss\n",
      "  47040 tt\n",
      "    182 un\n",
      "  13874 uu\n",
      "   1225 uy\n",
      "  11965 vv\n",
      "   3746 ww\n",
      "  16421 yy\n",
      "  28250 zz\n",
      "Downloading and preparing CMUdict\n",
      "Autogenerating pronunciations for the words in data_last_ffmpeg/local_ESTER12/dict/g2p/vocab_autogen.* ...\n",
      "12125\n",
      "12125\n",
      "12125 pronunciations autogenerated OK\n",
      "Combining the CMUdict pronunciations with the autogenerated ones ...\n",
      "Combined lexicon saved to 'data_last_ffmpeg/local_ESTER12/dict/lexicon_raw_nosil.txt'\n",
      "Preparing phone lists and clustering questions\n",
      "4 silence phones saved to: data_last_ffmpeg/local_ESTER12/dict/silence_phones.txt\n",
      "1 optional silence saved to: data_last_ffmpeg/local_ESTER12/dict/optional_silence.txt\n",
      "36 non-silence phones saved to: data_last_ffmpeg/local_ESTER12/dict/nonsilence_phones.txt\n",
      "2 extra triphone clustering-related questions saved to: data_last_ffmpeg/local_ESTER12/dict/extra_questions.txt\n",
      "Lexicon text file saved as: data_last_ffmpeg/local_ESTER12/dict/lexicon.txt\n"
     ]
    }
   ],
   "source": [
    "#### Prepare dict: add words which doesn't exist in dictionnary + config files...\n",
    "# print number of phonem used in french\n",
    "dir_repos=/fast/LINAGORA/STT/Thesis_aheba\n",
    "#dir_repos=/home/lingora/Documents/Linagora/kaldi/egs/Linagora/Thesis_aheba\n",
    "cat $dir_repos/cmu_dict/fr.dict | awk '{$1=\"\";print $0}' | tr ' ' '\\n' | sort -b | uniq -c\n",
    "mkdir -p $idata_kaldi/local_ESTER12/dict/cmudict\n",
    "cp $dir_repos/cmu_dict/lexicon_new.dict $idata_kaldi/local_ESTER12/dict/fr.dict\n",
    "mkdir -p $idata_kaldi/local_ESTER12/lm/g2p\n",
    "cp $dir_repos/g2p/model-5 $idata_kaldi/local_ESTER12/lm/g2p\n",
    "\n",
    "local/prepare_dict.sh --stage 0 --nj 16 --cmd \"$train_cmd\" \\\n",
    "   $idata_kaldi/local_ESTER12/lm $idata_kaldi/local_ESTER12/lm/g2p $idata_kaldi/local_ESTER12/dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checking data_last_ffmpeg/local_ESTER12/dict/silence_phones.txt ...\n",
      "--> reading data_last_ffmpeg/local_ESTER12/dict/silence_phones.txt\n",
      "--> data_last_ffmpeg/local_ESTER12/dict/silence_phones.txt is OK\n",
      "\n",
      "Checking data_last_ffmpeg/local_ESTER12/dict/optional_silence.txt ...\n",
      "--> reading data_last_ffmpeg/local_ESTER12/dict/optional_silence.txt\n",
      "--> data_last_ffmpeg/local_ESTER12/dict/optional_silence.txt is OK\n",
      "\n",
      "Checking data_last_ffmpeg/local_ESTER12/dict/nonsilence_phones.txt ...\n",
      "--> reading data_last_ffmpeg/local_ESTER12/dict/nonsilence_phones.txt\n",
      "--> data_last_ffmpeg/local_ESTER12/dict/nonsilence_phones.txt is OK\n",
      "\n",
      "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n",
      "--> disjoint property is OK.\n",
      "\n",
      "Checking data_last_ffmpeg/local_ESTER12/dict/lexicon.txt\n",
      "--> reading data_last_ffmpeg/local_ESTER12/dict/lexicon.txt\n",
      "--> data_last_ffmpeg/local_ESTER12/dict/lexicon.txt is OK\n",
      "\n",
      "Checking data_last_ffmpeg/local_ESTER12/dict/extra_questions.txt ...\n",
      "--> reading data_last_ffmpeg/local_ESTER12/dict/extra_questions.txt\n",
      "--> data_last_ffmpeg/local_ESTER12/dict/extra_questions.txt is OK\n",
      "--> SUCCESS [validating dictionary directory data_last_ffmpeg/local_ESTER12/dict]\n",
      "\n",
      "**Creating data_last_ffmpeg/local_ESTER12/dict/lexiconp.txt from data_last_ffmpeg/local_ESTER12/dict/lexicon.txt\n",
      "fstaddselfloops data_last_ffmpeg/lang_ESTER12/phones/wdisambig_phones.int data_last_ffmpeg/lang_ESTER12/phones/wdisambig_words.int \n",
      "prepare_lang.sh: validating output directory\n",
      "utils/validate_lang.pl data_last_ffmpeg/lang_ESTER12\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones.txt ...\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones.txt is OK\n",
      "\n",
      "Checking words.txt: #0 ...\n",
      "--> data_last_ffmpeg/lang_ESTER12/words.txt is OK\n",
      "\n",
      "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> silence.txt and nonsilence.txt are disjoint\n",
      "--> silence.txt and disambig.txt are disjoint\n",
      "--> disambig.txt and nonsilence.txt are disjoint\n",
      "--> disjoint property is OK\n",
      "\n",
      "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> summation property is OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/context_indep.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/context_indep.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/context_indep.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/context_indep.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/context_indep.csl corresponds to data_last_ffmpeg/lang_ESTER12/phones/context_indep.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/context_indep.{txt, int, csl} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/nonsilence.{txt, int, csl} ...\n",
      "--> 144 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/nonsilence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/nonsilence.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/nonsilence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/nonsilence.csl corresponds to data_last_ffmpeg/lang_ESTER12/phones/nonsilence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/nonsilence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/silence.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/silence.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/silence.csl corresponds to data_last_ffmpeg/lang_ESTER12/phones/silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/optional_silence.{txt, int, csl} ...\n",
      "--> 1 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/optional_silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/optional_silence.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/optional_silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/optional_silence.csl corresponds to data_last_ffmpeg/lang_ESTER12/phones/optional_silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/optional_silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/disambig.{txt, int, csl} ...\n",
      "--> 18 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/disambig.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/disambig.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/disambig.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/disambig.csl corresponds to data_last_ffmpeg/lang_ESTER12/phones/disambig.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/disambig.{txt, int, csl} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/roots.{txt, int} ...\n",
      "--> 40 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/roots.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/roots.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/roots.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/roots.{txt, int} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/sets.{txt, int} ...\n",
      "--> 40 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/sets.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/sets.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/sets.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/sets.{txt, int} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/extra_questions.{txt, int} ...\n",
      "--> 11 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/extra_questions.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/extra_questions.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/extra_questions.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/extra_questions.{txt, int} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/word_boundary.{txt, int} ...\n",
      "--> 164 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/word_boundary.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/word_boundary.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/word_boundary.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/word_boundary.{txt, int} are OK\n",
      "\n",
      "Checking optional_silence.txt ...\n",
      "--> reading data_last_ffmpeg/lang_ESTER12/phones/optional_silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/optional_silence.txt is OK\n",
      "\n",
      "Checking disambiguation symbols: #0 and #1\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/disambig.txt has \"#0\" and \"#1\"\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/disambig.txt is OK\n",
      "\n",
      "Checking topo ...\n",
      "\n",
      "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/word_boundary.txt doesn't include disambiguation symbols\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/word_boundary.txt is OK\n",
      "\n",
      "Checking word-level disambiguation symbols...\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/wdisambig.txt exists (newer prepare_lang.sh)\n",
      "Checking word_boundary.int and disambig.int\n",
      "--> generating a 86 word sequence\n",
      "--> resulting phone sequence from L.fst corresponds to the word sequence\n",
      "--> L.fst is OK\n",
      "--> generating a 35 word sequence\n",
      "--> resulting phone sequence from L_disambig.fst corresponds to the word sequence\n",
      "--> L_disambig.fst is OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/oov.{txt, int} ...\n",
      "--> 1 entry/entries in data_last_ffmpeg/lang_ESTER12/oov.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/oov.int corresponds to data_last_ffmpeg/lang_ESTER12/oov.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/oov.{txt, int} are OK\n",
      "\n",
      "--> data_last_ffmpeg/lang_ESTER12/L.fst is olabel sorted\n",
      "--> data_last_ffmpeg/lang_ESTER12/L_disambig.fst is olabel sorted\n",
      "--> SUCCESS [validating lang directory data_last_ffmpeg/lang_ESTER12]\n"
     ]
    }
   ],
   "source": [
    "#### Prepare Lang ==> L.fst Vocabulary's automate finite state\n",
    "utils/prepare_lang.sh $idata_kaldi/local_ESTER12/dict \\\n",
    "   \"<unk>\" $idata_kaldi/local_ESTER12/lang_tmp $idata_kaldi/lang_ESTER12"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ln: impossible de créer le lien symbolique 'data-ESTER-V4/local_ESTER12/lm/lm_tgsphinx.arpa.gz': Le fichier existe\n",
      "ln: impossible de créer le lien symbolique 'data-ESTER-V4/local_ESTER12/lm/lm_french-small.arpa.gz': Le fichier existe\n",
      "arpa2fst --disambig-symbol=#0 --read-symbol-table=data-ESTER-V4/lang_ESTER12_test_fglarge/words.txt - data-ESTER-V4/lang_ESTER12_test_fglarge/G.fst \n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:90) Reading \\data\\ section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\1-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\2-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\3-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\4-grams: section.\n",
      "utils/validate_lang.pl data-ESTER-V4/lang_ESTER12_test_fglarge\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones.txt ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones.txt is OK\n",
      "\n",
      "Checking words.txt: #0 ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/words.txt is OK\n",
      "\n",
      "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> silence.txt and nonsilence.txt are disjoint\n",
      "--> silence.txt and disambig.txt are disjoint\n",
      "--> disambig.txt and nonsilence.txt are disjoint\n",
      "--> disjoint property is OK\n",
      "\n",
      "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> summation property is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/context_indep.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/context_indep.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/context_indep.csl corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/context_indep.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/nonsilence.{txt, int, csl} ...\n",
      "--> 144 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/nonsilence.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/nonsilence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/nonsilence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/silence.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.{txt, int, csl} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.{txt, int, csl} ...\n",
      "--> 18 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.csl corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/roots.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/roots.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/roots.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/sets.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/sets.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/sets.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/extra_questions.{txt, int} ...\n",
      "--> 11 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/extra_questions.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/extra_questions.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.{txt, int} ...\n",
      "--> 164 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.{txt, int} are OK\n",
      "\n",
      "Checking optional_silence.txt ...\n",
      "--> reading data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.txt is OK\n",
      "\n",
      "Checking disambiguation symbols: #0 and #1\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.txt has \"#0\" and \"#1\"\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.txt is OK\n",
      "\n",
      "Checking topo ...\n",
      "\n",
      "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.txt doesn't include disambiguation symbols\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.txt is OK\n",
      "\n",
      "Checking word-level disambiguation symbols...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/wdisambig.txt exists (newer prepare_lang.sh)\n",
      "Checking word_boundary.int and disambig.int\n",
      "--> generating a 55 word sequence\n",
      "--> resulting phone sequence from L.fst corresponds to the word sequence\n",
      "--> L.fst is OK\n",
      "--> generating a 56 word sequence\n",
      "--> resulting phone sequence from L_disambig.fst corresponds to the word sequence\n",
      "--> L_disambig.fst is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/oov.{txt, int} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/oov.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/oov.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/oov.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/oov.{txt, int} are OK\n",
      "\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/L.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/L_disambig.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/G.fst is ilabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/G.fst has 721774 states\n",
      "--> utils/lang/check_g_properties.pl successfully validated data-ESTER-V4/lang_ESTER12_test_fglarge/G.fst\n",
      "--> utils/lang/check_g_properties.pl succeeded.\n",
      "--> SUCCESS [validating lang directory data-ESTER-V4/lang_ESTER12_test_fglarge]\n",
      "arpa2fst --disambig-symbol=#0 --read-symbol-table=data-ESTER-V4/lang_ESTER12_test_tglarge/words.txt - data-ESTER-V4/lang_ESTER12_test_tglarge/G.fst \n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:90) Reading \\data\\ section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\1-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\2-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\3-grams: section.\n",
      "utils/validate_lang.pl data-ESTER-V4/lang_ESTER12_test_tglarge\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones.txt ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones.txt is OK\n",
      "\n",
      "Checking words.txt: #0 ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/words.txt is OK\n",
      "\n",
      "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> silence.txt and nonsilence.txt are disjoint\n",
      "--> silence.txt and disambig.txt are disjoint\n",
      "--> disambig.txt and nonsilence.txt are disjoint\n",
      "--> disjoint property is OK\n",
      "\n",
      "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> summation property is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/context_indep.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/context_indep.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/context_indep.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/context_indep.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/nonsilence.{txt, int, csl} ...\n",
      "--> 144 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/nonsilence.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/nonsilence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/nonsilence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/silence.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.{txt, int, csl} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.{txt, int, csl} ...\n",
      "--> 18 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/roots.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/roots.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/roots.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/sets.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/sets.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/sets.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/extra_questions.{txt, int} ...\n",
      "--> 11 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/extra_questions.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/extra_questions.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.{txt, int} ...\n",
      "--> 164 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.{txt, int} are OK\n",
      "\n",
      "Checking optional_silence.txt ...\n",
      "--> reading data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.txt is OK\n",
      "\n",
      "Checking disambiguation symbols: #0 and #1\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.txt has \"#0\" and \"#1\"\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.txt is OK\n",
      "\n",
      "Checking topo ...\n",
      "\n",
      "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.txt doesn't include disambiguation symbols\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.txt is OK\n",
      "\n",
      "Checking word-level disambiguation symbols...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/wdisambig.txt exists (newer prepare_lang.sh)\n",
      "Checking word_boundary.int and disambig.int\n",
      "--> generating a 66 word sequence\n",
      "--> resulting phone sequence from L.fst corresponds to the word sequence\n",
      "--> L.fst is OK\n",
      "--> generating a 92 word sequence\n",
      "--> resulting phone sequence from L_disambig.fst corresponds to the word sequence\n",
      "--> L_disambig.fst is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/oov.{txt, int} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/oov.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/oov.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/oov.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/oov.{txt, int} are OK\n",
      "\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/L.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/L_disambig.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/G.fst is ilabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/G.fst has 527277 states\n",
      "--> utils/lang/check_g_properties.pl successfully validated data-ESTER-V4/lang_ESTER12_test_tglarge/G.fst\n",
      "--> utils/lang/check_g_properties.pl succeeded.\n",
      "--> SUCCESS [validating lang directory data-ESTER-V4/lang_ESTER12_test_tglarge]\n",
      "arpa2fst --disambig-symbol=#0 --read-symbol-table=data-ESTER-V4/lang_ESTER12_test_tgsphinx/words.txt - data-ESTER-V4/lang_ESTER12_test_tgsphinx/G.fst \n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:90) Reading \\data\\ section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\1-grams: section.\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 8 [-5.2142\t-ce\t-0.2163] skipped: word '-ce' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 9 [-4.8723\t-ci\t-0.2944] skipped: word '-ci' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 10 [-4.2048\t-elle\t-0.3700] skipped: word '-elle' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 11 [-4.4310\t-elles\t-0.3539] skipped: word '-elles' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 12 [-4.9062\t-en\t-0.2800] skipped: word '-en' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 13 [-4.0463\t-il\t-0.4450] skipped: word '-il' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 14 [-4.1699\t-ils\t-0.4206] skipped: word '-ils' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 15 [-5.1270\t-je\t-0.2720] skipped: word '-je' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 16 [-4.9573\t-la\t-0.2149] skipped: word '-la' not in symbol table\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 17 [-3.6009\t-là\t-0.5456] skipped: word '-là' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 18 [-4.7289\t-le\t-0.2947] skipped: word '-le' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 19 [-5.0130\t-les\t-0.1999] skipped: word '-les' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 20 [-5.5490\t-lui\t-0.1649] skipped: word '-lui' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 21 [-4.7873\t-même\t-0.2425] skipped: word '-même' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 22 [-5.4825\t-mêmes\t-0.1766] skipped: word '-mêmes' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 23 [-5.0634\t-moi\t-0.3092] skipped: word '-moi' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 24 [-4.7140\t-nous\t-0.3254] skipped: word '-nous' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 25 [-4.3209\t-on\t-0.3717] skipped: word '-on' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 26 [-3.8887\t-t\t-1.4364] skipped: word '-t' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 27 [-4.3967\t-t-elle\t-0.2546] skipped: word '-t-elle' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 28 [-4.2829\t-t-il\t-0.2571] skipped: word '-t-il' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 29 [-5.3860\t-toi\t-0.1905] skipped: word '-toi' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 30 [-5.4007\t-tu\t-0.1668] skipped: word '-tu' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 31 [-4.6364\t-vous\t-0.3545] skipped: word '-vous' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 32 [-5.6027\t-y\t-0.1394] skipped: word '-y' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 38 [-5.8621\tà-côtés\t-0.1238] skipped: word 'à-côtés' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 39 [-5.7419\tà-coups\t-0.1460] skipped: word 'à-coups' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 40 [-5.7226\tà-peu-près\t-0.1351] skipped: word 'à-peu-près' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 41 [-5.8535\tà-propos\t-0.1315] skipped: word 'à-propos' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 42 [-5.3186\ta1\t-0.2572] skipped: word 'a1' not in symbol table\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\2-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\3-grams: section.\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:231) Of 9454889 parse warnings, 30 were reported. Run program with --max_warnings=-1 to see all warnings\n",
      "utils/validate_lang.pl data-ESTER-V4/lang_ESTER12_test_tgsphinx\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones.txt ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones.txt is OK\n",
      "\n",
      "Checking words.txt: #0 ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/words.txt is OK\n",
      "\n",
      "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> silence.txt and nonsilence.txt are disjoint\n",
      "--> silence.txt and disambig.txt are disjoint\n",
      "--> disambig.txt and nonsilence.txt are disjoint\n",
      "--> disjoint property is OK\n",
      "\n",
      "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> summation property is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/context_indep.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/context_indep.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/context_indep.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/context_indep.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/nonsilence.{txt, int, csl} ...\n",
      "--> 144 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/nonsilence.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/nonsilence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/nonsilence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/silence.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.{txt, int, csl} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.{txt, int, csl} ...\n",
      "--> 18 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/roots.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/roots.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/roots.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/sets.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/sets.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/sets.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/extra_questions.{txt, int} ...\n",
      "--> 11 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/extra_questions.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/extra_questions.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.{txt, int} ...\n",
      "--> 164 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.{txt, int} are OK\n",
      "\n",
      "Checking optional_silence.txt ...\n",
      "--> reading data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.txt is OK\n",
      "\n",
      "Checking disambiguation symbols: #0 and #1\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.txt has \"#0\" and \"#1\"\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.txt is OK\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Checking topo ...\n",
      "\n",
      "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.txt doesn't include disambiguation symbols\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.txt is OK\n",
      "\n",
      "Checking word-level disambiguation symbols...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/wdisambig.txt exists (newer prepare_lang.sh)\n",
      "Checking word_boundary.int and disambig.int\n",
      "--> generating a 11 word sequence\n",
      "--> resulting phone sequence from L.fst corresponds to the word sequence\n",
      "--> L.fst is OK\n",
      "--> generating a 46 word sequence\n",
      "--> resulting phone sequence from L_disambig.fst corresponds to the word sequence\n",
      "--> L_disambig.fst is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/oov.{txt, int} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/oov.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/oov.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/oov.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/oov.{txt, int} are OK\n",
      "\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/L.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/L_disambig.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/G.fst is ilabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/G.fst has 13156697 states\n",
      "--> utils/lang/check_g_properties.pl successfully validated data-ESTER-V4/lang_ESTER12_test_tgsphinx/G.fst\n",
      "--> utils/lang/check_g_properties.pl succeeded.\n",
      "--> SUCCESS [validating lang directory data-ESTER-V4/lang_ESTER12_test_tgsphinx]\n",
      "arpa2fst --disambig-symbol=#0 --read-symbol-table=data-ESTER-V4/lang_ESTER12_test_french-small/words.txt - data-ESTER-V4/lang_ESTER12_test_french-small/G.fst \n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:90) Reading \\data\\ section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\1-grams: section.\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 12 [-5.538307\taalto] skipped: word 'aalto' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 13 [-5.120474\taaron\t-0.02769727] skipped: word 'aaron' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 15 [-5.731214\tabachidze] skipped: word 'abachidze' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 16 [-5.487072\tabadie\t-0.02312817] skipped: word 'abadie' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 17 [-5.749587\tabagnale\t-0.03421882] skipped: word 'abagnale' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 18 [-5.439013\tabaissant\t-0.1023452] skipped: word 'abaissant' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 19 [-5.298798\tabaisse\t-0.1515498] skipped: word 'abaisse' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 23 [-5.383688\tabaissée\t-0.1352918] skipped: word 'abaissée' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 24 [-5.447463\tabaissés\t-0.1639998] skipped: word 'abaissés' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 26 [-5.323689\tabandonna\t-0.07547411] skipped: word 'abandonna' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 32 [-5.20608\tabandonnera\t-0.08380523] skipped: word 'abandonnera' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 42 [-5.217123\tabasourdi\t-0.1560311] skipped: word 'abasourdi' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 46 [-5.573611\tabat-jour\t-0.04034681] skipped: word 'abat-jour' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 47 [-5.525448\tabats\t-0.06064676] skipped: word 'abats' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 51 [-5.45354\tabattements\t-0.1281941] skipped: word 'abattements' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 61 [-5.026776\tabb\t-0.07580301] skipped: word 'abb' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 62 [-5.444225\tabbado\t-0.02391137] skipped: word 'abbado' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 64 [-5.533504\tabbatiale\t-0.05947962] skipped: word 'abbatiale' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 66 [-5.472561\tabbayes\t-0.06715237] skipped: word 'abbayes' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 67 [-5.64184\tabbesses\t-0.01320701] skipped: word 'abbesses' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 71 [-5.262563\tabbott\t-0.03037171] skipped: word 'abbott' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 89 [-5.243713\tabdelkrim\t-0.003165964] skipped: word 'abdelkrim' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 97 [-5.60272\tabderazak\t-0.07333852] skipped: word 'abderazak' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 102 [-5.590033\tabderrezak\t-0.08256684] skipped: word 'abderrezak' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 106 [-5.726902\tabdic] skipped: word 'abdic' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 107 [-5.558577\tabdication\t-0.05181389] skipped: word 'abdication' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 109 [-5.369082\tabdiquer\t-0.1094639] skipped: word 'abdiquer' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 110 [-5.468573\tabdiqué\t-0.07636042] skipped: word 'abdiqué' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 119 [-5.446184\tabe\t-0.003954563] skipped: word 'abe' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 120 [-5.70366\tabeau\t-0.02909388] skipped: word 'abeau' not in symbol table\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\2-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\3-grams: section.\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:231) Of 292476 parse warnings, 30 were reported. Run program with --max_warnings=-1 to see all warnings\n",
      "utils/validate_lang.pl data-ESTER-V4/lang_ESTER12_test_french-small\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones.txt ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones.txt is OK\n",
      "\n",
      "Checking words.txt: #0 ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/words.txt is OK\n",
      "\n",
      "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> silence.txt and nonsilence.txt are disjoint\n",
      "--> silence.txt and disambig.txt are disjoint\n",
      "--> disambig.txt and nonsilence.txt are disjoint\n",
      "--> disjoint property is OK\n",
      "\n",
      "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> summation property is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/context_indep.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/context_indep.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/context_indep.csl corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/context_indep.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/nonsilence.{txt, int, csl} ...\n",
      "--> 144 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/nonsilence.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/nonsilence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/nonsilence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/silence.{txt, int, csl} ...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.{txt, int, csl} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/disambig.{txt, int, csl} ...\n",
      "--> 18 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/disambig.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/disambig.csl corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/disambig.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/roots.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/roots.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/roots.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/sets.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/sets.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/sets.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/extra_questions.{txt, int} ...\n",
      "--> 11 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/extra_questions.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/extra_questions.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/word_boundary.{txt, int} ...\n",
      "--> 164 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/word_boundary.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/word_boundary.{txt, int} are OK\n",
      "\n",
      "Checking optional_silence.txt ...\n",
      "--> reading data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.txt is OK\n",
      "\n",