Skip to content
Run_ESTER1_2.ipynb 93 KiB
Newer Older
{
 "cells": [
  {
   "cell_type": "code",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    ". path.sh\n",
    ". cmd.sh\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "idata_kaldi=data_last_ffmpeg\n",
    "exp_kaldi=exp_last_ffmpeg\n",
    "#model_tri_sat_ESTER=exp-ESTER-V4/monoAll_1_2/tri1_10K_100K_ESTER_All/tri1_15K_200K_ESTER_All/tri2_20K_300K_ESTER_All/tri1_SAT_50K_400K/tri1_SAT_70K_500K"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare ESTER 2"
   ]
  },
  {
   "cell_type": "code",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
   "execution_count": 18,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "prepare /fast/LINAGORA/Corpus/database/Corpus/ESTER/DGA/Phase1/data\n",
      "utils/data/get_utt2dur.sh: working out data_last_ffmpeg/data/utt2dur from data_last_ffmpeg/data/segments\n",
      "utils/data/get_utt2dur.sh: computed data_last_ffmpeg/data/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/data\n",
      "Successfully prepared data in data_last_ffmpeg/data..\n",
      "prepare /fast/LINAGORA/Corpus/database/Corpus/ESTER/DGA/Phase2/DATA2\n",
      "utils/data/get_utt2dur.sh: working out data_last_ffmpeg/DATA2/utt2dur from data_last_ffmpeg/DATA2/segments\n",
      "utils/data/get_utt2dur.sh: computed data_last_ffmpeg/DATA2/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/DATA2\n",
      "Successfully prepared data in data_last_ffmpeg/DATA2..\n",
      "prepare /fast/LINAGORA/Corpus/database/Corpus/ESTER/DGA/Eval2005/DATA\n",
      "utils/data/get_utt2dur.sh: working out data_last_ffmpeg/DATA/utt2dur from data_last_ffmpeg/DATA/segments\n",
      "utils/data/get_utt2dur.sh: computed data_last_ffmpeg/DATA/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/DATA\n",
      "Successfully prepared data in data_last_ffmpeg/DATA..\n",
      "prepare /fast/LINAGORA/Corpus/database/Corpus/ESTER2/corpus//train\n",
      "utils/data/get_utt2dur.sh: working out data_last_ffmpeg/train/utt2dur from data_last_ffmpeg/train/segments\n",
      "utils/data/get_utt2dur.sh: computed data_last_ffmpeg/train/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/train\n",
      "Successfully prepared data in data_last_ffmpeg/train..\n"
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "data_ESTER2=/fast/LINAGORA/Corpus/database/Corpus/ESTER2/corpus/\n",
    "data_ESTER_Phase1=/fast/LINAGORA/Corpus/database/Corpus/ESTER/DGA/Phase1\n",
    "data_ESTER_Phase2=/fast/LINAGORA/Corpus/database/Corpus/ESTER/DGA/Phase2\n",
    "data_ESTER_EVAL=/fast/LINAGORA/Corpus/database/Corpus/ESTER/DGA/Eval2005\n",
    "#idata_kaldi=data_last\n",
    "for part in $data_ESTER_Phase1/data $data_ESTER_Phase2/DATA2 $data_ESTER_EVAL/DATA $data_ESTER2/train; do\n",
    "  # use underscore-separated names in data directories.\n",
    "  echo \"prepare $part\"\n",
    "  #local/data_prepTCOF.sh $data/$part $idata_kaldi/$part\n",
    "  # probleme event (URL:)\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "  folder_name=$(basename $part)\n",
    "  local/data_prepESTER.sh $part $idata_kaldi/$folder_name\n",
    "done"
   ]
  },
  {
   "cell_type": "code",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
   "execution_count": 19,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data_last_ffmpeg/data exp_last_ffmpeg/make_mfcc/data mfcc\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/data\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "Succeeded creating MFCC features for data\n",
      "steps/compute_cmvn_stats.sh data_last_ffmpeg/data exp_last_ffmpeg/make_mfcc/data mfcc\n",
      "Succeeded creating CMVN stats for data\n",
      "fix_data_dir.sh: kept all 35574 utterances.\n",
      "fix_data_dir.sh: old files are kept in data_last_ffmpeg/data/.backup\n",
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data_last_ffmpeg/DATA2 exp_last_ffmpeg/make_mfcc/DATA2 mfcc\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/DATA2\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "It seems not all of the feature files were successfully processed (56956 != 57341);\n",
      "consider using utils/fix_data_dir.sh data_last_ffmpeg/DATA2\n",
      "Succeeded creating MFCC features for DATA2\n",
      "steps/compute_cmvn_stats.sh data_last_ffmpeg/DATA2 exp_last_ffmpeg/make_mfcc/DATA2 mfcc\n",
      "steps/compute_cmvn_stats.sh: warning: it seems not all of the speakers got cmvn stats (1555 != 1558);\n",
      "Succeeded creating CMVN stats for DATA2\n",
      "utils/fix_data_dir.sh: filtered /tmp/kaldi.A3Y6/speakers from 1558 to 1555 lines based on filter data_last_ffmpeg/DATA2/cmvn.scp.\n",
      "utils/fix_data_dir.sh: filtered data_last_ffmpeg/DATA2/spk2utt from 1558 to 1555 lines based on filter /tmp/kaldi.A3Y6/speakers.\n",
      "utils/fix_data_dir.sh: filtered data_last_ffmpeg/DATA2/spk2gender from 1558 to 1555 lines based on filter /tmp/kaldi.A3Y6/speakers.\n",
      "fix_data_dir.sh: kept 56956 utterances out of 56957\n",
      "utils/fix_data_dir.sh: filtered data_last_ffmpeg/DATA2/wav.scp from 126 to 124 lines based on filter /tmp/kaldi.A3Y6/recordings.\n",
      "fix_data_dir.sh: old files are kept in data_last_ffmpeg/DATA2/.backup\n",
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data_last_ffmpeg/DATA exp_last_ffmpeg/make_mfcc/DATA mfcc\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/DATA\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "Succeeded creating MFCC features for DATA\n",
      "steps/compute_cmvn_stats.sh data_last_ffmpeg/DATA exp_last_ffmpeg/make_mfcc/DATA mfcc\n",
      "Succeeded creating CMVN stats for DATA\n",
      "fix_data_dir.sh: kept all 10486 utterances.\n",
      "fix_data_dir.sh: old files are kept in data_last_ffmpeg/DATA/.backup\n",
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data_last_ffmpeg/train exp_last_ffmpeg/make_mfcc/train mfcc\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data_last_ffmpeg/train\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "run.pl: 1 / 32 failed, log is in exp_last_ffmpeg/make_mfcc/train/make_mfcc_train.*.log\n",
      "steps/compute_cmvn_stats.sh data_last_ffmpeg/train exp_last_ffmpeg/make_mfcc/train mfcc\n",
      "make_cmvn.sh: no such file data_last_ffmpeg/train/feats.scp\n",
      "fix_data_dir.sh: kept all 87296 utterances.\n",
      "fix_data_dir.sh: old files are kept in data_last_ffmpeg/train/.backup\n",
      "utils/combine_data.sh data_last_ffmpeg/ESTER1 data_last_ffmpeg/data data_last_ffmpeg/DATA2\n",
      "utils/combine_data.sh [info]: not combining utt2uniq as it does not exist\n",
      "utils/combine_data.sh: combined segments\n",
      "utils/combine_data.sh: combined utt2spk\n",
      "utils/combine_data.sh [info]: not combining utt2lang as it does not exist\n",
      "utils/combine_data.sh: combined utt2dur\n",
      "utils/combine_data.sh: combined feats.scp\n",
      "utils/combine_data.sh: combined text\n",
      "utils/combine_data.sh: combined cmvn.scp\n",
      "utils/combine_data.sh [info]: not combining reco2file_and_channel as it does not exist\n",
      "utils/combine_data.sh: combined wav.scp\n",
      "utils/combine_data.sh: combined spk2gender\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "fix_data_dir.sh: kept all 92530 utterances.\n",
      "fix_data_dir.sh: old files are kept in data_last_ffmpeg/ESTER1/.backup\n",
      "utils/combine_data.sh data_last_ffmpeg/ESTER1_2 data_last_ffmpeg/ESTER1 data_last_ffmpeg/train\n",
      "utils/combine_data.sh [info]: not combining utt2uniq as it does not exist\n",
      "utils/combine_data.sh: combined segments\n",
      "utils/combine_data.sh: combined utt2spk\n",
      "utils/combine_data.sh [info]: not combining utt2lang as it does not exist\n",
      "utils/combine_data.sh: combined utt2dur\n",
      "utils/combine_data.sh [info]: **not combining feats.scp as it does not exist everywhere**\n",
      "utils/combine_data.sh: combined text\n",
      "utils/combine_data.sh [info]: **not combining cmvn.scp as it does not exist everywhere**\n",
      "utils/combine_data.sh [info]: not combining reco2file_and_channel as it does not exist\n",
      "utils/combine_data.sh: combined wav.scp\n",
      "utils/combine_data.sh: combined spk2gender\n",
      "fix_data_dir.sh: kept all 179826 utterances.\n",
      "fix_data_dir.sh: old files are kept in data_last_ffmpeg/ESTER1_2/.backup\n"
     ]
    }
   ],
   "source": [
    "# Feature Extraction MFCC:\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "#exp_kaldi=exp_last\n",
    "mfccdir=mfcc\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "for part in data DATA2 DATA train; do\n",
    "    #MFCC features\n",
    "    steps/make_mfcc.sh --cmd \"$train_cmd\" --nj 32 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    #MFCC features + Pitch\n",
    "    #steps/make_mfcc_pitch.sh --cmd \"$train_cmd\" --nj 12 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    steps/compute_cmvn_stats.sh $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    utils/fix_data_dir.sh $idata_kaldi/$part\n",
    "done\n",
    "# Combine data ESTER Phase 1 & Phase 2\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "utils/combine_data.sh $idata_kaldi/ESTER1 $idata_kaldi/data $idata_kaldi/DATA2\n",
    "utils/combine_data.sh $idata_kaldi/ESTER1_2 $idata_kaldi/ESTER1 $idata_kaldi/train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build Language model using ESTER 1 & ESTER 2"
   ]
  },
  {
   "cell_type": "code",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "Splitting into 32 parts, to allow for parallel processing ...\n",
      "Checking the splits ...\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "Performing text normalization (32 jobs) - check data_last_ffmpeg/local_ESTER12/lm/norm/tmp/txt_norm.JOB.log ...\n",
      "Finished OK\n",
      "Selecting the vocabulary (400000 words) ...\n",
      "Making the corpus and the vocabulary ...\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "Word counts saved to 'data_last_ffmpeg/local_ESTER12/lm/word_counts.txt'\n",
      "Vocabulary saved as 'data_last_ffmpeg/local_ESTER12/lm/meeting-vocab.txt'\n",
      "All unique sentences (in sorted order) stored in 'data_last_ffmpeg/local_ESTER12/lm/meeting-lm-norm.txt.gz'\n",
      "Counting the total number word tokens in the corpus ...\n",
      "There are 2066518 tokens in the corpus\n",
      "Training a 3-gram LM ...\n",
      "This implementation assumes that you have a lot of free RAM(> 12GB) on your machine\n",
      "If that's not the case, consider something like: http://joshua-decoder.org/4.0/large-lms.html\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "6,4M\tdata_last_ffmpeg/local_ESTER12/lm/lm_tglarge.arpa.gz\n",
      "Creating a 'small' pruned 3-gram LM (threshold: 0.0000003) ...\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "data_last_ffmpeg/local_ESTER12/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "4,1M\tdata_last_ffmpeg/local_ESTER12/lm/lm_tgsmall.arpa.gz\n",
      "Creating a 'medium' pruned 3-gram LM (threshold: 0.0000001) ...\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "data_last_ffmpeg/local_ESTER12/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "5,4M\tdata_last_ffmpeg/local_ESTER12/lm/lm_tgmed.arpa.gz\n",
      "Training a 4-gram LM ...\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "7,9M\tdata_last_ffmpeg/local_ESTER12/lm/lm_fglarge.arpa.gz\n"
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "corpus_path=/fast/LINAGORA/Corpus/database\n",
    "LM_train_text=$corpus_path/Textall\n",
    "local/lm/train_lm.sh $LM_train_text \\\n",
    "$idata_kaldi/local_ESTER12/lm/norm/tmp $idata_kaldi/local_ESTER12/lm/norm/norm_texts $idata_kaldi/local_ESTER12/lm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/fast/LINAGORA/tools/LM/lm_tgsphinx.arpa.gz: line 35: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "data-ESTER-V4/local_ESTER12/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "BOW numerator for context \"\" is -3.2194e-05 < 0\n",
      "BOW numerator for context \"c' puisque\" is -0.000595476 < 0\n",
      "BOW numerator for context \"c' que\" is -0.000972837 < 0\n",
      "BOW numerator for context \"c' matin\" is -0.000964833 < 0\n",
      "BOW numerator for context \"c' monde\" is -0.000930619 < 0\n",
      "reading 74451 1-grams\n",
      "exp-ESTER-V4/eval_LM_ESTER12/mixed.gz: line 35: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "reading 18615477 2-grams\n",
      "reading 23653218 3-grams\n"
     ]
    }
   ],
   "source": [
    "# Evaluate LM\n",
    "testfile=$idata_kaldi/DATA/text_without_noise_tag\n",
    "# LM's\n",
    "LM_LIUM_LARGE=/fast/LINAGORA/tools/LM/lm_tgsphinx.arpa.gz\n",
    "LM_LIUM_SMALL=/fast/LINAGORA/tools/LM/lm_french-small.arpa.gz\n",
    "LM_ESTER_tglarge=$idata_kaldi/local_ESTER12/lm/lm_tglarge.arpa.gz\n",
    "LM_ESTER_fglarge=$idata_kaldi/local_ESTER12/lm/lm_fglarge.arpa.gz\n",
    "# dir eval language model\n",
    "dir_eval_lm=$exp_kaldi/eval_LM_ESTER12\n",
    "mkdir -p $dir_eval_lm\n",
    "\n",
    "# Get Text from kaldi text format\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "cut -f2- -d' ' < $idata_kaldi/DATA/text | cut -d ' ' -f2- |\\\n",
    "sed -e 's/[ ]\\+/ /g' | sed -e 's/<[^ ][^ ]*>\\|!sil//g' > $testfile\n",
    "# Lium 3-gram all\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "ngram -lm $LM_LIUM_LARGE -ppl $testfile -debug 2 > $dir_eval_lm/LM_tgLium_full.ppl\n",
    "# Lium 3-gram pruné\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "ngram -lm $LM_LIUM_SMALL -ppl $testfile -debug 2 > $dir_eval_lm/LM_tgLium_small.ppl\n",
    "# ESTER 3-gram\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "ngram -lm $LM_ESTER_tglarge -ppl $testfile -debug 2 > $dir_eval_lm/LM_tgESTER.ppl\n",
    "# ESTER 4-gram\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "ngram -lm $LM_ESTER_fglarge -ppl $testfile -debug 2 > $dir_eval_lm/LM_fgESTER.ppl\n",
    "#===== Mixe languages\n",
    "# compute best lambda\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "compute-best-mix $dir_eval_lm/LM_tgLium_full.ppl $dir_eval_lm/LM_tgESTER.ppl > $dir_eval_lm/best_mix\n",
    "# mixe languages\n",
    "ngram -lm $LM_LIUM_LARGE -mix-lm $LM_ESTER_tglarge -lambda 0.597881 -write-lm $dir_eval_lm/mixed.gz\n",
    "# compute perplexity\n",
    "ngram -lm $dir_eval_lm/mixed.gz -ppl $testfile -debug 2 > $dir_eval_lm/LM_mixed.ppl"
   ]
  },
  {
   "cell_type": "code",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " 105023 \n",
      "  52878 aa\n",
      "      1 aaq\n",
      "  49016 ai\n",
      "  20328 an\n",
      "   6620 au\n",
      "  13761 bb\n",
      "   5354 ch\n",
      "  20387 dd\n",
      "  32294 ee\n",
      "  22574 ei\n",
      "   1816 eu\n",
      "  11273 ff\n",
      "   8086 gg\n",
      "   1043 gn\n",
      "  47329 ii\n",
      "   7160 in\n",
      "   8405 jj\n",
      "  29273 kk\n",
      "  31392 ll\n",
      "  21284 mm\n",
      "  21994 nn\n",
      "   2169 oe\n",
      "  10989 on\n",
      "  25795 oo\n",
      "   7341 ou\n",
      "  20469 pp\n",
      "  64183 rr\n",
      "  38533 ss\n",
      "  47040 tt\n",
      "    182 un\n",
      "  13874 uu\n",
      "   1225 uy\n",
      "  11965 vv\n",
      "   3746 ww\n",
      "  16421 yy\n",
      "  28250 zz\n",
      "Downloading and preparing CMUdict\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "Autogenerating pronunciations for the words in data_last_ffmpeg/local_ESTER12/dict/g2p/vocab_autogen.* ...\n",
      "12125\n",
      "12125\n",
      "12125 pronunciations autogenerated OK\n",
      "Combining the CMUdict pronunciations with the autogenerated ones ...\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "Combined lexicon saved to 'data_last_ffmpeg/local_ESTER12/dict/lexicon_raw_nosil.txt'\n",
      "Preparing phone lists and clustering questions\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "4 silence phones saved to: data_last_ffmpeg/local_ESTER12/dict/silence_phones.txt\n",
      "1 optional silence saved to: data_last_ffmpeg/local_ESTER12/dict/optional_silence.txt\n",
      "36 non-silence phones saved to: data_last_ffmpeg/local_ESTER12/dict/nonsilence_phones.txt\n",
      "2 extra triphone clustering-related questions saved to: data_last_ffmpeg/local_ESTER12/dict/extra_questions.txt\n",
      "Lexicon text file saved as: data_last_ffmpeg/local_ESTER12/dict/lexicon.txt\n"
     ]
    }
   ],
   "source": [
    "#### Prepare dict: add words which doesn't exist in dictionnary + config files...\n",
    "# print number of phonem used in french\n",
    "dir_repos=/fast/LINAGORA/STT/Thesis_aheba\n",
    "#dir_repos=/home/lingora/Documents/Linagora/kaldi/egs/Linagora/Thesis_aheba\n",
    "cat $dir_repos/cmu_dict/fr.dict | awk '{$1=\"\";print $0}' | tr ' ' '\\n' | sort -b | uniq -c\n",
    "mkdir -p $idata_kaldi/local_ESTER12/dict/cmudict\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "cp $dir_repos/cmu_dict/lexicon_new.dict $idata_kaldi/local_ESTER12/dict/fr.dict\n",
    "mkdir -p $idata_kaldi/local_ESTER12/lm/g2p\n",
    "cp $dir_repos/g2p/model-5 $idata_kaldi/local_ESTER12/lm/g2p\n",
    "\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
    "local/prepare_dict.sh --stage 0 --nj 16 --cmd \"$train_cmd\" \\\n",
    "   $idata_kaldi/local_ESTER12/lm $idata_kaldi/local_ESTER12/lm/g2p $idata_kaldi/local_ESTER12/dict"
   ]
  },
  {
   "cell_type": "code",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "Checking data_last_ffmpeg/local_ESTER12/dict/silence_phones.txt ...\n",
      "--> reading data_last_ffmpeg/local_ESTER12/dict/silence_phones.txt\n",
      "--> data_last_ffmpeg/local_ESTER12/dict/silence_phones.txt is OK\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "Checking data_last_ffmpeg/local_ESTER12/dict/optional_silence.txt ...\n",
      "--> reading data_last_ffmpeg/local_ESTER12/dict/optional_silence.txt\n",
      "--> data_last_ffmpeg/local_ESTER12/dict/optional_silence.txt is OK\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "Checking data_last_ffmpeg/local_ESTER12/dict/nonsilence_phones.txt ...\n",
      "--> reading data_last_ffmpeg/local_ESTER12/dict/nonsilence_phones.txt\n",
      "--> data_last_ffmpeg/local_ESTER12/dict/nonsilence_phones.txt is OK\n",
      "\n",
      "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n",
      "--> disjoint property is OK.\n",
      "\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "Checking data_last_ffmpeg/local_ESTER12/dict/lexicon.txt\n",
      "--> reading data_last_ffmpeg/local_ESTER12/dict/lexicon.txt\n",
      "--> data_last_ffmpeg/local_ESTER12/dict/lexicon.txt is OK\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "Checking data_last_ffmpeg/local_ESTER12/dict/extra_questions.txt ...\n",
      "--> reading data_last_ffmpeg/local_ESTER12/dict/extra_questions.txt\n",
      "--> data_last_ffmpeg/local_ESTER12/dict/extra_questions.txt is OK\n",
      "--> SUCCESS [validating dictionary directory data_last_ffmpeg/local_ESTER12/dict]\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "**Creating data_last_ffmpeg/local_ESTER12/dict/lexiconp.txt from data_last_ffmpeg/local_ESTER12/dict/lexicon.txt\n",
      "fstaddselfloops data_last_ffmpeg/lang_ESTER12/phones/wdisambig_phones.int data_last_ffmpeg/lang_ESTER12/phones/wdisambig_words.int \n",
      "prepare_lang.sh: validating output directory\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "utils/validate_lang.pl data_last_ffmpeg/lang_ESTER12\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones.txt ...\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones.txt is OK\n",
      "\n",
      "Checking words.txt: #0 ...\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "--> data_last_ffmpeg/lang_ESTER12/words.txt is OK\n",
      "\n",
      "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> silence.txt and nonsilence.txt are disjoint\n",
      "--> silence.txt and disambig.txt are disjoint\n",
      "--> disambig.txt and nonsilence.txt are disjoint\n",
      "--> disjoint property is OK\n",
      "\n",
      "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> summation property is OK\n",
      "\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "Checking data_last_ffmpeg/lang_ESTER12/phones/context_indep.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/context_indep.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/context_indep.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/context_indep.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/context_indep.csl corresponds to data_last_ffmpeg/lang_ESTER12/phones/context_indep.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/context_indep.{txt, int, csl} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/nonsilence.{txt, int, csl} ...\n",
      "--> 144 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/nonsilence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/nonsilence.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/nonsilence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/nonsilence.csl corresponds to data_last_ffmpeg/lang_ESTER12/phones/nonsilence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/nonsilence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/silence.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/silence.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/silence.csl corresponds to data_last_ffmpeg/lang_ESTER12/phones/silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/optional_silence.{txt, int, csl} ...\n",
      "--> 1 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/optional_silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/optional_silence.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/optional_silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/optional_silence.csl corresponds to data_last_ffmpeg/lang_ESTER12/phones/optional_silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/optional_silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/disambig.{txt, int, csl} ...\n",
      "--> 18 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/disambig.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/disambig.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/disambig.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/disambig.csl corresponds to data_last_ffmpeg/lang_ESTER12/phones/disambig.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/disambig.{txt, int, csl} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/roots.{txt, int} ...\n",
      "--> 40 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/roots.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/roots.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/roots.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/roots.{txt, int} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/sets.{txt, int} ...\n",
      "--> 40 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/sets.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/sets.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/sets.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/sets.{txt, int} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/extra_questions.{txt, int} ...\n",
      "--> 11 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/extra_questions.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/extra_questions.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/extra_questions.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/extra_questions.{txt, int} are OK\n",
      "\n",
      "Checking data_last_ffmpeg/lang_ESTER12/phones/word_boundary.{txt, int} ...\n",
      "--> 164 entry/entries in data_last_ffmpeg/lang_ESTER12/phones/word_boundary.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/word_boundary.int corresponds to data_last_ffmpeg/lang_ESTER12/phones/word_boundary.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/word_boundary.{txt, int} are OK\n",
      "\n",
      "Checking optional_silence.txt ...\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "--> reading data_last_ffmpeg/lang_ESTER12/phones/optional_silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/optional_silence.txt is OK\n",
      "\n",
      "Checking disambiguation symbols: #0 and #1\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "--> data_last_ffmpeg/lang_ESTER12/phones/disambig.txt has \"#0\" and \"#1\"\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/disambig.txt is OK\n",
      "\n",
      "Checking topo ...\n",
      "\n",
      "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "--> data_last_ffmpeg/lang_ESTER12/phones/word_boundary.txt doesn't include disambiguation symbols\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/phones/word_boundary.txt is OK\n",
      "\n",
      "Checking word-level disambiguation symbols...\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "--> data_last_ffmpeg/lang_ESTER12/phones/wdisambig.txt exists (newer prepare_lang.sh)\n",
      "Checking word_boundary.int and disambig.int\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "--> generating a 86 word sequence\n",
      "--> resulting phone sequence from L.fst corresponds to the word sequence\n",
      "--> L.fst is OK\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "--> generating a 35 word sequence\n",
      "--> resulting phone sequence from L_disambig.fst corresponds to the word sequence\n",
      "--> L_disambig.fst is OK\n",
      "\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "Checking data_last_ffmpeg/lang_ESTER12/oov.{txt, int} ...\n",
      "--> 1 entry/entries in data_last_ffmpeg/lang_ESTER12/oov.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/oov.int corresponds to data_last_ffmpeg/lang_ESTER12/oov.txt\n",
      "--> data_last_ffmpeg/lang_ESTER12/oov.{txt, int} are OK\n",
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
      "--> data_last_ffmpeg/lang_ESTER12/L.fst is olabel sorted\n",
      "--> data_last_ffmpeg/lang_ESTER12/L_disambig.fst is olabel sorted\n",
      "--> SUCCESS [validating lang directory data_last_ffmpeg/lang_ESTER12]\n"
495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000
     ]
    }
   ],
   "source": [
    "#### Prepare Lang ==> L.fst Vocabulary's automate finite state\n",
    "utils/prepare_lang.sh $idata_kaldi/local_ESTER12/dict \\\n",
    "   \"<unk>\" $idata_kaldi/local_ESTER12/lang_tmp $idata_kaldi/lang_ESTER12"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ln: impossible de créer le lien symbolique 'data-ESTER-V4/local_ESTER12/lm/lm_tgsphinx.arpa.gz': Le fichier existe\n",
      "ln: impossible de créer le lien symbolique 'data-ESTER-V4/local_ESTER12/lm/lm_french-small.arpa.gz': Le fichier existe\n",
      "arpa2fst --disambig-symbol=#0 --read-symbol-table=data-ESTER-V4/lang_ESTER12_test_fglarge/words.txt - data-ESTER-V4/lang_ESTER12_test_fglarge/G.fst \n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:90) Reading \\data\\ section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\1-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\2-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\3-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\4-grams: section.\n",
      "utils/validate_lang.pl data-ESTER-V4/lang_ESTER12_test_fglarge\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones.txt ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones.txt is OK\n",
      "\n",
      "Checking words.txt: #0 ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/words.txt is OK\n",
      "\n",
      "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> silence.txt and nonsilence.txt are disjoint\n",
      "--> silence.txt and disambig.txt are disjoint\n",
      "--> disambig.txt and nonsilence.txt are disjoint\n",
      "--> disjoint property is OK\n",
      "\n",
      "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> summation property is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/context_indep.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/context_indep.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/context_indep.csl corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/context_indep.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/nonsilence.{txt, int, csl} ...\n",
      "--> 144 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/nonsilence.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/nonsilence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/nonsilence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/silence.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.{txt, int, csl} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.{txt, int, csl} ...\n",
      "--> 18 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.csl corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/roots.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/roots.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/roots.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/sets.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/sets.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/sets.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/extra_questions.{txt, int} ...\n",
      "--> 11 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/extra_questions.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/extra_questions.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.{txt, int} ...\n",
      "--> 164 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.{txt, int} are OK\n",
      "\n",
      "Checking optional_silence.txt ...\n",
      "--> reading data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/optional_silence.txt is OK\n",
      "\n",
      "Checking disambiguation symbols: #0 and #1\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.txt has \"#0\" and \"#1\"\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/disambig.txt is OK\n",
      "\n",
      "Checking topo ...\n",
      "\n",
      "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.txt doesn't include disambiguation symbols\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/word_boundary.txt is OK\n",
      "\n",
      "Checking word-level disambiguation symbols...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/phones/wdisambig.txt exists (newer prepare_lang.sh)\n",
      "Checking word_boundary.int and disambig.int\n",
      "--> generating a 55 word sequence\n",
      "--> resulting phone sequence from L.fst corresponds to the word sequence\n",
      "--> L.fst is OK\n",
      "--> generating a 56 word sequence\n",
      "--> resulting phone sequence from L_disambig.fst corresponds to the word sequence\n",
      "--> L_disambig.fst is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_fglarge/oov.{txt, int} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_ESTER12_test_fglarge/oov.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/oov.int corresponds to data-ESTER-V4/lang_ESTER12_test_fglarge/oov.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/oov.{txt, int} are OK\n",
      "\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/L.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/L_disambig.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/G.fst is ilabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_fglarge/G.fst has 721774 states\n",
      "--> utils/lang/check_g_properties.pl successfully validated data-ESTER-V4/lang_ESTER12_test_fglarge/G.fst\n",
      "--> utils/lang/check_g_properties.pl succeeded.\n",
      "--> SUCCESS [validating lang directory data-ESTER-V4/lang_ESTER12_test_fglarge]\n",
      "arpa2fst --disambig-symbol=#0 --read-symbol-table=data-ESTER-V4/lang_ESTER12_test_tglarge/words.txt - data-ESTER-V4/lang_ESTER12_test_tglarge/G.fst \n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:90) Reading \\data\\ section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\1-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\2-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\3-grams: section.\n",
      "utils/validate_lang.pl data-ESTER-V4/lang_ESTER12_test_tglarge\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones.txt ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones.txt is OK\n",
      "\n",
      "Checking words.txt: #0 ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/words.txt is OK\n",
      "\n",
      "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> silence.txt and nonsilence.txt are disjoint\n",
      "--> silence.txt and disambig.txt are disjoint\n",
      "--> disambig.txt and nonsilence.txt are disjoint\n",
      "--> disjoint property is OK\n",
      "\n",
      "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> summation property is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/context_indep.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/context_indep.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/context_indep.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/context_indep.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/nonsilence.{txt, int, csl} ...\n",
      "--> 144 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/nonsilence.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/nonsilence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/nonsilence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/silence.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.{txt, int, csl} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.{txt, int, csl} ...\n",
      "--> 18 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/roots.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/roots.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/roots.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/sets.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/sets.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/sets.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/extra_questions.{txt, int} ...\n",
      "--> 11 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/extra_questions.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/extra_questions.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.{txt, int} ...\n",
      "--> 164 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.{txt, int} are OK\n",
      "\n",
      "Checking optional_silence.txt ...\n",
      "--> reading data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/optional_silence.txt is OK\n",
      "\n",
      "Checking disambiguation symbols: #0 and #1\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.txt has \"#0\" and \"#1\"\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/disambig.txt is OK\n",
      "\n",
      "Checking topo ...\n",
      "\n",
      "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.txt doesn't include disambiguation symbols\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/word_boundary.txt is OK\n",
      "\n",
      "Checking word-level disambiguation symbols...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/phones/wdisambig.txt exists (newer prepare_lang.sh)\n",
      "Checking word_boundary.int and disambig.int\n",
      "--> generating a 66 word sequence\n",
      "--> resulting phone sequence from L.fst corresponds to the word sequence\n",
      "--> L.fst is OK\n",
      "--> generating a 92 word sequence\n",
      "--> resulting phone sequence from L_disambig.fst corresponds to the word sequence\n",
      "--> L_disambig.fst is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tglarge/oov.{txt, int} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_ESTER12_test_tglarge/oov.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/oov.int corresponds to data-ESTER-V4/lang_ESTER12_test_tglarge/oov.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/oov.{txt, int} are OK\n",
      "\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/L.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/L_disambig.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/G.fst is ilabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tglarge/G.fst has 527277 states\n",
      "--> utils/lang/check_g_properties.pl successfully validated data-ESTER-V4/lang_ESTER12_test_tglarge/G.fst\n",
      "--> utils/lang/check_g_properties.pl succeeded.\n",
      "--> SUCCESS [validating lang directory data-ESTER-V4/lang_ESTER12_test_tglarge]\n",
      "arpa2fst --disambig-symbol=#0 --read-symbol-table=data-ESTER-V4/lang_ESTER12_test_tgsphinx/words.txt - data-ESTER-V4/lang_ESTER12_test_tgsphinx/G.fst \n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:90) Reading \\data\\ section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\1-grams: section.\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 8 [-5.2142\t-ce\t-0.2163] skipped: word '-ce' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 9 [-4.8723\t-ci\t-0.2944] skipped: word '-ci' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 10 [-4.2048\t-elle\t-0.3700] skipped: word '-elle' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 11 [-4.4310\t-elles\t-0.3539] skipped: word '-elles' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 12 [-4.9062\t-en\t-0.2800] skipped: word '-en' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 13 [-4.0463\t-il\t-0.4450] skipped: word '-il' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 14 [-4.1699\t-ils\t-0.4206] skipped: word '-ils' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 15 [-5.1270\t-je\t-0.2720] skipped: word '-je' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 16 [-4.9573\t-la\t-0.2149] skipped: word '-la' not in symbol table\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 17 [-3.6009\t-là\t-0.5456] skipped: word '-là' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 18 [-4.7289\t-le\t-0.2947] skipped: word '-le' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 19 [-5.0130\t-les\t-0.1999] skipped: word '-les' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 20 [-5.5490\t-lui\t-0.1649] skipped: word '-lui' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 21 [-4.7873\t-même\t-0.2425] skipped: word '-même' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 22 [-5.4825\t-mêmes\t-0.1766] skipped: word '-mêmes' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 23 [-5.0634\t-moi\t-0.3092] skipped: word '-moi' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 24 [-4.7140\t-nous\t-0.3254] skipped: word '-nous' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 25 [-4.3209\t-on\t-0.3717] skipped: word '-on' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 26 [-3.8887\t-t\t-1.4364] skipped: word '-t' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 27 [-4.3967\t-t-elle\t-0.2546] skipped: word '-t-elle' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 28 [-4.2829\t-t-il\t-0.2571] skipped: word '-t-il' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 29 [-5.3860\t-toi\t-0.1905] skipped: word '-toi' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 30 [-5.4007\t-tu\t-0.1668] skipped: word '-tu' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 31 [-4.6364\t-vous\t-0.3545] skipped: word '-vous' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 32 [-5.6027\t-y\t-0.1394] skipped: word '-y' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 38 [-5.8621\tà-côtés\t-0.1238] skipped: word 'à-côtés' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 39 [-5.7419\tà-coups\t-0.1460] skipped: word 'à-coups' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 40 [-5.7226\tà-peu-près\t-0.1351] skipped: word 'à-peu-près' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 41 [-5.8535\tà-propos\t-0.1315] skipped: word 'à-propos' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 42 [-5.3186\ta1\t-0.2572] skipped: word 'a1' not in symbol table\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\2-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\3-grams: section.\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:231) Of 9454889 parse warnings, 30 were reported. Run program with --max_warnings=-1 to see all warnings\n",
      "utils/validate_lang.pl data-ESTER-V4/lang_ESTER12_test_tgsphinx\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones.txt ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones.txt is OK\n",
      "\n",
      "Checking words.txt: #0 ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/words.txt is OK\n",
      "\n",
      "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> silence.txt and nonsilence.txt are disjoint\n",
      "--> silence.txt and disambig.txt are disjoint\n",
      "--> disambig.txt and nonsilence.txt are disjoint\n",
      "--> disjoint property is OK\n",
      "\n",
      "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> summation property is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/context_indep.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/context_indep.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/context_indep.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/context_indep.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/nonsilence.{txt, int, csl} ...\n",
      "--> 144 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/nonsilence.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/nonsilence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/nonsilence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/silence.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.{txt, int, csl} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.{txt, int, csl} ...\n",
      "--> 18 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.csl corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/roots.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/roots.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/roots.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/sets.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/sets.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/sets.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/extra_questions.{txt, int} ...\n",
      "--> 11 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/extra_questions.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/extra_questions.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.{txt, int} ...\n",
      "--> 164 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.{txt, int} are OK\n",
      "\n",
      "Checking optional_silence.txt ...\n",
      "--> reading data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/optional_silence.txt is OK\n",
      "\n",
      "Checking disambiguation symbols: #0 and #1\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.txt has \"#0\" and \"#1\"\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/disambig.txt is OK\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Checking topo ...\n",
      "\n",
      "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.txt doesn't include disambiguation symbols\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/word_boundary.txt is OK\n",
      "\n",
      "Checking word-level disambiguation symbols...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/phones/wdisambig.txt exists (newer prepare_lang.sh)\n",
      "Checking word_boundary.int and disambig.int\n",
      "--> generating a 11 word sequence\n",
      "--> resulting phone sequence from L.fst corresponds to the word sequence\n",
      "--> L.fst is OK\n",
      "--> generating a 46 word sequence\n",
      "--> resulting phone sequence from L_disambig.fst corresponds to the word sequence\n",
      "--> L_disambig.fst is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_tgsphinx/oov.{txt, int} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_ESTER12_test_tgsphinx/oov.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/oov.int corresponds to data-ESTER-V4/lang_ESTER12_test_tgsphinx/oov.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/oov.{txt, int} are OK\n",
      "\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/L.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/L_disambig.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/G.fst is ilabel sorted\n",
      "--> data-ESTER-V4/lang_ESTER12_test_tgsphinx/G.fst has 13156697 states\n",
      "--> utils/lang/check_g_properties.pl successfully validated data-ESTER-V4/lang_ESTER12_test_tgsphinx/G.fst\n",
      "--> utils/lang/check_g_properties.pl succeeded.\n",
      "--> SUCCESS [validating lang directory data-ESTER-V4/lang_ESTER12_test_tgsphinx]\n",
      "arpa2fst --disambig-symbol=#0 --read-symbol-table=data-ESTER-V4/lang_ESTER12_test_french-small/words.txt - data-ESTER-V4/lang_ESTER12_test_french-small/G.fst \n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:90) Reading \\data\\ section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\1-grams: section.\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 12 [-5.538307\taalto] skipped: word 'aalto' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 13 [-5.120474\taaron\t-0.02769727] skipped: word 'aaron' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 15 [-5.731214\tabachidze] skipped: word 'abachidze' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 16 [-5.487072\tabadie\t-0.02312817] skipped: word 'abadie' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 17 [-5.749587\tabagnale\t-0.03421882] skipped: word 'abagnale' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 18 [-5.439013\tabaissant\t-0.1023452] skipped: word 'abaissant' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 19 [-5.298798\tabaisse\t-0.1515498] skipped: word 'abaisse' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 23 [-5.383688\tabaissée\t-0.1352918] skipped: word 'abaissée' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 24 [-5.447463\tabaissés\t-0.1639998] skipped: word 'abaissés' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 26 [-5.323689\tabandonna\t-0.07547411] skipped: word 'abandonna' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 32 [-5.20608\tabandonnera\t-0.08380523] skipped: word 'abandonnera' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 42 [-5.217123\tabasourdi\t-0.1560311] skipped: word 'abasourdi' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 46 [-5.573611\tabat-jour\t-0.04034681] skipped: word 'abat-jour' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 47 [-5.525448\tabats\t-0.06064676] skipped: word 'abats' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 51 [-5.45354\tabattements\t-0.1281941] skipped: word 'abattements' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 61 [-5.026776\tabb\t-0.07580301] skipped: word 'abb' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 62 [-5.444225\tabbado\t-0.02391137] skipped: word 'abbado' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 64 [-5.533504\tabbatiale\t-0.05947962] skipped: word 'abbatiale' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 66 [-5.472561\tabbayes\t-0.06715237] skipped: word 'abbayes' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 67 [-5.64184\tabbesses\t-0.01320701] skipped: word 'abbesses' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 71 [-5.262563\tabbott\t-0.03037171] skipped: word 'abbott' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 89 [-5.243713\tabdelkrim\t-0.003165964] skipped: word 'abdelkrim' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 97 [-5.60272\tabderazak\t-0.07333852] skipped: word 'abderazak' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 102 [-5.590033\tabderrezak\t-0.08256684] skipped: word 'abderrezak' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 106 [-5.726902\tabdic] skipped: word 'abdic' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 107 [-5.558577\tabdication\t-0.05181389] skipped: word 'abdication' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 109 [-5.369082\tabdiquer\t-0.1094639] skipped: word 'abdiquer' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 110 [-5.468573\tabdiqué\t-0.07636042] skipped: word 'abdiqué' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 119 [-5.446184\tabe\t-0.003954563] skipped: word 'abe' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 120 [-5.70366\tabeau\t-0.02909388] skipped: word 'abeau' not in symbol table\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\2-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\3-grams: section.\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:231) Of 292476 parse warnings, 30 were reported. Run program with --max_warnings=-1 to see all warnings\n",
      "utils/validate_lang.pl data-ESTER-V4/lang_ESTER12_test_french-small\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones.txt ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones.txt is OK\n",
      "\n",
      "Checking words.txt: #0 ...\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/words.txt is OK\n",
      "\n",
      "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> silence.txt and nonsilence.txt are disjoint\n",
      "--> silence.txt and disambig.txt are disjoint\n",
      "--> disambig.txt and nonsilence.txt are disjoint\n",
      "--> disjoint property is OK\n",
      "\n",
      "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> summation property is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/context_indep.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/context_indep.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/context_indep.csl corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/context_indep.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/nonsilence.{txt, int, csl} ...\n",
      "--> 144 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/nonsilence.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/nonsilence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/nonsilence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/silence.{txt, int, csl} ...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--> 20 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.{txt, int, csl} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.csl corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/disambig.{txt, int, csl} ...\n",
      "--> 18 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/disambig.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/disambig.csl corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/disambig.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/roots.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/roots.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/roots.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/sets.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/sets.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/sets.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/extra_questions.{txt, int} ...\n",
      "--> 11 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/extra_questions.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/extra_questions.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_ESTER12_test_french-small/phones/word_boundary.{txt, int} ...\n",
      "--> 164 entry/entries in data-ESTER-V4/lang_ESTER12_test_french-small/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/word_boundary.int corresponds to data-ESTER-V4/lang_ESTER12_test_french-small/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/word_boundary.{txt, int} are OK\n",
      "\n",
      "Checking optional_silence.txt ...\n",
      "--> reading data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_ESTER12_test_french-small/phones/optional_silence.txt is OK\n",
      "\n",