run.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#!/bin/bash\n",
    "\n",
    "# Copyright 2017 Abdel HEBA @Linagora\n",
    "# Pense a ajouter utils/fix_data_dir.sh data/test to fix utterance error\n",
    "# Running on Koios J=12\n",
    "#\n",
    ". cmd.sh\n",
    ". path.sh\n",
    "# link utils & steps\n",
    "#ln -s $KALDI_ROOT/egs/wsj/s5/utils\n",
    "#ln -s $KALDI_ROOT/egs/wsj/s5/steps"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "scrolled": false
   },
   "source": [
    "# Link SpeechDatabase & Kaldi directories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#ESTER PATH:\n",
    "#corpus_path=/home/abdelwah/Documents/STT/corpus/ESTER\n",
    "corpus_path=/fast/LINAGORA/Corpus/\n",
    "\n",
    "idata_kaldi=data-ESTER-V4\n",
    "exp_kaldi=exp-ESTER-V4"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "prepare data\n",
      "utils/data/get_utt2dur.sh: working out data-ESTER-V4/data/utt2dur from data-ESTER-V4/data/segments\n",
      "utils/data/get_utt2dur.sh: computed data-ESTER-V4/data/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER-V4/data\n",
      "Successfully prepared data in data-ESTER-V4/data..\n",
      "prepare DATA2\n",
      "utils/data/get_utt2dur.sh: working out data-ESTER-V4/DATA2/utt2dur from data-ESTER-V4/DATA2/segments\n",
      "utils/data/get_utt2dur.sh: computed data-ESTER-V4/DATA2/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER-V4/DATA2\n",
      "Successfully prepared data in data-ESTER-V4/DATA2..\n",
      "prepare DATA\n",
      "utils/data/get_utt2dur.sh: working out data-ESTER-V4/DATA/utt2dur from data-ESTER-V4/DATA/segments\n",
      "utils/data/get_utt2dur.sh: computed data-ESTER-V4/DATA/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER-V4/DATA\n",
      "Successfully prepared data in data-ESTER-V4/DATA..\n"
     ]
    }
   ],
   "source": [
    "# Dependences\n",
    "#pip3 install num2words --user\n",
    "#pip3 install unidecode --user\n",
    "# Prepare data input for kaldi processing\n",
    "data=$corpus_path/Corpus/ESTER/DGA/Phase1\n",
    "for part in data; do\n",
    "  # use underscore-separated names in data directories.\n",
    "  echo \"prepare $part\"\n",
    "  #local/data_prepTCOF.sh $data/$part $idata_kaldi/$part\n",
    "  # probleme event (URL:)\n",
    "  local/data_prepESTER.sh $data/$part $idata_kaldi/$part\n",
    "done\n",
    "\n",
    "# Prepare data input for kaldi processing\n",
    "data_phase2=$corpus_path/Corpus/ESTER/DGA/Phase2\n",
    "for part in DATA2; do\n",
    "  # use underscore-separated names in data directories.\n",
    "  echo \"prepare $part\"\n",
    "  #local/data_prepTCOF.sh $data/$part $idata_kaldi/$part\n",
    "  # probleme event (URL:)\n",
    "  local/data_prepESTER.sh $data_phase2/$part $idata_kaldi/$part\n",
    "done\n",
    "\n",
    "# ESTER DATA EVALUATION\n",
    "data_test=$corpus_path/Corpus/ESTER/DGA/Eval2005\n",
    "for part in DATA; do\n",
    "  # use underscore-separated names in data directories.\n",
    "  echo \"prepare $part\"\n",
    "  #local/data_prepTCOF.sh $data/$part $idata_kaldi/$part\n",
    "  # probleme event (URL:)\n",
    "  local/data_prepESTER.sh $data_test/$part $idata_kaldi/$part\n",
    "done"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SNR metric computed for each part of the corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#@ToDO : add snr evaluation for each Part of Corpus\n",
    "# Evaluate SNR for each segment of ESTER\n",
    "#evaluate_snr=eval-snr-ESTER/Eval2005\n",
    "#mkdir -p $evaluate_snr\n",
    "#for part in data; do\n",
    "#    echo \"Evaluate $part\"\n",
    "#    local/evaluation/evaluate_snr.sh $idata_kaldi/$part $evaluate_snr\n",
    "#done\n",
    "data=$corpus_path/Corpus/ESTER/DGA/Phase1\n",
    "data_phase2=$corpus_path/Corpus/ESTER/DGA/Phase2\n",
    "data_test=$corpus_path/Corpus/ESTER/DGA/Eval2005"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build Language model from Text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Splitting into 2 parts, to allow for parallel processing ...\n",
      "Checking the splits ...\n",
      "Performing text normalization (2 jobs) - check data-ESTER-V4/local/lm/norm/tmp/txt_norm.JOB.log ...\n",
      "Finished OK\n",
      "Selecting the vocabulary (400000 words) ...\n",
      "Making the corpus and the vocabulary ...\n",
      "Word counts saved to 'data-ESTER-V4/local/lm/word_counts.txt'\n",
      "Vocabulary saved as 'data-ESTER-V4/local/lm/meeting-vocab.txt'\n",
      "All unique sentences (in sorted order) stored in 'data-ESTER-V4/local/lm/meeting-lm-norm.txt.gz'\n",
      "Counting the total number word tokens in the corpus ...\n",
      "There are 992954 tokens in the corpus\n",
      "Training a 3-gram LM ...\n",
      "This implementation assumes that you have a lot of free RAM(> 12GB) on your machine\n",
      "If that's not the case, consider something like: http://joshua-decoder.org/4.0/large-lms.html\n",
      "3,6M\tdata-ESTER-V4/local/lm/lm_tglarge.arpa.gz\n",
      "Creating a 'small' pruned 3-gram LM (threshold: 0.0000003) ...\n",
      "data-ESTER-V4/local/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "2,9M\tdata-ESTER-V4/local/lm/lm_tgsmall.arpa.gz\n",
      "Creating a 'medium' pruned 3-gram LM (threshold: 0.0000001) ...\n",
      "data-ESTER-V4/local/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "3,3M\tdata-ESTER-V4/local/lm/lm_tgmed.arpa.gz\n",
      "Training a 4-gram LM ...\n",
      "4,4M\tdata-ESTER-V4/local/lm/lm_fglarge.arpa.gz\n"
     ]
    }
   ],
   "source": [
    "# Build Language model\n",
    "# Add proununciation model for all phase 1 & 2\n",
    "LM_train_text=$corpus_path/Corpus/Textall\n",
    "local/lm/train_lm.sh $LM_train_text \\\n",
    "$idata_kaldi/local/lm/norm/tmp $idata_kaldi/local/lm/norm/norm_texts $idata_kaldi/local/lm\n",
    "# check characters:\n",
    "# awk '{for(i=1;i<=NF;i++)if(!a[$i]++)print $i\"\\n\"}' ORS= FS= $idata_kaldi/local/lm/meeting-vocab.txt | sort -b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/fast/LINAGORA/tools/LM/lm_tgsphinx.arpa.gz: line 35: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "data-ESTER-V4/local/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "BOW numerator for context \"\" is -3.84557e-05 < 0\n",
      "BOW numerator for context \"c' que\" is -0.00159251 < 0\n",
      "reading 65199 1-grams\n",
      "exp-ESTER-V4/eval_LM/mixed.gz: line 35: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "reading 18557872 2-grams\n",
      "reading 23633745 3-grams\n"
     ]
    }
   ],
   "source": [
    "# Evaluate LM\n",
    "testfile=$idata_kaldi/DATA/text_without_noise_tag\n",
    "# LM's\n",
    "LM_LIUM_LARGE=/fast/LINAGORA/tools/LM/lm_tgsphinx.arpa.gz\n",
    "LM_LIUM_SMALL=/fast/LINAGORA/tools/LM/lm_french-small.arpa.gz\n",
    "LM_ESTER_tglarge=$idata_kaldi/local/lm/lm_tglarge.arpa.gz\n",
    "LM_ESTER_fglarge=$idata_kaldi/local/lm/lm_fglarge.arpa.gz\n",
    "# dir eval language model\n",
    "dir_eval_lm=$exp_kaldi/eval_LM\n",
    "mkdir -p $dir_eval_lm\n",
    "\n",
    "# Get Text from kaldi text format\n",
    "#cut -f2- -d' ' < $idata_kaldi/DATA/text | cut -d ' ' -f2- |\\\n",
    "#sed -e 's/[ ]\\+/ /g' | sed -e 's/<[^ ][^ ]*>\\|!sil//g' > $testfile\n",
    "# Lium 3-gram all\n",
    "#ngram -lm $LM_LIUM_LARGE -ppl $testfile -debug 2 > $dir_eval_lm/LM_tgLium_full.ppl\n",
    "# Lium 3-gram pruné\n",
    "#ngram -lm $LM_LIUM_SMALL -ppl $testfile -debug 2 > $dir_eval_lm/LM_tgLium_small.ppl\n",
    "# ESTER 3-gram\n",
    "#ngram -lm $LM_ESTER_tglarge -ppl $testfile -debug 2 > $dir_eval_lm/LM_tgESTER.ppl\n",
    "# ESTER 4-gram\n",
    "#ngram -lm $LM_ESTER_fglarge -ppl $testfile -debug 2 > $dir_eval_lm/LM_fgESTER.ppl\n",
    "#===== Mixe languages\n",
    "# compute best lambda\n",
    "#compute-best-mix $dir_eval_lm/LM_tgLium_full.ppl $dir_eval_lm/LM_tgESTER.ppl > $dir_eval_lm/best_mix\n",
    "# mixe languages\n",
    "ngram -lm $LM_LIUM_LARGE -mix-lm $LM_ESTER_tglarge -lambda 0.714713 -write-lm $dir_eval_lm/mixed.gz\n",
    "# compute perplexity\n",
    "ngram -lm $dir_eval_lm/mixed.gz -ppl $testfile -debug 2 > $dir_eval_lm/LM_mixed.ppl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/fast/LINAGORA/tools/kaldi/tools/tmp2\n",
      "Assertions.cc\t  misc.pyc\t\tsetup.py\n",
      "Assertions.hh\t  mt.py\t\t\tSimpleGoodTuring.py\n",
      "bin\t\t  Multigram.cc\t\tSparseVector.pyx\n",
      "build\t\t  MultigramGraph.hh\tsymbols.py\n",
      "CHANGES\t\t  Multigram.hh\t\tsymbols.pyc\n",
      "EditDistance.cc   Obstack.hh\t\ttest-g2p.sh\n",
      "Estimation.cc\t  PriorityQueue.hh\ttest_LanguageModel.py\n",
      "Evaluation.py\t  Probability.hh\ttest_mGramCounts.py\n",
      "Evaluation.pyc\t  Python.hh\t\ttest_Minimization.py\n",
      "fsa.py\t\t  README\t\ttestProbability.cc\n",
      "g2p.py\t\t  ReferenceCounting.hh\ttest.py\n",
      "Graph.cc\t  SequenceModel.cc\ttest_SequenceModel.py\n",
      "Graph.hh\t  SequenceModel.hh\ttest_sequitur.py\n",
      "groupedCounts.py  SequenceModel.py\ttest_SparseVector.py\n",
      "IterMap.py\t  SequenceModel.pyc\ttool.py\n",
      "LanguageModel.py  sequitur.i\t\ttool.pyc\n",
      "lib\t\t  sequitur_.py\t\tTranslation.cc\n",
      "LICENSE\t\t  sequitur.py\t\tTypes.cc\n",
      "Makefile\t  sequitur_.pyc\t\tTypes.hh\n",
      "makeOvModel.py\t  sequitur.pyc\t\tUtility.cc\n",
      "mGramCounts.py\t  SequiturTool.py\tUtility.hh\n",
      "Minimization.py   SequiturTool.pyc\txmlwriter.py\n",
      "Minimization.pyc  sequitur_wrap.cpp\n",
      "misc.py\t\t  setup.cfg\n"
     ]
    }
   ],
   "source": [
    "# Some above  could be made:\n",
    "# Learning Grapheme to phonem from dictionary\n",
    "#local/g2p/train_g2p.sh cmu_dict data/local/lm\n",
    "#echo $SRILM_ROOT\n",
    "#ls $KALDI_ROOT/tools/sequitur"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " 105015 \n",
      "  52871 aa\n",
      "      1 aaq\n",
      "  49015 ai\n",
      "  20328 an\n",
      "   6620 au\n",
      "  13761 bb\n",
      "   5354 ch\n",
      "  20386 dd\n",
      "  32293 ee\n",
      "  22574 ei\n",
      "   1816 eu\n",
      "  11272 ff\n",
      "   8085 gg\n",
      "   1043 gn\n",
      "  47324 ii\n",
      "   7160 in\n",
      "   8405 jj\n",
      "  29273 kk\n",
      "  31389 ll\n",
      "  21281 mm\n",
      "  21994 nn\n",
      "   2169 oe\n",
      "  10989 on\n",
      "  25794 oo\n",
      "   7339 ou\n",
      "  20469 pp\n",
      "  64179 rr\n",
      "  38531 ss\n",
      "  47034 tt\n",
      "    182 un\n",
      "  13873 uu\n",
      "   1225 uy\n",
      "  11964 vv\n",
      "   3746 ww\n",
      "  16421 yy\n",
      "  28250 zz\n",
      "Downloading and preparing CMUdict\n",
      "Autogenerating pronunciations for the words in data-ESTER-V4/local/dict/g2p/vocab_autogen.* ...\n",
      "2883\n",
      "2883\n",
      "2883 pronunciations autogenerated OK\n",
      "Combining the CMUdict pronunciations with the autogenerated ones ...\n",
      "Combined lexicon saved to 'data-ESTER-V4/local/dict/lexicon_raw_nosil.txt'\n",
      "Preparing phone lists and clustering questions\n",
      "4 silence phones saved to: data-ESTER-V4/local/dict/silence_phones.txt\n",
      "1 optional silence saved to: data-ESTER-V4/local/dict/optional_silence.txt\n",
      "36 non-silence phones saved to: data-ESTER-V4/local/dict/nonsilence_phones.txt\n",
      "2 extra triphone clustering-related questions saved to: data-ESTER-V4/local/dict/extra_questions.txt\n",
      "Lexicon text file saved as: data-ESTER-V4/local/dict/lexicon.txt\n"
     ]
    }
   ],
   "source": [
    "#### Prepare dict: add words which doesn't exist in dictionnary + config files...\n",
    "# print number of phonem used in french\n",
    "dir_repos=/fast/LINAGORA/STT/Thesis_aheba\n",
    "dir_repos=/home/lingora/Documents/Linagora/kaldi/egs/Linagora/Thesis_aheba\n",
    "cat $dir_repos/cmu_dict/fr.dict | awk '{$1=\"\";print $0}' | tr ' ' '\\n' | sort -b | uniq -c\n",
    "mkdir -p $idata_kaldi/local/dict/cmudict\n",
    "cp $dir_repos/cmu_dict/fr.dict $idata_kaldi/local/dict/fr.dict\n",
    "mkdir -p $idata_kaldi/local/lm/g2p\n",
    "cp $dir_repos/g2p/model-5 $idata_kaldi/local/lm/g2p\n",
    "\n",
    "local/prepare_dict.sh --stage 0 --nj 4 --cmd \"$train_cmd\" \\\n",
    "   $idata_kaldi/local/lm $idata_kaldi/local/lm/g2p $idata_kaldi/local/dict"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare L.fst"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checking data-ESTER-V4/local/dict/silence_phones.txt ...\n",
      "--> reading data-ESTER-V4/local/dict/silence_phones.txt\n",
      "--> data-ESTER-V4/local/dict/silence_phones.txt is OK\n",
      "\n",
      "Checking data-ESTER-V4/local/dict/optional_silence.txt ...\n",
      "--> reading data-ESTER-V4/local/dict/optional_silence.txt\n",
      "--> data-ESTER-V4/local/dict/optional_silence.txt is OK\n",
      "\n",
      "Checking data-ESTER-V4/local/dict/nonsilence_phones.txt ...\n",
      "--> reading data-ESTER-V4/local/dict/nonsilence_phones.txt\n",
      "--> data-ESTER-V4/local/dict/nonsilence_phones.txt is OK\n",
      "\n",
      "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n",
      "--> disjoint property is OK.\n",
      "\n",
      "Checking data-ESTER-V4/local/dict/lexicon.txt\n",
      "--> reading data-ESTER-V4/local/dict/lexicon.txt\n",
      "--> data-ESTER-V4/local/dict/lexicon.txt is OK\n",
      "\n",
      "Checking data-ESTER-V4/local/dict/extra_questions.txt ...\n",
      "--> reading data-ESTER-V4/local/dict/extra_questions.txt\n",
      "--> data-ESTER-V4/local/dict/extra_questions.txt is OK\n",
      "--> SUCCESS [validating dictionary directory data-ESTER-V4/local/dict]\n",
      "\n",
      "**Creating data-ESTER-V4/local/dict/lexiconp.txt from data-ESTER-V4/local/dict/lexicon.txt\n",
      "fstaddselfloops data-ESTER-V4/lang/phones/wdisambig_phones.int data-ESTER-V4/lang/phones/wdisambig_words.int \n",
      "prepare_lang.sh: validating output directory\n",
      "utils/validate_lang.pl data-ESTER-V4/lang\n",
      "Checking data-ESTER-V4/lang/phones.txt ...\n",
      "--> data-ESTER-V4/lang/phones.txt is OK\n",
      "\n",
      "Checking words.txt: #0 ...\n",
      "--> data-ESTER-V4/lang/words.txt is OK\n",
      "\n",
      "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> silence.txt and nonsilence.txt are disjoint\n",
      "--> silence.txt and disambig.txt are disjoint\n",
      "--> disambig.txt and nonsilence.txt are disjoint\n",
      "--> disjoint property is OK\n",
      "\n",
      "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> summation property is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang/phones/context_indep.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang/phones/context_indep.int corresponds to data-ESTER-V4/lang/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang/phones/context_indep.csl corresponds to data-ESTER-V4/lang/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang/phones/context_indep.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang/phones/nonsilence.{txt, int, csl} ...\n",
      "--> 144 entry/entries in data-ESTER-V4/lang/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang/phones/nonsilence.int corresponds to data-ESTER-V4/lang/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang/phones/nonsilence.csl corresponds to data-ESTER-V4/lang/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang/phones/nonsilence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang/phones/silence.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang/phones/silence.txt\n",
      "--> data-ESTER-V4/lang/phones/silence.int corresponds to data-ESTER-V4/lang/phones/silence.txt\n",
      "--> data-ESTER-V4/lang/phones/silence.csl corresponds to data-ESTER-V4/lang/phones/silence.txt\n",
      "--> data-ESTER-V4/lang/phones/silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang/phones/optional_silence.{txt, int, csl} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang/phones/optional_silence.int corresponds to data-ESTER-V4/lang/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang/phones/optional_silence.csl corresponds to data-ESTER-V4/lang/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang/phones/optional_silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang/phones/disambig.{txt, int, csl} ...\n",
      "--> 14 entry/entries in data-ESTER-V4/lang/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang/phones/disambig.int corresponds to data-ESTER-V4/lang/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang/phones/disambig.csl corresponds to data-ESTER-V4/lang/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang/phones/disambig.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang/phones/roots.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang/phones/roots.txt\n",
      "--> data-ESTER-V4/lang/phones/roots.int corresponds to data-ESTER-V4/lang/phones/roots.txt\n",
      "--> data-ESTER-V4/lang/phones/roots.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang/phones/sets.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang/phones/sets.txt\n",
      "--> data-ESTER-V4/lang/phones/sets.int corresponds to data-ESTER-V4/lang/phones/sets.txt\n",
      "--> data-ESTER-V4/lang/phones/sets.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang/phones/extra_questions.{txt, int} ...\n",
      "--> 11 entry/entries in data-ESTER-V4/lang/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang/phones/extra_questions.int corresponds to data-ESTER-V4/lang/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang/phones/extra_questions.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang/phones/word_boundary.{txt, int} ...\n",
      "--> 164 entry/entries in data-ESTER-V4/lang/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang/phones/word_boundary.int corresponds to data-ESTER-V4/lang/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang/phones/word_boundary.{txt, int} are OK\n",
      "\n",
      "Checking optional_silence.txt ...\n",
      "--> reading data-ESTER-V4/lang/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang/phones/optional_silence.txt is OK\n",
      "\n",
      "Checking disambiguation symbols: #0 and #1\n",
      "--> data-ESTER-V4/lang/phones/disambig.txt has \"#0\" and \"#1\"\n",
      "--> data-ESTER-V4/lang/phones/disambig.txt is OK\n",
      "\n",
      "Checking topo ...\n",
      "\n",
      "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> data-ESTER-V4/lang/phones/word_boundary.txt doesn't include disambiguation symbols\n",
      "--> data-ESTER-V4/lang/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n",
      "--> data-ESTER-V4/lang/phones/word_boundary.txt is OK\n",
      "\n",
      "Checking word-level disambiguation symbols...\n",
      "--> data-ESTER-V4/lang/phones/wdisambig.txt exists (newer prepare_lang.sh)\n",
      "Checking word_boundary.int and disambig.int\n",
      "--> generating a 36 word sequence\n",
      "--> resulting phone sequence from L.fst corresponds to the word sequence\n",
      "--> L.fst is OK\n",
      "--> generating a 36 word sequence\n",
      "--> resulting phone sequence from L_disambig.fst corresponds to the word sequence\n",
      "--> L_disambig.fst is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang/oov.{txt, int} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang/oov.txt\n",
      "--> data-ESTER-V4/lang/oov.int corresponds to data-ESTER-V4/lang/oov.txt\n",
      "--> data-ESTER-V4/lang/oov.{txt, int} are OK\n",
      "\n",
      "--> data-ESTER-V4/lang/L.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang/L_disambig.fst is olabel sorted\n",
      "--> SUCCESS [validating lang directory data-ESTER-V4/lang]\n"
     ]
    }
   ],
   "source": [
    "#### Prepare Lang ==> L.fst Vocabulary's automate finite state\n",
    "utils/prepare_lang.sh $idata_kaldi/local/dict \\\n",
    "   \"<unk>\" $idata_kaldi/local/lang_tmp $idata_kaldi/lang"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare G.fst"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "arpa2fst --disambig-symbol=#0 --read-symbol-table=data-ESTER-V4/lang_test_mixed/words.txt - data-ESTER-V4/lang_test_mixed/G.fst \n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:90) Reading \\data\\ section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\1-grams: section.\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 8 [-5.360068\t-ce\t-0.1533034] skipped: word '-ce' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 9 [-5.018168\t-ci\t-0.2031254] skipped: word '-ci' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 10 [-4.350668\t-elle\t-0.2460782] skipped: word '-elle' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 11 [-4.576869\t-elles\t-0.2385286] skipped: word '-elles' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 12 [-5.052068\t-en\t-0.192779] skipped: word '-en' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 13 [-4.192168\t-il\t-0.2843367] skipped: word '-il' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 14 [-4.315768\t-ils\t-0.2745169] skipped: word '-ils' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 15 [-5.272868\t-je\t-0.1903227] skipped: word '-je' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 16 [-5.103168\t-la\t-0.1458162] skipped: word '-la' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 17 [-4.874768\t-le\t-0.2040312] skipped: word '-le' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 18 [-5.158868\t-les\t-0.1417429] skipped: word '-les' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 19 [-5.694868\t-lui\t-0.1158859] skipped: word '-lui' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 20 [-3.746768\t-là\t-0.3350702] skipped: word '-là' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 21 [-5.209268\t-moi\t-0.2134934] skipped: word '-moi' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 22 [-4.933168\t-même\t-0.1705472] skipped: word '-même' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 23 [-5.628368\t-mêmes\t-0.1271463] skipped: word '-mêmes' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 24 [-4.859869\t-nous\t-0.2231263] skipped: word '-nous' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 25 [-4.466768\t-on\t-0.2460155] skipped: word '-on' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 26 [-4.034568\t-t\t-0.4986984] skipped: word '-t' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 27 [-4.542568\t-t-elle\t-0.1772586] skipped: word '-t-elle' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 28 [-4.428768\t-t-il\t-0.1768332] skipped: word '-t-il' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 29 [-5.531868\t-toi\t-0.1358383] skipped: word '-toi' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 30 [-5.546568\t-tu\t-0.1201105] skipped: word '-tu' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 31 [-4.782269\t-vous\t-0.2394385] skipped: word '-vous' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 32 [-5.748569\t-y\t-0.1000024] skipped: word '-y' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 42 [-5.464468\ta1\t-0.1763418] skipped: word 'a1' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 43 [-5.834968\ta10\t-0.1255163] skipped: word 'a10' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 44 [-5.842168\ta104\t-0.09675514] skipped: word 'a104' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 45 [-6.147268\ta11\t-0.07661788] skipped: word 'a11' not in symbol table\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:192) line 46 [-5.948668\ta13\t-0.1366806] skipped: word 'a13' not in symbol table\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\2-grams: section.\n",
      "LOG (arpa2fst:Read():arpa-file-parser.cc:145) Reading \\3-grams: section.\n",
      "WARNING (arpa2fst:Read():arpa-file-parser.cc:231) Of 12719307 parse warnings, 30 were reported. Run program with --max_warnings=-1 to see all warnings\n",
      "utils/validate_lang.pl data-ESTER-V4/lang_test_mixed\n",
      "Checking data-ESTER-V4/lang_test_mixed/phones.txt ...\n",
      "--> data-ESTER-V4/lang_test_mixed/phones.txt is OK\n",
      "\n",
      "Checking words.txt: #0 ...\n",
      "--> data-ESTER-V4/lang_test_mixed/words.txt is OK\n",
      "\n",
      "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> silence.txt and nonsilence.txt are disjoint\n",
      "--> silence.txt and disambig.txt are disjoint\n",
      "--> disambig.txt and nonsilence.txt are disjoint\n",
      "--> disjoint property is OK\n",
      "\n",
      "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> summation property is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_test_mixed/phones/context_indep.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_test_mixed/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/context_indep.int corresponds to data-ESTER-V4/lang_test_mixed/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/context_indep.csl corresponds to data-ESTER-V4/lang_test_mixed/phones/context_indep.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/context_indep.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_test_mixed/phones/nonsilence.{txt, int, csl} ...\n",
      "--> 144 entry/entries in data-ESTER-V4/lang_test_mixed/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/nonsilence.int corresponds to data-ESTER-V4/lang_test_mixed/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/nonsilence.csl corresponds to data-ESTER-V4/lang_test_mixed/phones/nonsilence.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/nonsilence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_test_mixed/phones/silence.{txt, int, csl} ...\n",
      "--> 20 entry/entries in data-ESTER-V4/lang_test_mixed/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/silence.int corresponds to data-ESTER-V4/lang_test_mixed/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/silence.csl corresponds to data-ESTER-V4/lang_test_mixed/phones/silence.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_test_mixed/phones/optional_silence.{txt, int, csl} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_test_mixed/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/optional_silence.int corresponds to data-ESTER-V4/lang_test_mixed/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/optional_silence.csl corresponds to data-ESTER-V4/lang_test_mixed/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/optional_silence.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_test_mixed/phones/disambig.{txt, int, csl} ...\n",
      "--> 14 entry/entries in data-ESTER-V4/lang_test_mixed/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/disambig.int corresponds to data-ESTER-V4/lang_test_mixed/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/disambig.csl corresponds to data-ESTER-V4/lang_test_mixed/phones/disambig.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/disambig.{txt, int, csl} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_test_mixed/phones/roots.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_test_mixed/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/roots.int corresponds to data-ESTER-V4/lang_test_mixed/phones/roots.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/roots.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_test_mixed/phones/sets.{txt, int} ...\n",
      "--> 40 entry/entries in data-ESTER-V4/lang_test_mixed/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/sets.int corresponds to data-ESTER-V4/lang_test_mixed/phones/sets.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/sets.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_test_mixed/phones/extra_questions.{txt, int} ...\n",
      "--> 11 entry/entries in data-ESTER-V4/lang_test_mixed/phones/extra_questions.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/extra_questions.int corresponds to data-ESTER-V4/lang_test_mixed/phones/extra_questions.txt\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--> data-ESTER-V4/lang_test_mixed/phones/extra_questions.{txt, int} are OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_test_mixed/phones/word_boundary.{txt, int} ...\n",
      "--> 164 entry/entries in data-ESTER-V4/lang_test_mixed/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/word_boundary.int corresponds to data-ESTER-V4/lang_test_mixed/phones/word_boundary.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/word_boundary.{txt, int} are OK\n",
      "\n",
      "Checking optional_silence.txt ...\n",
      "--> reading data-ESTER-V4/lang_test_mixed/phones/optional_silence.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/optional_silence.txt is OK\n",
      "\n",
      "Checking disambiguation symbols: #0 and #1\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/disambig.txt has \"#0\" and \"#1\"\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/disambig.txt is OK\n",
      "\n",
      "Checking topo ...\n",
      "\n",
      "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/word_boundary.txt doesn't include disambiguation symbols\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/word_boundary.txt is OK\n",
      "\n",
      "Checking word-level disambiguation symbols...\n",
      "--> data-ESTER-V4/lang_test_mixed/phones/wdisambig.txt exists (newer prepare_lang.sh)\n",
      "Checking word_boundary.int and disambig.int\n",
      "--> generating a 12 word sequence\n",
      "--> resulting phone sequence from L.fst corresponds to the word sequence\n",
      "--> L.fst is OK\n",
      "--> generating a 91 word sequence\n",
      "--> resulting phone sequence from L_disambig.fst corresponds to the word sequence\n",
      "--> L_disambig.fst is OK\n",
      "\n",
      "Checking data-ESTER-V4/lang_test_mixed/oov.{txt, int} ...\n",
      "--> 1 entry/entries in data-ESTER-V4/lang_test_mixed/oov.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/oov.int corresponds to data-ESTER-V4/lang_test_mixed/oov.txt\n",
      "--> data-ESTER-V4/lang_test_mixed/oov.{txt, int} are OK\n",
      "\n",
      "--> data-ESTER-V4/lang_test_mixed/L.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang_test_mixed/L_disambig.fst is olabel sorted\n",
      "--> data-ESTER-V4/lang_test_mixed/G.fst is ilabel sorted\n",
      "--> data-ESTER-V4/lang_test_mixed/G.fst has 11264129 states\n",
      "--> utils/lang/check_g_properties.pl successfully validated data-ESTER-V4/lang_test_mixed/G.fst\n",
      "--> utils/lang/check_g_properties.pl succeeded.\n",
      "--> SUCCESS [validating lang directory data-ESTER-V4/lang_test_mixed]\n",
      "Succeeded in formatting data.\n"
     ]
    }
   ],
   "source": [
    "#### Prepare Contextual automate finite state using LM's ===> Build G.fst\n",
    "#### Copy the largest LMS built from LIUM and generate G.fst :\n",
    "# Large LM\n",
    "ln -s $LM_LIUM_LARGE $idata_kaldi/local/lm/\n",
    "# Pruned LM\n",
    "ln -s $LM_LIUM_SMALL $idata_kaldi/local/lm/\n",
    "# Mixed\n",
    "dir_eval_lm=$exp_kaldi/eval_LM\n",
    "#mv $dir_eval_lm/mixed.gz $dir_eval_lm/lm_mixed.arpa.gz\n",
    "mv $dir_eval_lm/lm_mixed.arpa.gz $idata_kaldi/local/lm/\n",
    "local/format_lms.sh --src-dir $idata_kaldi/lang $idata_kaldi/local/lm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Feature extraction\n",
    "Try to use & evaluate each Feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER-V4/data exp-ESTER-V4/make_mfcc/data mfcc\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER-V4/data\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "Succeeded creating MFCC features for data\n",
      "steps/compute_cmvn_stats.sh data-ESTER-V4/data exp-ESTER-V4/make_mfcc/data mfcc\n",
      "Succeeded creating CMVN stats for data\n",
      "fix_data_dir.sh: kept all 35574 utterances.\n",
      "fix_data_dir.sh: old files are kept in data-ESTER-V4/data/.backup\n",
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER-V4/DATA exp-ESTER-V4/make_mfcc/DATA mfcc\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER-V4/DATA\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "Succeeded creating MFCC features for DATA\n",
      "steps/compute_cmvn_stats.sh data-ESTER-V4/DATA exp-ESTER-V4/make_mfcc/DATA mfcc\n",
      "Succeeded creating CMVN stats for DATA\n",
      "fix_data_dir.sh: kept all 10486 utterances.\n",
      "fix_data_dir.sh: old files are kept in data-ESTER-V4/DATA/.backup\n",
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER-V4/DATA2 exp-ESTER-V4/make_mfcc/DATA2 mfcc\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER-V4/DATA2\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "It seems not all of the feature files were successfully processed (57340 != 57341);\n",
      "consider using utils/fix_data_dir.sh data-ESTER-V4/DATA2\n",
      "Succeeded creating MFCC features for DATA2\n",
      "steps/compute_cmvn_stats.sh data-ESTER-V4/DATA2 exp-ESTER-V4/make_mfcc/DATA2 mfcc\n",
      "Succeeded creating CMVN stats for DATA2\n",
      "fix_data_dir.sh: kept 57340 utterances out of 57341\n",
      "fix_data_dir.sh: old files are kept in data-ESTER-V4/DATA2/.backup\n",
      "utils/combine_data.sh data-ESTER-V4/ESTER_All data-ESTER-V4/data data-ESTER-V4/DATA2\n",
      "utils/combine_data.sh [info]: not combining utt2uniq as it does not exist\n",
      "utils/combine_data.sh: combined segments\n",
      "utils/combine_data.sh: combined utt2spk\n",
      "utils/combine_data.sh [info]: not combining utt2lang as it does not exist\n",
      "utils/combine_data.sh: combined utt2dur\n",
      "utils/combine_data.sh: combined feats.scp\n",
      "utils/combine_data.sh: combined text\n",
      "utils/combine_data.sh: combined cmvn.scp\n",
      "utils/combine_data.sh [info]: not combining reco2file_and_channel as it does not exist\n",
      "utils/combine_data.sh: combined wav.scp\n",
      "utils/combine_data.sh: combined spk2gender\n",
      "fix_data_dir.sh: kept all 92914 utterances.\n",
      "fix_data_dir.sh: old files are kept in data-ESTER-V4/ESTER_All/.backup\n"
     ]
    }
   ],
   "source": [
    "# Feature Extraction MFCC:\n",
    "mfccdir=mfcc\n",
    "for part in data DATA DATA2; do\n",
    "    #MFCC features\n",
    "    steps/make_mfcc.sh --cmd \"$train_cmd\" --nj 32 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    #MFCC features + Pitch\n",
    "    #steps/make_mfcc_pitch.sh --cmd \"$train_cmd\" --nj 12 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    steps/compute_cmvn_stats.sh $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    utils/fix_data_dir.sh $idata_kaldi/$part\n",
    "done\n",
    "# Combine data ESTER Phase 1 & Phase 2\n",
    "utils/combine_data.sh $idata_kaldi/ESTER_All $idata_kaldi/data $idata_kaldi/DATA2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Feature Extraction PLP:\n",
    "plpdir=plp\n",
    "for part in data DATA DATA2; do\n",
    "    #PLP features\n",
    "    steps/make_plp.sh --cmd \"$train_cmd\" --nj 32 $idata_kaldi/$part $exp_kaldi/make_plp/$part $plpdir\n",
    "    #PLP features + Pitch\n",
    "    steps/make_plp_pitch.sh --cmd \"$train_cmd\" --nj 32 $idata_kaldi/$part $exp_kaldi/make_plp/$part $plpdir\n",
    "    steps/compute_cmvn_stats.sh $idata_kaldi/$part $exp_kaldi/make_plp/$part $plpdir\n",
    "    #Fbank\n",
    "    #steps/make_fbank.sh --cmd \"$train_cmd\" --nj 4 $idata_kaldi/$part $exp_kaldi/make_fbank/$part $fbankdir\n",
    "    #steps/compute_cmvn_stats.sh $idata_kaldi/$part $exp_kaldi/make_fbank/$part $fbankdir\n",
    "done\n",
    "# Combine data ESTER Phase 1 & Phase 2\n",
    "utils/combine_data.sh $idata_kaldi/ESTER_All $idata_kaldi/data $idata_kaldi/DATA2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Feature extraction Fbanks:\n",
    "fbankdir=fbank\n",
    "for part in data DATA DATA2; do\n",
    "    #Fbank\n",
    "    steps/make_fbank.sh --cmd \"$train_cmd\" --nj 32 $idata_kaldi/$part $exp_kaldi/make_fbank/$part $fbankdir\n",
    "    #Fbank + pitch\n",
    "    steps/make_fbank_pitch.sh --cmd \"$train_cmd\" --nj 32 $idata_kaldi/$part $exp_kaldi/make_fbank/$part $fbankdir\n",
    "    steps/compute_cmvn_stats.sh $idata_kaldi/$part $exp_kaldi/make_fbank/$part $fbankdir\n",
    "done\n",
    "# Combine data ESTER Phase 1 & Phase 2\n",
    "utils/combine_data.sh $idata_kaldi/ESTER_All $idata_kaldi/data $idata_kaldi/DATA2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# obviously, if you use this, consider that you have some wrong segmentation in your clean data...\n",
    "#utils/fix_data_dir.sh $idata_kaldi/data\n",
    "#utils/fix_data_dir.sh $idata_kaldi/meeting_best_microsoft\n",
    "#utils/fix_data_dir.sh $idata_kaldi/meeting_test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Split Data for training phases\n",
    "Use --shortest for taking utt in accendent order (sorted by duration !)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "feat-to-len scp:data-ESTER-V4/data/feats.scp ark,t:data-ESTER-V4/data_1kshort/tmp.len \n",
      "sort: échec d'écriture: 'sortie standard': Relais brisé (pipe)\n",
      "sort: erreur d'écriture\n",
      "utils/subset_data_dir.sh: reducing #utt from 35574 to 1000\n",
      "feat-to-len scp:data-ESTER-V4/data/feats.scp ark,t:data-ESTER-V4/data_5kshort/tmp.len \n",
      "sort: échec d'écriture: 'sortie standard': Relais brisé (pipe)\n",
      "sort: erreur d'écriture\n",
      "utils/subset_data_dir.sh: reducing #utt from 35574 to 5000\n",
      "feat-to-len scp:data-ESTER-V4/data/feats.scp ark,t:data-ESTER-V4/data_10kshort/tmp.len \n",
      "sort: échec d'écriture: 'sortie standard': Relais brisé (pipe)\n",
      "sort: erreur d'écriture\n",
      "utils/subset_data_dir.sh: reducing #utt from 35574 to 10000\n",
      "feat-to-len scp:data-ESTER-V4/data/feats.scp ark,t:data-ESTER-V4/data_15kshort/tmp.len \n",
      "sort: échec d'écriture: 'sortie standard': Relais brisé (pipe)\n",
      "sort: erreur d'écriture\n",
      "utils/subset_data_dir.sh: reducing #utt from 35574 to 15000\n"
     ]
    }
   ],
   "source": [
    "# # Make some small data subsets for early system-build stages.  Note, there are 29k\n",
    "# # utterances in the train_clean_100 directory which has 100 hours of data.\n",
    "# # For the monophone stages we select the shortest utterances, which should make it\n",
    "# # easier to align the data from a flat start.\n",
    "utils/subset_data_dir.sh --shortest $idata_kaldi/data 1000 $idata_kaldi/data_1kshort\n",
    "utils/subset_data_dir.sh --shortest $idata_kaldi/data 5000 $idata_kaldi/data_5kshort\n",
    "utils/subset_data_dir.sh --shortest $idata_kaldi/data 10000 $idata_kaldi/data_10kshort\n",
    "utils/subset_data_dir.sh --shortest $idata_kaldi/data 15000 $idata_kaldi/data_15kshort\n",
    "#utils/subset_data_dir.sh --shortest $idata_kaldi/data 15000 $idata_kaldi/data_15kshort\n",
    "#utils/subset_data_dir.sh $idata_kaldi/data 20000 $idata_kaldi/data_20k\n",
    "#utils/subset_data_dir.sh $idata_kaldi/data 25000 $idata_kaldi/data_25k"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train different monophone system with different size\n",
    "Analyse number of gaussian and state used!!!\n",
    "and evaluate this first step"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "steps/train_mono.sh --boost-silence 1.25 --nj 32 --cmd run.pl --mem 64G data-ESTER-V2-noise/data_1kshort data-ESTER-V2-noise/lang exp-ESTER-V2-noise/mono1K\n",
      "filter_scps.pl: warning: some input lines were output to multiple files\n",
      "steps/train_mono.sh: Initializing monophone system.\n",
      "steps/train_mono.sh: Compiling training graphs\n",
      "steps/train_mono.sh: Aligning data equally (pass 0)\n",
      "steps/train_mono.sh: Pass 1\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 2\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 3\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 4\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 5\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 6\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 7\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 8\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 9\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 10\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 11\n",
      "steps/train_mono.sh: Pass 12\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 13\n",
      "steps/train_mono.sh: Pass 14\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 15\n",
      "steps/train_mono.sh: Pass 16\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 17\n",
      "steps/train_mono.sh: Pass 18\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 19\n",
      "steps/train_mono.sh: Pass 20\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 21\n",
      "steps/train_mono.sh: Pass 22\n",
      "steps/train_mono.sh: Pass 23\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 24\n",
      "steps/train_mono.sh: Pass 25\n",
      "steps/train_mono.sh: Pass 26\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 27\n",
      "steps/train_mono.sh: Pass 28\n",
      "steps/train_mono.sh: Pass 29\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 30\n",
      "steps/train_mono.sh: Pass 31\n",
      "steps/train_mono.sh: Pass 32\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 33\n",
      "steps/train_mono.sh: Pass 34\n",
      "steps/train_mono.sh: Pass 35\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 36\n",
      "steps/train_mono.sh: Pass 37\n",
      "steps/train_mono.sh: Pass 38\n",
      "steps/train_mono.sh: Aligning data\n",
      "steps/train_mono.sh: Pass 39\n",
      "steps/diagnostic/analyze_alignments.sh --cmd run.pl --mem 64G data-ESTER-V2-noise/lang exp-ESTER-V2-noise/mono1K\n",
      "analyze_phone_length_stats.py: WARNING: optional-silence SIL is seen only 41.9% of the time at utterance begin.  This may not be optimal.\n",
      "analyze_phone_length_stats.py: WARNING: optional-silence SIL is seen only 57.4% of the time at utterance end.  This may not be optimal.\n",
      "steps/diagnostic/analyze_alignments.sh: see stats in exp-ESTER-V2-noise/mono1K/log/analyze_alignments.log\n",
      "3 warnings in exp-ESTER-V2-noise/mono1K/log/init.log\n",
      "2 warnings in exp-ESTER-V2-noise/mono1K/log/analyze_alignments.log\n",
      "1070 warnings in exp-ESTER-V2-noise/mono1K/log/align.*.*.log\n",
      "122 warnings in exp-ESTER-V2-noise/mono1K/log/update.*.log\n",
      "exp-ESTER-V2-noise/mono1K: nj=32 align prob=-97.99 over 0.24h [retry=1.2%, fail=0.0%] states=128 gauss=1001\n",
      "steps/train_mono.sh: Done training monophone system in exp-ESTER-V2-noise/mono1K\n"
     ]
    }
   ],
   "source": [