Run_ESTER2.ipynb 6.61 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    ". path.sh\n",
    ". cmd.sh\n",
    "idata_kaldi=data-ESTER2-V1\n",
    "exp_kaldi=exp-ESTER2-V1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "prepare train\n",
      "utils/data/get_utt2dur.sh: working out data-ESTER2-V1/train/utt2dur from data-ESTER2-V1/train/segments\n",
      "utils/data/get_utt2dur.sh: computed data-ESTER2-V1/train/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/train\n",
      "Successfully prepared data in data-ESTER2-V1/train..\n",
      "prepare test\n",
      "utils/data/get_utt2dur.sh: working out data-ESTER2-V1/test/utt2dur from data-ESTER2-V1/test/segments\n",
      "utils/data/get_utt2dur.sh: computed data-ESTER2-V1/test/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/test\n",
      "Successfully prepared data in data-ESTER2-V1/test..\n",
      "prepare dev\n",
      "utils/data/get_utt2dur.sh: working out data-ESTER2-V1/dev/utt2dur from data-ESTER2-V1/dev/segments\n",
      "utils/data/get_utt2dur.sh: computed data-ESTER2-V1/dev/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/dev\n",
      "Successfully prepared data in data-ESTER2-V1/dev..\n"
     ]
    }
   ],
   "source": [
    "data=/fast/LINAGORA/Corpus/Corpus/ESTER2/corpus\n",
    "#idata_kaldi=data-ESTER-V4\n",
    "for part in train test dev; do\n",
    "  # use underscore-separated names in data directories.\n",
    "  echo \"prepare $part\"\n",
    "  #local/data_prepTCOF.sh $data/$part $idata_kaldi/$part\n",
    "  # probleme event (URL:)\n",
    "  local/data_prepESTER.sh $data/$part $idata_kaldi/$part\n",
    "done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER2-V1/train exp-ESTER2-V1/make_mfcc/train mfcc-ESTER2-V1\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/train\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "Succeeded creating MFCC features for train\n",
      "steps/compute_cmvn_stats.sh data-ESTER2-V1/train exp-ESTER2-V1/make_mfcc/train mfcc-ESTER2-V1\n",
      "Succeeded creating CMVN stats for train\n",
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER2-V1/test exp-ESTER2-V1/make_mfcc/test mfcc-ESTER2-V1\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/test\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "Succeeded creating MFCC features for test\n",
      "steps/compute_cmvn_stats.sh data-ESTER2-V1/test exp-ESTER2-V1/make_mfcc/test mfcc-ESTER2-V1\n",
      "Succeeded creating CMVN stats for test\n",
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER2-V1/dev exp-ESTER2-V1/make_mfcc/dev mfcc-ESTER2-V1\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/dev\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "Succeeded creating MFCC features for dev\n",
      "steps/compute_cmvn_stats.sh data-ESTER2-V1/dev exp-ESTER2-V1/make_mfcc/dev mfcc-ESTER2-V1\n",
      "Succeeded creating CMVN stats for dev\n"
     ]
    }
   ],
   "source": [
    "exp_kaldi=exp-ESTER2-V1\n",
    "mfccdir=mfcc-ESTER2-V1\n",
    "for part in train test dev; do\n",
    "    #MFCC features\n",
    "    steps/make_mfcc.sh --cmd \"$train_cmd\" --nj 32 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    #MFCC features + Pitch\n",
    "    #steps/make_mfcc_pitch.sh --cmd \"$train_cmd\" --nj 12 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    steps/compute_cmvn_stats.sh $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    #utils/fix_data_dir.sh $idata_kaldi/$part\n",
    "done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Splitting into 2 parts, to allow for parallel processing ...\n",
      "Checking the splits ...\n",
      "Performing text normalization (2 jobs) - check data-ESTER2-V1/local_ESTER12/lm/norm/tmp/txt_norm.JOB.log ...\n",
      "Finished OK\n",
      "Selecting the vocabulary (400000 words) ...\n",
      "Making the corpus and the vocabulary ...\n",
      "Word counts saved to 'data-ESTER2-V1/local_ESTER12/lm/word_counts.txt'\n",
      "Vocabulary saved as 'data-ESTER2-V1/local_ESTER12/lm/meeting-vocab.txt'\n",
      "All unique sentences (in sorted order) stored in 'data-ESTER2-V1/local_ESTER12/lm/meeting-lm-norm.txt.gz'\n",
      "Counting the total number word tokens in the corpus ...\n",
      "There are 1075067 tokens in the corpus\n",
      "Training a 3-gram LM ...\n",
      "This implementation assumes that you have a lot of free RAM(> 12GB) on your machine\n",
      "If that's not the case, consider something like: http://joshua-decoder.org/4.0/large-lms.html\n",
      "3,8M\tdata-ESTER2-V1/local_ESTER12/lm/lm_tglarge.arpa.gz\n",
      "Creating a 'small' pruned 3-gram LM (threshold: 0.0000003) ...\n",
      "data-ESTER2-V1/local_ESTER12/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "3,0M\tdata-ESTER2-V1/local_ESTER12/lm/lm_tgsmall.arpa.gz\n",
      "Creating a 'medium' pruned 3-gram LM (threshold: 0.0000001) ...\n",
      "data-ESTER2-V1/local_ESTER12/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "3,5M\tdata-ESTER2-V1/local_ESTER12/lm/lm_tgmed.arpa.gz\n",
      "Training a 4-gram LM ...\n",
      "4,5M\tdata-ESTER2-V1/local_ESTER12/lm/lm_fglarge.arpa.gz\n"
     ]
    }
   ],
   "source": [
    "LM_train_text=/fast/LINAGORA/Corpus/Corpus/ESTER2/corpus/train\n",
    "local/lm/train_lm.sh $LM_train_text \\\n",
    "$idata_kaldi/local_ESTER12/lm/norm/tmp $idata_kaldi/local_ESTER12/lm/norm/norm_texts $idata_kaldi/local_ESTER12/lm"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Bash",
   "language": "bash",
   "name": "bash"
  },
  "language_info": {
   "codemirror_mode": "shell",
   "file_extension": ".sh",
   "mimetype": "text/x-sh",
   "name": "bash"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}