run.sh 26.7 KB
Newer Older
1
#!/bin/bash
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
2

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
3 4 5
# Copyright 2017 Abdel HEBA @Linagora
# Pense a ajouter utils/fix_data_dir.sh data/test to fix utterance error
# Running on Koios J=12
6
#
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
7 8 9
. ./cmd.sh
. ./path.sh

10

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
11 12

# format the data as Kaldi data directories
13 14 15 16
#  Data prepare: TCOF - ESTER
# TCOF:
idata_kaldi=data-microsoft-mfcc
exp_kaldi=exp-microsoft-mfcc
17 18
data=/home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus
LM_train_text=/home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/train
19 20 21 22 23
# ESTER:
data=/home/lingora/Documents/Linagora/Data/ESTER/Corpus/ESTER/DGA/Phase1
idata_kaldi=data-ESTER-V2-noise
exp_kaldi=exp-ESTER-V2-noise
for part in data; do
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
24
  # use underscore-separated names in data directories.
25
  echo "prepare $part"
26 27 28
  #local/data_prepTCOF.sh $data/$part $idata_kaldi/$part
  # probleme event (URL:)
  local/data_prepESTER.sh $data/$part $idata_kaldi/$part
29 30 31
done

# Evaluate SNR for each segment
32 33 34 35 36 37 38 39 40 41 42
#evaluate_snr=eval-snr
#mkdir eval-snr
#for part in meeting_best_microsoft meeting_test; do
#    echo "Evaluate $part"
#    local/evaluation/evaluate_snr.sh $idata_kaldi/$part $evaluate_snr
#done

# Evaluate SNR for each segment of ESTER
evaluate_snr=eval-snr-ESTER/Eval2005
mkdir -p $evaluate_snr
for part in data; do
43 44
    echo "Evaluate $part"
    local/evaluation/evaluate_snr.sh $idata_kaldi/$part $evaluate_snr
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
45
done
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
46
###### OOOOOK
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
47

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
48
# Learning Language model
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
49 50 51 52 53 54
# ## Optional text corpus normalization and LM training
# ## These scripts are here primarily as a documentation of the process that has been
# ## used to build the LM. Most users of this recipe will NOT need/want to run
# ## this step. The pre-built language models and the pronunciation lexicon, as
# ## well as some intermediate data(e.g. the normalized text used for LM training),
# ## are available for download at http://www.openslr.org/11/
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
55
# OOOOOOK Train_lm
56 57 58 59
# TCOF
#LM_train_text=/home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/train
# ESTER
LM_train_text=/home/lingora/Documents/Linagora/Data/ESTER/Corpus/ESTER/DGA/Phase1/data
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
60
local/lm/train_lm.sh $LM_train_text \
61
$idata_kaldi/local/lm/norm/tmp $idata_kaldi/local/lm/norm/norm_texts $idata_kaldi/local/lm
62 63 64
# check characters:
# awk '{for(i=1;i<=NF;i++)if(!a[$i]++)print $i"\n"}' ORS= FS= $idata_kaldi/local/lm/meeting-vocab.txt | sort -b

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
65 66 67 68
# Learning Grapheme to phonem
## Optional G2P training scripts.
## As the LM training scripts above, this script is intended primarily to
## document our G2P model creation process
69
# OOOOOOk g2p: done
70
#local/g2p/train_g2p.sh cmu_dict data/local/lm
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
71

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
72 73

# # when "--stage 3" option is used below we skip the G2P steps, and use the
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
74
# # if lexicon are already downloaded from Elyes's works then Stage=3 else Stage=0
75 76
# print number of phonem used in french
cat cmu_dict/fr.dict | awk '{$1="";print $0}' | tr ' ' '\n' | sort -b | uniq -c
77 78
mkdir -p $idata_kaldi/local/dict/cmudict
cp cmu_dict/fr.dict $idata_kaldi/local/dict/fr.dict
79 80 81 82 83 84
mkdir -p $idata_kaldi/local/lm/g2p
cp g2p/model-5 $idata_kaldi/local/lm/g2p
###### Prepare dict: add words which doesn't exist in dictionnary + config files...

local/prepare_dict.sh --stage 0 --nj 4 --cmd "$train_cmd" \
   $idata_kaldi/local/lm $idata_kaldi/local/lm/g2p $idata_kaldi/local/dict
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
85

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
86
###### OOOOOOK
87
utils/prepare_lang.sh $idata_kaldi/local/dict \
88
   "<unk>" $idata_kaldi/local/lang_tmp $idata_kaldi/lang
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
89

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
90 91
export LC_ALL=fr_FR.UTF-8

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
92
###### OOOOOOK
93
local/format_lms.sh --src-dir $idata_kaldi/lang $idata_kaldi/local/lm
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
94

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
95 96 97 98 99 100 101 102 103
# # Create ConstArpaLm format language model for full 3-gram and 4-gram LMs
 #utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz \
 #  data/lang data/lang_test_tglarge
 #utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz \
 #  data/lang data/lang_test_fglarge
#OK MFCC
mfccdir=mfcc
plpdir=plp
fbankdir=fbank
104 105
#for part in meeting_best_microsoft meeting_test; do
for part in data; do
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
106
    #MFCC features
107 108
    steps/make_mfcc.sh --cmd "$train_cmd" --nj 4 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir
    steps/compute_cmvn_stats.sh $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
109
    #PLP features
110 111
    #steps/make_plp.sh --cmd "$train_cmd" --nj 4 $idata_kaldi/$part $exp_kaldi/make_plp/$part $plpdir
    #steps/compute_cmvn_stats.sh $idata_kaldi/$part $exp_kaldi/make_plp/$part $plpdir
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
112
    #Fbank
113 114
    #steps/make_fbank.sh --cmd "$train_cmd" --nj 4 $idata_kaldi/$part $exp_kaldi/make_fbank/$part $fbankdir
    #steps/compute_cmvn_stats.sh $idata_kaldi/$part $exp_kaldi/make_fbank/$part $fbankdir
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
115
done
116
utils/fix_data_dir.sh $idata_kaldi/data
117 118
utils/fix_data_dir.sh $idata_kaldi/meeting_best_microsoft
utils/fix_data_dir.sh $idata_kaldi/meeting_test
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
119 120 121 122
# # Make some small data subsets for early system-build stages.  Note, there are 29k
# # utterances in the train_clean_100 directory which has 100 hours of data.
# # For the monophone stages we select the shortest utterances, which should make it
# # easier to align the data from a flat start.
123 124 125 126 127 128 129
utils/subset_data_dir.sh --shortest $idata_kaldi/data 1000 $idata_kaldi/data_1kshort
utils/subset_data_dir.sh --shortest $idata_kaldi/data 5000 $idata_kaldi/data_5kshort
utils/subset_data_dir.sh --shortest $idata_kaldi/data 10000 $idata_kaldi/data_10kshort
utils/subset_data_dir.sh --shortest $idata_kaldi/data 15000 $idata_kaldi/data_15kshort
utils/subset_data_dir.sh --shortest $idata_kaldi/data 15000 $idata_kaldi/data_15kshort
utils/subset_data_dir.sh $idata_kaldi/data 20000 $idata_kaldi/data_20k
utils/subset_data_dir.sh $idata_kaldi/data 25000 $idata_kaldi/data_25k
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
130
# # train a monophone system
131
exp_mono=$exp_kaldi/mono10K
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
132
 steps/train_mono.sh --boost-silence 1.25 --nj 4 --cmd "$train_cmd" \
133
   $idata_kaldi/data_10kshort $idata_kaldi/lang $exp_mono
134

135
# OK Jusqu'au monophone
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
# =================================================
# =================================================
# Evaluate PER for each meeting in training set
# All Evaluation Will be achived in exp/Evaluation
dir_evaluation=$exp_kaldi/Evaluation_selected
mkdir -p $dir_evaluation
for test in meeting_train meeting_test; do
    # Align $test_set
    steps/align_si.sh --boost-silence 1.25 --nj 4 --cmd "$train_cmd" \
    $idata_kaldi/$test $idata_kaldi/lang $exp_mono $dir_evaluation/mono_ali_$test
    find $data/$test -mindepth 1 -maxdepth 1 > $dir_evaluation/meeting_in_$test.txt
    # Evaluate PER for each meeting
    for meeting_dir in $(cat $dir_evaluation/meeting_in_$test.txt); do
        meeting=$(basename $meeting_dir)
        echo $meeting
        cat $idata_kaldi/$test/text | grep $meeting > $dir_evaluation/text_$meeting.tmp
        local/evaluation/evaluate_PER.sh $PWD/$dir_evaluation/text_$meeting.tmp $idata_kaldi/local/dict/lexicon.txt \
        $idata_kaldi/local/lm $idata_kaldi/lang/phones.txt $dir_evaluation/mono_ali_$test $dir_evaluation/evaluate_PER
        echo $meeting
        cat $dir_evaluation/evaluate_PER/PER.res
        rm $dir_evaluation/text_$meeting.tmp
    done
done
# =================================================
# ==================================================
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
161
# # decode using the monophone model
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183

   #utils/mkgraph.sh --mono data-final/lang_test_french-small \
   #  exp-final/mono exp-final/mono/graph_french-small
   #utils/mkgraph.sh --mono data-valid/lang_g2p_test_mix \
   #  exp-valid/mono_g2p exp-valid/mono_g2p/graph_mix
   utils/mkgraph.sh --mono $idata_kaldi/lang_test_tglarge \
     $exp_mono $exp_mono/graph_tglarge
   utils/mkgraph.sh --mono $idata_kaldi/lang_test_french-small \
     $exp_mono $exp_mono/graph_french-small
   #utils/mkgraph.sh --mono data-valid/lang_100h_test_mix \
   #  exp-valid/mono_16h exp-valid/mono_16h/graph_mix
   #utils/mkgraph.sh --mono data-valid/lang_100h_test_tglarge \
   #  exp-valid/mono_16h exp-valid/mono_16h/graph_tglarge
    min_lmwt=7
    max_lmwt=17
    cmd=run.pl
    word_ins_penalty=0.0,0.5,1.0
    mkdir -p $dir_evaluation/evaluate_WER
    touch $dir_evaluation/evaluate_WER/WER_per_meeting.csv
    echo "Filename,%WER,%nbWER,ins,del,sub" > $dir_evaluation/evaluate_WER/WER_per_meeting.csv
   for test in meeting_test; do
        # Decode WER
184 185
       steps/decode.sh --nj 4 --cmd "$decode_cmd" $exp_mono/graph_french-small \
       $idata_kaldi/$test $exp_mono/decode_french-small_$test
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
        # Evaluate WER for each meeting in $ test
   #     symtab=$exp_mono/graph_tglarge/words.txt
   #     find $data/$test -mindepth 1 -maxdepth 1 -type d > $dir_evaluation/meeting_in_$test.txt
   #     for meeting_dir in $(cat $dir_evaluation/meeting_in_$test.txt); do
   #         meeting=$(basename $meeting_dir)
   #         mkdir -p $dir_evaluation/evaluate_WER/scoring_$test
   #         cat $idata_kaldi/$test/text | grep $meeting | sed 's:!sil::g' | sed 's:<noise>::g' |\
   #          sed 's:<spoken_noise>::g' | sed 's:<laugh>::g'> $dir_evaluation/evaluate_WER/scoring_$test/text_meeting.tmp
   #         for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
   #             $cmd LMWT=$min_lmwt:$max_lmwt $exp_mono/decode_tglarge_$test/log/score.LMWT.$wip.log \
   #             cat $exp_mono/decode_tglarge_$test/scoring/LMWT.$wip.tra \| \
   #             utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
   #             compute-wer --text --mode=present \
   #             ark:$dir_evaluation/evaluate_WER/scoring_$test/text_meeting.tmp  ark,p:- ">&" $dir_evaluation/evaluate_WER/scoring_$test/wer_LMWT_$wip_$meeting;
   #         done
   #         cat $dir_evaluation/evaluate_WER/scoring_$test/wer*$meeting | utils/best_wer.sh | \
   #         awk -v name_meeting=$meeting 'BEGIN{OFS=","}{$1=name_meeting;print $1,$2,$4$5$6$7,$9,$11}' >> $dir_evaluation/evaluate_WER/WER_per_meeting.csv
   #         rm $dir_evaluation/evaluate_WER/scoring_$test/wer*
   #     done
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
205 206
   done

207 208 209 210 211 212 213 214 215 216 217 218 219
# Merging All results
cat exp-eval/Evaluation/evaluate_PER/PER.csv | sort -k1 | awk 'BEGIN{FS=",";OFS=","}{$1="";print $0}' \
 > exp-eval/Evaluation/evaluate_PER.csv
cat exp-eval/Evaluation/evaluate_WER/WER_per_meeting.csv | sort -k1 > exp-eval/Evaluation/Evaluation_wer.csv
paste -d , exp-eval/Evaluation/Evaluation_wer.csv exp-eval/Evaluation/evaluate_PER.csv > exp-eval/Evaluation/final_evaluation.csv
# Merge with Perplexity : ppl_only
cat exp-eval/Evaluation/ppl_only/3gfrench-smalldev_test.csv | awk 'BEGIN{FS=",";OFS=","}{$1="";print $0}' > exp-eval/Evaluation/evaluate_3gfrench-small.csv
cat exp-eval/Evaluation/ppl_only/3glmlarge_dev_test.csv | awk 'BEGIN{FS=",";OFS=","}{$1="";print $0}' > exp-eval/Evaluation/evaluate_3glmlarge_dev_test.csv
cat exp-eval/Evaluation/ppl_only/3gmixfrsmall_dev_test.csv | awk 'BEGIN{FS=",";OFS=","}{$1="";print $0}' > exp-eval/Evaluation/evaluate_3gmixfrsmall_dev_test.csv

paste -d , exp-eval/Evaluation/final_evaluation.csv exp-eval/Evaluation/evaluate_3gfrench-small.csv
paste -d , exp-eval/Evaluation/final_evaluation.csv exp-eval/Evaluation/lm_tg100h.csv \
 > exp-eval/Evaluation/Final-eval/final_evaluation_lm_tg_100h.csv
220 221 222 223 224 225


# =========================== TRIPHONE =======================
# Align data
 steps/align_si.sh --boost-silence 1.25 --nj 4 --cmd "$train_cmd" \
   $idata_kaldi/data_15kshort $idata_kaldi/lang $exp_mono $exp_kaldi/mono_ali15k_model10k
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
226 227 228

# # train a first delta + delta-delta triphone system on a subset of 70000 utterances
 steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
229
     3000 15000 $idata_kaldi/data_15kshort $idata_kaldi/lang $exp_kaldi/mono_ali $exp_kaldi/tri1
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
230

231 232 233 234
 steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
     3000 15000 $idata_kaldi/meeting_best_microsoft $idata_kaldi/lang $exp_kaldi/mono_ali $exp_kaldi/tri1_selected
 steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
     2000 10000 $idata_kaldi/data_20kshort $idata_kaldi/lang_all exp-ESTER-all/mono_ali exp-ESTER-all/tri1_selected
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
235
# # decode using the tri1 model
236 237 238
# (
   utils/mkgraph.sh $idata_kaldi/lang_test_tglarge \
     $exp_kaldi/tri1_selected $exp_kaldi/tri1_selected/graph_tglarge
239 240
   utils/mkgraph.sh $idata_kaldi/lang_test_french-small \
     $exp_kaldi/tri1_selected $exp_kaldi/tri1_selected/graph_french-small
241 242 243 244 245
   for test in meeting_test; do
     steps/decode.sh --nj 2 --cmd "$decode_cmd" $exp_kaldi/tri1_selected/graph_tglarge \
       $idata_kaldi/$test $exp_kaldi/tri1_selected/decode_tglarge_$test
     #steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed,tgsmall,tglarge} \
     #  data/$test exp/tri1/decode_{tgsmall,tgmed}_$test
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
246 247 248 249
     #steps/lmrescore_const_arpa.sh \
     #  --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
     #  data/$test exp/tri1/decode_{tgsmall,tglarge}_$test
   done
250
# )&
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
251

252 253 254 255 256

# ================== Transformation LDA+MLLT ============
utils/subset_data_dir.sh --shortest $idata_kaldi/data 27000 $idata_kaldi/data_27k
 steps/align_si.sh --nj 4 --cmd "$train_cmd" \
   $idata_kaldi/data_20k $idata_kaldi/lang $exp_kaldi/tri1 $exp_kaldi/tri1_ali
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
257 258 259


# # train an LDA+MLLT system.
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
260 261
 steps/train_lda_mllt.sh --cmd "$train_cmd" \
    --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
262
    $idata_kaldi/data_27k $idata_kaldi/lang $exp_kaldi/tri1_selected $exp_kaldi/tri2b_selected
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
263 264

# # decode using the LDA+MLLT model
265
# (
266 267 268 269 270 271 272 273 274 275 276
   utils/mkgraph.sh $idata_kaldi/lang_test_tglarge \
     $exp_kaldi/tri2b_selected $exp_kaldi/tri2b_selected/graph_tglarge
   for test in meeting_test; do
     steps/decode.sh --nj 2 --cmd "$decode_cmd" $exp_kaldi/tri2b_selected/graph_tglarge \
       $idata_kaldi/$test $exp_kaldi/tri2b_selected/decode_tglarde_$test
     #steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
     #  data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test
     #steps/lmrescore_const_arpa.sh \
     #  --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
     #  data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test
   done
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
277 278 279
# )&

# # Align a 10k utts subset using the tri2b model
280 281 282
utils/subset_data_dir.sh --shortest $idata_kaldi/data 31000 $idata_kaldi/data_31k
 steps/align_si.sh  --nj 4 --cmd "$train_cmd" --use-graphs true \
   $idata_kaldi/data_31k $idata_kaldi/lang $exp_kaldi/tri2b_selected $exp_kaldi/tri2b_ali
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
283 284 285 286

# # Train tri3b, which is LDA+MLLT+SAT on 10k utts
# steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
#   data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b
287 288 289 290 291
 steps/train_sat.sh --cmd "$train_cmd" 4500 45000 \
   $idata_kaldi/data_31k $idata_kaldi/lang $exp_kaldi/tri2b_ali $exp_kaldi/tri3b

  steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
   data-microsoft-mfcc/meeting_best_microsoft data-ESTER/lang_new exp-ESTER/tri3b exp-ESTER/tri3b_bis
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
292 293
# # decode using the tri3b model
# (
294 295
   utils/mkgraph.sh $idata_kaldi/lang_test_french-small \
     $exp_kaldi/tri3b $exp_kaldi/tri3b/graph_test_french-small
296 297
   for test in meeting_test; do
     steps/decode_fmllr.sh --nj 2 --cmd "$decode_cmd" \
298 299
      $exp_kaldi/tri3b/graph_test_french-small $idata_kaldi/$test \
       $exp_kaldi/tri3b/decode_tgsphinx_$test
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
300 301 302 303 304
#     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
#       data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test
#     steps/lmrescore_const_arpa.sh \
#       --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
#       data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test
305
   done
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
# )&

# # align the entire train_clean_100 subset using the tri3b model
# steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
#   data/train_clean_100 data/lang_nosp \
#   exp/tri3b exp/tri3b_ali_clean_100

# # train another LDA+MLLT+SAT system on the entire 100 hour subset
# steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
#   data/train_clean_100 data/lang_nosp \
#   exp/tri3b_ali_clean_100 exp/tri4b

# # decode using the tri4b model
# (
#   utils/mkgraph.sh data/lang_nosp_test_tgsmall \
#     exp/tri4b exp/tri4b/graph_nosp_tgsmall
#   for test in test_clean test_other dev_clean dev_other; do
#     steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
#       exp/tri4b/graph_nosp_tgsmall data/$test \
#       exp/tri4b/decode_nosp_tgsmall_$test
#     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
#       data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test
#     steps/lmrescore_const_arpa.sh \
#       --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
#       data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test
#     steps/lmrescore_const_arpa.sh \
#       --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
#       data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test
#   done
# )&

# # Now we compute the pronunciation and silence probabilities from training data,
# # and re-create the lang directory.
339 340
# à comprendre
 steps/get_prons.sh --cmd "$train_cmd" \
341
   $idata_kaldi/data_31k $idata_kaldi/lang $exp_kaldi/tri3b
342 343 344 345 346 347
 utils/dict_dir_add_pronprobs.sh --max-normalize true \
   $idata_kaldi/local/dict \
   $exp_kaldi/tri3b/pron_counts_nowb.txt $exp_kaldi/tri3b/sil_counts_nowb.txt \
   $exp_kaldi/tri3b/pron_bigram_counts_nowb.txt $idata_kaldi/local/dict_new

 utils/prepare_lang.sh $idata_kaldi/local/dict_new \
348
   "<unk>" $idata_kaldi/local/lang_tmp_new $idata_kaldi/lang_new
349
 local/format_lms.sh --src-dir $idata_kaldi/lang_new $idata_kaldi/local/lm
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
350 351 352 353 354 355 356 357

# utils/build_const_arpa_lm.sh \
#   data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge
# utils/build_const_arpa_lm.sh \
#   data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge

# # decode using the tri4b model with pronunciation and silence probabilities
# (
358 359 360 361 362
   #utils/mkgraph.sh \
   #  $idata_kaldi/lang_new_test_tglarge $exp_kaldi/tri3b $exp_kaldi/tri3b/graph_tglarge
   #utils/mkgraph.sh \
   #  $idata_kaldi/lang_new_test_french-small $exp_kaldi/tri3b $exp_kaldi/tri3b/graph_french-small
   utils/mkgraph.sh \
363
     $idata_kaldi/lang_new_test_french-small $exp_kaldi/tri3b $exp_kaldi/tri3b/graph_french-small
364
   for test in meeting_test; do
365 366 367
     steps/decode_fmllr.sh --nj 4 --cmd "$decode_cmd" \
       $exp_kaldi/tri3b/graph_french-small $idata_kaldi/data_10kshort \
       $exp_kaldi/tri3b/decode_lang_new_french-small_data_10kshort
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
368 369 370 371 372 373 374 375
#     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
#       data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test
#     steps/lmrescore_const_arpa.sh \
#       --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
#       data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test
#     steps/lmrescore_const_arpa.sh \
#       --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
#       data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test
376
   done
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
377 378 379 380 381 382 383 384 385 386
# )&

# # align train_clean_100 using the tri4b model
# steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
#   data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100

# # if you want at this point you can train and test NN model(s) on the 100 hour
# # subset
# local/nnet2/run_5a_clean_100.sh

387 388 389 390 391 392 393 394 395 396 397 398

  num_threads=4
  parallel_opts="--num-threads $num_threads"
  minibatch_size=128

 steps/nnet2/train_pnorm_fast.sh --stage -10 \
   --samples-per-iter 400000 \
   --parallel-opts "$parallel_opts" \
   --num-threads "$num_threads" \
   --minibatch-size "$minibatch_size" \
   --num-jobs-nnet 4  --mix-up 8000 \
   --initial-learning-rate 0.01 --final-learning-rate 0.001 \
399
   --num-hidden-layers 3 \
400 401
   --pnorm-input-dim 2000 --pnorm-output-dim 400 \
   --cmd "$decode_cmd" \
402
    $idata_kaldi/data $idata_kaldi/lang_new $exp_kaldi/tri3b $exp_kaldi/nn2
403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424


for test in meeting_test; do
  #steps/nnet2/decode.sh --nj 2 --cmd "$decode_cmd" \
  #  --transform-dir $exp_kaldi/tri3b/decode_lang_new_tglarge_$test \
  #  $exp_kaldi/tri3b/graph_tglarge $idata_kaldi/$test $exp_kaldi/nn2/decode_tglarge_$test
  #  steps/nnet2/decode.sh --nj 2 --cmd "$decode_cmd" \
  #   --transform-dir $exp_kaldi/tri3b/decode_lang_new_french-small_$test \
  #   $exp_kaldi/tri3b/graph_french-small $idata_kaldi/$test $exp_kaldi/nn2/decode_french-small_$test
   steps/nnet2/decode.sh --nj 2 --cmd "$decode_cmd" \
     $exp_kaldi/tri3b/graph_tgmix $idata_kaldi/$test $exp_kaldi/nn2/decode_tgmix_$test
  #steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
  #  data/$test $dir/decode_{tgsmall,tgmed}_$test  || exit 1;
  #steps/lmrescore_const_arpa.sh \
  #  --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
  #  data/$test $dir/decode_{tgsmall,tglarge}_$test || exit 1;
  #steps/lmrescore_const_arpa.sh \
  #  --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
  #  data/$test $dir/decode_{tgsmall,fglarge}_$test || exit 1;
done


Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
425 426 427
# local/download_and_untar.sh $data $data_url train-clean-360

# # now add the "clean-360" subset to the mix ...
428
# local/data_prepTCOF.sh \
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
429 430 431 432 433 434 435 436 437
#   $data/LibriSpeech/train-clean-360 data/train_clean_360
# steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_clean_360 \
#   exp/make_mfcc/train_clean_360 $mfccdir
# steps/compute_cmvn_stats.sh \
#   data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir

# # ... and then combine the two sets into a 460 hour one
# utils/combine_data.sh \
#   data/train_clean_460 data/train_clean_100 data/train_clean_360
438 439 440
# Phase1(30H) & Phase 2(50H): 90H
utils/combine_data.sh \
 $idata_kaldi/DATA_1_2 $idata_kaldi/data $idata_kaldi/DATA2
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
441 442 443 444 445

# # align the new, combined set, using the tri4b model
# steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
#   data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460

446 447 448
 steps/align_fmllr.sh --nj 4 --cmd "$train_cmd" \
   $idata_kaldi/DATA_1_2 $idata_kaldi/lang_new $exp_kaldi/tri3b $exp_kaldi/tri3b_ali_90

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
449 450 451 452
# # create a larger SAT model, trained on the 460 hours of data.
# steps/train_sat.sh  --cmd "$train_cmd" 5000 100000 \
#   data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b

453 454 455 456 457 458
 steps/train_sat.sh  --cmd "$train_cmd" 5000 100000 \
   $idata_kaldi/DATA_1_2 $idata_kaldi/lang_new $exp_kaldi/tri3b_ali_90 $exp_kaldi/tri5b

steps/train_sat.sh  --cmd "$train_cmd" 7000 150000 \
 $idata_kaldi/DATA_1_2 $idata_kaldi/lang_new $exp_kaldi/tri3b_ali_90 $exp_kaldi/tri5b

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
459 460 461 462
# # decode using the tri5b model
# (
#   utils/mkgraph.sh data/lang_test_tgsmall \
#     exp/tri5b exp/tri5b/graph_tgsmall
463 464 465 466 467
utils/mkgraph.sh $idata_kaldi/lang_new_test_french-small \
     $exp_kaldi/tri5b $exp_kaldi/tri5b/graph_tgsmall
steps/decode_fmllr.sh --nj 4 --cmd "$decode_cmd" \
       $exp_kaldi/tri5b/graph_tgsmall $idata_kaldi/DATA \
       $exp_kaldi/tri5b/decode_french-small_EVAL2005
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484
#   for test in test_clean test_other dev_clean dev_other; do
#     steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
#       exp/tri5b/graph_tgsmall data/$test \
#       exp/tri5b/decode_tgsmall_$test
#     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
#       data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test
#     steps/lmrescore_const_arpa.sh \
#       --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
#       data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test
#     steps/lmrescore_const_arpa.sh \
#       --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
#       data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test
#   done
# )&

# # train a NN model on the 460 hour set
# local/nnet2/run_6a_clean_460.sh
485
#local/nnet2/run_6a_clean_460.sh
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
486

487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502
 num_threads=4
  parallel_opts="-pe smp $num_threads"
  minibatch_size=128

  steps/nnet2/train_pnorm_fast.sh --stage -10 \
   --samples-per-iter 400000 \
   --num-epochs 7 --num-epochs-extra 3 \
   --parallel-opts "$parallel_opts" \
   --num-threads "$num_threads" \
   --minibatch-size "$minibatch_size" \
   --num-jobs-nnet 4  --mix-up 10000 \
   --initial-learning-rate 0.01 --final-learning-rate 0.001 \
   --num-hidden-layers 4 \
   --pnorm-input-dim 4000 --pnorm-output-dim 400 \
   --cmd "$decode_cmd" \
    $idata_kaldi/DATA_1_2 $idata_kaldi/lang_new $exp_kaldi/tri5b $exp_kaldi/nn90
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
503 504 505
# local/download_and_untar.sh $data $data_url train-other-500

# # prepare the 500 hour subset.
506
# local/data_prepTCOF.sh \
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589
#   $data/LibriSpeech/train-other-500 data/train_other_500
# steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_other_500 \
#   exp/make_mfcc/train_other_500 $mfccdir
# steps/compute_cmvn_stats.sh \
#   data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir

# # combine all the data
# utils/combine_data.sh \
#   data/train_960 data/train_clean_460 data/train_other_500

# steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
#   data/train_960 data/lang exp/tri5b exp/tri5b_ali_960

# # train a SAT model on the 960 hour mixed data.  Use the train_quick.sh script
# # as it is faster.
# steps/train_quick.sh --cmd "$train_cmd" \
#   7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b

# # decode using the tri6b model
# (
#   utils/mkgraph.sh data/lang_test_tgsmall \
#     exp/tri6b exp/tri6b/graph_tgsmall
#   for test in test_clean test_other dev_clean dev_other; do
#     steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
#       exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test
#     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
#       data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test
#     steps/lmrescore_const_arpa.sh \
#       --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
#       data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test
#     steps/lmrescore_const_arpa.sh \
#       --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
#       data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test
#   done
# )&

# # this does some data-cleaning. The cleaned data should be useful when we add
# # the neural net and chain systems.
# local/run_cleanup_segmentation.sh

# # steps/cleanup/debug_lexicon.sh --remove-stress true  --nj 200 --cmd "$train_cmd" data/train_clean_100 \
# #    data/lang exp/tri6b data/local/dict/lexicon.txt exp/debug_lexicon_100h

# # #Perform rescoring of tri6b be means of faster-rnnlm
# # #Attention: with default settings requires 4 GB of memory per rescoring job, so commenting this out by default
# # wait && local/run_rnnlm.sh \
# #     --rnnlm-ver "faster-rnnlm" \
# #     --rnnlm-options "-hidden 150 -direct 1000 -direct-order 5" \
# #     --rnnlm-tag "h150-me5-1000" $data data/local/lm

# # #Perform rescoring of tri6b be means of faster-rnnlm using Noise contrastive estimation
# # #Note, that could be extremely slow without CUDA
# # #We use smaller direct layer size so that it could be stored in GPU memory (~2Gb)
# # #Suprisingly, bottleneck here is validation rather then learning
# # #Therefore you can use smaller validation dataset to speed up training
# # wait && local/run_rnnlm.sh \
# #     --rnnlm-ver "faster-rnnlm" \
# #     --rnnlm-options "-hidden 150 -direct 400 -direct-order 3 --nce 20" \
# #     --rnnlm-tag "h150-me3-400-nce20" $data data/local/lm


# # train nnet3 tdnn models on the entire data with data-cleaning (xent and chain)
# local/chain/run_tdnn.sh # set "--stage 11" if you have already run local/nnet3/run_tdnn.sh

# # The nnet3 TDNN recipe:
# # local/nnet3/run_tdnn.sh # set "--stage 11" if you have already run local/chain/run_tdnn.sh

# # # train models on cleaned-up data
# # # we've found that this isn't helpful-- see the comments in local/run_data_cleaning.sh
# # local/run_data_cleaning.sh

# # # The following is the current online-nnet2 recipe, with "multi-splice".
# # local/online/run_nnet2_ms.sh

# # # The following is the discriminative-training continuation of the above.
# # local/online/run_nnet2_ms_disc.sh

# # ## The following is an older version of the online-nnet2 recipe, without "multi-splice".  It's faster
# # ## to train but slightly worse.
# # # local/online/run_nnet2.sh

# # Wait for decodings in the background
# wait