Commit 7fd54997 authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

French ASR system

parent 37514a2a
# you can change cmd.sh depending on what type of queue you are using.
# If you have no queueing system and want to run on a local machine, you
# can change all instances 'queue.pl' to run.pl (but be careful and run
# commands one by one: most recipes will exhaust the memory on your
# machine). queue.pl works with GridEngine (qsub). slurm.pl works
# with slurm. Different queues are configured differently, with different
# queue names and different ways of specifying things like memory;
# to account for these differences you can create and edit the file
# conf/queue.conf to match your queue's configuration. Search for
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
#export train_cmd="queue.pl --mem 2G"
#export decode_cmd="queue.pl --mem 4G"
#export mkgraph_cmd="queue.pl --mem 8G"
# Local machine
export train_cmd="run.pl --mem 2G"
export decode_cmd="run.pl --mem 4G"
export mkgraph_cmd="run.pl --mem 8G"
# empty config, just use the defaults.
--use-energy=false # only non-default option.
# config for high-resolution MFCC features, intended for neural network training
# Note: we keep all cepstra, so it has the same info as filterbank features,
# but MFCC is more easily compressible (because less correlated) which is why
# we prefer this method.
--use-energy=false # use average of log energy, not energy.
--num-mel-bins=40 # similar to Google's setup.
--num-ceps=40 # there is no dimensionality reduction.
--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so
# there might be some information at the low end.
--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600)
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
## This config is given by conf/make_pitch_online.sh to the program compute-and-process-kaldi-pitch-feats,
## and is copied by steps/online/nnet2/prepare_online_decoding.sh and similar scripts, to be given
## to programs like online2-wav-nnet2-latgen-faster.
## The program compute-and-process-kaldi-pitch-feats will use it to compute pitch features that
## are the same as that those which will generated in online decoding; this enables us to train
## in a way that's compatible with online decoding.
##
## most of these options relate to the post-processing rather than the pitch
## extraction itself.
--add-raw-log-pitch=true ## this is intended for input to neural nets, so our
## approach is "throw everything in and see what
## sticks".
--normalization-left-context=75
--normalization-right-context=50 # We're removing some of the right-context
# for the normalization. Would normally be 75.
#
# Note: our changes to the (left,right) context
# from the defaults of (75,75) to (75,50) will
# almost certainly worsen results, but will
# reduce latency.
--frames-per-chunk=10 ## relates to offline simulation of online decoding; 1
## would be equivalent to getting in samples one by
## one.
--simulate-first-pass-online=true ## this make the online-pitch-extraction code
## output the 'first-pass' features, which
## are less accurate than the final ones, and
## which are the only features the neural-net
## decoding would ever see (since we can't
## afford to do lattice rescoring in the
## neural-net code
# Default configuration
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
option mem=* -l mem_free=$0,ram_free=$0
option mem=0 # Do not add anything to qsub_opts
option num_threads=* -pe smp $0
option num_threads=1 # Do not add anything to qsub_opts
option max_jobs_run=* -tc $0
default gpu=0
option gpu=0 -q all.q
option gpu=* -l gpu=$0 -q g.q
default allow_k20=true
option allow_k20=true
option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*'
#num2Words
pip install num2words
python setup.py install
python setup.py test
from num2words import num2words
num2words(42, lang='fr')
export KALDI_ROOT=`pwd`/../../..
export PATH=$PWD/tools/festival/nsw/bin:$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
. $KALDI_ROOT/tools/config/common_path.sh
export LC_ALL=C
# we use this both in the (optional) LM training and the G2P-related scripts
PYTHON='python2.7'
### Below are the paths used by the optional parts of the recipe
# We only need the Festival stuff below for the optional text normalization(for LM-training) step
FEST_ROOT=tools/festival
NSW_PATH=${FEST_ROOT}/festival/bin:${FEST_ROOT}/nsw/bin
export PATH=$PATH:$NSW_PATH
# SRILM is needed for LM model building
SRILM_ROOT=$KALDI_ROOT/tools/srilm
SRILM_PATH=$SRILM_ROOT/bin:$SRILM_ROOT/bin/i686-m64
export PATH=$PATH:$SRILM_PATH
# Sequitur G2P executable
sequitur=$KALDI_ROOT/tools/sequitur/g2p.py
sequitur_path="$(dirname $sequitur)/lib/$PYTHON/site-packages"
# Directory under which the LM training corpus should be extracted
LM_CORPUS_ROOT=./lm-corpus
#!/bin/bash
# data dir
data=/data/tcof/
. ./cmd.sh
. ./path.sh
# you might not want to do this for interactive shells.
set -e
# format the data as Kaldi data directories
for part in train dev test ; do
# use underscore-separated names in data directories.
local/data_prep.sh $data/LibriSpeech/$part data/$part
done
## Optional text corpus normalization and LM training
## These scripts are here primarily as a documentation of the process that has been
## used to build the LM. Most users of this recipe will NOT need/want to run
## this step. The pre-built language models and the pronunciation lexicon, as
## well as some intermediate data(e.g. the normalized text used for LM training),
## are available for download at http://www.openslr.org/11/
#local/lm/train_lm.sh $LM_CORPUS_ROOT \
# data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm
## Optional G2P training scripts.
## As the LM training scripts above, this script is intended primarily to
## document our G2P model creation process
#local/g2p/train_g2p.sh data/local/dict/cmudict data/local/lm
# when "--stage 3" option is used below we skip the G2P steps, and use the
# lexicon we have already downloaded from openslr.org/11/
local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \
data/local/lm data/local/lm data/local/dict_nosp
utils/prepare_lang.sh data/local/dict_nosp \
"<UNK>" data/local/lang_tmp_nosp data/lang_nosp
local/format_lms.sh --src-dir data/lang_nosp data/local/lm
# Create ConstArpaLm format language model for full 3-gram and 4-gram LMs
utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz \
data/lang_nosp data/lang_nosp_test_tglarge
utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz \
data/lang_nosp data/lang_nosp_test_fglarge
mfccdir=mfcc
# spread the mfccs over various machines, as this data-set is quite large.
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename.
utils/create_split_dir.pl /export/b{02,11,12,13}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
$mfccdir/storage
fi
for part in dev_clean test_clean dev_other test_other train_clean_100; do
steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/$part exp/make_mfcc/$part $mfccdir
steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir
done
# Make some small data subsets for early system-build stages. Note, there are 29k
# utterances in the train_clean_100 directory which has 100 hours of data.
# For the monophone stages we select the shortest utterances, which should make it
# easier to align the data from a flat start.
utils/subset_data_dir.sh --shortest data/train_clean_100 2000 data/train_2kshort
utils/subset_data_dir.sh data/train_clean_100 5000 data/train_5k
utils/subset_data_dir.sh data/train_clean_100 10000 data/train_10k
# train a monophone system
steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
data/train_2kshort data/lang_nosp exp/mono
# decode using the monophone model
(
utils/mkgraph.sh --mono data/lang_nosp_test_tgsmall \
exp/mono exp/mono/graph_nosp_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/mono/graph_nosp_tgsmall \
data/$test exp/mono/decode_nosp_tgsmall_$test
done
)&
steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
data/train_5k data/lang_nosp exp/mono exp/mono_ali_5k
# train a first delta + delta-delta triphone system on a subset of 5000 utterances
steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1
# decode using the tri1 model
(
utils/mkgraph.sh data/lang_nosp_test_tgsmall \
exp/tri1 exp/tri1/graph_nosp_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgsmall \
data/$test exp/tri1/decode_nosp_tgsmall_$test
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test
done
)&
steps/align_si.sh --nj 10 --cmd "$train_cmd" \
data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k
# train an LDA+MLLT system.
steps/train_lda_mllt.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" 2500 15000 \
data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b
# decode using the LDA+MLLT model
(
utils/mkgraph.sh data/lang_nosp_test_tgsmall \
exp/tri2b exp/tri2b/graph_nosp_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgsmall \
data/$test exp/tri2b/decode_nosp_tgsmall_$test
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test
done
)&
# Align a 10k utts subset using the tri2b model
steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \
data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k
# Train tri3b, which is LDA+MLLT+SAT on 10k utts
steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b
# decode using the tri3b model
(
utils/mkgraph.sh data/lang_nosp_test_tgsmall \
exp/tri3b exp/tri3b/graph_nosp_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
exp/tri3b/graph_nosp_tgsmall data/$test \
exp/tri3b/decode_nosp_tgsmall_$test
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test
done
)&
# align the entire train_clean_100 subset using the tri3b model
steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
data/train_clean_100 data/lang_nosp \
exp/tri3b exp/tri3b_ali_clean_100
# train another LDA+MLLT+SAT system on the entire 100 hour subset
steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
data/train_clean_100 data/lang_nosp \
exp/tri3b_ali_clean_100 exp/tri4b
# decode using the tri4b model
(
utils/mkgraph.sh data/lang_nosp_test_tgsmall \
exp/tri4b exp/tri4b/graph_nosp_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
exp/tri4b/graph_nosp_tgsmall data/$test \
exp/tri4b/decode_nosp_tgsmall_$test
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test
done
)&
# Now we compute the pronunciation and silence probabilities from training data,
# and re-create the lang directory.
steps/get_prons.sh --cmd "$train_cmd" \
data/train_clean_100 data/lang_nosp exp/tri4b
utils/dict_dir_add_pronprobs.sh --max-normalize true \
data/local/dict_nosp \
exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \
exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict
utils/prepare_lang.sh data/local/dict \
"<UNK>" data/local/lang_tmp data/lang
local/format_lms.sh --src-dir data/lang data/local/lm
utils/build_const_arpa_lm.sh \
data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge
utils/build_const_arpa_lm.sh \
data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge
# decode using the tri4b model with pronunciation and silence probabilities
(
utils/mkgraph.sh \
data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
exp/tri4b/graph_tgsmall data/$test \
exp/tri4b/decode_tgsmall_$test
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test
done
)&
# align train_clean_100 using the tri4b model
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100
# if you want at this point you can train and test NN model(s) on the 100 hour
# subset
local/nnet2/run_5a_clean_100.sh
local/download_and_untar.sh $data $data_url train-clean-360
# now add the "clean-360" subset to the mix ...
local/data_prep.sh \
$data/LibriSpeech/train-clean-360 data/train_clean_360
steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_clean_360 \
exp/make_mfcc/train_clean_360 $mfccdir
steps/compute_cmvn_stats.sh \
data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir
# ... and then combine the two sets into a 460 hour one
utils/combine_data.sh \
data/train_clean_460 data/train_clean_100 data/train_clean_360
# align the new, combined set, using the tri4b model
steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460
# create a larger SAT model, trained on the 460 hours of data.
steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b
# decode using the tri5b model
(
utils/mkgraph.sh data/lang_test_tgsmall \
exp/tri5b exp/tri5b/graph_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
exp/tri5b/graph_tgsmall data/$test \
exp/tri5b/decode_tgsmall_$test
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test
done
)&
# train a NN model on the 460 hour set
local/nnet2/run_6a_clean_460.sh
local/download_and_untar.sh $data $data_url train-other-500
# prepare the 500 hour subset.
local/data_prep.sh \
$data/LibriSpeech/train-other-500 data/train_other_500
steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_other_500 \
exp/make_mfcc/train_other_500 $mfccdir
steps/compute_cmvn_stats.sh \
data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir
# combine all the data
utils/combine_data.sh \
data/train_960 data/train_clean_460 data/train_other_500
steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
data/train_960 data/lang exp/tri5b exp/tri5b_ali_960
# train a SAT model on the 960 hour mixed data. Use the train_quick.sh script
# as it is faster.
steps/train_quick.sh --cmd "$train_cmd" \
7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b
# decode using the tri6b model
(
utils/mkgraph.sh data/lang_test_tgsmall \
exp/tri6b exp/tri6b/graph_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test
done
)&
# this does some data-cleaning. The cleaned data should be useful when we add
# the neural net and chain systems.
local/run_cleanup_segmentation.sh
# steps/cleanup/debug_lexicon.sh --remove-stress true --nj 200 --cmd "$train_cmd" data/train_clean_100 \
# data/lang exp/tri6b data/local/dict/lexicon.txt exp/debug_lexicon_100h
# #Perform rescoring of tri6b be means of faster-rnnlm
# #Attention: with default settings requires 4 GB of memory per rescoring job, so commenting this out by default
# wait && local/run_rnnlm.sh \
# --rnnlm-ver "faster-rnnlm" \
# --rnnlm-options "-hidden 150 -direct 1000 -direct-order 5" \
# --rnnlm-tag "h150-me5-1000" $data data/local/lm
# #Perform rescoring of tri6b be means of faster-rnnlm using Noise contrastive estimation
# #Note, that could be extremely slow without CUDA
# #We use smaller direct layer size so that it could be stored in GPU memory (~2Gb)
# #Suprisingly, bottleneck here is validation rather then learning
# #Therefore you can use smaller validation dataset to speed up training
# wait && local/run_rnnlm.sh \
# --rnnlm-ver "faster-rnnlm" \
# --rnnlm-options "-hidden 150 -direct 400 -direct-order 3 --nce 20" \
# --rnnlm-tag "h150-me3-400-nce20" $data data/local/lm
# train nnet3 tdnn models on the entire data with data-cleaning (xent and chain)
local/chain/run_tdnn.sh # set "--stage 11" if you have already run local/nnet3/run_tdnn.sh
# The nnet3 TDNN recipe:
# local/nnet3/run_tdnn.sh # set "--stage 11" if you have already run local/chain/run_tdnn.sh
# # train models on cleaned-up data
# # we've found that this isn't helpful-- see the comments in local/run_data_cleaning.sh
# local/run_data_cleaning.sh
# # The following is the current online-nnet2 recipe, with "multi-splice".
# local/online/run_nnet2_ms.sh
# # The following is the discriminative-training continuation of the above.
# local/online/run_nnet2_ms_disc.sh
# ## The following is an older version of the online-nnet2 recipe, without "multi-splice". It's faster
# ## to train but slightly worse.
# # local/online/run_nnet2.sh
# Wait for decodings in the background
wait
../../wsj/s5/steps
\ No newline at end of file
../../wsj/s5/utils
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment