Commit 2957260d authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

add Jupyter scrips and update format_lms

parent c7831348
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
". path.sh\n",
". cmd.sh\n",
"idata_kaldi=data-ESTER2-V1\n",
"exp_kaldi=exp-ESTER2-V1"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"prepare train\n",
"utils/data/get_utt2dur.sh: working out data-ESTER2-V1/train/utt2dur from data-ESTER2-V1/train/segments\n",
"utils/data/get_utt2dur.sh: computed data-ESTER2-V1/train/utt2dur\n",
"utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/train\n",
"Successfully prepared data in data-ESTER2-V1/train..\n",
"prepare test\n",
"utils/data/get_utt2dur.sh: working out data-ESTER2-V1/test/utt2dur from data-ESTER2-V1/test/segments\n",
"utils/data/get_utt2dur.sh: computed data-ESTER2-V1/test/utt2dur\n",
"utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/test\n",
"Successfully prepared data in data-ESTER2-V1/test..\n",
"prepare dev\n",
"utils/data/get_utt2dur.sh: working out data-ESTER2-V1/dev/utt2dur from data-ESTER2-V1/dev/segments\n",
"utils/data/get_utt2dur.sh: computed data-ESTER2-V1/dev/utt2dur\n",
"utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/dev\n",
"Successfully prepared data in data-ESTER2-V1/dev..\n"
]
}
],
"source": [
"data=/fast/LINAGORA/Corpus/Corpus/ESTER2/corpus\n",
"#idata_kaldi=data-ESTER-V4\n",
"for part in train test dev; do\n",
" # use underscore-separated names in data directories.\n",
" echo \"prepare $part\"\n",
" #local/data_prepTCOF.sh $data/$part $idata_kaldi/$part\n",
" # probleme event (URL:)\n",
" local/data_prepESTER.sh $data/$part $idata_kaldi/$part\n",
"done"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER2-V1/train exp-ESTER2-V1/make_mfcc/train mfcc-ESTER2-V1\n",
"utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/train\n",
"steps/make_mfcc.sh [info]: segments file exists: using that.\n",
"Succeeded creating MFCC features for train\n",
"steps/compute_cmvn_stats.sh data-ESTER2-V1/train exp-ESTER2-V1/make_mfcc/train mfcc-ESTER2-V1\n",
"Succeeded creating CMVN stats for train\n",
"steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER2-V1/test exp-ESTER2-V1/make_mfcc/test mfcc-ESTER2-V1\n",
"utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/test\n",
"steps/make_mfcc.sh [info]: segments file exists: using that.\n",
"Succeeded creating MFCC features for test\n",
"steps/compute_cmvn_stats.sh data-ESTER2-V1/test exp-ESTER2-V1/make_mfcc/test mfcc-ESTER2-V1\n",
"Succeeded creating CMVN stats for test\n",
"steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER2-V1/dev exp-ESTER2-V1/make_mfcc/dev mfcc-ESTER2-V1\n",
"utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/dev\n",
"steps/make_mfcc.sh [info]: segments file exists: using that.\n",
"Succeeded creating MFCC features for dev\n",
"steps/compute_cmvn_stats.sh data-ESTER2-V1/dev exp-ESTER2-V1/make_mfcc/dev mfcc-ESTER2-V1\n",
"Succeeded creating CMVN stats for dev\n"
]
}
],
"source": [
"exp_kaldi=exp-ESTER2-V1\n",
"mfccdir=mfcc-ESTER2-V1\n",
"for part in train test dev; do\n",
" #MFCC features\n",
" steps/make_mfcc.sh --cmd \"$train_cmd\" --nj 32 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
" #MFCC features + Pitch\n",
" #steps/make_mfcc_pitch.sh --cmd \"$train_cmd\" --nj 12 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
" steps/compute_cmvn_stats.sh $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
" #utils/fix_data_dir.sh $idata_kaldi/$part\n",
"done"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Splitting into 2 parts, to allow for parallel processing ...\n",
"Checking the splits ...\n",
"Performing text normalization (2 jobs) - check data-ESTER2-V1/local_ESTER12/lm/norm/tmp/txt_norm.JOB.log ...\n",
"Finished OK\n",
"Selecting the vocabulary (400000 words) ...\n",
"Making the corpus and the vocabulary ...\n",
"Word counts saved to 'data-ESTER2-V1/local_ESTER12/lm/word_counts.txt'\n",
"Vocabulary saved as 'data-ESTER2-V1/local_ESTER12/lm/meeting-vocab.txt'\n",
"All unique sentences (in sorted order) stored in 'data-ESTER2-V1/local_ESTER12/lm/meeting-lm-norm.txt.gz'\n",
"Counting the total number word tokens in the corpus ...\n",
"There are 1075067 tokens in the corpus\n",
"Training a 3-gram LM ...\n",
"This implementation assumes that you have a lot of free RAM(> 12GB) on your machine\n",
"If that's not the case, consider something like: http://joshua-decoder.org/4.0/large-lms.html\n",
"3,8M\tdata-ESTER2-V1/local_ESTER12/lm/lm_tglarge.arpa.gz\n",
"Creating a 'small' pruned 3-gram LM (threshold: 0.0000003) ...\n",
"data-ESTER2-V1/local_ESTER12/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
"3,0M\tdata-ESTER2-V1/local_ESTER12/lm/lm_tgsmall.arpa.gz\n",
"Creating a 'medium' pruned 3-gram LM (threshold: 0.0000001) ...\n",
"data-ESTER2-V1/local_ESTER12/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
"3,5M\tdata-ESTER2-V1/local_ESTER12/lm/lm_tgmed.arpa.gz\n",
"Training a 4-gram LM ...\n",
"4,5M\tdata-ESTER2-V1/local_ESTER12/lm/lm_fglarge.arpa.gz\n"
]
}
],
"source": [
"LM_train_text=/fast/LINAGORA/Corpus/Corpus/ESTER2/corpus/train\n",
"local/lm/train_lm.sh $LM_train_text \\\n",
"$idata_kaldi/local_ESTER12/lm/norm/tmp $idata_kaldi/local_ESTER12/lm/norm/norm_texts $idata_kaldi/local_ESTER12/lm"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Bash",
"language": "bash",
"name": "bash"
},
"language_info": {
"codemirror_mode": "shell",
"file_extension": ".sh",
"mimetype": "text/x-sh",
"name": "bash"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
beam=13.0 # beam for decoding. Was 13.0 in the scripts.
first_beam=9.0 # beam for 1st-pass decoding in SAT.
......@@ -43,7 +43,9 @@ trap "rm -r $tmpdir" EXIT
mkdir -p $tmpdir
#for lm_suffix in tgsmall tgmed tglarge fglarge; do
for lm_suffix in tglarge french-small; do
for lm_suffix in fglarge tglarge tgsphinx french-small mixed; do
#for lm_suffix in fglarge; do
#for lm_suffix in mixed; do
#for lm_suffix in linto1; do
# tglarge is prepared by a separate command, called from run.sh; we don't
# want to compile G.fst for tglarge, as it takes a while.
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment