Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
". path.sh\n",
". cmd.sh\n",
"idata_kaldi=data-ESTER2-V1\n",
"exp_kaldi=exp-ESTER2-V1"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"prepare train\n",
"utils/data/get_utt2dur.sh: working out data-ESTER2-V1/train/utt2dur from data-ESTER2-V1/train/segments\n",
"utils/data/get_utt2dur.sh: computed data-ESTER2-V1/train/utt2dur\n",
"utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/train\n",
"Successfully prepared data in data-ESTER2-V1/train..\n",
"prepare test\n",
"utils/data/get_utt2dur.sh: working out data-ESTER2-V1/test/utt2dur from data-ESTER2-V1/test/segments\n",
"utils/data/get_utt2dur.sh: computed data-ESTER2-V1/test/utt2dur\n",
"utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/test\n",
"Successfully prepared data in data-ESTER2-V1/test..\n",
"prepare dev\n",
"utils/data/get_utt2dur.sh: working out data-ESTER2-V1/dev/utt2dur from data-ESTER2-V1/dev/segments\n",
"utils/data/get_utt2dur.sh: computed data-ESTER2-V1/dev/utt2dur\n",
"utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/dev\n",
"Successfully prepared data in data-ESTER2-V1/dev..\n"
]
}
],
"source": [
"data=/fast/LINAGORA/Corpus/Corpus/ESTER2/corpus\n",
"#idata_kaldi=data-ESTER-V4\n",
"for part in train test dev; do\n",
" # use underscore-separated names in data directories.\n",
" echo \"prepare $part\"\n",
" #local/data_prepTCOF.sh $data/$part $idata_kaldi/$part\n",
" # probleme event (URL:)\n",
" local/data_prepESTER.sh $data/$part $idata_kaldi/$part\n",
"done"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER2-V1/train exp-ESTER2-V1/make_mfcc/train mfcc-ESTER2-V1\n",
"utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/train\n",
"steps/make_mfcc.sh [info]: segments file exists: using that.\n",
"Succeeded creating MFCC features for train\n",
"steps/compute_cmvn_stats.sh data-ESTER2-V1/train exp-ESTER2-V1/make_mfcc/train mfcc-ESTER2-V1\n",
"Succeeded creating CMVN stats for train\n",
"steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER2-V1/test exp-ESTER2-V1/make_mfcc/test mfcc-ESTER2-V1\n",
"utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/test\n",
"steps/make_mfcc.sh [info]: segments file exists: using that.\n",
"Succeeded creating MFCC features for test\n",
"steps/compute_cmvn_stats.sh data-ESTER2-V1/test exp-ESTER2-V1/make_mfcc/test mfcc-ESTER2-V1\n",
"Succeeded creating CMVN stats for test\n",
"steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER2-V1/dev exp-ESTER2-V1/make_mfcc/dev mfcc-ESTER2-V1\n",
"utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/dev\n",
"steps/make_mfcc.sh [info]: segments file exists: using that.\n",
"Succeeded creating MFCC features for dev\n",
"steps/compute_cmvn_stats.sh data-ESTER2-V1/dev exp-ESTER2-V1/make_mfcc/dev mfcc-ESTER2-V1\n",
"Succeeded creating CMVN stats for dev\n"
]
}
],
"source": [
"exp_kaldi=exp-ESTER2-V1\n",
"mfccdir=mfcc-ESTER2-V1\n",
"for part in train test dev; do\n",
" #MFCC features\n",
" steps/make_mfcc.sh --cmd \"$train_cmd\" --nj 32 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
" #MFCC features + Pitch\n",
" #steps/make_mfcc_pitch.sh --cmd \"$train_cmd\" --nj 12 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
" steps/compute_cmvn_stats.sh $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
" #utils/fix_data_dir.sh $idata_kaldi/$part\n",
"done"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Splitting into 2 parts, to allow for parallel processing ...\n",
"Checking the splits ...\n",
"Performing text normalization (2 jobs) - check data-ESTER2-V1/local_ESTER12/lm/norm/tmp/txt_norm.JOB.log ...\n",
"Finished OK\n",
"Selecting the vocabulary (400000 words) ...\n",
"Making the corpus and the vocabulary ...\n",
"Word counts saved to 'data-ESTER2-V1/local_ESTER12/lm/word_counts.txt'\n",
"Vocabulary saved as 'data-ESTER2-V1/local_ESTER12/lm/meeting-vocab.txt'\n",
"All unique sentences (in sorted order) stored in 'data-ESTER2-V1/local_ESTER12/lm/meeting-lm-norm.txt.gz'\n",
"Counting the total number word tokens in the corpus ...\n",
"There are 1075067 tokens in the corpus\n",
"Training a 3-gram LM ...\n",
"This implementation assumes that you have a lot of free RAM(> 12GB) on your machine\n",
"If that's not the case, consider something like: http://joshua-decoder.org/4.0/large-lms.html\n",
"3,8M\tdata-ESTER2-V1/local_ESTER12/lm/lm_tglarge.arpa.gz\n",
"Creating a 'small' pruned 3-gram LM (threshold: 0.0000003) ...\n",
"data-ESTER2-V1/local_ESTER12/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
"3,0M\tdata-ESTER2-V1/local_ESTER12/lm/lm_tgsmall.arpa.gz\n",
"Creating a 'medium' pruned 3-gram LM (threshold: 0.0000001) ...\n",
"data-ESTER2-V1/local_ESTER12/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
"3,5M\tdata-ESTER2-V1/local_ESTER12/lm/lm_tgmed.arpa.gz\n",
"Training a 4-gram LM ...\n",
"4,5M\tdata-ESTER2-V1/local_ESTER12/lm/lm_fglarge.arpa.gz\n"
]
}
],
"source": [
"LM_train_text=/fast/LINAGORA/Corpus/Corpus/ESTER2/corpus/train\n",
"local/lm/train_lm.sh $LM_train_text \\\n",
"$idata_kaldi/local_ESTER12/lm/norm/tmp $idata_kaldi/local_ESTER12/lm/norm/norm_texts $idata_kaldi/local_ESTER12/lm"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Bash",
"language": "bash",
"name": "bash"
},
"language_info": {
"codemirror_mode": "shell",
"file_extension": ".sh",
"mimetype": "text/x-sh",
"name": "bash"
}
},
"nbformat": 4,
"nbformat_minor": 2
}