Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
LINAGORA
L
LGS
Labs
kaldi-modelgen
Commits
e4b07c98
Commit
e4b07c98
authored
Mar 20, 2017
by
Abdelwahab HEBA
Browse files
add script for evaluate PER,perplexity,snr and WER
parent
cabbb34c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
238 additions
and
0 deletions
+238
-0
local/evaluation/evaluate_PER.sh
local/evaluation/evaluate_PER.sh
+63
-0
local/evaluation/evaluate_perplexity.sh
local/evaluation/evaluate_perplexity.sh
+62
-0
local/evaluation/evaluate_snr.sh
local/evaluation/evaluate_snr.sh
+27
-0
local/evaluation/evaluation.sh
local/evaluation/evaluation.sh
+86
-0
No files found.
local/evaluation/evaluate_PER.sh
0 → 100755
View file @
e4b07c98
#!/usr/bin/env bash
# Abdel HEBA @Linagora 2017
# Needs :
# Transcription Text
# lexicon & G2P model
# Phone.txt
# Directory where Acoustic model & All ali.*.gz are saved
.
path.sh
.
cmd.sh
text
=
$1
lexicon
=
$2
G2P_dir
=
$3
phone
=
$4
exp_dir
=
$5
out_dir
=
$6
if
[
!
-d
"
$out_dir
"
]
;
then
mkdir
$out_dir
echo
"Filename,%PER,%nbPER,ins,del,sub"
>
$out_dir
/PER.res
fi
# Task 0: Test if Grapheme to phonem is correct to predict lexicon word
# Task 1: replace all words in Text with their phonetics
phone_transcription
=
$out_dir
/truth_transcription.tmp
;
[[
-f
"
$phone_transcription
"
]]
&&
rm
$phone_transcription
phone_hyp
=
$out_dir
/phone_hypothesis.tmp
;
[[
-f
"
$phone_hyp
"
]]
&&
rm
$phone_hyp
vocab_tmp
=
$out_dir
/vocab.tmp
;
[[
-f
"
$phone_hyp
"
]]
&&
rm
$phone_hyp
result_phone
=
$out_dir
/res.phone
;
[[
-f
"
$phone_hyp
"
]]
&&
rm
$phone_hyp
added_vocab
=
$out_dir
/added_vocab.tmp
;
[[
-f
"
$phone_hyp
"
]]
&&
rm
$phone_hyp
touch
$phone_transcription
touch
$added_vocab
while
read
line
;
do
# Build phonetic transcription for each utterence
seg
=
`
echo
$line
|awk
'{print $1}'
`
for
word
in
$(
echo
$line
|
awk
'{$1="";print $0}'
)
;
do
phonetic_of_word
=
`
awk
-v
find_word
=
$word
'$1 == find_word {$1="";print $0}'
$lexicon
`
# If word doesn't exist in lexicon then generate phonetisation from G2P model
if
[
-z
"
${
phonetic_of_word
}
"
]
;
then
echo
$word
>
$vocab_tmp
local
/g2p.sh
$vocab_tmp
$G2P_dir
$result_phone
phonetic_of_word
=
`
awk
'{$1="";print $0}'
$result_phone
`
# Save Added word
echo
$word$phonetic_of_word
>>
$added_vocab
fi
seg
=
$seg$phonetic_of_word
done
# Save phonetics for each utterance
echo
$seg
>>
$phone_transcription
done
<
$text
# Task 2: extract phone alignement from acoustic model
# Extract phones
show-alignments
$phone
$exp_dir
/final.mdl
"ark:gunzip -c
$exp_dir
/ali.*.gz|"
|
awk
'$0!=""'
|
awk
'NR%2==0'
|
\
#sed s/SIL//g | sed s/SPN//g | sed s/NSN//g |\
sed
s/_I//g |
sed
s/_S//g |
sed
s/_B//g |
sed
s/_E//g |
\
sed
's/\s\s*/ /g'
>
$phone_hyp
# Task 3: compare both phonetics between truth and learned one from acoustic model
compute-wer
--text
--mode
=
present ark:
$phone_transcription
ark:
$phone_hyp
|
\
awk
-v
meeting
=
$(
basename
$text
)
'BEGIN{OFS=","} $1 == "%WER" {$1=meeting;print $1,$2,$4$5$6$7,$9,$11}'
>>
$out_dir
/PER.res
\ No newline at end of file
local/evaluation/evaluate_perplexity.sh
0 → 100755
View file @
e4b07c98
#!/bin/bash
# Copyright 2017 Abdel HEBA @Linagora
# Need to be called after training mono
.
../../path.sh
.
../../cmd.sh
data
=
$1
in_list
=
dir_texts.txt
out_ppl
=
res_ppl.txt
out_csv
=
out.csv
data_train
=
$data
/train
data_test
=
$data
/test
data_dev
=
$data
/dev
norm_dir
=
norm_dir
prep_dir
=
perplexity_results_3glmlarge
lm_model
=
~/Documents/Linagora/Data/Dict_FR/cmudict/sphinxfr/lm_tgsphinx.arpa.gz
#lm_model=data/local/lm/lm_french-small.arpa.gz
# ====== Evaluate Perplexity for each meeting =========
# for training meeting
find
$data_train
-mindepth
1
-maxdepth
1
-type
d |
\
tee
-a
$in_list
>
dir_texts_train.txt
# for test meeting
find
$data_test
-mindepth
1
-maxdepth
1
-type
d |
\
tee
-a
$in_list
>
dir_texts_test.txt
# for dev meeting
find
$data_dev
-mindepth
1
-maxdepth
1
-type
d |
\
tee
-a
$in_list
>
dir_texts_dev.txt
# Save clean text from each meeting
../lm/normalize_text.sh
$in_list
$norm_dir
echo
"Compute perplexity"
mkdir
-p
$prep_dir
for
b
in
$(
cat
$in_list
)
;
do
id
=
$(
basename
$b
)
echo
"compute perplexity for
$id
"
ngram
-ppl
$norm_dir
/
$id
.txt
-lm
$lm_model
>
$prep_dir
/
$id
.txt
done
find
$prep_dir
-type
f |
sort
>
$out_ppl
python3 parse_perplexity.py
$out_ppl
$out
.csv
# for part in $(cat dirdevtest.txt); do local/data_prep.sh $data/$part data-valid/$part; done
plpdir
=
plp
for
part
in
$(
cat
dirdevtest.txt
)
;
do
../../steps/make_plp.sh
--cmd
"
$train_cmd
"
--nj
5 data-valid/
$part
exp-valid/make_plp/
$part
$plpdir
../../steps/compute_cmvn_stats.sh data-valid/
$part
exp-valid/make_plp/
$part
$plpdir
../../utils/fix_data_dir.sh data-valid/
$part
done
# split text for each meeting
text
=
text
segments
=
segments
cat
data-valid/dev/segments |
awk
'{print $1,$2}'
>
segmeeting.txt
cat
data-valid/dev/segments |
awk
'{print $2}'
|
uniq
>
meeting.txt
for
i
in
$(
cat
meeting.txt
)
;
do
cat
data-valid/dev/segments |
awk
-v
v
=
$i
'$2 == v {print $1}'
>
segpermeeting.txt
for
j
in
$(
cat
segpermeeting.txt
)
;
do
cat
data-valid/dev/text |
awk
-v
a
=
$j
'$1 == a {print $0}'
>>
$i_meeting
.txt
done
done
local/evaluation/evaluate_snr.sh
0 → 100755
View file @
e4b07c98
#!/usr/bin/env sh
.
path.sh
.
cmd.sh
# Open Segments file and split the audio segment then compute SNR
while
read
-r
line
do
echo
$line
deb_seg
=
`
echo
$line
|
awk
'{print $3}'
`
name_seg
=
`
echo
$line
|
awk
'{print $1}'
`
name_file
=
`
echo
$line
|
awk
'{print $2}'
`
echo
$name_file
duration_seg
=
`
echo
$line
|
awk
'{print $4-$3}'
`
audio_file
=
`
cat
$1
/wav.scp |
grep
$name_file
|
awk
'{print $3}'
`
echo
$audio_file
echo
$deb_seg
echo
$duration_seg
#ffmpeg -ss $deb_seg -t $duration_seg -i $audio_file $2/tmp.wav
sox
$audio_file
$2
/tmp.wav trim
$deb_seg
00:
$duration_seg
sox
$2
/tmp.wav
-t
wav
-r
16000
-c
1
$2
/tmp16k.wav
snr_calculator.exe
-num_chans
1
-sf
16000
-sig_thresh
0.8
-noise_thresh
0.2
-frame_dur
10
-window_dur
20
-input
$2
/tmp16k.wav |
tail
-1
|
\
awk
-v
name_segment
=
$name_seg
'{print name_segment,$5}'
>>
$2
/Eval.txt
rm
$2
/tmp.wav
rm
$2
/tmp16k.wav
done
<
$1
/segments
\ No newline at end of file
local/evaluation/evaluation.sh
0 → 100755
View file @
e4b07c98
#!/usr/bin/env bash
# Faire l'entrainement du modèle acoustique sur 16H d'entrainement
# l'alignement déjà done, calcul du PER
# ===========================================
# Aligner les meetings de test et dev
steps/align_si.sh
--boost-silence
1.25
--nj
5
--cmd
"
$train_cmd
"
\
data-sphinx/dev data-sphinx/lang exp-sphinx/mono exp-sphinx/mono_ali_dev
find /data/Corpus/dev
-mindepth
1
-maxdepth
1
>
meeting_dev.txt
steps/align_si.sh
--boost-silence
1.25
--nj
12
--cmd
"
$train_cmd
"
\
data-sphinx/test data-sphinx/lang exp-sphinx/mono exp-sphinx/mono_ali_test
find /data/Corpus/test
-mindepth
1
-maxdepth
1
>>
meeting_test.txt
# calcul du PER pour les meetings du dev et test
for
meeting_dir
in
$(
cat
meeting_test.txt
)
;
do
meeting
=
$(
basename
$meeting_dir
)
echo
$meeting
cat
data-sphinx/test/text |
grep
$meeting
>
$meeting
local
/evaluation/evaluate_PER.sh /data/Thesis_aheba/
$meeting
data-sphinx/local/dict/lexicon.txt data-sphinx/local/lm data-sphinx/lang/phones.txt exp-sphinx/mono_ali_test evaluation_test
echo
$meeting
cat
evaluation_test/PER.res
rm
/data/Thesis_aheba/
$meeting
done
touch
PER.res
echo
"%PER"
>
PER.res
cat
evaluation_dev/PER.res
>>
PER.res
cat
evaluation_test/PER.res
>>
PER.res
cat
PER.res |
sort
-k1
|
awk
'{$1="";print $0}'
>
PER.csv
# concat perplexity with PER
paste
file_ppl.csv PER.csv
>
file_ppl_PER.csv
# Evaluate WER need scoring files
data_dev
=
/home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev
find
$data_dev
-mindepth
1
-maxdepth
1
-type
d
>
meeting_dev.txt
data_test
=
/home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test
find
$data_test
-mindepth
1
-maxdepth
1
-type
d
>
meeting_test.txt
Evaluation_dir
=
Evaluation/WER
lang_or_graph
=
$Evaluation_dir
/graph_mix
min_lmwt
=
7
max_lmwt
=
17
cmd
=
run.pl
word_ins_penalty
=
0.0,0.5,1.0
symtab
=
$lang_or_graph
/words.txt
# Aprés décodage et scoring des données dev et test
# Pour dev
touch
$Evaluation_dir
/WER_per_meeting.csv
#echo "%WER" > $Evaluation_dir/WER_per_meeting.csv
# ========================= DEV =====================
dir
=
$Evaluation_dir
/decode_mix_dev
for
meeting_dir
in
$(
cat
meeting_dev.txt
)
;
do
meeting
=
$(
basename
$meeting_dir
)
cat
data/dev/text |
grep
$meeting
|
sed
's:<noise>::g'
|
sed
's:<spoken_noise>::g'
|
sed
's:<laugh>::g'
>
$dir
/scoring/text_meeting.tmp
for
wip
in
$(
echo
$word_ins_penalty
|
sed
's/,/ /g'
)
;
do
$cmd
LMWT
=
$min_lmwt
:
$max_lmwt
$dir
/scoring/log/score.LMWT.
$wip
.log
\
cat
$dir
/scoring/LMWT.
$wip
.tra
\|
\
utils/int2sym.pl
-f
2-
$symtab
\|
sed
's:\<unk\>::g'
\|
\
compute-wer
--text
--mode
=
present
\
ark:
$dir
/scoring/text_meeting.tmp ark,p:-
">&"
$dir
/wer_LMWT_
$wip_$meeting
;
done
cat
$dir
/wer
*
$meeting
| utils/best_wer.sh |
awk
-v
name_meeting
=
$meeting
'{$1=name_meeting" %WER";print $0}'
>>
$Evaluation_dir
/WER_per_meeting.csv
rm
$dir
/wer
*
done
# ======================= TEST =======================
dir
=
$Evaluation_dir
/decode_mix_test
for
meeting_dir
in
$(
cat
meeting_test.txt
)
;
do
meeting
=
$(
basename
$meeting_dir
)
cat
data/test/text |
grep
$meeting
|
sed
's:<noise>::g'
|
sed
's:<spoken_noise>::g'
|
sed
's:<laugh>::g'
>
$dir
/scoring/text_meeting.tmp
for
wip
in
$(
echo
$word_ins_penalty
|
sed
's/,/ /g'
)
;
do
$cmd
LMWT
=
$min_lmwt
:
$max_lmwt
$dir
/scoring/log/score.LMWT.
$wip
.log
\
cat
$dir
/scoring/LMWT.
$wip
.tra
\|
\
utils/int2sym.pl
-f
2-
$symtab
\|
sed
's:\<unk\>::g'
\|
\
compute-wer
--text
--mode
=
present
\
ark:
$dir
/scoring/text_meeting.tmp ark,p:-
">&"
$dir
/wer_LMWT_
$wip_$meeting
;
done
cat
$dir
/wer
*
$meeting
| utils/best_wer.sh |
awk
-v
name_meeting
=
$meeting
'{$1=name_meeting" %WER";print $0}'
>>
$Evaluation_dir
/WER_per_meeting.csv
rm
$dir
/wer
*
done
# Concatener avec le fichier evaluation suivant le LM associé
touch
Evaluation/WER.csv
echo
"%WER"
>
Evaluation/WER.csv
cat
Evaluation/WER/WER_per_meeting.csv |
sort
-k1
|
awk
'{$1="";print $0}'
|
sed
's/,/|/g'
>>
Evaluation/WER.csv
paste
-d
, Evaluation/3glmmix_dev_test_ppl_per.csv Evaluation/WER.csv
>
Evaluation/3glmfrench-small_dev_test_ppl_per_wer.csv
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment