Commit e4b07c98 authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

add script for evaluate PER,perplexity,snr and WER

parent cabbb34c
#!/usr/bin/env bash
# Abdel HEBA @Linagora 2017
# Needs :
# Transcription Text
# lexicon & G2P model
# Phone.txt
# Directory where Acoustic model & All ali.*.gz are saved
. path.sh
. cmd.sh
text=$1
lexicon=$2
G2P_dir=$3
phone=$4
exp_dir=$5
out_dir=$6
if [ ! -d "$out_dir" ]; then
mkdir $out_dir
echo "Filename,%PER,%nbPER,ins,del,sub" > $out_dir/PER.res
fi
# Task 0: Test if Grapheme to phonem is correct to predict lexicon word
# Task 1: replace all words in Text with their phonetics
phone_transcription=$out_dir/truth_transcription.tmp; [[ -f "$phone_transcription" ]] && rm $phone_transcription
phone_hyp=$out_dir/phone_hypothesis.tmp; [[ -f "$phone_hyp" ]] && rm $phone_hyp
vocab_tmp=$out_dir/vocab.tmp; [[ -f "$phone_hyp" ]] && rm $phone_hyp
result_phone=$out_dir/res.phone; [[ -f "$phone_hyp" ]] && rm $phone_hyp
added_vocab=$out_dir/added_vocab.tmp; [[ -f "$phone_hyp" ]] && rm $phone_hyp
touch $phone_transcription
touch $added_vocab
while read line; do
# Build phonetic transcription for each utterence
seg=`echo $line |awk '{print $1}'`
for word in $(echo $line | awk '{$1="";print $0}'); do
phonetic_of_word=`awk -v find_word=$word '$1 == find_word {$1="";print $0}' $lexicon`
# If word doesn't exist in lexicon then generate phonetisation from G2P model
if [ -z "${phonetic_of_word}" ]; then
echo $word > $vocab_tmp
local/g2p.sh $vocab_tmp $G2P_dir $result_phone
phonetic_of_word=`awk '{$1="";print $0}' $result_phone`
# Save Added word
echo $word$phonetic_of_word >> $added_vocab
fi
seg=$seg$phonetic_of_word
done
# Save phonetics for each utterance
echo $seg >>$phone_transcription
done < $text
# Task 2: extract phone alignement from acoustic model
# Extract phones
show-alignments $phone $exp_dir/final.mdl "ark:gunzip -c $exp_dir/ali.*.gz|" | awk '$0!=""' | awk 'NR%2==0' |\
#sed s/SIL//g | sed s/SPN//g | sed s/NSN//g |\
sed s/_I//g | sed s/_S//g | sed s/_B//g | sed s/_E//g |\
sed 's/\s\s*/ /g' > $phone_hyp
# Task 3: compare both phonetics between truth and learned one from acoustic model
compute-wer --text --mode=present ark:$phone_transcription ark:$phone_hyp | \
awk -v meeting=$(basename $text) 'BEGIN{OFS=","} $1 == "%WER" {$1=meeting;print $1,$2,$4$5$6$7,$9,$11}' >> $out_dir/PER.res
\ No newline at end of file
#!/bin/bash
# Copyright 2017 Abdel HEBA @Linagora
# Need to be called after training mono
. ../../path.sh
. ../../cmd.sh
data=$1
in_list=dir_texts.txt
out_ppl=res_ppl.txt
out_csv=out.csv
data_train=$data/train
data_test=$data/test
data_dev=$data/dev
norm_dir=norm_dir
prep_dir=perplexity_results_3glmlarge
lm_model=~/Documents/Linagora/Data/Dict_FR/cmudict/sphinxfr/lm_tgsphinx.arpa.gz
#lm_model=data/local/lm/lm_french-small.arpa.gz
# ====== Evaluate Perplexity for each meeting =========
# for training meeting
find $data_train -mindepth 1 -maxdepth 1 -type d |\
tee -a $in_list > dir_texts_train.txt
# for test meeting
find $data_test -mindepth 1 -maxdepth 1 -type d |\
tee -a $in_list > dir_texts_test.txt
# for dev meeting
find $data_dev -mindepth 1 -maxdepth 1 -type d |\
tee -a $in_list > dir_texts_dev.txt
# Save clean text from each meeting
../lm/normalize_text.sh $in_list $norm_dir
echo "Compute perplexity"
mkdir -p $prep_dir
for b in $(cat $in_list); do
id=$(basename $b)
echo "compute perplexity for $id"
ngram -ppl $norm_dir/$id.txt -lm $lm_model > $prep_dir/$id.txt
done
find $prep_dir -type f | sort > $out_ppl
python3 parse_perplexity.py $out_ppl $out.csv
# for part in $(cat dirdevtest.txt); do local/data_prep.sh $data/$part data-valid/$part; done
plpdir=plp
for part in $(cat dirdevtest.txt); do
../../steps/make_plp.sh --cmd "$train_cmd" --nj 5 data-valid/$part exp-valid/make_plp/$part $plpdir
../../steps/compute_cmvn_stats.sh data-valid/$part exp-valid/make_plp/$part $plpdir
../../utils/fix_data_dir.sh data-valid/$part
done
# split text for each meeting
text=text
segments=segments
cat data-valid/dev/segments | awk '{print $1,$2}' > segmeeting.txt
cat data-valid/dev/segments | awk '{print $2}' | uniq > meeting.txt
for i in $( cat meeting.txt); do
cat data-valid/dev/segments | awk -v v=$i '$2 == v {print $1}' > segpermeeting.txt
for j in $( cat segpermeeting.txt); do
cat data-valid/dev/text | awk -v a=$j '$1 == a {print $0}' >> $i_meeting.txt
done
done
#!/usr/bin/env sh
. path.sh
. cmd.sh
# Open Segments file and split the audio segment then compute SNR
while read -r line
do
echo $line
deb_seg=`echo $line | awk '{print $3}'`
name_seg=`echo $line | awk '{print $1}'`
name_file=`echo $line | awk '{print $2}'`
echo $name_file
duration_seg=`echo $line | awk '{print $4-$3}'`
audio_file=`cat $1/wav.scp | grep $name_file | awk '{print $3}'`
echo $audio_file
echo $deb_seg
echo $duration_seg
#ffmpeg -ss $deb_seg -t $duration_seg -i $audio_file $2/tmp.wav
sox $audio_file $2/tmp.wav trim $deb_seg 00:$duration_seg
sox $2/tmp.wav -t wav -r 16000 -c 1 $2/tmp16k.wav
snr_calculator.exe -num_chans 1 -sf 16000 -sig_thresh 0.8 -noise_thresh 0.2 -frame_dur 10 -window_dur 20 -input $2/tmp16k.wav | tail -1 |\
awk -v name_segment=$name_seg '{print name_segment,$5}' >> $2/Eval.txt
rm $2/tmp.wav
rm $2/tmp16k.wav
done < $1/segments
\ No newline at end of file
#!/usr/bin/env bash
# Faire l'entrainement du modèle acoustique sur 16H d'entrainement
# l'alignement déjà done, calcul du PER
# ===========================================
# Aligner les meetings de test et dev
steps/align_si.sh --boost-silence 1.25 --nj 5 --cmd "$train_cmd" \
data-sphinx/dev data-sphinx/lang exp-sphinx/mono exp-sphinx/mono_ali_dev
find /data/Corpus/dev -mindepth 1 -maxdepth 1 > meeting_dev.txt
steps/align_si.sh --boost-silence 1.25 --nj 12 --cmd "$train_cmd" \
data-sphinx/test data-sphinx/lang exp-sphinx/mono exp-sphinx/mono_ali_test
find /data/Corpus/test -mindepth 1 -maxdepth 1 >> meeting_test.txt
# calcul du PER pour les meetings du dev et test
for meeting_dir in $(cat meeting_test.txt); do
meeting=$(basename $meeting_dir)
echo $meeting
cat data-sphinx/test/text | grep $meeting > $meeting
local/evaluation/evaluate_PER.sh /data/Thesis_aheba/$meeting data-sphinx/local/dict/lexicon.txt data-sphinx/local/lm data-sphinx/lang/phones.txt exp-sphinx/mono_ali_test evaluation_test
echo $meeting
cat evaluation_test/PER.res
rm /data/Thesis_aheba/$meeting
done
touch PER.res
echo "%PER" > PER.res
cat evaluation_dev/PER.res >> PER.res
cat evaluation_test/PER.res >> PER.res
cat PER.res | sort -k1 | awk '{$1="";print $0}' > PER.csv
# concat perplexity with PER
paste file_ppl.csv PER.csv > file_ppl_PER.csv
# Evaluate WER need scoring files
data_dev=/home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev
find $data_dev -mindepth 1 -maxdepth 1 -type d > meeting_dev.txt
data_test=/home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test
find $data_test -mindepth 1 -maxdepth 1 -type d > meeting_test.txt
Evaluation_dir=Evaluation/WER
lang_or_graph=$Evaluation_dir/graph_mix
min_lmwt=7
max_lmwt=17
cmd=run.pl
word_ins_penalty=0.0,0.5,1.0
symtab=$lang_or_graph/words.txt
# Aprés décodage et scoring des données dev et test
# Pour dev
touch $Evaluation_dir/WER_per_meeting.csv
#echo "%WER" > $Evaluation_dir/WER_per_meeting.csv
# ========================= DEV =====================
dir=$Evaluation_dir/decode_mix_dev
for meeting_dir in $(cat meeting_dev.txt); do
meeting=$(basename $meeting_dir)
cat data/dev/text | grep $meeting | sed 's:<noise>::g' | sed 's:<spoken_noise>::g' | sed 's:<laugh>::g' > $dir/scoring/text_meeting.tmp
for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \
cat $dir/scoring/LMWT.$wip.tra \| \
utils/int2sym.pl -f 2- $symtab \| sed 's:\<unk\>::g' \| \
compute-wer --text --mode=present \
ark:$dir/scoring/text_meeting.tmp ark,p:- ">&" $dir/wer_LMWT_$wip_$meeting;
done
cat $dir/wer*$meeting | utils/best_wer.sh | awk -v name_meeting=$meeting '{$1=name_meeting" %WER";print $0}' >> $Evaluation_dir/WER_per_meeting.csv
rm $dir/wer*
done
# ======================= TEST =======================
dir=$Evaluation_dir/decode_mix_test
for meeting_dir in $(cat meeting_test.txt); do
meeting=$(basename $meeting_dir)
cat data/test/text | grep $meeting | sed 's:<noise>::g' | sed 's:<spoken_noise>::g' | sed 's:<laugh>::g' > $dir/scoring/text_meeting.tmp
for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \
cat $dir/scoring/LMWT.$wip.tra \| \
utils/int2sym.pl -f 2- $symtab \| sed 's:\<unk\>::g' \| \
compute-wer --text --mode=present \
ark:$dir/scoring/text_meeting.tmp ark,p:- ">&" $dir/wer_LMWT_$wip_$meeting;
done
cat $dir/wer*$meeting | utils/best_wer.sh | awk -v name_meeting=$meeting '{$1=name_meeting" %WER";print $0}' >> $Evaluation_dir/WER_per_meeting.csv
rm $dir/wer*
done
# Concatener avec le fichier evaluation suivant le LM associé
touch Evaluation/WER.csv
echo "%WER" > Evaluation/WER.csv
cat Evaluation/WER/WER_per_meeting.csv | sort -k1 | awk '{$1="";print $0}' | sed 's/,/|/g'>> Evaluation/WER.csv
paste -d , Evaluation/3glmmix_dev_test_ppl_per.csv Evaluation/WER.csv > Evaluation/3glmfrench-small_dev_test_ppl_per_wer.csv
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment