data_prepTCOF.sh 2.74 KB
Newer Older
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
1 2
#!/bin/bash

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
3
# Copyright 2016 Linagora (author: Abdel HEBA) | DONE
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
4 5 6 7 8
# see research.linagora.com OpenPaas Project and https://hubl.in for meetings
# GPL

source path.sh

9
#LANG=en_US.ISO-8859-15
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
10

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
11 12 13 14 15 16
if [ "$#" -ne 2 ]; then
    echo "Usage: $0 <src-dir> <dst-dir>"
    echo "e.g: $0 /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/train data/train"
    #exit 1
fi

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
17 18
 src=$1
 dst=$2
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42

# all utterances are Wav compressed, we use sox for reading signal in binary format
if ! which sox >&/dev/null; then
    echo "Please install 'sox' on All worker nodes"
    echo "apt-get install sox"
    #exit 1
fi


#Reflechir partie Split...?

#echo "=== Starting initial Tcof Data preparation ..."

#echo "--- Making test/train data split ..."

mkdir -p $dst #|| exit 1;

[ ! -d $src ] && echo "$0: no such directory $src" #&& exit  1;

wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
trans=$dst/text; [[ -f "$trans" ]] && rm $trans
utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
43
segments=$dst/segments; [[ -f "$segments" ]] && rm $segments
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
44

45
# For each meeting
46
for meeting_dir in $(find $src -mindepth 1 -maxdepth 1 -type f | grep "*.trs" | sort); do
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
47
    meeting=$(basename $meeting_dir)
48
    #[ ! -f $meeting_dir/$meeting.trs ] && [ ! -f $meeting_dir/$meeting.wav ] && echo " Missing $meeting.trs or $meeting.wav file " #&& exit 1
49 50 51 52
    # Generate Kaldi input file
    #echo $meeting_dir
    #echo $dst
    #python3 local/parse_AudioDB.py --data-prep --input-dir $meeting_dir --output-dir $dst >> log.txt 2>&1
53
    python3 local/parseTcofSync.py $meeting_dir $dst >> log.txt 2>&1
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
54 55
done

Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
56 57
# Sort all files
# text
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
58
#export LC_ALL=C
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
59 60 61 62 63 64 65 66
cat $trans | sort -k1 > $trans.txt
rm $trans
mv $trans.txt $trans
#segments
cat $segments | sort -k1 > $segments.txt
rm $segments
mv $segments.txt $segments
# wav
67 68 69
cat $wav_scp | sort -k1 > $wav_scp.txt
rm $wav_scp
mv $wav_scp.txt $wav_scp
70
# spk2gender
71 72 73
cat $spk2gender | sort -k1 > $spk2gender.txt
rm $spk2gender
mv $spk2gender.txt $spk2gender
74 75
# utt2spk
cat $utt2spk |sort -k1 > $utt2spk.txt
76 77
rm $utt2spk
mv $utt2spk.txt $utt2spk
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
78

79
# convert utt2spk to spk2utt
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
80 81 82
spk2utt=$dst/spk2utt
utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt #|| exit 1

83
# Sort spk2utt
84 85 86
cat $spk2utt | sort -k1 > $spk2utt.txt
rm $spk2utt
mv $spk2utt.txt $spk2utt
87
# Check trannscripts and utterances
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
88 89 90
 ntrans=$(wc -l <$trans)
 nutt2spk=$(wc -l <$utt2spk)
 ! [ "$ntrans" -eq "$nutt2spk" ] && \
91
    echo "Inconsistent #transcripts($ntrans) and # utt2spk($nutt2spk)" #&& exit 1;
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
92

93
# compute segment's duration
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
94
 utils/data/get_utt2dur.sh $dst 1>&2 #|| exit 1
95
# Validate Kladi Inputs
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
96 97 98
 utils/validate_data_dir.sh --no-feats $dst #|| exit 1;

 echo "Successfully prepared data in $dst.."
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
99 100

#exit 0