data_prepACSYNT.sh 2.79 KB
Newer Older
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
1 2 3 4 5 6 7 8 9 10 11 12
#!/bin/bash

# Copyright 2016 Linagora (author: Abdel HEBA) | DONE
# see research.linagora.com OpenPaas Project and https://hubl.in for meetings
# GPL

source path.sh

#LANG=en_US.ISO-8859-15

if [ "$#" -ne 2 ]; then
    echo "Usage: $0 <src-dir> <dst-dir>"
13
    echo "e.g: $0 /home/lingora/Documents/Linagora/Data/ACSYNT data/ACSYNT"
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
14 15 16 17 18 19 20 21 22 23
    #exit 1
fi

 src=$1
 dst=$2

# all utterances are Wav compressed, we use sox for reading signal in binary format
if ! which sox >&/dev/null; then
    echo "Please install 'sox' on All worker nodes"
    echo "apt-get install sox"
24
    exit 1
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
25 26
fi

27
mkdir -p $dst || exit 1;
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
28

29
[ ! -d $src ] && echo "$0: no such directory $src" && exit  1;
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
30 31 32 33 34 35 36 37

wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
trans=$dst/text; [[ -f "$trans" ]] && rm $trans
utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
#spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur
segments=$dst/segments; [[ -f "$segments" ]] && rm $segments

38 39 40 41 42
# Format ACSYNT DIR to ACSYNT/meeting | ACSYNT/prepared_speech | ACSYNT/story
if [ -d $src/ACSYNT_Final ]; then
rm -r $src/ACSYNT_Final
fi
local/ACSYNT_Parse/prep_ACSYNT.sh $src $src/ACSYNT_Final
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
# For each meeting
for meeting_dir in $(find $src -mindepth 1 -maxdepth 1 -type d | sort); do
    #echo $meeting_dir
    meeting=`ls $meeting_dir/* | grep ".TextGrid"`
    #echo $meeting
    for meeting_part_TextGrid in $(echo $meeting); do
    wav_file=`echo $filewithuppercase | sed "s/TextGrid/wav/"`
    [ ! -f $meeting_part_TextGrid ] && [ ! -f $wav_file ] && echo " Missing $meeting_part_TextGrid or $wav_file file " #&& exit 1
    python local/ACSYNT_Parse/ParseACSYNT.py $meeting_part_TextGrid $dst >> log.txt 2>&1
    done
done

# Sort all files
# text
#export LC_ALL=C
cat $trans | sort -k1 > $trans.txt
rm $trans
mv $trans.txt $trans
#segments
cat $segments | sort -k1 > $segments.txt
rm $segments
mv $segments.txt $segments
# wav
cat $wav_scp | sort -k1 > $wav_scp.txt
rm $wav_scp
mv $wav_scp.txt $wav_scp
# spk2gender
#cat $spk2gender | sort -k1 > $spk2gender.txt
#rm $spk2gender
#mv $spk2gender.txt $spk2gender
# utt2spk
cat $utt2spk |sort -k1 > $utt2spk.txt
rm $utt2spk
mv $utt2spk.txt $utt2spk

# convert utt2spk to spk2utt
spk2utt=$dst/spk2utt
utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt #|| exit 1

# Sort spk2utt
cat $spk2utt | sort -k1 > $spk2utt.txt
rm $spk2utt
mv $spk2utt.txt $spk2utt
# Check trannscripts and utterances
 ntrans=$(wc -l <$trans)
 nutt2spk=$(wc -l <$utt2spk)
 ! [ "$ntrans" -eq "$nutt2spk" ] && \
    echo "Inconsistent #transcripts($ntrans) and # utt2spk($nutt2spk)" #&& exit 1;

# compute segment's duration
 utils/data/get_utt2dur.sh $dst 1>&2 #|| exit 1
# Validate Kladi Inputs
 utils/validate_data_dir.sh --no-feats $dst #|| exit 1;

 echo "Successfully prepared data in $dst.."

#exit 0