Commit 81931ac7 authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

Prepare Data Done

parent 7fd54997
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
thibault1_lev sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/thibault1_lev/thibault1_lev.wav -t wav -r 16000 -c 1 -
thibaut1_der sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/thibaut1_der/thibaut1_der.wav -t wav -r 16000 -c 1 -
thomas1_pel sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/thomas1_pel/thomas1_pel.wav -t wav -r 16000 -c 1 -
thomas_allan_cp_proinf sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/thomas_allan_cp_proinf/thomas_allan_cp_proinf.wav -t wav -r 16000 -c 1 -
tidiane1_ecc sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/tidiane1_ecc/tidiane1_ecc.wav -t wav -r 16000 -c 1 -
tromboniste sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/tromboniste/tromboniste.wav -t wav -r 16000 -c 1 -
tscha_cha_reu_ass_08 sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/tscha_cha_reu_ass_08/tscha_cha_reu_ass_08.wav -t wav -r 16000 -c 1 -
tunisie_mun_08 sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/tunisie_mun_08/tunisie_mun_08.wav -t wav -r 16000 -c 1 -
vacances_seb_13 sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/vacances_seb_13/vacances_seb_13.wav -t wav -r 16000 -c 1 -
valentin1_lan sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/valentin1_lan/valentin1_lan.wav -t wav -r 16000 -c 1 -
valentin_manel_ce1_proinf sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/valentin_manel_ce1_proinf/valentin_manel_ce1_proinf.wav -t wav -r 16000 -c 1 -
valentine1_bah sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/valentine1_bah/valentine1_bah.wav -t wav -r 16000 -c 1 -
valentine1_sow sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/valentine1_sow/valentine1_sow.wav -t wav -r 16000 -c 1 -
valentine2_sow sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/valentine2_sow/valentine2_sow.wav -t wav -r 16000 -c 1 -
valentine3_sow sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/valentine3_sow/valentine3_sow.wav -t wav -r 16000 -c 1 -
valentine4_sow sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/valentine4_sow/valentine4_sow.wav -t wav -r 16000 -c 1 -
valentine5_sow sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/valentine5_sow/valentine5_sow.wav -t wav -r 16000 -c 1 -
valentine6_sow sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/valentine6_sow/valentine6_sow.wav -t wav -r 16000 -c 1 -
valentine_camille_ce2_proinf sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/valentine_camille_ce2_proinf/valentine_camille_ce2_proinf.wav -t wav -r 16000 -c 1 -
vampiretatouage_sd sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/vampiretatouage_sd/vampiretatouage_sd.wav -t wav -r 16000 -c 1 -
victoire1_duc sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/victoire1_duc/victoire1_duc.wav -t wav -r 16000 -c 1 -
vin_car_07 sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/vin_car_07/vin_car_07.wav -t wav -r 16000 -c 1 -
vincent1_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/vincent1_can/vincent1_can.wav -t wav -r 16000 -c 1 -
vincent2_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/dev/vincent2_can/vincent2_can.wav -t wav -r 16000 -c 1 -
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
vincent10_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent10_can/vincent10_can.wav -t wav -r 16000 -c 1 -
vincent11_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent11_can/vincent11_can.wav -t wav -r 16000 -c 1 -
vincent12_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent12_can/vincent12_can.wav -t wav -r 16000 -c 1 -
vincent13_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent13_can/vincent13_can.wav -t wav -r 16000 -c 1 -
vincent14_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent14_can/vincent14_can.wav -t wav -r 16000 -c 1 -
vincent15_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent15_can/vincent15_can.wav -t wav -r 16000 -c 1 -
vincent16_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent16_can/vincent16_can.wav -t wav -r 16000 -c 1 -
vincent17_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent17_can/vincent17_can.wav -t wav -r 16000 -c 1 -
vincent18_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent18_can/vincent18_can.wav -t wav -r 16000 -c 1 -
vincent19_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent19_can/vincent19_can.wav -t wav -r 16000 -c 1 -
vincent20_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent20_can/vincent20_can.wav -t wav -r 16000 -c 1 -
vincent21_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent21_can/vincent21_can.wav -t wav -r 16000 -c 1 -
vincent22_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent22_can/vincent22_can.wav -t wav -r 16000 -c 1 -
vincent3_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent3_can/vincent3_can.wav -t wav -r 16000 -c 1 -
vincent4_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent4_can/vincent4_can.wav -t wav -r 16000 -c 1 -
vincent5_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent5_can/vincent5_can.wav -t wav -r 16000 -c 1 -
vincent6_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent6_can/vincent6_can.wav -t wav -r 16000 -c 1 -
vincent7_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent7_can/vincent7_can.wav -t wav -r 16000 -c 1 -
vincent8_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent8_can/vincent8_can.wav -t wav -r 16000 -c 1 -
vincent9_can sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent9_can/vincent9_can.wav -t wav -r 16000 -c 1 -
vincent_yvelise_ce2_proinf sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/vincent_yvelise_ce2_proinf/vincent_yvelise_ce2_proinf.wav -t wav -r 16000 -c 1 -
voyage_con_15 sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/voyage_con_15/voyage_con_15.wav -t wav -r 16000 -c 1 -
voyage_gou_13 sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/voyage_gou_13/voyage_gou_13.wav -t wav -r 16000 -c 1 -
voyage_hab_14 sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/voyage_hab_14/voyage_hab_14.wav -t wav -r 16000 -c 1 -
voyage_jus_14 sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/voyage_jus_14/voyage_jus_14.wav -t wav -r 16000 -c 1 -
voyage_leo_13 sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/voyage_leo_13/voyage_leo_13.wav -t wav -r 16000 -c 1 -
voyages_ric_06 sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/voyages_ric_06/voyages_ric_06.wav -t wav -r 16000 -c 1 -
walid1_mat sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/walid1_mat/walid1_mat.wav -t wav -r 16000 -c 1 -
walid_logan_ce2_proinf sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/walid_logan_ce2_proinf/walid_logan_ce2_proinf.wav -t wav -r 16000 -c 1 -
xavier_thomas_cm2_proinf sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/xavier_thomas_cm2_proinf/xavier_thomas_cm2_proinf.wav -t wav -r 16000 -c 1 -
xaviere1_leg sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/xaviere1_leg/xaviere1_leg.wav -t wav -r 16000 -c 1 -
yaelle_aurelia_cp_proinf sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/yaelle_aurelia_cp_proinf/yaelle_aurelia_cp_proinf.wav -t wav -r 16000 -c 1 -
yaelle_elena_cp_proinf sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/yaelle_elena_cp_proinf/yaelle_elena_cp_proinf.wav -t wav -r 16000 -c 1 -
youssef1_tor sox /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/test/youssef1_tor/youssef1_tor.wav -t wav -r 16000 -c 1 -
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/bin/bash
# Copyright 2016 Linagora (author: Abdel HEBA)
# see research.linagora.com OpenPaas Project and https://hubl.in for meetings
# GPL
source path.sh
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <src-dir> <dst-dir>"
echo "e.g: $0 /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/train data/train"
#exit 1
fi
src=$1
dst=$2
# all utterances are Wav compressed, we use sox for reading signal in binary format
if ! which sox >&/dev/null; then
echo "Please install 'sox' on All worker nodes"
echo "apt-get install sox"
#exit 1
fi
#Reflechir partie Split...?
#echo "=== Starting initial Tcof Data preparation ..."
#echo "--- Making test/train data split ..."
mkdir -p $dst #|| exit 1;
[ ! -d $src ] && echo "$0: no such directory $src" #&& exit 1;
wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
trans=$dst/text; [[ -f "$trans" ]] && rm $trans
utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur
# à voir
# cat lexicon/lexicon | awk '{print $1}' | egrep "_|-|'" | egrep -v '^-|-$|\)$' > lexicon/lex
for meeting_dir in $(find $src -mindepth 1 -maxdepth 1 -type d | sort); do
meeting=$(basename $meeting_dir)
#if ! [ $meeting -eq $meeting ]; then
#echo "$0 unexpected subdirectory name $reader"
#exit 1;
#fi
[ ! -f $meeting_dir/$meeting.trs ] && [ ! -f $meeting_dir/$meeting.wav ] && echo " Missing $meeting.trs or $meeting.wav file " #&& exit 1
#dir.tsr contains metadata gender of speaker
#reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}')
#if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then
# echo "Unexpected gender: '$reader_gender'"
#exit 1;
#fi
$PYTHON local/parseTcof.py $meeting_dir/$meeting.trs $dst >> log.txt 2>&1
done
#spk2utt=$dst/spk2utt
# utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt #|| exit 1
# ntrans=$(wc -l <$trans)
# nutt2spk=$(wc -l <$utt2spk)
# ! [ "$ntrans" -eq "$nutt2spk" ] && \
# echo "Inconsistent #transcripts($ntrans) and # utt2spk($nutt2spk)" #&& exit 1;
# ustils/data/get_utt2dur.sh $dst 1>&2 #|| exit 1
# utils/validate_data_dir.sh --no-feats $dst #|| exit 1;
# echo "Successfully prepared data in $dst.."
#exit 0
#!/bin/bash
# Copyright 2016 Linagora (author: Abdel HEBA)
# see research.linagora.com OpenPaas Project and https://hubl.in for meetings
# GPL
source path.sh
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <src-dir> <dst-dir>"
echo "e.g: $0 /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/train data/train"
#exit 1
fi
src=$1
dst=$2
# all utterances are Wav compressed, we use sox for reading signal in binary format
if ! which sox >&/dev/null; then
echo "Please install 'sox' on All worker nodes"
echo "apt-get install sox"
#exit 1
fi
#Reflechir partie Split...?
#echo "=== Starting initial Tcof Data preparation ..."
#echo "--- Making test/train data split ..."
mkdir -p $dst #|| exit 1;
[ ! -d $src ] && echo "$0: no such directory $src" #&& exit 1;
wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
trans=$dst/text; [[ -f "$trans" ]] && rm $trans
utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur
# à voir
# cat lexicon/lexicon | awk '{print $1}' | egrep "_|-|'" | egrep -v '^-|-$|\)$' > lexicon/lex
for meeting_dir in $(find $src -mindepth 1 -maxdepth 1 -type d | sort); do
meeting=$(basename $meeting_dir)
#if ! [ $meeting -eq $meeting ]; then
#echo "$0 unexpected subdirectory name $reader"
#exit 1;
#fi
[ ! -f $meeting_dir/$meeting.trs ] && [ ! -f $meeting_dir/$meeting.wav ] && echo " Missing $meeting.trs or $meeting.wav file " #&& exit 1
#dir.tsr contains metadata gender of speaker
#reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}')
#if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then
# echo "Unexpected gender: '$reader_gender'"
#exit 1;
#fi
$PYTHON local/parseTcof.py $meeting_dir/$meeting.trs $dst >> file_ok.txt 2>> log.txt
done
#spk2utt=$dst/spk2utt
# utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt #|| exit 1
# ntrans=$(wc -l <$trans)
# nutt2spk=$(wc -l <$utt2spk)
# ! [ "$ntrans" -eq "$nutt2spk" ] && \
# echo "Inconsistent #transcripts($ntrans) and # utt2spk($nutt2spk)" #&& exit 1;
# ustils/data/get_utt2dur.sh $dst 1>&2 #|| exit 1
# utils/validate_data_dir.sh --no-feats $dst #|| exit 1;
# echo "Successfully prepared data in $dst.."
#exit 0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from xml.dom import minidom
from unicodedata import normalize
from sys import argv
import re
import os.path
def transformation_text(text):
bool=True
if "###" in text or "(" in text: # "voir - amorces"
print "Supprimer Ligne"
bool=False
else:
#print "detecter (///|/|<|>)"
print text
text = re.sub(r"(\+|[*]+|///|/|<|>)", "", text.strip())
text= re.sub(r"-|_|\."," ",text.strip())
text = re.sub(r"(O K | O K|^O K$)", " ok ", text)
text=re.sub(r"{[^{]+}"," ",text.strip())
#text=re.sub(r"¤[^¤]+¤","",text.strip())
text=re.sub(r"¤[^ ]+|[^ ]+¤|¤","",text.strip())
text=re.sub(r" +"," ",text.strip())
text=re.sub(r" 4x4 "," quatre fois quatre ",text)
# if "///" in text:
# print text
# print "Detecté ///"
# text=text.replace('///','')
# if ">" in text or "<" in text:
# print "< or > Detecté"
# else:
# if "{" in text and "}" in text:
# print "comment detected"
# else:
# if "(" in text and ")" in text:
# print "( ) detected"
# else:
# if "***" in text:
# print "suite de syllabes incompréhensibles"
# else:
# if "*" in text:
# print "suite de syllable incompréhensible"
# else:
# if "$$$" in text:
# print "coupure de l'enregistrement"
return bool,text
if __name__=="__main__":
file_trs=argv[1]
outdir=argv[2]
print file_trs.split('.')[0]
# Output File needed for kaldi input
segments_file = open(outdir + '/segments', 'a')
utt2spk_file = open(outdir + '/utt2spk', 'a')
text_file = open(outdir + '/text', 'a')
wav_scp = open(outdir + '/wav.scp', 'a')
spk2gender= open(outdir + '/spk2gender', 'a')
# Read Trans File
trsdoc= minidom.parse(file_trs)
#Read MetaData Of speaker ( ID and Name)
Speaker= trsdoc.getElementsByTagName('Speaker')
speaker_id=[]
namespk=[]
for spk in Speaker:
id_spk=spk.attributes['id'].value
id_spk=normalize('NFKD', id_spk).encode('utf-8', 'ignore')
name_spk=spk.attributes['name'].value
name_spk=normalize('NFKD', name_spk).encode('utf-8', 'ignore')
speaker_id.append(id_spk.replace(" ",""))
namespk.append(name_spk.lower().replace(" ",""))
#Read MetaData To get Gender of Speaker (Gender and Name)
file_xml=file_trs.split('.')[0]+'.xml'
xmldoc= minidom.parse(file_xml)
locuteur= xmldoc.getElementsByTagName('locuteur')
sexe= xmldoc.getElementsByTagName('sexe')
speaker_gender=[]
count=0
print namespk
print speaker_id
for loc in locuteur:
if loc.hasAttribute('identifiant'):
name_loc=loc.attributes['identifiant'].value
name_loc=normalize('NFKD', name_loc).encode('utf-8', 'ignore').replace(" ","")
print name_loc
#If the gender of speaker doesn't mentioned
if sexe[count].childNodes==[]:
speaker_gender.append([speaker_id[namespk.index(name_loc.lower())],'m'])
else:
gender_loc="".join(t.nodeValue for t in sexe[count].childNodes if t.nodeType == t.TEXT_NODE)
gender_loc=normalize('NFKD', gender_loc).encode('utf-8', 'ignore')
speaker_gender.append([speaker_id[namespk.index(name_loc.lower())],gender_loc.lower()])
count=count+1
print speaker_gender
#g_spk='m' if gender_spk=='male' else 'f'
#speaker_gender.append([id_spk,g_spk])
#print speaker_gender
Turnlist= trsdoc.getElementsByTagName('Turn')
#print len(Turnlist)
a=""
count=1
#print "#id_utt\tid_Seg\tid_Spkr\tstartTime\tendTime\tText"
for Turn in Turnlist:
# Get id_spkr
att_spk=Turn.attributes['speaker'].value
spkr=normalize('NFKD', att_spk).encode('utf-8', 'ignore')
# Get StartSegment
att_startTime=Turn.attributes['startTime'].value
startTime=normalize('NFKD', att_startTime).encode('utf-8', 'ignore')
#Get EndSegment
att_endTime=Turn.attributes['endTime'].value
endTime=normalize('NFKD', att_endTime).encode('utf-8', 'ignore')
# Get Text
field_text="".join(t.nodeValue for t in Turn.childNodes if t.nodeType == t.TEXT_NODE)
#print field_text.encode('utf-8','ignore')
#a=a.decode('unicode_escape').encode('utf-8','ignore').split()
_text=field_text.encode('utf-8','ignore').split()
text=""
for x in _text:
text=text+' '+x
# Function Transformation à faire
#bool,text=transformation_text(text)
bool=True
seg_id=str(os.path.basename(file_trs.split('.')[0]))+'_seg-%07d' % count
spkr_id=str(os.path.basename(file_trs.split('.')[0]))+'_spk-%03d' % int(spkr.split('spk')[1])
if bool and text!="":
#print seg_id+'\t'+spkr_id+'\t'+startTime+'\t'+endTime+'\t'+text
print >> segments_file, '%s %s %s %s' % (seg_id, os.path.basename(file_trs.split('.')[0]), startTime, endTime)
print >> utt2spk_file, '%s %s' % (seg_id, spkr_id)
print >> text_file, '%s %s' % (seg_id, text)
for spk_tuple in speaker_gender:
if spk_tuple[0]==spkr:
print >> spk2gender,'%s %s' % (seg_id, spk_tuple[1])
break
count=count+1
print >> wav_scp, '%s sox %s -t wav -r 16000 -c 1 -' % (os.path.basename(file_trs.split('.')[0]), os.path.dirname(file_trs)+'/'+os.path.basename(file_trs.split('.')[0])+'.wav')
segments_file.close()
utt2spk_file.close()
text_file.close()
wav_scp.close()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from xml.dom import minidom
from unicodedata import normalize
from sys import argv
import re
import os.path
def transformation_text(text):
bool=True
if "###" in text or "(" in text: # "voir - amorces"
print "Supprimer Ligne"
bool=False
else:
#print "detecter (///|/|<|>)"
print text
text = re.sub(r"(\+|[*]+|///|/|<|>)", "", text.strip())
text= re.sub(r"-|_|\."," ",text.strip())
text = re.sub(r"(O K | O K|^O K$)", " ok ", text)
text=re.sub(r"{[^{]+}"," ",text.strip())
#text=re.sub(r"¤[^¤]+¤","",text.strip())
text=re.sub(r"¤[^ ]+|[^ ]+¤|¤","",text.strip())
text=re.sub(r" +"," ",text.strip())
text=re.sub(r" 4x4 "," quatre fois quatre ",text)
# if "///" in text:
# print text
# print "Detecté ///"
# text=text.replace('///','')
# if ">" in text or "<" in text:
# print "< or > Detecté"
# else:
# if "{" in text and "}" in text:
# print "comment detected"
# else:
# if "(" in text and ")" in text:
# print "( ) detected"
# else:
# if "***" in text:
# print "suite de syllabes incompréhensibles"
# else:
# if "*" in text:
# print "suite de syllable incompréhensible"
# else:
# if "$$$" in text:
# print "coupure de l'enregistrement"
return bool,text
if __name__=="__main__":
file_trs=argv[1]
outdir=argv[2]
print file_trs.split('.')[0]
# Output File needed for kaldi input
segments_file = open(outdir + '/segments', 'a')
utt2spk_file = open(outdir + '/utt2spk', 'a')
text_file = open(outdir + '/text', 'a')
wav_scp = open(outdir + '/wav.scp', 'a')
spk2gender= open(outdir + '/spk2gender', 'a')
# Read Trans File
trsdoc= minidom.parse(file_trs)
#Read MetaData Of speaker ( ID and Name)
Speaker= trsdoc.getElementsByTagName('Speaker')
speaker_id=[]
namespk=[]
for spk in Speaker:
id_spk=spk.attributes['id'].value
id_spk=normalize('NFKD', id_spk).encode('utf-8', 'ignore')
name_spk=spk.attributes['name'].value
name_spk=normalize('NFKD', name_spk).encode('utf-8', 'ignore')
speaker_id.append(id_spk)
namespk.append(name_spk.lower())
#Read MetaData To get Gender of Speaker (Gender and Name)
file_xml=file_trs.split('.')[0]+'.xml'
xmldoc= minidom.parse(file_xml)
locuteur= xmldoc.getElementsByTagName('locuteur')
sexe= xmldoc.getElementsByTagName('sexe')
speaker_gender=[]
count=0
for loc in locuteur:
if loc.hasAttribute('identifiant'):
name_loc=loc.attributes['identifiant'].value
name_loc=normalize('NFKD', name_loc).encode('utf-8', 'ignore')
#If the gender of speaker doesn't mentioned
if sexe[0].childNodes==[]:
speaker_gender.append([speaker_id[namespk.index(name_loc.lower())],'m'])
else:
gender_loc="".join(t.nodeValue for t in sexe[count].childNodes if t.nodeType == t.TEXT_NODE)
gender_loc=normalize('NFKD', gender_loc).encode('utf-8', 'ignore')
speaker_gender.append([speaker_id[namespk.index(name_loc.lower())],gender_loc.lower()])
count=count+1
#g_spk='m' if gender_spk=='male' else 'f'
#speaker_gender.append([id_spk,g_spk])
#print speaker_gender
Turnlist= trsdoc.getElementsByTagName('Turn')
#print len(Turnlist)
a=""
count=1
#print "#id_utt\tid_Seg\tid_Spkr\tstartTime\tendTime\tText"
for Turn in Turnlist:
# Get id_spkr
att_spk=Turn.attributes['speaker'].value
spkr=normalize('NFKD', att_spk).encode('utf-8', 'ignore')
# Get StartSegment
att_startTime=Turn.attributes['startTime'].value
startTime=normalize('NFKD', att_startTime).encode('utf-8', 'ignore')
#Get EndSegment
att_endTime=Turn.attributes['endTime'].value
endTime=normalize('NFKD', att_endTime).encode('utf-8', 'ignore')
# Get Text
field_text="".join(t.nodeValue for t in Turn.childNodes if t.nodeType == t.TEXT_NODE)
#print field_text.encode('utf-8','ignore')
#a=a.decode('unicode_escape').encode('utf-8','ignore').split()
_text=field_text.encode('utf-8','ignore').split()
text=""
for x in _text:
text=text+' '+x
# Function Transformation à faire
#bool,text=transformation_text(text)
bool=True
seg_id=str(os.path.basename(file_trs.split('.')[0]))+'_seg-%07d' % count
spkr_id=str(os.path.basename(file_trs.split('.')[0]))+'_spk-%03d' % int(spkr.split('spk')[1])
if bool and text!="":
#print seg_id+'\t'+spkr_id+'\t'+startTime+'\t'+endTime+'\t'+text
print >> segments_file, '%s %s %s %s' % (seg_id, os.path.basename(file_trs.split('.')[0]), startTime, endTime)
print >> utt2spk_file, '%s %s' % (seg_id, spkr_id)
print >> text_file, '%s %s' % (seg_id, text)
for spk_tuple in speaker_gender:
if spk_tuple[0]==spkr:
print >> spk2gender,'%s %s' % (seg_id, spk_tuple[1])
break
count=count+1
print >> wav_scp, '%s sox %s -t wav -r 16000 -c 1 -' % (os.path.basename(file_trs.split('.')[0]), os.path.dirname(file_trs)+'/'+os.path.basename(file_trs.split('.')[0])+'.wav')
segments_file.close()
utt2spk_file.close()
text_file.close()
wav_scp.close()
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment