Commit 3704eaca authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

Use python3 and fix encoding problem with LC_ALL in path.sh

parent 63f86c6d
......@@ -59,7 +59,7 @@ for meeting_dir in $(find $src -mindepth 1 -maxdepth 1 -type d | sort); do
# echo "Unexpected gender: '$reader_gender'"
#exit 1;
#fi
$PYTHON local/parseTcofSync.py $meeting_dir/$meeting.trs $dst >> log.txt 2>&1
python3 local/parseTcofSync.py $meeting_dir/$meeting.trs $dst >> log.txt 2>&1
done
......@@ -73,30 +73,30 @@ cat $segments | sort -k1 > $segments.txt
rm $segments
mv $segments.txt $segments
# wav
cat $wav_scp | sort -k1 > $wav_scp.txt
rm $wav_scp
mv $wav_scp.txt $wav_scp
cat $wav_scp | sort -k1 > $wav_scp.txt
rm $wav_scp
mv $wav_scp.txt $wav_scp
# # spk2gender
cat $spk2gender | sort -k1 > $spk2gender.txt
rm $spk2gender
mv $spk2gender.txt $spk2gender
cat $spk2gender | sort -k1 > $spk2gender.txt
rm $spk2gender
mv $spk2gender.txt $spk2gender
# # utt2spk
cat $utt2spk | sort -k1 > $utt2spk.txt
rm $utt2spk
mv $utt2spk.txt $utt2spk
cat $utt2spk | sort -k1 > $utt2spk.txt
rm $utt2spk
mv $utt2spk.txt $utt2spk
spk2utt=$dst/spk2utt
utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt #|| exit 1
# spk2utt
cat $spk2utt | sort -k1 > $spk2utt.txt
rm $spk2utt
mv $spk2utt.txt $spk2utt
cat $spk2utt | sort -k1 > $spk2utt.txt
rm $spk2utt
mv $spk2utt.txt $spk2utt
ntrans=$(wc -l <$trans)
nutt2spk=$(wc -l <$utt2spk)
! [ "$ntrans" -eq "$nutt2spk" ] && \
echo "Inconsistent #transcripts($ntrans) and # utt2spk($nutt2spk)" #&& exit 1;
echo "Inconsistent #transcripts($ntrans) and # utt2spk($nutt2spk)" #&& exit 1;
utils/data/get_utt2dur.sh $dst 1>&2 #|| exit 1
......
......@@ -5,12 +5,12 @@ from xml.etree import ElementTree as ET
from unicodedata import normalize
from sys import argv
from num2words import num2words
from unidecode import unidecode
import re
import os.path
import sys
# ( in text
# ) in text
def transformation_text(text):
bool=True
#print text
......@@ -34,7 +34,6 @@ def transformation_text(text):
# 4x4
# Remove noise sound (BIP) over Name of places and person
#text = re.sub(r"¤[^ ]+|[^ ]+¤|¤", "", text.strip())
text=re.sub(r"(¤.+¤)",'<NOISE>',text)
if len(re.findall(r"\dx\d",text))>0:
text=re.sub(r"x"," ",text)
if len(re.findall("\d+h\d+",text))>0:
......@@ -47,7 +46,7 @@ def transformation_text(text):
# remove silence character : OK
#text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable
text=re.sub(r'=\w+=','',text)
text=re.sub(r'=','',text)
# Comment Transcriber
text=re.sub(r'\{.+\}','',text)
text=re.sub(r'\(.+\}','',text)
......@@ -62,7 +61,8 @@ def transformation_text(text):
text=re.sub(r'\.',' ',text)
#text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK
text=re.sub(r"\?|/|\!|<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$","",text)
#<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text=re.sub(r"\?|/|\!|<|>","",text)
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
text=re.sub(r"(\+)", "!SIL", text)
......@@ -79,6 +79,8 @@ def transformation_text(text):
choosen_word = choosen_word.replace('/', '')
text = text.replace(unchoosen_text, choosen_word)
#print "Apres************"+text
# Remove noise sound (BIP) over Name of places and person
text=re.sub(r"(¤.+¤)",'<NOISE>',text)
# replace unkown syllable
text=re.sub(r"\*+","<SPOKEN_NOISE>",text)
# cut of recording : OK
......@@ -96,18 +98,27 @@ def transformation_text(text):
#print "********************************* NUM2WORD"
for num in num_list:
num_in_word = num2words(int(num), lang='fr')
num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text = text.replace(str(num), " " + str(num_in_word) + " ")
#print text
# replace n succesive spaces with one space. : OK
text=re.sub(r"\s{2,}"," ",text)
text = re.sub("^ ", '', text)
# change bounding | to < and > : OK
balise=set(re.findall(r"\|\w+_?\w+\|",text))
if len(balise)>0:
print(balise)
for b in balise:
new_balise='<'+b[1:len(b)-1]+'>'
text=text.replace(b,new_balise)
print(text)
# c'est l'essaim ....
text=text.lower()
return bool,text
if __name__=="__main__":
# Inputs
file_trs=argv[1]
#print(file_trs)
#print file_trs
outdir=argv[2]
basename=os.path.basename(file_trs.split('.')[0])
......@@ -128,9 +139,10 @@ if __name__=="__main__":
namespk=[]
for spk in trsdoc.iter('Speaker'):
id_spk=spk.get('id')
name_spk=spk.get('name')
if isinstance(name_spk,unicode):
name_spk=normalize('NFKD', name_spk).encode('ascii', 'ignore')
name_spk=unidecode(spk.get('name'))
#if isinstance(name_spk,str):
#print(type(name_spk))
#name_spk=normalize('NFKD', name_spk).encode('ascii', 'ignore')
speaker_id.append(id_spk.replace(" ",""))
namespk.append(name_spk.lower().replace(" ",""))
#Read MetaData To get Gender of Speaker (Gender and Name)
......@@ -142,8 +154,7 @@ if __name__=="__main__":
for loc in metadoc.iter('locuteur'):
if loc.attrib!=dict({}):
name_loc=loc.get('identifiant')
if isinstance(name_loc, unicode):
name_loc = normalize('NFKD', name_loc).encode('ascii', 'ignore')
name_loc = unidecode(name_loc)
name_loc=name_loc.replace(" ","")
#print name_loc
#print name_loc
......@@ -178,6 +189,8 @@ if __name__=="__main__":
Spk_that_contribute_to_meeting=set([])
start_utt=0
end_utt=0
sourceEncoding = "iso-8859-1"
targetEncoding = "utf-8"
for Element in trsdoc.iter():
if Element.tag=="Turn" and Element.get('speaker') is None:
has_attrib_speaker=False
......@@ -195,11 +208,10 @@ if __name__=="__main__":
# File text
# File speaker_gender
if bool and text!="":
#print seg_id+'\t'+spkr_id+'\t'+startTime+'\t'+endTime+'\t'+text
print >> segments_file, '%s %s %s %s' % (seg_id, basename, start_utt, endTime)
segments_file.write(seg_id+" "+basename+" "+str(start_utt)+" "+str(endTime)+"\n")
start_utt=endTime
print >> utt2spk_file, '%s %s' % (seg_id, spkr_id)
print >> text_file, '%s %s' % (seg_id, text.encode('utf-8'))
utt2spk_file.write(seg_id+" "+spkr_id+"\n")
text_file.write(seg_id+" "+text+"\n")
#for spk_tuple in speaker_gender:
# if spk_tuple[0]==spkr:
# print >> spk2gender,'%s %s' % (seg_id, spk_tuple[1])
......@@ -226,20 +238,11 @@ if __name__=="__main__":
spkr_id=str(basename)+'_spk-%03d' % int(spkr.split('spk')[1])
bool, text = transformation_text(text)
end_utt=Time_start_current_sync
# File wav.scp
# File utt2spk
# File text
# File speaker_gender
if bool and text!="":
#print seg_id+'\t'+spkr_id+'\t'+startTime+'\t'+endTime+'\t'+text
print >> segments_file, '%s %s %s %s' % (seg_id, basename, start_utt, end_utt)
segments_file.write(seg_id+" "+basename+" "+str(start_utt)+" "+str(end_utt)+"\n")
start_utt=Time_start_current_sync
print >> utt2spk_file, '%s %s' % (seg_id, spkr_id)
print >> text_file, '%s %s' % (seg_id, text.encode('utf-8'))
#for spk_tuple in speaker_gender:
# if spk_tuple[0]==spkr:
# print >> spk2gender,'%s %s' % (spkr_id, spk_tuple[1])
# break
utt2spk_file.write(seg_id+" "+spkr_id+"\n")
text_file.write(seg_id+" "+text+"\n")
text=Element.tail.replace('\n', '')
count=count+1
elif Element.tag=="Comment" and has_attrib_speaker and not Element.tail is None:
......@@ -247,13 +250,13 @@ if __name__=="__main__":
elif Element.tag=="Event" and has_attrib_speaker and not Element.tail is None :
if Element.get('type')=='noise':
if Element.get('desc')=='rire':
text=text+" <LAUGH> "+Element.tail.replace('\n', '')
text=text+" |LAUGH| "+Element.tail.replace('\n', '')
else:
text=text+" <NOISE> "+Element.tail.replace('\n', '')
text=text+" |NOISE| "+Element.tail.replace('\n', '')
elif Element.get('type')=='pronounce':
text=text+" <SPOKEN_NOISE> "+Element.tail.replace('\n', '')
text=text+" |SPOKEN_NOISE| "+Element.tail.replace('\n', '')
else:
text=text+" <NOISE> "+Element.tail.replace('\n', '')
text=text+" |NOISE| "+Element.tail.replace('\n', '')
elif Element.tag=="Who" and has_attrib_speaker and not Element.tail is None:
text=text+" "+Element.tail.replace('\n', '')
#else:
......@@ -272,27 +275,15 @@ if __name__=="__main__":
int(spkr.split('spk')[1]), int(Turn_count), int(count))
spkr_id = str(basename) + '_spk-%03d' % int(spkr.split('spk')[1])
bool, text = transformation_text(text)
#print bool
#print text
# File wav.scp
# File text
# File speaker_gender
if bool and text != "":
# print seg_id+'\t'+spkr_id+'\t'+startTime+'\t'+endTime+'\t'+text
print >> segments_file, '%s %s %s %s' % (seg_id, basename, start_utt, endTime)
print >> utt2spk_file, '%s %s' % (seg_id, spkr_id)
print >> text_file, '%s %s' % (seg_id, text.encode('utf-8'))
#for spk_tuple in speaker_gender:
# if spk_tuple[0] == spkr:
# print >> spk2gender, '%s %s' % (seg_id, spk_tuple[1])
# break
#print speaker_gender
segments_file.write(seg_id+" "+basename+" "+str(start_utt)+" "+str(endTime)+"\n")
utt2spk_file.write(seg_id+" "+spkr_id+"\n")
text_file.write(seg_id+" "+text+"\n")
for spk in speaker_gender:
if spk[0] in Spk_that_contribute_to_meeting:
spk_id = str(basename)+'_spk-%03d' % int(spk[0].split('spk')[1])
print >> spk2gender, '%s %s' % (spk_id, spk[1])
print >> wav_scp, '%s sox %s -t wav -r 16000 -c 1 - |' % (basename, os.path.dirname(file_trs) + '/' + basename + '.wav')
# print >> wav_scp, '%s sox %s -t wav -r 16000 -c 1 -' % (file_name, os.path.dirname(file_trs)+'/'+file_name+'.wav')
spk2gender.write(spk_id+" "+spk[1]+"\n")
wav_scp.write(basename+" sox "+os.path.dirname(file_trs) + '/' + basename + '.wav'+" -t wav -r 16000 -c 1 - |\n")
segments_file.close()
utt2spk_file.close()
text_file.close()
......
#!/usr/bin/env bash
export KALDI_ROOT=`pwd`/../../..
export PATH=$PWD/tools/festival/nsw/bin:$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
. $KALDI_ROOT/tools/config/common_path.sh
LANG=en_US.UTF-8
LANGUAGE=en_US.UTF-8
LC_ALL=en_US.UTF-8
LANG=fr_Fr.UTF-8
LANGUAGE=fr_FR.UTF-8
LC_ALL=fr_FR.UTF-8
# we use this both in the (optional) LM training and the G2P-related scripts
# we use this both in the Data prepare (Normalization step) and in optional way in the LM training and the G2P-related scripts
PYTHON='python2.7'
PYTHON3='python3'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment