Commit dc5d4274 authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

Fix #3#4 improve normalization & generate for G2P for each word which doesn't in our vocabulary

parent 296570b5
......@@ -56929,6 +56929,7 @@ lÄnder ll aa nn dd ei
lÄnder(2) ll aa nn dd eu rr
là ll aa
là-bas ll aa bb aa
là-b ll aa bb
là-dedans ll aa dd ee dd an
là-dessus ll aa dd ee ss uu
là-haut ll aa au
......@@ -100350,6 +100351,7 @@ voiliers(2) vv ww aa ll yy ei zz
voilure vv ww aa ll uu rr
voilure(2) vv ww aa ll uu rr ee
voilà vv ww aa ll aa
voilà-t-il vv ww aa ll aa tt ii ll
voilé vv ww aa ll ei
voilée vv ww aa ll ei
voilées vv ww aa ll ei
......@@ -50,7 +50,7 @@ for trs_file in $(find $src -type f -name "*.trs" | sort); do
#echo $meeting_dir
#echo $dst
#python3 local/parse_AudioDB.py --data-prep --input-dir $meeting_dir --output-dir $dst >> log.txt 2>&1
python3 local/parseESTERSync.py $trs_file $dst >> log.txt 2>&1
python3 local/parseESTERSyncV2.py $trs_file $dst >> log.txt 2>&1
done
# Sort all files
......
......@@ -6,7 +6,7 @@
# Auto-generates pronunciations using Sequitur G2P
. path.sh || exit 1
export LC_ALL=C
[ -z "$PYTHON" ] && PYTHON=python2.7
if [ $# -ne 3 ]; then
......@@ -38,14 +38,19 @@ g2p_exceptions="HH HH" # more such entries can be added, separated by "\n"
[ ! -d $sequitur_path ] && echo "Can't find '$sequitur_path' - please fix your Sequitur installation" && exit 1
[ ! -f $sequitur_model ] && echo "Can't find the Sequitur model file: $sequitur_model" && exit 1
PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \
--model=$sequitur_model --apply $vocab \
>${out_lexicon}.tmp || exit 1
#PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \
# --model=$sequitur_model --apply $vocab \
# >${out_lexicon}.tmp || exit 1
#awk 'NR==FNR{p[$1]=$0; next;} {if ($1 in p) print p[$1]; else print}' \
# <(echo -e $g2p_exceptions) ${out_lexicon}.tmp >$out_lexicon || exit 1
awk 'NR==FNR{p[$1]=$0; next;} {if ($1 in p) print p[$1]; else print}' \
<(echo -e $g2p_exceptions) ${out_lexicon}.tmp >$out_lexicon || exit 1
PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \
--model=$sequitur_model --apply $vocab > ${out_lexicon}.tmp || exit 1
awk '{$1="";print $0}' ${out_lexicon}.tmp > ${out_lexicon}.tmp1
paste -d "" $vocab ${out_lexicon}.tmp1 > $out_lexicon
rm ${out_lexicon}.tmp
rm ${out_lexicon}.tmp1
exit 0
......@@ -23,12 +23,16 @@ mkdir -p $out_root
processed=0
for b in $(cat $in_list); do
id=$(basename $b)
echo "Start processing $id at $(date '+%T %F')"
in_file=$b/$id.trs
# Tcof
#in_file=$b/$id.trs
#id=$(basename $b)
#echo "Start processing $id at $(date '+%T %F')"
# ESTER
in_file=$b
[[ -f "$in_file" ]] || { echo "WARNING: $in_file does not exists"; continue; }
#python3 local/parse_AudioDB.py $b
python3 local/lm/parseText.py $in_file |\
#python3 local/lm/parseText.py $in_file |\
python3 local/lm/parseESTERSyncV2_text.py $in_file |\
$PYTHON local/lm/pre_filter.py /dev/stdin $out_root/$id.txt
#$PYTHON local/lm/pre_filter.py /dev/stdin $out_root/corpus_train.txt
processed=$((processed + 1))
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from xml.etree import ElementTree as ET
from sys import argv
from num2words import num2words
from unidecode import unidecode
import re
import os.path
def transformation_text(text):
# character normalization:
text=re.sub("æ","ae",text)
text=re.sub("œ","oe",text)
#if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
# len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text)) > 0 \
# or len(re.findall("[^\p{L}]-\p{L}+|^-\p{L}+", text)) > 0:
# bool=False
#else:
# ^^ remove
text=re.sub(r"\^+","",text)
text=re.sub(r"\_+","",text)
# 4x4
# Remove noise sound (BIP) over Name of places and person
#text = re.sub(r"¤[^ ]+|[^ ]+¤|¤", "", text.strip())
if len(re.findall(r"\dx\d",text))>0:
text=re.sub(r"x"," ",text)
if len(re.findall("\d+h\d+",text))>0:
heures=re.findall("\d+h\d+",text)
for h in heures:
split_h=h.split('h')
text_rep=split_h[0]+' heure '+split_h[1]
text=text.replace(h, text_rep)
text=re.sub(r',',' ',text)
# remove silence character : OK
#text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable
text=re.sub(r'=','',text)
# Comment Transcriber
text=re.sub(r'\{.+\}','',text)
text=re.sub(r'\(.+\}','',text)
#print "detecter (///|/|<|>)"
# Remove undecidable variant heared like on (n') en:
text=re.sub(r"\(.+\)|\(\)","",text)
#text = re.sub(r"(\+|[*]+|///|/|<|>)", "", text.strip())
#text=re.sub(r"-|_|\."," ",text.strip())
text=re.sub(r'(O.K.)','ok',text)
text = re.sub(r'(O.K)', 'ok', text)
# Replace . with ''
text=re.sub(r'\.|,|;','',text)
#text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK
#<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text=re.sub(r":|\?|/|\!|#+","",text)
text=re.sub(r"%","pour cent",text)
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
#text=re.sub(r"(\+)", "!SIL", text)
#text=re.sub(r"(///)", "!SIL", text)
#text=re.sub(r"(///)", "<long-sil>", text)
#if len(re.findall(r"/.+/", text)) > 0:
#print "AVANT***********"+text
# for unchoosen_text in re.findall(r"/.+/", text):
# choose first undecideble word
# unchoosen_word=unchoosen_text.split(',')
# for choosen_word in unchoosen_word:
# isn't incomprehensible word
# if len(re.findall(r"\*+|\d+", choosen_word))==0:
# choosen_word = choosen_word.replace('/', '')
# text = text.replace(unchoosen_text, choosen_word)
#print "Apres************"+text
# Remove noise sound (BIP) over Name of places and person
#text=re.sub(r"(¤.+¤)",'<NOISE>',text)
# replace unkown syllable
text=re.sub(r"\*+","",text)
# cut of recording : OK
#text=re.sub(r"\$+","",text)
# remove " character: OK
text = re.sub(r"\"+", "", text)
# t 'avais
text = re.sub(r"[ ]\'", " ", text)
text = re.sub(r"\'", "\' ", text)
# convert number if exist : OK
num_list = re.findall("\d+", text)
if len(num_list) > 0:
#print text
#print "********************************* NUM2WORD"
for num in num_list:
num_in_word = num2words(int(num), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text = re.sub(r"(^|[ ])"+str(num)+"([ ]|$)"," " + str(num_in_word) + " ",text)
#print text
# for example : A43
num_list = re.findall("\w+?-?\d+", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'(\w+?)-?(\d+)',s)
num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
# replace n succesive spaces with one space. : OK
text=re.sub(r"\s{2,}"," ",text)
text=re.sub(r" $","",text)
text=re.sub("^ ", '', text)
# change bounding | to < and > : OK
#balise=set(re.findall(r"\|\w+_?\w+\|",text))
#if len(balise)>0:
#print(balise)
# for b in balise:
# new_balise='<'+b[1:len(b)-1]+'>'
# text=text.replace(b,new_balise)
#print(text)
# c'est l'essaim ....
text=text.lower()
text=re.sub("[ ]-|-$","",text)
return text
if __name__=="__main__":
# Inputs
file_trs=argv[1]
#outdir=argv[2]
basename=os.path.basename(file_trs.split('.')[0])
# Read Trans File
tree_trs = ET.parse(file_trs)
trsdoc= tree_trs.getroot()
#============================ Read MetaData =======================================
#============================ Topic section (ID,DESC) =======================================
for topic in trsdoc.iter('Topic'):
topic_id=unidecode(topic.get('id'))
topic_desc=unidecode(topic.get('desc'))
#print(str(basename)+" "+topic_id+" "+topic_desc+"\n")
#topic_file.write(str(basename)+" "+topic_id+" "+topic_desc+"\n")
#============================ Speaker section (ID,GENDER) ===================================
speaker_id=[]
#namespk=[]
speaker_gender=[]
for spk in trsdoc.iter('Speaker'):
id_spk=spk.get('id')
#name_spk=unidecode(spk.get('name'))
if spk.findall('type')==[]:
gender="m"
else:
gender=unidecode(spk.get('type'))
if gender =="female":
gender="f"
else:
gender="m"
#if isinstance(name_spk,str):
#print(type(name_spk))
#name_spk=normalize('NFKD', name_spk).encode('ascii', 'ignore')
speaker_id.append(id_spk.replace(" ",""))
speaker_gender.append([id_spk.replace(" ",""),gender.lower()])
#namespk.append(name_spk.lower().replace(" ",""))
#============================ Catch Transcription Segment and Topic Section ==================
text=""
Turn_count=0
count=0
has_attrib_speaker=False
# set for uniq add
Spk_that_contribute_to_meeting=set([])
start_utt=0
end_utt=0
#Not used
section_start_time=0
section_end_time=0
section_type=""
section_topic=""
nb_section=0
spkr="spk1"
for Element in trsdoc.iter():
#OK validation
#print("Print lekbirr "+str(Element.tail))
#print("Print lekbirr "+str(Element.tail))
if Element.tag=="Section":
if nb_section>0:
text = transformation_text(text)
# File wav.scp
# File utt2spk
# File text
# File speaker_gender
if text!=""and has_attrib_speaker:
Spk_that_contribute_to_meeting.add(spkr)
#print("SAVED BY SECTION "+seg_id+" "+text)
seg_id = str(basename) + '_%s-%03d_Section%02d_Topic-%s_Turn-%03d_seg-%07d' % (
str(re.sub('\d+','',spkr)),int(re.sub('[a-zA-Z]','',spkr)),int(nb_section),str(section_topic), int(Turn_count), int(count))
spkr_id = str(basename)+'_%s-%03d' % (str(re.sub('\d+','',spkr)),int(re.sub('[a-zA-Z]','',spkr)))
#segments_file.write(seg_id+" "+basename+" "+str(start_utt)+" "+str(section_end_time)+"\n")
start_utt=section_end_time
#utt2spk_file.write(seg_id+" "+spkr_id+"\n")
#text_file.write(seg_id+" "+text+"\n")
print(text)
text=""
# New section
section_start_time=Element.get('startTime')
section_end_time=Element.get('endTime')
section_type=unidecode(Element.get('type'))
#if Element.findall('topic')==[]:
# section_topic="None"
#else:
section_topic=unidecode(str(Element.get('topic')))
if section_topic=="":
section_topic="None"
Turn_count=0
count=0
nb_section+=1
elif Element.tag=="Turn":
# if the turn is the spoken turn , not musical segment or noise
#print(str(Element.tag))
#print(Element.attrib)
#print(Element.get("speaker"))
if not "speaker" in Element.attrib:
#print("pas de champ speaker")
has_attrib_speaker=False
else:
if Element.get('speaker')=="":
#print("Cest vide: "+Element.get('speaker'))
has_attrib_speaker=False
else:
#print(Element.get('speaker'))
# If the latest Utterance of previous Speaker is the latest one of his Turn speech
if Turn_count>0:
seg_id = str(basename) + '_%s-%03d_Section%02d_Topic-%s_Turn-%03d_seg-%07d' % (
str(re.sub('\d+','',spkr)),int(re.sub('[a-zA-Z]','',spkr)),int(nb_section),str(section_topic), int(Turn_count), int(count))
spkr_id = str(basename)+'_%s-%03d' % (str(re.sub('\d+','',spkr)),int(re.sub('[a-zA-Z]','',spkr)))
text = transformation_text(text)
# File wav.scp
# File utt2spk
# File text
# File speaker_gender
if bool and text!="":
#print("SAVED BY TURN "+seg_id+" "+text)
Spk_that_contribute_to_meeting.add(spkr)
#segments_file.write(seg_id+" "+basename+" "+str(start_utt)+" "+str(endTime)+"\n")
start_utt=endTime
#utt2spk_file.write(seg_id+" "+spkr_id+"\n")
#text_file.write(seg_id+" "+text+"\n")
print(text)
text=""
count = 0
# Get id_spkr
#print(Element.get('speaker'))
spkr=Element.get('speaker')
#print file_trs
has_attrib_speaker=True
spkr=spkr.split()[0]
#print spkr
# Get StartSegment
startTime = Element.get('startTime')
# Get EndSegment
endTime = Element.get('endTime')
# count sync for computing start and end utterance
Turn_count = Turn_count+1
elif has_attrib_speaker:
#print("Je rentre dans has_attrib_speaker et element.tail not null")
#print(str(Element.tag))
#print(str(Element.tail))
if Element.tag=="Sync" or Element.tag=="Background":
#print("Je rentre Sync+Background"+ text +"| et le next c'est "+ Element.tail)
#print(Element.tag+" "+Element.tail)
Time_start_current_sync=Element.get('time')
#if count>0:
#print("save after Turn")
#print(str(basename))
#print(str(section_topic))
#print(str(spkr))
#print(str(int(Turn_count)))
#print(str(int(count)))
#print text
### Save Files For Kaldi ###
seg_id = str(basename) + '_%s-%03d_Section%02d_Topic-%s_Turn-%03d_seg-%07d' % (
str(re.sub('\d+','',spkr)),int(re.sub('[a-zA-Z]','',spkr)),int(nb_section),str(section_topic), int(Turn_count), int(count))
spkr_id = str(basename)+'_%s-%03d' % (str(re.sub('\d+','',spkr)),int(re.sub('[a-zA-Z]','',spkr)))
text = transformation_text(text)
#print("Sync or Background: wizzz "+text)
end_utt=Time_start_current_sync
if text!="":
#print("SAVED BY SYNC or BACKGROUND "+seg_id+" "+text)
Spk_that_contribute_to_meeting.add(spkr)
#segments_file.write(seg_id+" "+basename+" "+str(start_utt)+" "+str(end_utt)+"\n")
#utt2spk_file.write(seg_id+" "+spkr_id+"\n")
#text_file.write(seg_id+" "+text+"\n")
print(text)
text=""
count+=1
start_utt=Time_start_current_sync
text=Element.tail.replace('\n', '')
#print(count)
#count+=1
elif Element.tag=="Comment" or Element.tag=="Background":
text=text+" "+Element.tail.replace('\n', '')
elif Element.tag=="Event":
# if Element.get('type')=='noise':
# ===== Respiration
if Element.get('desc')=='r' or Element.get('desc')=='i' or Element.get('desc')=='e' or Element.get('desc')=='n':
text=text+" "+Element.tail.replace('\n', '')
elif Element.get('desc')=='pf':
text=text+" "+Element.tail.replace('\n', '')
# ===== Bruits bouches
elif Element.get('desc')=='tx':
text=text+" "+Element.tail.replace('\n', '')
elif Element.get('desc')=='bg':
text=text+" "+Element.tail.replace('\n', '')
elif Element.get('desc')=='bb':
text=text+" "+Element.tail.replace('\n', '')
elif Element.get('desc')=='rire':
text=text+" "+Element.tail.replace('\n', '')
elif Element.get('desc')=='sif':
text=text+" "+Element.tail.replace('\n', '')
elif Element.get('desc')=='ch' or Element.get('desc')=='ch-':
text=text+" "+Element.tail.replace('\n', '')
# ====== Bruit exterieus a l'acte de parole
elif Element.get('desc')=='b' or Element.get('desc')=='pap' or Element.get('desc')=='mic' or Element.get('desc')=='conv':
text=text+" "+Element.tail.replace('\n', '')
elif Element.get('desc')=='top':
text=text+" "+Element.tail.replace('\n', '')
# "pi" intellegible "pif" inaudible voir doc transcriber
#elif Element.get('type')=='pronounce':
# text=text+" "+Element.tail.replace('\n', '')
# desc="EN"
#if Element.get('type')=='language':
# text=text+" "+Element.tail.replace('\n', '')
#if Element.tag=="Who":
else:
text=text+" "+Element.tail.replace('\n', '')
if count > 0 and has_attrib_speaker and not Element.tail is None:
#print text
### Save Files For Kaldi ###
seg_id = str(basename) + '_%s-%03d_Section%02d_Topic-%s_Turn-%03d_seg-%07d' % (
str(re.sub('\d+','',spkr)),int(re.sub('[a-zA-Z]','',spkr)),int(nb_section),str(section_topic), int(Turn_count), int(count))
#seg_id = str(basename) + '_spk-%03d_Turn-%03d_seg-%07d' % (
#int(spkr.split('spk')[1]), int(Turn_count), int(count))
spkr_id = str(basename)+'_%s-%03d' % (str(re.sub('\d+','',spkr)),int(re.sub('[a-zA-Z]','',spkr)))
text = transformation_text(text)
if bool and text != "":
#print("Last SAVE"+seg_id+" "+text)
#segments_file.write(seg_id+" "+basename+" "+str(start_utt)+" "+str(endTime)+"\n")
#utt2spk_file.write(seg_id+" "+spkr_id+"\n")
#text_file.write(seg_id+" "+text+"\n")
print(text)
# Gender file edition
#print(Spk_that_contribute_to_meeting)
#print(len(Spk_that_contribute_to_meeting))
#print(speaker_gender)
#print(len(speaker_gender))
#for spk in speaker_gender:
# if spk[0] in Spk_that_contribute_to_meeting:
# spk_id = str(basename)+'_%s-%03d' % (str(re.sub('\d+','',spk[0])),int(re.sub('[a-zA-Z]','',spk[0])))
# spk2gender.write(spk_id+" "+spk[1]+"\n")
#wav_scp.write(basename+" sox "+os.path.dirname(file_trs) + '/' + basename + '.wav'+" -t wav -r 16000 -c 1 - |\n")
#segments_file.close()
#utt2spk_file.close()
#text_file.close()
#wav_scp.close()
#topic_file.close()
#spk2gender.close()
......@@ -11,7 +11,7 @@
stage=1
# how many words we want in the LM's vocabulary
vocab_size=200000
vocab_size=400000
# LM pruning threshold for the 'small' trigram model
prune_thresh_small=0.0000003
......@@ -46,7 +46,9 @@ if [ "$stage" -le 1 ]; then
mkdir -p $tmp_dir
echo "Splitting into $normjobs parts, to allow for parallel processing ..."
split_files=$(eval "echo $split_prefix-{$(seq -s',' $normjobs)}")
find $corpus_dir -mindepth 1 -maxdepth 1 -type d |\
# Tcof
#find $corpus_dir -mindepth 1 -maxdepth 1 -type d |\
find $corpus_dir -mindepth 1 -maxdepth 1 -type f -name "*.trs" | sort |\
tee $tmp_dir/all_texts.txt |\
utils/split_scp.pl - $split_files
echo "Checking the splits ..."
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from xml.etree import ElementTree as ET
from sys import argv
from num2words import num2words
from unidecode import unidecode
import re
import os.path
def transformation_text(text):
# character normalization:
text=re.sub("æ","ae",text)
text=re.sub("œ","oe",text)
#if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
# len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text)) > 0 \
# or len(re.findall("[^\p{L}]-\p{L}+|^-\p{L}+", text)) > 0:
# bool=False
#else:
# ^^ remove
text=re.sub(r"\^+","",text)
text=re.sub(r"\_+","",text)
# 4x4
# Remove noise sound (BIP) over Name of places and person
#text = re.sub(r"¤[^ ]+|[^ ]+¤|¤", "", text.strip())
if len(re.findall(r"\dx\d",text))>0:
text=re.sub(r"x"," ",text)
if len(re.findall("\d+h\d+",text))>0:
heures=re.findall("\d+h\d+",text)
for h in heures:
split_h=h.split('h')
text_rep=split_h[0]+' heure '+split_h[1]
text=text.replace(h, text_rep)
text=re.sub(r',',' ',text)
# remove silence character : OK
#text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable
text=re.sub(r'=','',text)
# Comment Transcriber
text=re.sub(r'\{.+\}','',text)
text=re.sub(r'\(.+\}','',text)
#print "detecter (///|/|<|>)"
# Remove undecidable variant heared like on (n') en:
text=re.sub(r"\(.+\)|\(\)","",text)
#text = re.sub(r"(\+|[*]+|///|/|<|>)", "", text.strip())
#text=re.sub(r"-|_|\."," ",text.strip())
text=re.sub(r'(O.K.)','ok',text)
text = re.sub(r'(O.K)', 'ok', text)
# Replace . with ''
text=re.sub(r'\.|,|;','',text)
#text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK
#<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text=re.sub(r":|\?|/|\!|#+","",text)
text=re.sub(r"%","pour cent",text)
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
#text=re.sub(r"(\+)", "!SIL", text)
#text=re.sub(r"(///)", "!SIL", text)
#text=re.sub(r"(///)", "<long-sil>", text)
#if len(re.findall(r"/.+/", text)) > 0:
#print "AVANT***********"+text
# for unchoosen_text in re.findall(r"/.+/", text):
# choose first undecideble word
# unchoosen_word=unchoosen_text.split(',')
# for choosen_word in unchoosen_word:
# isn't incomprehensible word
# if len(re.findall(r"\*+|\d+", choosen_word))==0:
# choosen_word = choosen_word.replace('/', '')
# text = text.replace(unchoosen_text, choosen_word)
#print "Apres************"+text
# Remove noise sound (BIP) over Name of places and person
#text=re.sub(r"(¤.+¤)",'<NOISE>',text)
# replace unkown syllable
text=re.sub(r"\*+","",text)
# cut of recording : OK
#text=re.sub(r"\$+","",text)
# remove " character: OK
text = re.sub(r"\"+", "", text)
# t 'avais
text = re.sub(r"[ ]\'", " ", text)
text = re.sub(r"\'", "\' ", text)
# convert number if exist : OK
num_list = re.findall("\d+", text)
if len(num_list) > 0:
#print text
#print "********************************* NUM2WORD"
for num in num_list:
num_in_word = num2words(int(num), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text = re.sub(r"(^|[ ])"+str(num)+"([ ]|$)"," " + str(num_in_word) + " ",text)
#print text
# for example : A43
num_list = re.findall("\w+?-?\d+", text)
if len(num_list) > 0:
print(text)
print(num_list)
for s in num_list:
split_between_char_int=re.findall(r'(\w+?)-?(\d+)',s)
print(split_between_char_int)
num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
# replace n succesive spaces with one space. : OK
text=re.sub(r"\s{2,}"," ",text)
text=re.sub(r" $","",text)
text=re.sub("^ ", '', text)
# change bounding | to < and > : OK
#balise=set(re.findall(r"\|\w+_?\w+\|",text))
#if len(balise)>0:
#print(balise)
# for b in balise:
# new_balise='<'+b[1:len(b)-1]+'>'
# text=text.replace(b,new_balise)
#print(text)
# c'est l'essaim ....
text=text.lower()
text=re.sub("[ ]-|-$","",text)
return text
if __name__=="__main__":
# Inputs
file_trs=argv[1]
print(file_trs)
outdir=argv[2]
basename=os.path.basename(file_trs.split('.')[0])
#print file_trs.split('.')[0]
# Output File needed for kaldi input
segments_file = open(outdir + '/segments', 'a')
utt2spk_file = open(outdir + '/utt2spk', 'a')
text_file = open(outdir + '/text', 'a')
wav_scp = open(outdir + '/wav.scp', 'a')
spk2gender= open(outdir + '/spk2gender', 'a')
topic_file = open(outdir+'/topic','a')
# Read Trans File
tree_trs = ET.parse(file_trs)
trsdoc= tree_trs.getroot()
#============================ Read MetaData =======================================
#============================ Topic section (ID,DESC) =======================================
for topic in trsdoc.iter('Topic'):
topic_id=unidecode(topic.get('id'))