Commit fce0ce72 authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

enhance text normalization

parent 3704eaca
......@@ -13,17 +13,6 @@ import os.path
def transformation_text(text):
bool=True
#print text
#or "(" in text
# Remove Line when : ### | $$$ | Particular Pronunciation | Amorse | BIP | Sylable incompr�hensible
#len(re.findall(r"\w+-[^\w+]|\w+-$",text))
#if "###" in text or len(re.findall(r"\[.+\]",text))>0 or len(re.findall(r"[\w|�|�|�|�|�|�|�|�|�|�|�|]+-[^\w|�|�|�|�|�|�|�|�|�|�|�|]+|[\w|�|�|�|�|�|�|�|�|�|�|�|]+-$",text))>0 or len(re.findall(" -\w+",text))>0 or len(re.findall(r"\�",text))>0 or len(re.findall(r"\*+",text))>0 or len(re.findall(r"/.+/",text))>0:
#print text
#print len(re.findall(r"[\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]+-[^\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]|[\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]+-$",text))
##len(re.findall(r"[\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]+-[^\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]|[\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]+-$",text)) > 0\
#print len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text))
#len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text)) > 0 \
#len(re.findall(r"[\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]+-[^\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]|[\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]+-$",text)) > 0\
if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text)) > 0 \
or len(re.findall("[^\p{L}]-\p{L}+|^-\p{L}+", text)) > 0:
......@@ -32,11 +21,10 @@ def transformation_text(text):
bool=False
else:
# 4x4
# Remove noise sound (BIP) over Name of places and person
#text = re.sub(r"[^ ]+|[^ ]+|", "", text.strip())
if len(re.findall(r"\dx\d",text))>0:
text=re.sub(r"x"," ",text)
# remove silence character : OK
#text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable
if len(re.findall("\d+h\d+",text))>0:
heures=re.findall("\d+h\d+",text)
for h in heures:
......@@ -44,7 +32,10 @@ def transformation_text(text):
text_rep=split_h[0]+' heure '+split_h[1]
text=text.replace(h, text_rep)
text=re.sub(r',',' ',text)
text=re.sub(r'=\w+=','',text)
# remove silence character : OK
#text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable
text=re.sub(r'=','',text)
# Comment Transcriber
text=re.sub(r'\{.+\}','',text)
text=re.sub(r'\(.+\}','',text)
......@@ -59,10 +50,8 @@ def transformation_text(text):
text=re.sub(r'\.',' ',text)
#text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK
text=re.sub(r"\?|/|\!|<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$","",text)
# Remove noise sound (BIP) over Name of places and person
#text = re.sub(r"¤[^ ]+|[^ ]+¤|¤", "", text.strip())
text=re.sub(r"(¤.+¤)",'',text)
#<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text=re.sub(r"\?|/|\!|<|>","",text)
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
text=re.sub(r"(\+)", "", text)
......@@ -79,7 +68,9 @@ def transformation_text(text):
choosen_word = choosen_word.replace('/', '')
text = text.replace(unchoosen_text, choosen_word)
#print "Apres************"+text
# replace unkown syllable
# Remove noise sound (BIP) over Name of places and person
text=re.sub(r"(.+)",'',text)
# replace unkown syllable
text=re.sub(r"\*+","",text)
# cut of recording : OK
text=re.sub(r"\$+","",text)
......@@ -96,12 +87,20 @@ def transformation_text(text):
#print "********************************* NUM2WORD"
for num in num_list:
num_in_word = num2words(int(num), lang='fr')
num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text = text.replace(str(num), " " + str(num_in_word) + " ")
#print text
# replace n succesive spaces with one space. : OK
text=re.sub(r"\s{2,}"," ",text)
text = re.sub("^ ", '', text)
# change bounding | to < and > : OK
#balise=set(re.findall(r"\|\w+_?\w+\|",text))
#if len(balise)>0:
# print(balise)
# for b in balise:
# new_balise='<'+b[1:len(b)-1]+'>'
# text=text.replace(b,new_balise)
# print(text)
# c'est l'essaim ....
text=text.lower()
return bool,text
......@@ -133,7 +132,7 @@ if __name__=="__main__":
# File text
# File speaker_gender
if bool and text!="":
print text.encode('utf-8')
print text
#for spk_tuple in speaker_gender:
# if spk_tuple[0]==spkr:
# print >> spk2gender,'%s %s' % (seg_id, spk_tuple[1])
......@@ -145,7 +144,7 @@ if __name__=="__main__":
if count>0:
bool, text = transformation_text(text)
if bool and text!="":
print text.encode('utf-8')
print text
text=Element.tail.replace('\n', '')
count=count+1
elif Element.tag=="Comment" and has_attrib_speaker and not Element.tail is None:
......@@ -165,4 +164,4 @@ if __name__=="__main__":
if count > 0 and has_attrib_speaker and not Element.tail is None:
bool, text = transformation_text(text)
if bool and text != "":
print text.encode('utf-8')
print text
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment