Commit f8c50354 authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

update cleaning step

parent 322dfcfc
......@@ -16,7 +16,6 @@ def transformation_text(text):
text=re.sub("mohamed","mohammed",text)
# character normalization:
text=re.sub("&","et",text)
text=re.sub("\+","plus",text)
text=re.sub("æ","ae",text)
text=re.sub("œ","oe",text)
#if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
......@@ -39,15 +38,15 @@ def transformation_text(text):
text_rep=split_h[0]+' heure '+split_h[1]
text=text.replace(h, text_rep)
text=re.sub(r',',' ',text)
# remove silence character : OK
# remove silence character : OK
#text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable
# Liaison non standard remarquable
text=re.sub(r'=','',text)
# Comment Transcriber
# Comment Transcriber
text=re.sub(r'\{.+\}','',text)
text=re.sub(r'\(.+\}','',text)
#print "detecter (///|/|<|>)"
# Remove undecidable variant heared like on (n') en:
# Remove undecidable variant heared like on (n') en:
text=re.sub(r"\(.+\)|\(\)","",text)
#text = re.sub(r"(\+|[*]+|///|/|<|>)", "", text.strip())
#text=re.sub(r"-|_|\."," ",text.strip())
......@@ -55,52 +54,54 @@ def transformation_text(text):
text = re.sub(r'(O.K)', 'ok', text)
# Replace . with ''
text=re.sub(r'\.|,|;','',text)
#text=re.sub(r"{[^{]+}"," ",text.strip())
#text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK
#<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text=re.sub(r":|\?|/|\!|#+","",text)
text=re.sub(r"%","pour cent",text)
# replace silence character with <sil> : OK
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
#text=re.sub(r"(\+)", "!SIL", text)
#text=re.sub(r"(///)", "!SIL", text)
#text=re.sub(r"(///)", "<long-sil>", text)
#if len(re.findall(r"/.+/", text)) > 0:
#print "AVANT***********"+text
#print "AVANT***********"+text
# for unchoosen_text in re.findall(r"/.+/", text):
# choose first undecideble word
# choose first undecideble word
# unchoosen_word=unchoosen_text.split(',')
# for choosen_word in unchoosen_word:
# isn't incomprehensible word
# isn't incomprehensible word
# if len(re.findall(r"\*+|\d+", choosen_word))==0:
# choosen_word = choosen_word.replace('/', '')
# text = text.replace(unchoosen_text, choosen_word)
#print "Apres************"+text
# Remove noise sound (BIP) over Name of places and person
#print "Apres************"+text
# Remove noise sound (BIP) over Name of places and person
#text=re.sub(r"(¤.+¤)",'<NOISE>',text)
# replace unkown syllable
# replace unkown syllable
text=re.sub(r"\*+","",text)
# cut of recording : OK
# cut of recording : OK
#text=re.sub(r"\$+","",text)
# remove " character: OK
# remove " character: OK
text = re.sub(r"\"+", "", text)
# t 'avais
text = re.sub(r"[ ]\'", " ", text)
text = re.sub(r"\'", "\' ", text)
text = re.sub(r"[ ]\'", "\'", text)
#text = re.sub(r"\'", "\' ", text)
# for example : A43
num_list = re.findall("\w+?-?\d+", text)
#num_list = re.findall("\w+?-?\d+", text)
num_list = re.findall("[a-zA-Z|à|è|é|ù]+\'*[a-zA-Z|à|é|è|ù]*[-]?\d+""", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'(\w+?)-?(\d+)',s)
split_between_char_int=re.findall(r'([a-zA-Z|à|é|è|ù]+\'*[a-zA-Z]*)-?(\d+)',s)
num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
text = re.sub(r"(^|[ ])"+str(s)," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
num_list = re.findall("\d+\w+", text)
#num_list = re.findall("\d+\w+", text)
num_list = re.findall("\d+[a-zA-Z|à|é|è|ù]+\'*[a-zA-Z|à|é|è|ù]*", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'(\d+\w+)',s)
split_between_char_int=re.findall(r'(\d+)([a-zA-Z|à|é|è|ù]+\'*[a-zA-Z|à|é|è|ù]*)',s)
#re.findall(r'\d+\w+',s)
num_in_word = num2words(int(split_between_char_int[0][0]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
......@@ -115,14 +116,14 @@ def transformation_text(text):
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text = re.sub(r"(^|[ ])"+str(num)+"([ ]|$)"," " + str(num_in_word) + " ",text)
#print text
# replace n succesive spaces with one space. : OK
# replace n succesive spaces with one space. : OK
text=re.sub(r"\s{2,}"," ",text)
text=re.sub(r" $","",text)
text=re.sub("^ ", '', text)
# change bounding | to < and > : OK
# change bounding | to < and > : OK
#balise=set(re.findall(r"\|\w+_?\w+\|",text))
#if len(balise)>0:
#print(balise)
#print(balise)
# for b in balise:
# new_balise='<'+b[1:len(b)-1]+'>'
# text=text.replace(b,new_balise)
......
......@@ -88,7 +88,7 @@ def transformation_text(text):
#text = re.sub(r"\'", "\' ", text)
# for example : A43
#num_list = re.findall("\w+?-?\d+", text)
num_list = re.findall("[a-zA-Z|à|è|é|ù]+\'*[a-zA-Z]*[-]?\d+""", text)
num_list = re.findall("[a-zA-Z|à|è|é|ù]+\'*[a-zA-Z|à|é|è|ù]*[-]?\d+""", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'([a-zA-Z|à|é|è|ù]+\'*[a-zA-Z]*)-?(\d+)',s)
......@@ -97,10 +97,10 @@ def transformation_text(text):
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
text = re.sub(r"(^|[ ])"+str(s)," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
#num_list = re.findall("\d+\w+", text)
num_list = re.findall("\d+[a-zA-Z]+\'*[a-zA-Z]*", text)
num_list = re.findall("\d+[a-zA-Z|à|é|è|ù]+\'*[a-zA-Z|à|é|è|ù]*", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'(\d+)([a-zA-Z]+\'*[a-zA-Z]*)',s)
split_between_char_int=re.findall(r'(\d+)([a-zA-Z|à|é|è|ù]+\'*[a-zA-Z|à|é|è|ù]*)',s)
#re.findall(r'\d+\w+',s)
num_in_word = num2words(int(split_between_char_int[0][0]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment