Commit c7831348 authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

modif cleaning

parent ab16de1f
...@@ -18,6 +18,9 @@ def transformation_text(text): ...@@ -18,6 +18,9 @@ def transformation_text(text):
text=re.sub("&","et",text) text=re.sub("&","et",text)
text=re.sub("æ","ae",text) text=re.sub("æ","ae",text)
text=re.sub("œ","oe",text) text=re.sub("œ","oe",text)
# ESTER 2 Problem "19ème" ====> "dix-neuvième"
text=re.sub("19ème","dix-neuvième",text)
text=re.sub("Canal \+","canal plus",text)
#if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \ #if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
# len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text)) > 0 \ # len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text)) > 0 \
# or len(re.findall("[^\p{L}]-\p{L}+|^-\p{L}+", text)) > 0: # or len(re.findall("[^\p{L}]-\p{L}+|^-\p{L}+", text)) > 0:
...@@ -37,7 +40,7 @@ def transformation_text(text): ...@@ -37,7 +40,7 @@ def transformation_text(text):
split_h=h.split('h') split_h=h.split('h')
text_rep=split_h[0]+' heure '+split_h[1] text_rep=split_h[0]+' heure '+split_h[1]
text=text.replace(h, text_rep) text=text.replace(h, text_rep)
text=re.sub(r',',' ',text) text=re.sub(r',',' ',text)
# remove silence character : OK # remove silence character : OK
#text=re.sub(r"(/.+/","remplacer par la 1er",text) #text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable # Liaison non standard remarquable
...@@ -57,7 +60,7 @@ def transformation_text(text): ...@@ -57,7 +60,7 @@ def transformation_text(text):
#text=re.sub(r"{[^{]+}"," ",text.strip()) #text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK # Remove ? ! < > : OK
#<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$ #<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text=re.sub(r":|\?|/|\!|#+","",text) text=re.sub(r":|\?|/|\!|#+","",text)
text=re.sub(r"%","pour cent",text) text=re.sub(r"%","pour cent",text)
# replace silence character with <sil> : OK # replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text) #text=re.sub(r"(\+)", "<sil>", text)
...@@ -88,19 +91,19 @@ def transformation_text(text): ...@@ -88,19 +91,19 @@ def transformation_text(text):
text = re.sub(r"\'", "\' ", text) text = re.sub(r"\'", "\' ", text)
# for example : A43 # for example : A43
#num_list = re.findall("\w+?-?\d+", text) #num_list = re.findall("\w+?-?\d+", text)
num_list = re.findall("[a-zA-Z|à|è|é|ù]+\'*[a-zA-Z|à|é|è|ù]*[-]?\d+""", text) num_list = re.findall("[a-zA-Z]+\'*[a-zA-Z]*[-]?\d+""", text)
if len(num_list) > 0: if len(num_list) > 0:
for s in num_list: for s in num_list:
split_between_char_int=re.findall(r'([a-zA-Z|à|é|è|ù]+\'*[a-zA-Z]*)-?(\d+)',s) split_between_char_int=re.findall(r'([a-zA-Z]+\'*[a-zA-Z]*)-?(\d+)',s)
num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr') num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore') #num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text) #text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
text = re.sub(r"(^|[ ])"+str(s)," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text) text = re.sub(r"(^|[ ])"+str(s)," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
#num_list = re.findall("\d+\w+", text) #num_list = re.findall("\d+\w+", text)
num_list = re.findall("\d+[a-zA-Z|à|é|è|ù]+\'*[a-zA-Z|à|é|è|ù]*", text) num_list = re.findall("\d+[a-zA-Z]+\'*[a-zA-Z]*", text)
if len(num_list) > 0: if len(num_list) > 0:
for s in num_list: for s in num_list:
split_between_char_int=re.findall(r'(\d+)([a-zA-Z|à|é|è|ù]+\'*[a-zA-Z|à|é|è|ù]*)',s) split_between_char_int=re.findall(r'(\d+)([a-zA-Z]+\'*[a-zA-Z]*)',s)
#re.findall(r'\d+\w+',s) #re.findall(r'\d+\w+',s)
num_in_word = num2words(int(split_between_char_int[0][0]), lang='fr') num_in_word = num2words(int(split_between_char_int[0][0]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore') #num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
......
...@@ -18,6 +18,9 @@ def transformation_text(text): ...@@ -18,6 +18,9 @@ def transformation_text(text):
text=re.sub("&","et",text) text=re.sub("&","et",text)
text=re.sub("æ","ae",text) text=re.sub("æ","ae",text)
text=re.sub("œ","oe",text) text=re.sub("œ","oe",text)
# ESTER 2 Problem "19ème" ====> "dix-neuvième"
text=re.sub("19ème","dix-neuvième",text)
text=re.sub("Canal \+","canal plus",text)
#if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \ #if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
# len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text)) > 0 \ # len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text)) > 0 \
# or len(re.findall("[^\p{L}]-\p{L}+|^-\p{L}+", text)) > 0: # or len(re.findall("[^\p{L}]-\p{L}+|^-\p{L}+", text)) > 0:
...@@ -37,7 +40,7 @@ def transformation_text(text): ...@@ -37,7 +40,7 @@ def transformation_text(text):
split_h=h.split('h') split_h=h.split('h')
text_rep=split_h[0]+' heure '+split_h[1] text_rep=split_h[0]+' heure '+split_h[1]
text=text.replace(h, text_rep) text=text.replace(h, text_rep)
text=re.sub(r',',' ',text) text=re.sub(r',',' ',text)
# remove silence character : OK # remove silence character : OK
#text=re.sub(r"(/.+/","remplacer par la 1er",text) #text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable # Liaison non standard remarquable
...@@ -57,7 +60,7 @@ def transformation_text(text): ...@@ -57,7 +60,7 @@ def transformation_text(text):
#text=re.sub(r"{[^{]+}"," ",text.strip()) #text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK # Remove ? ! < > : OK
#<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$ #<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text=re.sub(r":|\?|/|\!|#+","",text) text=re.sub(r":|\?|/|\!|#+","",text)
text=re.sub(r"%","pour cent",text) text=re.sub(r"%","pour cent",text)
# replace silence character with <sil> : OK # replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text) #text=re.sub(r"(\+)", "<sil>", text)
...@@ -88,19 +91,19 @@ def transformation_text(text): ...@@ -88,19 +91,19 @@ def transformation_text(text):
text = re.sub(r"\'", "\' ", text) text = re.sub(r"\'", "\' ", text)
# for example : A43 # for example : A43
#num_list = re.findall("\w+?-?\d+", text) #num_list = re.findall("\w+?-?\d+", text)
num_list = re.findall("[a-zA-Z|à|è|é|ù]+\'*[a-zA-Z|à|é|è|ù]*[-]?\d+""", text) num_list = re.findall("[a-zA-Z]+\'*[a-zA-Z]*[-]?\d+""", text)
if len(num_list) > 0: if len(num_list) > 0:
for s in num_list: for s in num_list:
split_between_char_int=re.findall(r'([a-zA-Z|à|é|è|ù]+\'*[a-zA-Z]*)-?(\d+)',s) split_between_char_int=re.findall(r'([a-zA-Z]+\'*[a-zA-Z]*)-?(\d+)',s)
num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr') num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore') #num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text) #text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
text = re.sub(r"(^|[ ])"+str(s)," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text) text = re.sub(r"(^|[ ])"+str(s)," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
#num_list = re.findall("\d+\w+", text) #num_list = re.findall("\d+\w+", text)
num_list = re.findall("\d+[a-zA-Z|à|é|è|ù]+\'*[a-zA-Z|à|é|è|ù]*", text) num_list = re.findall("\d+[a-zA-Z]+\'*[a-zA-Z]*", text)
if len(num_list) > 0: if len(num_list) > 0:
for s in num_list: for s in num_list:
split_between_char_int=re.findall(r'(\d+)([a-zA-Z|à|é|è|ù]+\'*[a-zA-Z|à|é|è|ù]*)',s) split_between_char_int=re.findall(r'(\d+)([a-zA-Z]+\'*[a-zA-Z]*)',s)
#re.findall(r'\d+\w+',s) #re.findall(r'\d+\w+',s)
num_in_word = num2words(int(split_between_char_int[0][0]), lang='fr') num_in_word = num2words(int(split_between_char_int[0][0]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore') #num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment