Commit c7831348 by Abdelwahab HEBA

modif cleaning

parent ab16de1f
 ... ... @@ -18,6 +18,9 @@ def transformation_text(text): text=re.sub("&","et",text) text=re.sub("æ","ae",text) text=re.sub("œ","oe",text) # ESTER 2 Problem "19ème" ====> "dix-neuvième" text=re.sub("19ème","dix-neuvième",text) text=re.sub("Canal \+","canal plus",text) #if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \ # len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-\$",text)) > 0 \ # or len(re.findall("[^\p{L}]-\p{L}+|^-\p{L}+", text)) > 0: ... ... @@ -37,7 +40,7 @@ def transformation_text(text): split_h=h.split('h') text_rep=split_h[0]+' heure '+split_h[1] text=text.replace(h, text_rep) text=re.sub(r',',' ',text) text=re.sub(r',|¸',' ',text) # remove silence character : OK #text=re.sub(r"(/.+/","remplacer par la 1er",text) # Liaison non standard remarquable ... ... @@ -57,7 +60,7 @@ def transformation_text(text): #text=re.sub(r"{[^{]+}"," ",text.strip()) # Remove ? ! < > : OK #<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+\$ text=re.sub(r":|\?|/|\!|#+","",text) text=re.sub(r":|\?|/|\!|#+|²","",text) text=re.sub(r"%","pour cent",text) # replace silence character with : OK #text=re.sub(r"(\+)", "", text) ... ... @@ -88,19 +91,19 @@ def transformation_text(text): text = re.sub(r"\'", "\' ", text) # for example : A43 #num_list = re.findall("\w+?-?\d+", text) num_list = re.findall("[a-zA-Z|à|è|é|ù]+\'*[a-zA-Z|à|é|è|ù]*[-]?\d+""", text) num_list = re.findall("[a-zA-Z]+\'*[a-zA-Z]*[-]?\d+""", text) if len(num_list) > 0: for s in num_list: split_between_char_int=re.findall(r'([a-zA-Z|à|é|è|ù]+\'*[a-zA-Z]*)-?(\d+)',s) split_between_char_int=re.findall(r'([a-zA-Z]+\'*[a-zA-Z]*)-?(\d+)',s) num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr') #num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore') #text = re.sub(r"(^|[ ])"+str(s)+"([ ]|\$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text) text = re.sub(r"(^|[ ])"+str(s)," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text) #num_list = re.findall("\d+\w+", text) num_list = re.findall("\d+[a-zA-Z|à|é|è|ù]+\'*[a-zA-Z|à|é|è|ù]*", text) num_list = re.findall("\d+[a-zA-Z]+\'*[a-zA-Z]*", text) if len(num_list) > 0: for s in num_list: split_between_char_int=re.findall(r'(\d+)([a-zA-Z|à|é|è|ù]+\'*[a-zA-Z|à|é|è|ù]*)',s) split_between_char_int=re.findall(r'(\d+)([a-zA-Z]+\'*[a-zA-Z]*)',s) #re.findall(r'\d+\w+',s) num_in_word = num2words(int(split_between_char_int[0][0]), lang='fr') #num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore') ... ...
 ... ... @@ -18,6 +18,9 @@ def transformation_text(text): text=re.sub("&","et",text) text=re.sub("æ","ae",text) text=re.sub("œ","oe",text) # ESTER 2 Problem "19ème" ====> "dix-neuvième" text=re.sub("19ème","dix-neuvième",text) text=re.sub("Canal \+","canal plus",text) #if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \ # len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-\$",text)) > 0 \ # or len(re.findall("[^\p{L}]-\p{L}+|^-\p{L}+", text)) > 0: ... ... @@ -37,7 +40,7 @@ def transformation_text(text): split_h=h.split('h') text_rep=split_h[0]+' heure '+split_h[1] text=text.replace(h, text_rep) text=re.sub(r',',' ',text) text=re.sub(r',|¸',' ',text) # remove silence character : OK #text=re.sub(r"(/.+/","remplacer par la 1er",text) # Liaison non standard remarquable ... ... @@ -57,7 +60,7 @@ def transformation_text(text): #text=re.sub(r"{[^{]+}"," ",text.strip()) # Remove ? ! < > : OK #<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+\$ text=re.sub(r":|\?|/|\!|#+","",text) text=re.sub(r":|\?|/|\!|#+|²","",text) text=re.sub(r"%","pour cent",text) # replace silence character with : OK #text=re.sub(r"(\+)", "", text) ... ... @@ -88,19 +91,19 @@ def transformation_text(text): text = re.sub(r"\'", "\' ", text) # for example : A43 #num_list = re.findall("\w+?-?\d+", text) num_list = re.findall("[a-zA-Z|à|è|é|ù]+\'*[a-zA-Z|à|é|è|ù]*[-]?\d+""", text) num_list = re.findall("[a-zA-Z]+\'*[a-zA-Z]*[-]?\d+""", text) if len(num_list) > 0: for s in num_list: split_between_char_int=re.findall(r'([a-zA-Z|à|é|è|ù]+\'*[a-zA-Z]*)-?(\d+)',s) split_between_char_int=re.findall(r'([a-zA-Z]+\'*[a-zA-Z]*)-?(\d+)',s) num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr') #num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore') #text = re.sub(r"(^|[ ])"+str(s)+"([ ]|\$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text) text = re.sub(r"(^|[ ])"+str(s)," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text) #num_list = re.findall("\d+\w+", text) num_list = re.findall("\d+[a-zA-Z|à|é|è|ù]+\'*[a-zA-Z|à|é|è|ù]*", text) num_list = re.findall("\d+[a-zA-Z]+\'*[a-zA-Z]*", text) if len(num_list) > 0: for s in num_list: split_between_char_int=re.findall(r'(\d+)([a-zA-Z|à|é|è|ù]+\'*[a-zA-Z|à|é|è|ù]*)',s) split_between_char_int=re.findall(r'(\d+)([a-zA-Z]+\'*[a-zA-Z]*)',s) #re.findall(r'\d+\w+',s) num_in_word = num2words(int(split_between_char_int[0][0]), lang='fr') #num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore') ... ...
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!