Commit 322dfcfc authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

new clean and number2word convertion

parent 5bcfb89f
......@@ -84,21 +84,24 @@ def transformation_text(text):
# remove " character: OK
text = re.sub(r"\"+", "", text)
# t 'avais
text = re.sub(r"[ ]\'", " ", text)
text = re.sub(r"\'", "\' ", text)
text = re.sub(r"[ ]\'", "\'", text)
#text = re.sub(r"\'", "\' ", text)
# for example : A43
num_list = re.findall("\w+?-?\d+", text)
#num_list = re.findall("\w+?-?\d+", text)
num_list = re.findall("[a-zA-Z|à|è|é|ù]+\'*[a-zA-Z]*[-]?\d+""", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'(\w+?)-?(\d+)',s)
split_between_char_int=re.findall(r'([a-zA-Z|à|é|è|ù]+\'*[a-zA-Z]*)-?(\d+)',s)
num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
text = re.sub(r"(^|[ ])"+str(s)," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
num_list = re.findall("\d+\w+", text)
#num_list = re.findall("\d+\w+", text)
num_list = re.findall("\d+[a-zA-Z]+\'*[a-zA-Z]*", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'\d+\w+',s)
split_between_char_int=re.findall(r'(\d+)([a-zA-Z]+\'*[a-zA-Z]*)',s)
#re.findall(r'\d+\w+',s)
num_in_word = num2words(int(split_between_char_int[0][0]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment