Commit a9dc6777 authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

Parse \d+\w+ reg

parent f53a9df0
......@@ -10,6 +10,8 @@ import os.path
def transformation_text(text):
# character normalization:
text=re.sub("&","et",text)
text=re.sub("\+","plus",text)
text=re.sub("æ","ae",text)
text=re.sub("œ","oe",text)
#if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
......@@ -80,7 +82,25 @@ def transformation_text(text):
# t 'avais
text = re.sub(r"[ ]\'", " ", text)
text = re.sub(r"\'", "\' ", text)
# convert number if exist : OK
# for example : A43
num_list = re.findall("\w+?-?\d+", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'(\w+?)-?(\d+)',s)
num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
text = re.sub(r"(^|[ ])"+str(s)," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
num_list = re.findall("\d+\w+", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'(\d+\w+)',s)
num_in_word = num2words(int(split_between_char_int[0][0]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
text = re.sub(r"(^|[ ])"+str(s)," "+ str(num_in_word)+ " " + str(split_between_char_int[0][1]) + " ",text)
# convert number if exist : OK
num_list = re.findall("\d+", text)
if len(num_list) > 0:
#print text
......@@ -89,15 +109,7 @@ def transformation_text(text):
num_in_word = num2words(int(num), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text = re.sub(r"(^|[ ])"+str(num)+"([ ]|$)"," " + str(num_in_word) + " ",text)
#print text
# for example : A43
num_list = re.findall("\w+?-?\d+", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'(\w+?)-?(\d+)',s)
num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
#print text
# replace n succesive spaces with one space. : OK
text=re.sub(r"\s{2,}"," ",text)
text=re.sub(r" $","",text)
......
......@@ -48,7 +48,7 @@ if [ "$stage" -le 1 ]; then
split_files=$(eval "echo $split_prefix-{$(seq -s',' $normjobs)}")
# Tcof
#find $corpus_dir -mindepth 1 -maxdepth 1 -type d |\
find $corpus_dir -mindepth 1 -maxdepth 1 -type f -name "*.trs" | sort |\
find $corpus_dir -mindepth 1 -maxdepth 1 -name "*.trs" | sort |\
tee $tmp_dir/all_texts.txt |\
utils/split_scp.pl - $split_files
echo "Checking the splits ..."
......
......@@ -10,6 +10,7 @@ import os.path
def transformation_text(text):
# character normalization:
text=re.sub("&","et",text)
text=re.sub("æ","ae",text)
text=re.sub("œ","oe",text)
#if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
......@@ -80,7 +81,24 @@ def transformation_text(text):
# t 'avais
text = re.sub(r"[ ]\'", " ", text)
text = re.sub(r"\'", "\' ", text)
# convert number if exist : OK
# for example : A43
num_list = re.findall("\w+?-?\d+", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'(\w+?)-?(\d+)',s)
num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
text = re.sub(r"(^|[ ])"+str(s)," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
num_list = re.findall("\d+\w+", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'\d+\w+',s)
num_in_word = num2words(int(split_between_char_int[0][0]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
text = re.sub(r"(^|[ ])"+str(s)," "+ str(num_in_word)+ " " + str(split_between_char_int[0][1]) + " ",text)
# convert number if exist : OK
num_list = re.findall("\d+", text)
if len(num_list) > 0:
#print text
......@@ -89,18 +107,7 @@ def transformation_text(text):
num_in_word = num2words(int(num), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text = re.sub(r"(^|[ ])"+str(num)+"([ ]|$)"," " + str(num_in_word) + " ",text)
#print text
# for example : A43
num_list = re.findall("\w+?-?\d+", text)
if len(num_list) > 0:
print(text)
print(num_list)
for s in num_list:
split_between_char_int=re.findall(r'(\w+?)-?(\d+)',s)
print(split_between_char_int)
num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
#print text
# replace n succesive spaces with one space. : OK
text=re.sub(r"\s{2,}"," ",text)
text=re.sub(r" $","",text)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment