Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
LINAGORA
L
LGS
Labs
kaldi-modelgen
Commits
322dfcfc
Commit
322dfcfc
authored
Oct 05, 2017
by
Abdelwahab HEBA
Browse files
new clean and number2word convertion
parent
5bcfb89f
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
9 additions
and
6 deletions
+9
-6
local/parseESTERSyncV2.py
local/parseESTERSyncV2.py
+9
-6
No files found.
local/parseESTERSyncV2.py
View file @
322dfcfc
...
...
@@ -84,21 +84,24 @@ def transformation_text(text):
# remove " character: OK
text
=
re
.
sub
(
r
"\"+"
,
""
,
text
)
# t 'avais
text
=
re
.
sub
(
r
"[ ]\'"
,
"
"
,
text
)
text
=
re
.
sub
(
r
"\'"
,
"
\'
"
,
text
)
text
=
re
.
sub
(
r
"[ ]\'"
,
"
\'
"
,
text
)
#
text = re.sub(r"\'", "\' ", text)
# for example : A43
num_list
=
re
.
findall
(
"\w+?-?\d+"
,
text
)
#num_list = re.findall("\w+?-?\d+", text)
num_list
=
re
.
findall
(
"[a-zA-Z|à|è|é|ù]+
\'
*[a-zA-Z]*[-]?\d+""", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int
=
re
.
findall
(
r
'(
\w+?
)-?(\d+)'
,
s
)
split_between_char_int=re.findall(r'(
[a-zA-Z|à|é|è|ù]+
\'
*[a-zA-Z]*
)-?(\d+)',s)
num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"
(
^|
[
])
"+str(s)+"
([
]
|
$
)
","
" + str(split_between_char_int[0][0]) +"
"+ str(num_in_word) + "
",text)
text = re.sub(r"
(
^|
[
])
"+str(s),"
" + str(split_between_char_int[0][0]) +"
"+ str(num_in_word) + "
",text)
num_list
=
re
.
findall
(
"\d+\w+"
,
text
)
#num_list = re.findall("
\
d
+
\
w
+
", text)
num_list = re.findall("
\
d
+
[
a
-
zA
-
Z
]
+
\
'*[a-zA-Z]*", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int
=
re
.
findall
(
r
'\d+\w+'
,
s
)
split_between_char_int=re.findall(r'
(
\
d
+
)([
a
-
zA
-
Z
]
+
\
'*[a-zA-Z]*)'
,
s
)
#re.findall(r'\d+\w+',s)
num_in_word
=
num2words
(
int
(
split_between_char_int
[
0
][
0
]),
lang
=
'fr'
)
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment