Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
LINAGORA
L
LGS
Labs
kaldi-modelgen
Commits
c7831348
Commit
c7831348
authored
Oct 09, 2017
by
Abdelwahab HEBA
Browse files
modif cleaning
parent
ab16de1f
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
18 additions
and
12 deletions
+18
-12
local/lm/parseESTERSyncV2_text.py
local/lm/parseESTERSyncV2_text.py
+9
-6
local/parseESTERSyncV2.py
local/parseESTERSyncV2.py
+9
-6
No files found.
local/lm/parseESTERSyncV2_text.py
View file @
c7831348
...
...
@@ -18,6 +18,9 @@ def transformation_text(text):
text
=
re
.
sub
(
"&"
,
"et"
,
text
)
text
=
re
.
sub
(
"æ"
,
"ae"
,
text
)
text
=
re
.
sub
(
"œ"
,
"oe"
,
text
)
# ESTER 2 Problem "19ème" ====> "dix-neuvième"
text
=
re
.
sub
(
"19ème"
,
"dix-neuvième"
,
text
)
text
=
re
.
sub
(
"Canal \+"
,
"canal plus"
,
text
)
#if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
# len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text)) > 0 \
# or len(re.findall("[^\p{L}]-\p{L}+|^-\p{L}+", text)) > 0:
...
...
@@ -37,7 +40,7 @@ def transformation_text(text):
split_h
=
h
.
split
(
'h'
)
text_rep
=
split_h
[
0
]
+
' heure '
+
split_h
[
1
]
text
=
text
.
replace
(
h
,
text_rep
)
text
=
re
.
sub
(
r
','
,
' '
,
text
)
text
=
re
.
sub
(
r
',
|¸
'
,
' '
,
text
)
# remove silence character : OK
#text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable
...
...
@@ -57,7 +60,7 @@ def transformation_text(text):
#text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK
#<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text
=
re
.
sub
(
r
":|\?|/|\!|#+"
,
""
,
text
)
text
=
re
.
sub
(
r
":|\?|/|\!|#+
|²
"
,
""
,
text
)
text
=
re
.
sub
(
r
"%"
,
"pour cent"
,
text
)
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
...
...
@@ -88,19 +91,19 @@ def transformation_text(text):
text
=
re
.
sub
(
r
"\'"
,
"
\'
"
,
text
)
# for example : A43
#num_list = re.findall("\w+?-?\d+", text)
num_list
=
re
.
findall
(
"[a-zA-Z
|à|è|é|ù
]+
\'
*[a-zA-Z
|à|é|è|ù
]*[-]?\d+""", text)
num_list
=
re
.
findall
(
"[a-zA-Z]+
\'
*[a-zA-Z]*[-]?\d+""", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'([a-zA-Z
|à|é|è|ù
]+
\'
*[a-zA-Z]*)-?(\d+)',s)
split_between_char_int=re.findall(r'([a-zA-Z]+
\'
*[a-zA-Z]*)-?(\d+)',s)
num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"
(
^|
[
])
"+str(s)+"
([
]
|
$
)
","
" + str(split_between_char_int[0][0]) +"
"+ str(num_in_word) + "
",text)
text = re.sub(r"
(
^|
[
])
"+str(s),"
" + str(split_between_char_int[0][0]) +"
"+ str(num_in_word) + "
",text)
#num_list = re.findall("
\
d
+
\
w
+
", text)
num_list = re.findall("
\
d
+
[
a
-
zA
-
Z
|
à
|
é
|
è
|
ù
]
+
\
'*[a-zA-Z
|à|é|è|ù
]*", text)
num_list = re.findall("
\
d
+
[
a
-
zA
-
Z
]
+
\
'*[a-zA-Z]*", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'
(
\
d
+
)([
a
-
zA
-
Z
|
à
|
é
|
è
|
ù
]
+
\
'*[a-zA-Z
|à|é|è|ù
]*)'
,
s
)
split_between_char_int=re.findall(r'
(
\
d
+
)([
a
-
zA
-
Z
]
+
\
'*[a-zA-Z]*)'
,
s
)
#re.findall(r'\d+\w+',s)
num_in_word
=
num2words
(
int
(
split_between_char_int
[
0
][
0
]),
lang
=
'fr'
)
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
...
...
local/parseESTERSyncV2.py
View file @
c7831348
...
...
@@ -18,6 +18,9 @@ def transformation_text(text):
text
=
re
.
sub
(
"&"
,
"et"
,
text
)
text
=
re
.
sub
(
"æ"
,
"ae"
,
text
)
text
=
re
.
sub
(
"œ"
,
"oe"
,
text
)
# ESTER 2 Problem "19ème" ====> "dix-neuvième"
text
=
re
.
sub
(
"19ème"
,
"dix-neuvième"
,
text
)
text
=
re
.
sub
(
"Canal \+"
,
"canal plus"
,
text
)
#if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
# len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text)) > 0 \
# or len(re.findall("[^\p{L}]-\p{L}+|^-\p{L}+", text)) > 0:
...
...
@@ -37,7 +40,7 @@ def transformation_text(text):
split_h
=
h
.
split
(
'h'
)
text_rep
=
split_h
[
0
]
+
' heure '
+
split_h
[
1
]
text
=
text
.
replace
(
h
,
text_rep
)
text
=
re
.
sub
(
r
','
,
' '
,
text
)
text
=
re
.
sub
(
r
',
|¸
'
,
' '
,
text
)
# remove silence character : OK
#text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable
...
...
@@ -57,7 +60,7 @@ def transformation_text(text):
#text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK
#<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text
=
re
.
sub
(
r
":|\?|/|\!|#+"
,
""
,
text
)
text
=
re
.
sub
(
r
":|\?|/|\!|#+
|²
"
,
""
,
text
)
text
=
re
.
sub
(
r
"%"
,
"pour cent"
,
text
)
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
...
...
@@ -88,19 +91,19 @@ def transformation_text(text):
text
=
re
.
sub
(
r
"\'"
,
"
\'
"
,
text
)
# for example : A43
#num_list = re.findall("\w+?-?\d+", text)
num_list
=
re
.
findall
(
"[a-zA-Z
|à|è|é|ù
]+
\'
*[a-zA-Z
|à|é|è|ù
]*[-]?\d+""", text)
num_list
=
re
.
findall
(
"[a-zA-Z]+
\'
*[a-zA-Z]*[-]?\d+""", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'([a-zA-Z
|à|é|è|ù
]+
\'
*[a-zA-Z]*)-?(\d+)',s)
split_between_char_int=re.findall(r'([a-zA-Z]+
\'
*[a-zA-Z]*)-?(\d+)',s)
num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"
(
^|
[
])
"+str(s)+"
([
]
|
$
)
","
" + str(split_between_char_int[0][0]) +"
"+ str(num_in_word) + "
",text)
text = re.sub(r"
(
^|
[
])
"+str(s),"
" + str(split_between_char_int[0][0]) +"
"+ str(num_in_word) + "
",text)
#num_list = re.findall("
\
d
+
\
w
+
", text)
num_list = re.findall("
\
d
+
[
a
-
zA
-
Z
|
à
|
é
|
è
|
ù
]
+
\
'*[a-zA-Z
|à|é|è|ù
]*", text)
num_list = re.findall("
\
d
+
[
a
-
zA
-
Z
]
+
\
'*[a-zA-Z]*", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'
(
\
d
+
)([
a
-
zA
-
Z
|
à
|
é
|
è
|
ù
]
+
\
'*[a-zA-Z
|à|é|è|ù
]*)'
,
s
)
split_between_char_int=re.findall(r'
(
\
d
+
)([
a
-
zA
-
Z
]
+
\
'*[a-zA-Z]*)'
,
s
)
#re.findall(r'\d+\w+',s)
num_in_word
=
num2words
(
int
(
split_between_char_int
[
0
][
0
]),
lang
=
'fr'
)
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment