Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
LINAGORA
L
LGS
Labs
kaldi-modelgen
Commits
a9dc6777
Commit
a9dc6777
authored
Sep 08, 2017
by
Abdelwahab HEBA
Browse files
Parse \d+\w+ reg
parent
f53a9df0
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
43 additions
and
24 deletions
+43
-24
local/lm/parseESTERSyncV2_text.py
local/lm/parseESTERSyncV2_text.py
+22
-10
local/lm/train_lm.sh
local/lm/train_lm.sh
+1
-1
local/parseESTERSyncV2.py
local/parseESTERSyncV2.py
+20
-13
No files found.
local/lm/parseESTERSyncV2_text.py
View file @
a9dc6777
...
...
@@ -10,6 +10,8 @@ import os.path
def
transformation_text
(
text
):
# character normalization:
text
=
re
.
sub
(
"&"
,
"et"
,
text
)
text
=
re
.
sub
(
"\+"
,
"plus"
,
text
)
text
=
re
.
sub
(
"æ"
,
"ae"
,
text
)
text
=
re
.
sub
(
"œ"
,
"oe"
,
text
)
#if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
...
...
@@ -80,7 +82,25 @@ def transformation_text(text):
# t 'avais
text
=
re
.
sub
(
r
"[ ]\'"
,
" "
,
text
)
text
=
re
.
sub
(
r
"\'"
,
"
\'
"
,
text
)
# convert number if exist : OK
# for example : A43
num_list
=
re
.
findall
(
"\w+?-?\d+"
,
text
)
if
len
(
num_list
)
>
0
:
for
s
in
num_list
:
split_between_char_int
=
re
.
findall
(
r
'(\w+?)-?(\d+)'
,
s
)
num_in_word
=
num2words
(
int
(
split_between_char_int
[
0
][
1
]),
lang
=
'fr'
)
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
text
=
re
.
sub
(
r
"(^|[ ])"
+
str
(
s
),
" "
+
str
(
split_between_char_int
[
0
][
0
])
+
" "
+
str
(
num_in_word
)
+
" "
,
text
)
num_list
=
re
.
findall
(
"\d+\w+"
,
text
)
if
len
(
num_list
)
>
0
:
for
s
in
num_list
:
split_between_char_int
=
re
.
findall
(
r
'(\d+\w+)'
,
s
)
num_in_word
=
num2words
(
int
(
split_between_char_int
[
0
][
0
]),
lang
=
'fr'
)
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
text
=
re
.
sub
(
r
"(^|[ ])"
+
str
(
s
),
" "
+
str
(
num_in_word
)
+
" "
+
str
(
split_between_char_int
[
0
][
1
])
+
" "
,
text
)
# convert number if exist : OK
num_list
=
re
.
findall
(
"\d+"
,
text
)
if
len
(
num_list
)
>
0
:
#print text
...
...
@@ -89,15 +109,7 @@ def transformation_text(text):
num_in_word
=
num2words
(
int
(
num
),
lang
=
'fr'
)
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text
=
re
.
sub
(
r
"(^|[ ])"
+
str
(
num
)
+
"([ ]|$)"
,
" "
+
str
(
num_in_word
)
+
" "
,
text
)
#print text
# for example : A43
num_list
=
re
.
findall
(
"\w+?-?\d+"
,
text
)
if
len
(
num_list
)
>
0
:
for
s
in
num_list
:
split_between_char_int
=
re
.
findall
(
r
'(\w+?)-?(\d+)'
,
s
)
num_in_word
=
num2words
(
int
(
split_between_char_int
[
0
][
1
]),
lang
=
'fr'
)
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text
=
re
.
sub
(
r
"(^|[ ])"
+
str
(
s
)
+
"([ ]|$)"
,
" "
+
str
(
split_between_char_int
[
0
][
0
])
+
" "
+
str
(
num_in_word
)
+
" "
,
text
)
#print text
# replace n succesive spaces with one space. : OK
text
=
re
.
sub
(
r
"\s{2,}"
,
" "
,
text
)
text
=
re
.
sub
(
r
" $"
,
""
,
text
)
...
...
local/lm/train_lm.sh
View file @
a9dc6777
...
...
@@ -48,7 +48,7 @@ if [ "$stage" -le 1 ]; then
split_files
=
$(
eval
"echo
$split_prefix
-{
$(
seq
-s
','
$normjobs
)
}"
)
# Tcof
#find $corpus_dir -mindepth 1 -maxdepth 1 -type d |\
find
$corpus_dir
-mindepth
1
-maxdepth
1
-type
f
-name
"*.trs"
|
sort
|
\
find
$corpus_dir
-mindepth
1
-maxdepth
1
-name
"*.trs"
|
sort
|
\
tee
$tmp_dir
/all_texts.txt |
\
utils/split_scp.pl -
$split_files
echo
"Checking the splits ..."
...
...
local/parseESTERSyncV2.py
View file @
a9dc6777
...
...
@@ -10,6 +10,7 @@ import os.path
def
transformation_text
(
text
):
# character normalization:
text
=
re
.
sub
(
"&"
,
"et"
,
text
)
text
=
re
.
sub
(
"æ"
,
"ae"
,
text
)
text
=
re
.
sub
(
"œ"
,
"oe"
,
text
)
#if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
...
...
@@ -80,7 +81,24 @@ def transformation_text(text):
# t 'avais
text
=
re
.
sub
(
r
"[ ]\'"
,
" "
,
text
)
text
=
re
.
sub
(
r
"\'"
,
"
\'
"
,
text
)
# convert number if exist : OK
# for example : A43
num_list
=
re
.
findall
(
"\w+?-?\d+"
,
text
)
if
len
(
num_list
)
>
0
:
for
s
in
num_list
:
split_between_char_int
=
re
.
findall
(
r
'(\w+?)-?(\d+)'
,
s
)
num_in_word
=
num2words
(
int
(
split_between_char_int
[
0
][
1
]),
lang
=
'fr'
)
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
text
=
re
.
sub
(
r
"(^|[ ])"
+
str
(
s
),
" "
+
str
(
split_between_char_int
[
0
][
0
])
+
" "
+
str
(
num_in_word
)
+
" "
,
text
)
num_list
=
re
.
findall
(
"\d+\w+"
,
text
)
if
len
(
num_list
)
>
0
:
for
s
in
num_list
:
split_between_char_int
=
re
.
findall
(
r
'\d+\w+'
,
s
)
num_in_word
=
num2words
(
int
(
split_between_char_int
[
0
][
0
]),
lang
=
'fr'
)
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
text
=
re
.
sub
(
r
"(^|[ ])"
+
str
(
s
),
" "
+
str
(
num_in_word
)
+
" "
+
str
(
split_between_char_int
[
0
][
1
])
+
" "
,
text
)
# convert number if exist : OK
num_list
=
re
.
findall
(
"\d+"
,
text
)
if
len
(
num_list
)
>
0
:
#print text
...
...
@@ -89,18 +107,7 @@ def transformation_text(text):
num_in_word
=
num2words
(
int
(
num
),
lang
=
'fr'
)
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text
=
re
.
sub
(
r
"(^|[ ])"
+
str
(
num
)
+
"([ ]|$)"
,
" "
+
str
(
num_in_word
)
+
" "
,
text
)
#print text
# for example : A43
num_list
=
re
.
findall
(
"\w+?-?\d+"
,
text
)
if
len
(
num_list
)
>
0
:
print
(
text
)
print
(
num_list
)
for
s
in
num_list
:
split_between_char_int
=
re
.
findall
(
r
'(\w+?)-?(\d+)'
,
s
)
print
(
split_between_char_int
)
num_in_word
=
num2words
(
int
(
split_between_char_int
[
0
][
1
]),
lang
=
'fr'
)
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text
=
re
.
sub
(
r
"(^|[ ])"
+
str
(
s
)
+
"([ ]|$)"
,
" "
+
str
(
split_between_char_int
[
0
][
0
])
+
" "
+
str
(
num_in_word
)
+
" "
,
text
)
#print text
# replace n succesive spaces with one space. : OK
text
=
re
.
sub
(
r
"\s{2,}"
,
" "
,
text
)
text
=
re
.
sub
(
r
" $"
,
""
,
text
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment