Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
LINAGORA
L
LGS
Labs
kaldi-modelgen
Commits
f8c50354
Commit
f8c50354
authored
Oct 05, 2017
by
Abdelwahab HEBA
Browse files
update cleaning step
parent
322dfcfc
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
29 additions
and
28 deletions
+29
-28
local/lm/parseESTERSyncV2_text.py
local/lm/parseESTERSyncV2_text.py
+26
-25
local/parseESTERSyncV2.py
local/parseESTERSyncV2.py
+3
-3
No files found.
local/lm/parseESTERSyncV2_text.py
View file @
f8c50354
...
...
@@ -16,7 +16,6 @@ def transformation_text(text):
text
=
re
.
sub
(
"mohamed"
,
"mohammed"
,
text
)
# character normalization:
text
=
re
.
sub
(
"&"
,
"et"
,
text
)
text
=
re
.
sub
(
"\+"
,
"plus"
,
text
)
text
=
re
.
sub
(
"æ"
,
"ae"
,
text
)
text
=
re
.
sub
(
"œ"
,
"oe"
,
text
)
#if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
...
...
@@ -39,15 +38,15 @@ def transformation_text(text):
text_rep
=
split_h
[
0
]
+
' heure '
+
split_h
[
1
]
text
=
text
.
replace
(
h
,
text_rep
)
text
=
re
.
sub
(
r
','
,
' '
,
text
)
# remove silence character : OK
# remove silence character : OK
#text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable
# Liaison non standard remarquable
text
=
re
.
sub
(
r
'='
,
''
,
text
)
# Comment Transcriber
# Comment Transcriber
text
=
re
.
sub
(
r
'\{.+\}'
,
''
,
text
)
text
=
re
.
sub
(
r
'\(.+\}'
,
''
,
text
)
#print "detecter (///|/|<|>)"
# Remove undecidable variant heared like on (n') en:
# Remove undecidable variant heared like on (n') en:
text
=
re
.
sub
(
r
"\(.+\)|\(\)"
,
""
,
text
)
#text = re.sub(r"(\+|[*]+|///|/|<|>)", "", text.strip())
#text=re.sub(r"-|_|\."," ",text.strip())
...
...
@@ -55,52 +54,54 @@ def transformation_text(text):
text
=
re
.
sub
(
r
'(O.K)'
,
'ok'
,
text
)
# Replace . with ''
text
=
re
.
sub
(
r
'\.|,|;'
,
''
,
text
)
#text=re.sub(r"{[^{]+}"," ",text.strip())
#text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK
#<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text
=
re
.
sub
(
r
":|\?|/|\!|#+"
,
""
,
text
)
text
=
re
.
sub
(
r
"%"
,
"pour cent"
,
text
)
# replace silence character with <sil> : OK
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
#text=re.sub(r"(\+)", "!SIL", text)
#text=re.sub(r"(///)", "!SIL", text)
#text=re.sub(r"(///)", "<long-sil>", text)
#if len(re.findall(r"/.+/", text)) > 0:
#print "AVANT***********"+text
#print "AVANT***********"+text
# for unchoosen_text in re.findall(r"/.+/", text):
# choose first undecideble word
# choose first undecideble word
# unchoosen_word=unchoosen_text.split(',')
# for choosen_word in unchoosen_word:
# isn't incomprehensible word
# isn't incomprehensible word
# if len(re.findall(r"\*+|\d+", choosen_word))==0:
# choosen_word = choosen_word.replace('/', '')
# text = text.replace(unchoosen_text, choosen_word)
#print "Apres************"+text
# Remove noise sound (BIP) over Name of places and person
#print "Apres************"+text
# Remove noise sound (BIP) over Name of places and person
#text=re.sub(r"(¤.+¤)",'<NOISE>',text)
# replace unkown syllable
# replace unkown syllable
text
=
re
.
sub
(
r
"\*+"
,
""
,
text
)
# cut of recording : OK
# cut of recording : OK
#text=re.sub(r"\$+","",text)
# remove " character: OK
# remove " character: OK
text
=
re
.
sub
(
r
"\"+"
,
""
,
text
)
# t 'avais
text
=
re
.
sub
(
r
"[ ]\'"
,
" "
,
text
)
text
=
re
.
sub
(
r
"\'"
,
"
\'
"
,
text
)
text
=
re
.
sub
(
r
"[ ]\'"
,
"
\'
"
,
text
)
#text = re.sub(r"\'", "\' ", text)
# for example : A43
num_list
=
re
.
findall
(
"\w+?-?\d+"
,
text
)
#num_list = re.findall("\w+?-?\d+", text)
num_list
=
re
.
findall
(
"[a-zA-Z|à|è|é|ù]+
\'
*[a-zA-Z|à|é|è|ù]*[-]?\d+""", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int
=
re
.
findall
(
r
'(
\w+?
)-?(\d+)'
,
s
)
split_between_char_int=re.findall(r'(
[a-zA-Z|à|é|è|ù]+
\'
*[a-zA-Z]*
)-?(\d+)',s)
num_in_word = num2words(int(split_between_char_int[0][1]), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"
(
^|
[
])
"+str(s)+"
([
]
|
$
)
","
" + str(split_between_char_int[0][0]) +"
"+ str(num_in_word) + "
",text)
text = re.sub(r"
(
^|
[
])
"+str(s),"
" + str(split_between_char_int[0][0]) +"
"+ str(num_in_word) + "
",text)
num_list
=
re
.
findall
(
"\d+\w+"
,
text
)
#num_list = re.findall("
\
d
+
\
w
+
", text)
num_list = re.findall("
\
d
+
[
a
-
zA
-
Z
|
à
|
é
|
è
|
ù
]
+
\
'*[a-zA-Z|à|é|è|ù]*", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int
=
re
.
findall
(
r
'(\d+\w+)'
,
s
)
split_between_char_int=re.findall(r'
(
\
d
+
)([
a
-
zA
-
Z
|
à
|
é
|
è
|
ù
]
+
\
'*[a-zA-Z|à|é|è|ù]*)'
,
s
)
#re.findall(r'\d+\w+',s)
num_in_word
=
num2words
(
int
(
split_between_char_int
[
0
][
0
]),
lang
=
'fr'
)
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
#text = re.sub(r"(^|[ ])"+str(s)+"([ ]|$)"," " + str(split_between_char_int[0][0]) +" "+ str(num_in_word) + " ",text)
...
...
@@ -115,14 +116,14 @@ def transformation_text(text):
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text
=
re
.
sub
(
r
"(^|[ ])"
+
str
(
num
)
+
"([ ]|$)"
,
" "
+
str
(
num_in_word
)
+
" "
,
text
)
#print text
# replace n succesive spaces with one space. : OK
# replace n succesive spaces with one space. : OK
text
=
re
.
sub
(
r
"\s{2,}"
,
" "
,
text
)
text
=
re
.
sub
(
r
" $"
,
""
,
text
)
text
=
re
.
sub
(
"^ "
,
''
,
text
)
# change bounding | to < and > : OK
# change bounding | to < and > : OK
#balise=set(re.findall(r"\|\w+_?\w+\|",text))
#if len(balise)>0:
#print(balise)
#print(balise)
# for b in balise:
# new_balise='<'+b[1:len(b)-1]+'>'
# text=text.replace(b,new_balise)
...
...
local/parseESTERSyncV2.py
View file @
f8c50354
...
...
@@ -88,7 +88,7 @@ def transformation_text(text):
#text = re.sub(r"\'", "\' ", text)
# for example : A43
#num_list = re.findall("\w+?-?\d+", text)
num_list
=
re
.
findall
(
"[a-zA-Z|à|è|é|ù]+
\'
*[a-zA-Z]*[-]?\d+""", text)
num_list
=
re
.
findall
(
"[a-zA-Z|à|è|é|ù]+
\'
*[a-zA-Z
|à|é|è|ù
]*[-]?\d+""", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'([a-zA-Z|à|é|è|ù]+
\'
*[a-zA-Z]*)-?(\d+)',s)
...
...
@@ -97,10 +97,10 @@ def transformation_text(text):
#text = re.sub(r"
(
^|
[
])
"+str(s)+"
([
]
|
$
)
","
" + str(split_between_char_int[0][0]) +"
"+ str(num_in_word) + "
",text)
text = re.sub(r"
(
^|
[
])
"+str(s),"
" + str(split_between_char_int[0][0]) +"
"+ str(num_in_word) + "
",text)
#num_list = re.findall("
\
d
+
\
w
+
", text)
num_list = re.findall("
\
d
+
[
a
-
zA
-
Z
]
+
\
'*[a-zA-Z]*", text)
num_list = re.findall("
\
d
+
[
a
-
zA
-
Z
|
à
|
é
|
è
|
ù
]
+
\
'*[a-zA-Z
|à|é|è|ù
]*", text)
if len(num_list) > 0:
for s in num_list:
split_between_char_int=re.findall(r'
(
\
d
+
)([
a
-
zA
-
Z
]
+
\
'*[a-zA-Z]*)'
,
s
)
split_between_char_int=re.findall(r'
(
\
d
+
)([
a
-
zA
-
Z
|
à
|
é
|
è
|
ù
]
+
\
'*[a-zA-Z
|à|é|è|ù
]*)'
,
s
)
#re.findall(r'\d+\w+',s)
num_in_word
=
num2words
(
int
(
split_between_char_int
[
0
][
0
]),
lang
=
'fr'
)
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment