Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
LINAGORA
L
LGS
Labs
kaldi-modelgen
Commits
3ad59a3d
Commit
3ad59a3d
authored
Jan 24, 2017
by
Abdelwahab HEBA
Browse files
Normalize Text
parent
1748ca3f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
18 additions
and
29 deletions
+18
-29
local/lm/normalize_text.sh
local/lm/normalize_text.sh
+1
-1
local/lm/parseText.py
local/lm/parseText.py
+17
-28
No files found.
local/lm/normalize_text.sh
View file @
3ad59a3d
...
@@ -27,7 +27,7 @@ for b in $(cat $in_list); do
...
@@ -27,7 +27,7 @@ for b in $(cat $in_list); do
echo
"Start processing
$id
at
$(
date
'+%T %F'
)
"
echo
"Start processing
$id
at
$(
date
'+%T %F'
)
"
in_file
=
$b
/
$id
.trs
in_file
=
$b
/
$id
.trs
[[
-f
"
$in_file
"
]]
||
{
echo
"WARNING:
$in_file
does not exists"
;
continue
;
}
[[
-f
"
$in_file
"
]]
||
{
echo
"WARNING:
$in_file
does not exists"
;
continue
;
}
$PYTHON
local
/lm/parseText.py
$in_file
|
\
python3
local
/lm/parseText.py
$in_file
|
\
$PYTHON
local
/lm/pre_filter.py /dev/stdin
$out_root
/corpus_train.txt
$PYTHON
local
/lm/pre_filter.py /dev/stdin
$out_root
/corpus_train.txt
processed
=
$((
processed
+
1
))
processed
=
$((
processed
+
1
))
echo
"Processing of
$id
has finished at
$(
date
'+%T %F'
)
[
$processed
texts ready so far]"
echo
"Processing of
$id
has finished at
$(
date
'+%T %F'
)
[
$processed
texts ready so far]"
...
...
local/lm/parseText.py
View file @
3ad59a3d
#!/usr/bin/env python
#!/usr/bin/env python
# -*- coding:
latin-1
-*-
# -*- coding:
utf-8
-*-
from
xml.etree
import
ElementTree
as
ET
from
xml.etree
import
ElementTree
as
ET
from
unicodedata
import
normalize
from
unicodedata
import
normalize
from
sys
import
argv
from
sys
import
argv
from
num2words
import
num2words
from
num2words
import
num2words
from
unidecode
import
unidecode
import
re
import
re
import
os.path
import
os.path
import
sys
# ( in text
# ( in text
# ) in text
# ) in text
def
transformation_text
(
text
):
def
transformation_text
(
text
):
bool
=
True
bool
=
True
if
"###"
in
text
or
len
(
re
.
findall
(
r
"\[.+\]"
,
text
))
>
0
or
\
if
"###"
in
text
or
len
(
re
.
findall
(
r
"\[.+\]"
,
text
))
>
0
or
\
...
@@ -22,7 +22,7 @@ def transformation_text(text):
...
@@ -22,7 +22,7 @@ def transformation_text(text):
else
:
else
:
# 4x4
# 4x4
# Remove noise sound (BIP) over Name of places and person
# Remove noise sound (BIP) over Name of places and person
#text = re.sub(r"[^ ]+|[^ ]+
|
", "", text.strip())
#text = re.sub(r"
¤
[^ ]+|[^ ]+
¤|¤
", "", text.strip())
if
len
(
re
.
findall
(
r
"\dx\d"
,
text
))
>
0
:
if
len
(
re
.
findall
(
r
"\dx\d"
,
text
))
>
0
:
text
=
re
.
sub
(
r
"x"
,
" "
,
text
)
text
=
re
.
sub
(
r
"x"
,
" "
,
text
)
if
len
(
re
.
findall
(
"\d+h\d+"
,
text
))
>
0
:
if
len
(
re
.
findall
(
"\d+h\d+"
,
text
))
>
0
:
...
@@ -50,12 +50,12 @@ def transformation_text(text):
...
@@ -50,12 +50,12 @@ def transformation_text(text):
text
=
re
.
sub
(
r
'\.'
,
' '
,
text
)
text
=
re
.
sub
(
r
'\.'
,
' '
,
text
)
#text=re.sub(r"{[^{]+}"," ",text.strip())
#text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK
# Remove ? ! < > : OK
#<[^\p{L}]|[^\p{L}]>|<\p{L}+[ ]|<\p{L}+$
#<[^\p{L}]|[^\p{L}]>|
#+|
<\p{L}+[ ]|<\p{L}+$
text
=
re
.
sub
(
r
":|\?|/|\!|<|>|#+"
,
""
,
text
)
text
=
re
.
sub
(
r
":|\?|/|\!|<|>|#+"
,
""
,
text
)
# replace silence character with <sil> : OK
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
#text=re.sub(r"(\+)", "<sil>", text)
text
=
re
.
sub
(
r
"(\+)"
,
""
,
text
)
text
=
re
.
sub
(
r
"(\+)"
,
"
!SIL
"
,
text
)
text
=
re
.
sub
(
r
"(///)"
,
""
,
text
)
text
=
re
.
sub
(
r
"(///)"
,
"
!SIL
"
,
text
)
#text=re.sub(r"(///)", "<long-sil>", text)
#text=re.sub(r"(///)", "<long-sil>", text)
if
len
(
re
.
findall
(
r
"/.+/"
,
text
))
>
0
:
if
len
(
re
.
findall
(
r
"/.+/"
,
text
))
>
0
:
#print "AVANT***********"+text
#print "AVANT***********"+text
...
@@ -68,8 +68,8 @@ def transformation_text(text):
...
@@ -68,8 +68,8 @@ def transformation_text(text):
choosen_word
=
choosen_word
.
replace
(
'/'
,
''
)
choosen_word
=
choosen_word
.
replace
(
'/'
,
''
)
text
=
text
.
replace
(
unchoosen_text
,
choosen_word
)
text
=
text
.
replace
(
unchoosen_text
,
choosen_word
)
#print "Apres************"+text
#print "Apres************"+text
# Remove noise sound (BIP) over Name of places and person
# Remove noise sound (BIP) over Name of places and person
text
=
re
.
sub
(
r
"(.+)"
,
''
,
text
)
text
=
re
.
sub
(
r
"(
¤
.+
¤
)"
,
''
,
text
)
# replace unkown syllable
# replace unkown syllable
text
=
re
.
sub
(
r
"\*+"
,
""
,
text
)
text
=
re
.
sub
(
r
"\*+"
,
""
,
text
)
# cut of recording : OK
# cut of recording : OK
...
@@ -80,7 +80,6 @@ def transformation_text(text):
...
@@ -80,7 +80,6 @@ def transformation_text(text):
text
=
re
.
sub
(
r
"[ ]\'"
,
" "
,
text
)
text
=
re
.
sub
(
r
"[ ]\'"
,
" "
,
text
)
text
=
re
.
sub
(
r
"\'"
,
"
\'
"
,
text
)
text
=
re
.
sub
(
r
"\'"
,
"
\'
"
,
text
)
# convert number if exist : OK
# convert number if exist : OK
num_list
=
re
.
findall
(
" \d+| \d+$"
,
text
)
num_list
=
re
.
findall
(
" \d+| \d+$"
,
text
)
if
len
(
num_list
)
>
0
:
if
len
(
num_list
)
>
0
:
#print text
#print text
...
@@ -96,21 +95,19 @@ def transformation_text(text):
...
@@ -96,21 +95,19 @@ def transformation_text(text):
# change bounding | to < and > : OK
# change bounding | to < and > : OK
#balise=set(re.findall(r"\|\w+_?\w+\|",text))
#balise=set(re.findall(r"\|\w+_?\w+\|",text))
#if len(balise)>0:
#if len(balise)>0:
#
print(balise)
#
print(balise)
# for b in balise:
# for b in balise:
# new_balise='<'+b[1:len(b)-1]+'>'
# new_balise='<'+b[1:len(b)-1]+'>'
# text=text.replace(b,new_balise)
# text=text.replace(b,new_balise)
#
print(text)
#
print(text)
# c'est l'essaim ....
# c'est l'essaim ....
text
=
text
.
lower
()
text
=
text
.
lower
()
return
bool
,
text
return
bool
,
text
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
# Inputs
# Inputs
file_trs
=
argv
[
1
]
file_trs
=
argv
[
1
]
basename
=
os
.
path
.
basename
(
file_trs
.
split
(
'.'
)[
0
])
#print(file_trs)
# MetaData File
#print file_trs
file_meta
=
file_trs
.
split
(
'.'
)[
0
]
+
'.xml'
#print file_trs.split('.')[0]
# Read Trans File
# Read Trans File
tree_trs
=
ET
.
parse
(
file_trs
)
tree_trs
=
ET
.
parse
(
file_trs
)
trsdoc
=
tree_trs
.
getroot
()
trsdoc
=
tree_trs
.
getroot
()
...
@@ -132,7 +129,7 @@ if __name__=="__main__":
...
@@ -132,7 +129,7 @@ if __name__=="__main__":
# File text
# File text
# File speaker_gender
# File speaker_gender
if
bool
and
text
!=
""
:
if
bool
and
text
!=
""
:
print
text
print
(
text
)
#for spk_tuple in speaker_gender:
#for spk_tuple in speaker_gender:
# if spk_tuple[0]==spkr:
# if spk_tuple[0]==spkr:
# print >> spk2gender,'%s %s' % (seg_id, spk_tuple[1])
# print >> spk2gender,'%s %s' % (seg_id, spk_tuple[1])
...
@@ -144,24 +141,16 @@ if __name__=="__main__":
...
@@ -144,24 +141,16 @@ if __name__=="__main__":
if
count
>
0
:
if
count
>
0
:
bool
,
text
=
transformation_text
(
text
)
bool
,
text
=
transformation_text
(
text
)
if
bool
and
text
!=
""
:
if
bool
and
text
!=
""
:
print
text
print
(
text
)
text
=
Element
.
tail
.
replace
(
'
\n
'
,
''
)
text
=
Element
.
tail
.
replace
(
'
\n
'
,
''
)
count
=
count
+
1
count
=
count
+
1
elif
Element
.
tag
==
"Comment"
and
has_attrib_speaker
and
not
Element
.
tail
is
None
:
elif
Element
.
tag
==
"Comment"
and
has_attrib_speaker
and
not
Element
.
tail
is
None
:
text
=
text
+
" "
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
text
=
text
+
" "
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
elif
Element
.
tag
==
"Event"
and
has_attrib_speaker
and
not
Element
.
tail
is
None
:
elif
Element
.
tag
==
"Event"
and
has_attrib_speaker
and
not
Element
.
tail
is
None
:
if
Element
.
get
(
'type'
)
==
'noise'
:
text
=
text
+
" "
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
if
Element
.
get
(
'desc'
)
==
'rire'
:
text
=
text
+
" "
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
else
:
text
=
text
+
" "
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
elif
Element
.
get
(
'type'
)
==
'pronounce'
:
text
=
text
+
" "
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
else
:
text
=
text
+
" "
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
elif
Element
.
tag
==
"Who"
and
has_attrib_speaker
and
not
Element
.
tail
is
None
:
elif
Element
.
tag
==
"Who"
and
has_attrib_speaker
and
not
Element
.
tail
is
None
:
text
=
text
+
" "
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
text
=
text
+
" "
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
if
count
>
0
and
has_attrib_speaker
and
not
Element
.
tail
is
None
:
if
count
>
0
and
has_attrib_speaker
and
not
Element
.
tail
is
None
:
bool
,
text
=
transformation_text
(
text
)
bool
,
text
=
transformation_text
(
text
)
if
bool
and
text
!=
""
:
if
bool
and
text
!=
""
:
print
text
print
(
text
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment