Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
LINAGORA
L
LGS
Labs
kaldi-modelgen
Commits
3704eaca
Commit
3704eaca
authored
Jan 20, 2017
by
Abdelwahab HEBA
Browse files
Use python3 and fix encoding problem with LC_ALL in path.sh
parent
63f86c6d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
58 additions
and
66 deletions
+58
-66
local/data_prep.sh
local/data_prep.sh
+14
-14
local/parseTcofSync.py
local/parseTcofSync.py
+39
-48
path.sh
path.sh
+5
-4
No files found.
local/data_prep.sh
View file @
3704eaca
...
...
@@ -59,7 +59,7 @@ for meeting_dir in $(find $src -mindepth 1 -maxdepth 1 -type d | sort); do
# echo "Unexpected gender: '$reader_gender'"
#exit 1;
#fi
$PYTHON
local
/parseTcofSync.py
$meeting_dir
/
$meeting
.trs
$dst
>>
log.txt 2>&1
python3
local
/parseTcofSync.py
$meeting_dir
/
$meeting
.trs
$dst
>>
log.txt 2>&1
done
...
...
@@ -73,30 +73,30 @@ cat $segments | sort -k1 > $segments.txt
rm
$segments
mv
$segments
.txt
$segments
# wav
cat
$wav_scp
|
sort
-k1
>
$wav_scp
.txt
rm
$wav_scp
mv
$wav_scp
.txt
$wav_scp
cat
$wav_scp
|
sort
-k1
>
$wav_scp
.txt
rm
$wav_scp
mv
$wav_scp
.txt
$wav_scp
# # spk2gender
cat
$spk2gender
|
sort
-k1
>
$spk2gender
.txt
rm
$spk2gender
mv
$spk2gender
.txt
$spk2gender
cat
$spk2gender
|
sort
-k1
>
$spk2gender
.txt
rm
$spk2gender
mv
$spk2gender
.txt
$spk2gender
# # utt2spk
cat
$utt2spk
|
sort
-k1
>
$utt2spk
.txt
rm
$utt2spk
mv
$utt2spk
.txt
$utt2spk
cat
$utt2spk
|
sort
-k1
>
$utt2spk
.txt
rm
$utt2spk
mv
$utt2spk
.txt
$utt2spk
spk2utt
=
$dst
/spk2utt
utils/utt2spk_to_spk2utt.pl <
$utt2spk
>
$spk2utt
#|| exit 1
# spk2utt
cat
$spk2utt
|
sort
-k1
>
$spk2utt
.txt
rm
$spk2utt
mv
$spk2utt
.txt
$spk2utt
cat
$spk2utt
|
sort
-k1
>
$spk2utt
.txt
rm
$spk2utt
mv
$spk2utt
.txt
$spk2utt
ntrans
=
$(
wc
-l
<
$trans
)
nutt2spk
=
$(
wc
-l
<
$utt2spk
)
!
[
"
$ntrans
"
-eq
"
$nutt2spk
"
]
&&
\
echo
"Inconsistent #transcripts(
$ntrans
) and # utt2spk(
$nutt2spk
)"
#&& exit 1;
echo
"Inconsistent #transcripts(
$ntrans
) and # utt2spk(
$nutt2spk
)"
#&& exit 1;
utils/data/get_utt2dur.sh
$dst
1>&2
#|| exit 1
...
...
local/parseTcofSync.py
100644 → 100755
View file @
3704eaca
...
...
@@ -5,12 +5,12 @@ from xml.etree import ElementTree as ET
from
unicodedata
import
normalize
from
sys
import
argv
from
num2words
import
num2words
from
unidecode
import
unidecode
import
re
import
os.path
import
sys
# ( in text
# ) in text
def
transformation_text
(
text
):
bool
=
True
#print text
...
...
@@ -34,7 +34,6 @@ def transformation_text(text):
# 4x4
# Remove noise sound (BIP) over Name of places and person
#text = re.sub(r"¤[^ ]+|[^ ]+¤|¤", "", text.strip())
text
=
re
.
sub
(
r
"(¤.+¤)"
,
'<NOISE>'
,
text
)
if
len
(
re
.
findall
(
r
"\dx\d"
,
text
))
>
0
:
text
=
re
.
sub
(
r
"x"
,
" "
,
text
)
if
len
(
re
.
findall
(
"\d+h\d+"
,
text
))
>
0
:
...
...
@@ -47,7 +46,7 @@ def transformation_text(text):
# remove silence character : OK
#text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable
text
=
re
.
sub
(
r
'=
\w+=
'
,
''
,
text
)
text
=
re
.
sub
(
r
'='
,
''
,
text
)
# Comment Transcriber
text
=
re
.
sub
(
r
'\{.+\}'
,
''
,
text
)
text
=
re
.
sub
(
r
'\(.+\}'
,
''
,
text
)
...
...
@@ -62,7 +61,8 @@ def transformation_text(text):
text
=
re
.
sub
(
r
'\.'
,
' '
,
text
)
#text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK
text
=
re
.
sub
(
r
"\?|/|\!|<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$"
,
""
,
text
)
#<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text
=
re
.
sub
(
r
"\?|/|\!|<|>"
,
""
,
text
)
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
text
=
re
.
sub
(
r
"(\+)"
,
"!SIL"
,
text
)
...
...
@@ -79,6 +79,8 @@ def transformation_text(text):
choosen_word
=
choosen_word
.
replace
(
'/'
,
''
)
text
=
text
.
replace
(
unchoosen_text
,
choosen_word
)
#print "Apres************"+text
# Remove noise sound (BIP) over Name of places and person
text
=
re
.
sub
(
r
"(¤.+¤)"
,
'<NOISE>'
,
text
)
# replace unkown syllable
text
=
re
.
sub
(
r
"\*+"
,
"<SPOKEN_NOISE>"
,
text
)
# cut of recording : OK
...
...
@@ -96,18 +98,27 @@ def transformation_text(text):
#print "********************************* NUM2WORD"
for
num
in
num_list
:
num_in_word
=
num2words
(
int
(
num
),
lang
=
'fr'
)
num_in_word
=
normalize
(
'NFKD'
,
num_in_word
).
encode
(
'ascii'
,
'ignore'
)
#
num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text
=
text
.
replace
(
str
(
num
),
" "
+
str
(
num_in_word
)
+
" "
)
#print text
# replace n succesive spaces with one space. : OK
text
=
re
.
sub
(
r
"\s{2,}"
,
" "
,
text
)
text
=
re
.
sub
(
"^ "
,
''
,
text
)
# change bounding | to < and > : OK
balise
=
set
(
re
.
findall
(
r
"\|\w+_?\w+\|"
,
text
))
if
len
(
balise
)
>
0
:
print
(
balise
)
for
b
in
balise
:
new_balise
=
'<'
+
b
[
1
:
len
(
b
)
-
1
]
+
'>'
text
=
text
.
replace
(
b
,
new_balise
)
print
(
text
)
# c'est l'essaim ....
text
=
text
.
lower
()
return
bool
,
text
if
__name__
==
"__main__"
:
# Inputs
file_trs
=
argv
[
1
]
#print(file_trs)
#print file_trs
outdir
=
argv
[
2
]
basename
=
os
.
path
.
basename
(
file_trs
.
split
(
'.'
)[
0
])
...
...
@@ -128,9 +139,10 @@ if __name__=="__main__":
namespk
=
[]
for
spk
in
trsdoc
.
iter
(
'Speaker'
):
id_spk
=
spk
.
get
(
'id'
)
name_spk
=
spk
.
get
(
'name'
)
if
isinstance
(
name_spk
,
unicode
):
name_spk
=
normalize
(
'NFKD'
,
name_spk
).
encode
(
'ascii'
,
'ignore'
)
name_spk
=
unidecode
(
spk
.
get
(
'name'
))
#if isinstance(name_spk,str):
#print(type(name_spk))
#name_spk=normalize('NFKD', name_spk).encode('ascii', 'ignore')
speaker_id
.
append
(
id_spk
.
replace
(
" "
,
""
))
namespk
.
append
(
name_spk
.
lower
().
replace
(
" "
,
""
))
#Read MetaData To get Gender of Speaker (Gender and Name)
...
...
@@ -142,8 +154,7 @@ if __name__=="__main__":
for
loc
in
metadoc
.
iter
(
'locuteur'
):
if
loc
.
attrib
!=
dict
({}):
name_loc
=
loc
.
get
(
'identifiant'
)
if
isinstance
(
name_loc
,
unicode
):
name_loc
=
normalize
(
'NFKD'
,
name_loc
).
encode
(
'ascii'
,
'ignore'
)
name_loc
=
unidecode
(
name_loc
)
name_loc
=
name_loc
.
replace
(
" "
,
""
)
#print name_loc
#print name_loc
...
...
@@ -178,6 +189,8 @@ if __name__=="__main__":
Spk_that_contribute_to_meeting
=
set
([])
start_utt
=
0
end_utt
=
0
sourceEncoding
=
"iso-8859-1"
targetEncoding
=
"utf-8"
for
Element
in
trsdoc
.
iter
():
if
Element
.
tag
==
"Turn"
and
Element
.
get
(
'speaker'
)
is
None
:
has_attrib_speaker
=
False
...
...
@@ -195,11 +208,10 @@ if __name__=="__main__":
# File text
# File speaker_gender
if
bool
and
text
!=
""
:
#print seg_id+'\t'+spkr_id+'\t'+startTime+'\t'+endTime+'\t'+text
print
>>
segments_file
,
'%s %s %s %s'
%
(
seg_id
,
basename
,
start_utt
,
endTime
)
segments_file
.
write
(
seg_id
+
" "
+
basename
+
" "
+
str
(
start_utt
)
+
" "
+
str
(
endTime
)
+
"
\n
"
)
start_utt
=
endTime
print
>>
utt2spk_file
,
'%s %s'
%
(
seg_id
,
spkr_id
)
print
>>
text_file
,
'%s %s'
%
(
seg_id
,
text
.
encode
(
'utf-8'
)
)
utt2spk_file
.
write
(
seg_id
+
" "
+
spkr_id
+
"
\n
"
)
text_file
.
write
(
seg_id
+
" "
+
text
+
"
\n
"
)
#for spk_tuple in speaker_gender:
# if spk_tuple[0]==spkr:
# print >> spk2gender,'%s %s' % (seg_id, spk_tuple[1])
...
...
@@ -226,20 +238,11 @@ if __name__=="__main__":
spkr_id
=
str
(
basename
)
+
'_spk-%03d'
%
int
(
spkr
.
split
(
'spk'
)[
1
])
bool
,
text
=
transformation_text
(
text
)
end_utt
=
Time_start_current_sync
# File wav.scp
# File utt2spk
# File text
# File speaker_gender
if
bool
and
text
!=
""
:
#print seg_id+'\t'+spkr_id+'\t'+startTime+'\t'+endTime+'\t'+text
print
>>
segments_file
,
'%s %s %s %s'
%
(
seg_id
,
basename
,
start_utt
,
end_utt
)
segments_file
.
write
(
seg_id
+
" "
+
basename
+
" "
+
str
(
start_utt
)
+
" "
+
str
(
end_utt
)
+
"
\n
"
)
start_utt
=
Time_start_current_sync
print
>>
utt2spk_file
,
'%s %s'
%
(
seg_id
,
spkr_id
)
print
>>
text_file
,
'%s %s'
%
(
seg_id
,
text
.
encode
(
'utf-8'
))
#for spk_tuple in speaker_gender:
# if spk_tuple[0]==spkr:
# print >> spk2gender,'%s %s' % (spkr_id, spk_tuple[1])
# break
utt2spk_file
.
write
(
seg_id
+
" "
+
spkr_id
+
"
\n
"
)
text_file
.
write
(
seg_id
+
" "
+
text
+
"
\n
"
)
text
=
Element
.
tail
.
replace
(
'
\n
'
,
''
)
count
=
count
+
1
elif
Element
.
tag
==
"Comment"
and
has_attrib_speaker
and
not
Element
.
tail
is
None
:
...
...
@@ -247,13 +250,13 @@ if __name__=="__main__":
elif
Element
.
tag
==
"Event"
and
has_attrib_speaker
and
not
Element
.
tail
is
None
:
if
Element
.
get
(
'type'
)
==
'noise'
:
if
Element
.
get
(
'desc'
)
==
'rire'
:
text
=
text
+
"
<
LAUGH
>
"
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
text
=
text
+
"
|
LAUGH
|
"
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
else
:
text
=
text
+
"
<
NOISE
>
"
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
text
=
text
+
"
|
NOISE
|
"
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
elif
Element
.
get
(
'type'
)
==
'pronounce'
:
text
=
text
+
"
<
SPOKEN_NOISE
>
"
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
text
=
text
+
"
|
SPOKEN_NOISE
|
"
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
else
:
text
=
text
+
"
<
NOISE
>
"
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
text
=
text
+
"
|
NOISE
|
"
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
elif
Element
.
tag
==
"Who"
and
has_attrib_speaker
and
not
Element
.
tail
is
None
:
text
=
text
+
" "
+
Element
.
tail
.
replace
(
'
\n
'
,
''
)
#else:
...
...
@@ -272,27 +275,15 @@ if __name__=="__main__":
int
(
spkr
.
split
(
'spk'
)[
1
]),
int
(
Turn_count
),
int
(
count
))
spkr_id
=
str
(
basename
)
+
'_spk-%03d'
%
int
(
spkr
.
split
(
'spk'
)[
1
])
bool
,
text
=
transformation_text
(
text
)
#print bool
#print text
# File wav.scp
# File text
# File speaker_gender
if
bool
and
text
!=
""
:
# print seg_id+'\t'+spkr_id+'\t'+startTime+'\t'+endTime+'\t'+text
print
>>
segments_file
,
'%s %s %s %s'
%
(
seg_id
,
basename
,
start_utt
,
endTime
)
print
>>
utt2spk_file
,
'%s %s'
%
(
seg_id
,
spkr_id
)
print
>>
text_file
,
'%s %s'
%
(
seg_id
,
text
.
encode
(
'utf-8'
))
#for spk_tuple in speaker_gender:
# if spk_tuple[0] == spkr:
# print >> spk2gender, '%s %s' % (seg_id, spk_tuple[1])
# break
#print speaker_gender
segments_file
.
write
(
seg_id
+
" "
+
basename
+
" "
+
str
(
start_utt
)
+
" "
+
str
(
endTime
)
+
"
\n
"
)
utt2spk_file
.
write
(
seg_id
+
" "
+
spkr_id
+
"
\n
"
)
text_file
.
write
(
seg_id
+
" "
+
text
+
"
\n
"
)
for
spk
in
speaker_gender
:
if
spk
[
0
]
in
Spk_that_contribute_to_meeting
:
spk_id
=
str
(
basename
)
+
'_spk-%03d'
%
int
(
spk
[
0
].
split
(
'spk'
)[
1
])
print
>>
spk2gender
,
'%s %s'
%
(
spk_id
,
spk
[
1
])
print
>>
wav_scp
,
'%s sox %s -t wav -r 16000 -c 1 - |'
%
(
basename
,
os
.
path
.
dirname
(
file_trs
)
+
'/'
+
basename
+
'.wav'
)
# print >> wav_scp, '%s sox %s -t wav -r 16000 -c 1 -' % (file_name, os.path.dirname(file_trs)+'/'+file_name+'.wav')
spk2gender
.
write
(
spk_id
+
" "
+
spk
[
1
]
+
"
\n
"
)
wav_scp
.
write
(
basename
+
" sox "
+
os
.
path
.
dirname
(
file_trs
)
+
'/'
+
basename
+
'.wav'
+
" -t wav -r 16000 -c 1 - |
\n
"
)
segments_file
.
close
()
utt2spk_file
.
close
()
text_file
.
close
()
...
...
path.sh
View file @
3704eaca
#!/usr/bin/env bash
export
KALDI_ROOT
=
`
pwd
`
/../../..
export
PATH
=
$PWD
/tools/festival/nsw/bin:
$PWD
/utils/:
$KALDI_ROOT
/tools/openfst/bin:
$PWD
:
$PATH
[
!
-f
$KALDI_ROOT
/tools/config/common_path.sh
]
&&
echo
>
&2
"The standard file
$KALDI_ROOT
/tools/config/common_path.sh is not present -> Exit!"
&&
exit
1
.
$KALDI_ROOT
/tools/config/common_path.sh
LANG
=
en_US
.UTF-8
LANGUAGE
=
en_US
.UTF-8
LC_ALL
=
en_US
.UTF-8
LANG
=
fr_Fr
.UTF-8
LANGUAGE
=
fr_FR
.UTF-8
LC_ALL
=
fr_FR
.UTF-8
# we use this both in the
(optional)
LM training and the G2P-related scripts
# we use this both in the
Data prepare (Normalization step) and in optional way in the
LM training and the G2P-related scripts
PYTHON
=
'python2.7'
PYTHON3
=
'python3'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment