Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
LINAGORA
L
LGS
Labs
kaldi-modelgen
Commits
0e544ebd
Commit
0e544ebd
authored
Mar 20, 2017
by
Abdelwahab HEBA
Browse files
Parse ACSYNT speech Database
parent
1697aa30
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
889 additions
and
0 deletions
+889
-0
local/ACSYNT_Parse/ParseACSYNT.py
local/ACSYNT_Parse/ParseACSYNT.py
+80
-0
local/ACSYNT_Parse/__pycache__/textgrid.cpython-35.pyc
local/ACSYNT_Parse/__pycache__/textgrid.cpython-35.pyc
+0
-0
local/ACSYNT_Parse/parseASYNT.sh
local/ACSYNT_Parse/parseASYNT.sh
+11
-0
local/ACSYNT_Parse/prep_ACSYNT.sh
local/ACSYNT_Parse/prep_ACSYNT.sh
+69
-0
local/ACSYNT_Parse/textgrid.py
local/ACSYNT_Parse/textgrid.py
+628
-0
local/ACSYNT_Parse/textgrid.pyc
local/ACSYNT_Parse/textgrid.pyc
+0
-0
local/data_prepACSYNT.sh
local/data_prepACSYNT.sh
+101
-0
No files found.
local/ACSYNT_Parse/ParseACSYNT.py
0 → 100755
View file @
0e544ebd
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Abdel Linagora@March17
from
textgrid
import
TextGrid
from
sys
import
argv
import
re
import
os.path
def
transform_text
(
text
):
if
len
(
re
.
findall
(
r
"\<.+\>"
,
text
))
>
0
:
text
=
re
.
sub
(
r
"\<.{2}"
,
" "
,
text
)
text
=
re
.
sub
(
r
"\>"
,
" "
,
text
)
text
=
re
.
sub
(
r
"\[\w+\]"
,
"<noise>"
,
text
)
text
=
re
.
sub
(
r
"\[.+\]"
,
"<noise>"
,
text
)
text
=
re
.
sub
(
r
"\."
,
""
,
text
)
text
=
re
.
sub
(
r
"\\+"
,
""
,
text
)
text
=
re
.
sub
(
r
"e\^"
,
"ê"
,
text
)
text
=
re
.
sub
(
r
"c\,"
,
"ç"
,
text
)
text
=
re
.
sub
(
r
"o\^"
,
"ô"
,
text
)
text
=
re
.
sub
(
r
"u\`"
,
"ù"
,
text
)
text
=
re
.
sub
(
r
"i\^"
,
"î"
,
text
)
text
=
re
.
sub
(
r
"\?"
,
""
,
text
)
# delete space in the end of utterance
text
=
re
.
sub
(
r
"[\s]+$"
,
""
,
text
)
# replace n succesive spaces with one space. : OK
text
=
re
.
sub
(
r
"\s{2,}"
,
" "
,
text
)
# split word j'y j' y
if
len
(
re
.
findall
(
r
"\w+-\w+\'\w+"
,
text
))
>
0
:
a
=
else
:
text
=
re
.
sub
(
"
\'
"
,
"
\'
"
,
text
)
text
=
text
.
lower
()
return
text
if
__name__
==
"__main__"
:
# Input : file.TEXTGRID and we assume that file.wav are in the same directory
TEXTGRID_file
=
argv
[
1
]
dirname
=
os
.
path
.
dirname
(
TEXTGRID_file
)
basename
=
os
.
path
.
basename
(
TEXTGRID_file
.
split
(
'.'
)[
0
])
WAV_file
=
dirname
+
'/'
+
basename
+
'.wav'
# Output directory
outdir
=
argv
[
2
]
# Output File needed for kaldi input
segments_file
=
open
(
outdir
+
'/segments'
,
'a'
)
utt2spk_file
=
open
(
outdir
+
'/utt2spk'
,
'a'
)
text_file
=
open
(
outdir
+
'/text'
,
'a'
)
wav_scp
=
open
(
outdir
+
'/wav.scp'
,
'a'
)
# Parse TEXTGRID FILE
TEXTGRID_io
=
open
(
TEXTGRID_file
,
'r'
)
TEXTGRID_obj
=
TextGrid
(
TEXTGRID_io
.
read
())
# Get only first Imte
Tier
=
TEXTGRID_obj
.
tiers
[
0
]
count
=
0
Spk_that_contribute_to_meeting
=
[]
spkr_id
=
1
for
deb_seg
,
end_seg
,
text
in
Tier
.
simple_transcript
:
seg_id
=
str
(
basename
)
+
'_spk-%03d_seg-%05d'
%
(
int
(
count
),
int
(
count
))
text
=
transform_text
(
text
)
split_spkr_text
=
text
.
split
(
':'
)
if
len
(
split_spkr_text
)
>
1
:
if
not
split_spkr_text
[
0
]
in
Spk_that_contribute_to_meeting
:
Spk_that_contribute_to_meeting
.
append
(
split_spkr_text
[
0
])
spkr
=
Spk_that_contribute_to_meeting
.
index
(
split_spkr_text
[
0
])
+
1
spkr_id
=
str
(
basename
)
+
'_spk-%03d'
%
int
(
spkr
)
text
=
split_spkr_text
[
1
]
text
=
re
.
sub
(
"^ "
,
""
,
text
)
#print(split_spkr_text)
segments_file
.
write
(
seg_id
+
" "
+
basename
+
" "
+
str
(
round
(
float
(
deb_seg
),
3
))
+
" "
+
str
(
round
(
float
(
end_seg
),
3
))
+
"
\n
"
)
text_file
.
write
(
seg_id
+
" "
+
text
+
"
\n
"
)
utt2spk_file
.
write
(
seg_id
+
" "
+
str
(
spkr_id
)
+
"
\n
"
)
count
=
count
+
1
wav_scp
.
write
(
basename
+
" sox "
+
WAV_file
+
" -t wav -r 16000 -c 1 - |
\n
"
)
segments_file
.
close
()
utt2spk_file
.
close
()
text_file
.
close
()
wav_scp
.
close
()
\ No newline at end of file
local/ACSYNT_Parse/__pycache__/textgrid.cpython-35.pyc
0 → 100644
View file @
0e544ebd
File added
local/ACSYNT_Parse/parseASYNT.sh
0 → 100755
View file @
0e544ebd
#!/usr/bin/env bash
# @ Abdel HEBA
# Cluster Readed Text, meeting with 2 loc, and presentation
corpus
=
/home/lingora/Documents/Linagora/Data/ACSYNT
outdir
=
/home/lingora/Documents/Linagora/Data/ACSYNT/ACSYNT_Final
local
/ACSYNT_Parse/prep_ACSYNT.sh
$corpus
$outdir
local/ACSYNT_Parse/prep_ACSYNT.sh
0 → 100755
View file @
0e544ebd
#!/usr/bin/env bash
# Abdel @LINAGORA - DONE
corpus
=
$1
# CD1 CD2 CD3 are directories which contain Data
ACSYNT_CD1
=
$corpus
/ACSYNT_CD1/ACSYNT_CD1
ACSYNT_CD2
=
$corpus
/ACSYNT_CD2/ACSYNT_CD2
ACSYNT_CD3
=
$corpus
/ACSYNT_CD3/ACSYNT_CD3
# separate Readed speech , prepared speech and meetings
# 4 letters : 3 for ID and latest: E: for meeting, P: for prepared speech and T: for readed speech
out
=
$2
out_prepared
=
$out
/prepared_speech
out_meeting
=
$out
/meeting
out_story
=
$out
/story
mkdir
-p
$out
mkdir
-p
$out_prepared
mkdir
-p
$out_story
mkdir
-p
$out_meeting
for
type_speech
in
$(
find
$ACSYNT_CD1
$ACSYNT_CD2
$ACSYNT_CD3
-mindepth
1
-maxdepth
1
-type
d
)
;
do
type
=
$(
basename
$type_speech
)
_Bool_meeting
=
`
echo
$type
|
grep
"Entretien"
`
_Bool_prepared
=
`
echo
$type
|
grep
"Presentation"
`
_Bool_text
=
`
echo
$type
|
grep
"Text"
`
if
[
!
-z
"
$_Bool_meeting
"
]
;
then
# This file is a meeting
for
dir_meeting
in
$(
find
$type_speech
-mindepth
1
-maxdepth
1
-type
d
)
;
do
cp
-r
$dir_meeting
/
$out_meeting
dir_name
=
$(
basename
$dir_meeting
)
_Bool_Uppercase
=
`
ls
$out_meeting
/
$dir_name
|
grep
".TEXTGRID"
`
for
filewithuppercase
in
$(
echo
$_Bool_Uppercase
)
;
do
fileingoodformat
=
`
echo
$filewithuppercase
|
sed
"s/TEXTGRID/TextGrid/"
`
mv
$out_meeting
/
$dir_name
/
$filewithuppercase
$out_meeting
/
$dir_name
/
$fileingoodformat
done
done
fi
if
[
!
-z
"
$_Bool_prepared
"
]
;
then
# This file is a prepared speech
for
dir_prepared
in
$(
find
$type_speech
-mindepth
1
-maxdepth
1
-type
d
)
;
do
cp
-r
$dir_prepared
/
$out_prepared
dir_name
=
$(
basename
$dir_prepared
)
_Bool_Uppercase
=
`
ls
$out_prepared
/
$dir_name
|
grep
".TEXTGRID"
`
for
filewithuppercase
in
$(
echo
$_Bool_Uppercase
)
;
do
fileingoodformat
=
`
echo
$filewithuppercase
|
sed
"s/TEXTGRID/TextGrid/"
`
mv
$out_prepared
/
$dir_name
/
$filewithuppercase
$out_prepared
/
$dir_name
/
$fileingoodformat
done
done
fi
if
[
!
-z
"
$_Bool_text
"
]
;
then
# This file is a story readed
for
file_story
in
$(
find
$type_speech
-mindepth
1
-maxdepth
1 |
grep
wav
)
;
do
wav_file
=
$(
basename
$file_story
)
dir_name
=
$(
dirname
$file_story
)
file_name
=
`
echo
$wav_file
|
sed
's/\.wav//g'
`
mkdir
-p
$out_story
/
$file_name
cp
$dir_name
/
$file_name
*
$out_story
/
$file_name
#cp `echo $file_story | sed 's/wav/TextGrid/g'` $out_story/$dir_out
# change all textgrid extension to TEXTGRID
_Bool_Uppercase
=
`
ls
$out_story
/
$file_name
|
grep
".TEXTGRID"
`
if
[
!
-z
"
$_Bool_Uppercase
"
]
;
then
mv
$out_story
/
$file_name
/
$file_name
.TEXTGRID
$out_story
/
$file_name
/
$file_name
.TextGrid
fi
done
echo
"Text..."
fi
done
\ No newline at end of file
local/ACSYNT_Parse/textgrid.py
0 → 100755
View file @
0e544ebd
# Natural Language Toolkit: TextGrid analysis
#
# Copyright (C) 2001-2011 NLTK Project
# Author: Margaret Mitchell <itallow@gmail.com>
# Steven Bird <sb@csse.unimelb.edu.au> (revisions)
# URL: <http://www.nltk.org>
# For license information, see LICENSE.TXT
#
"""
Tools for reading TextGrid files, the format used by Praat.
Module contents
===============
The textgrid corpus reader provides 4 data items and 1 function
for each textgrid file. For each tier in the file, the reader
provides 10 data items and 2 functions.
For the full textgrid file:
- size
The number of tiers in the file.
- xmin
First marked time of the file.
- xmax
Last marked time of the file.
- t_time
xmax - xmin.
- text_type
The style of TextGrid format:
- ooTextFile: Organized by tier.
- ChronTextFile: Organized by time.
- OldooTextFile: Similar to ooTextFile.
- to_chron()
Convert given file to a ChronTextFile format.
- to_oo()
Convert given file to an ooTextFile format.
For each tier:
- text_type
The style of TextGrid format, as above.
- classid
The style of transcription on this tier:
- IntervalTier: Transcription is marked as intervals.
- TextTier: Transcription is marked as single points.
- nameid
The name of the tier.
- xmin
First marked time of the tier.
- xmax
Last marked time of the tier.
- size
Number of entries in the tier.
- transcript
The raw transcript for the tier.
- simple_transcript
The transcript formatted as a list of tuples: (time1, time2, utterance).
- tier_info
List of (classid, nameid, xmin, xmax, size, transcript).
- min_max()
A tuple of (xmin, xmax).
- time(non_speech_marker)
Returns the utterance time of a given tier.
Excludes entries that begin with a non-speech marker.
"""
# needs more cleanup, subclassing, epydoc docstrings
import
sys
import
re
TEXTTIER
=
"TextTier"
INTERVALTIER
=
"IntervalTier"
OOTEXTFILE
=
re
.
compile
(
r
"""(?x)
xmin\ =\ (.*)[\r\n]+
xmax\ =\ (.*)[\r\n]+
[\s\S]+?size\ =\ (.*)[\r\n]+
"""
)
CHRONTEXTFILE
=
re
.
compile
(
r
"""(?x)
[\r\n]+(\S+)\
(\S+)\ +!\ Time\ domain.\ *[\r\n]+
(\S+)\ +!\ Number\ of\ tiers.\ *[\r\n]+"
"""
)
OLDOOTEXTFILE
=
re
.
compile
(
r
"""(?x)
[\r\n]+(\S+)
[\r\n]+(\S+)
[\r\n]+.+[\r\n]+(\S+)
"""
)
#################################################################
# TextGrid Class
#################################################################
class
TextGrid
(
object
):
"""
Class to manipulate the TextGrid format used by Praat.
Separates each tier within this file into its own Tier
object. Each TextGrid object has
a number of tiers (size), xmin, xmax, a text type to help
with the different styles of TextGrid format, and tiers with their
own attributes.
"""
def
__init__
(
self
,
read_file
):
"""
Takes open read file as input, initializes attributes
of the TextGrid file.
@type read_file: An open TextGrid file, mode "r".
@param size: Number of tiers.
@param xmin: xmin.
@param xmax: xmax.
@param t_time: Total time of TextGrid file.
@param text_type: TextGrid format.
@type tiers: A list of tier objects.
"""
self
.
read_file
=
read_file
self
.
size
=
0
self
.
xmin
=
0
self
.
xmax
=
0
self
.
t_time
=
0
self
.
text_type
=
self
.
_check_type
()
self
.
tiers
=
self
.
_find_tiers
()
def
__iter__
(
self
):
for
tier
in
self
.
tiers
:
yield
tier
def
next
(
self
):
if
self
.
idx
==
(
self
.
size
-
1
):
raise
StopIteration
self
.
idx
+=
1
return
self
.
tiers
[
self
.
idx
]
@
staticmethod
def
load
(
file
):
"""
@param file: a file in TextGrid format
"""
return
TextGrid
(
open
(
file
).
read
())
def
_load_tiers
(
self
,
header
):
"""
Iterates over each tier and grabs tier information.
"""
tiers
=
[]
if
self
.
text_type
==
"ChronTextFile"
:
m
=
re
.
compile
(
header
)
tier_headers
=
m
.
findall
(
self
.
read_file
)
tier_re
=
" \d+.?\d* \d+.?\d*[
\r\n
]+
\"
[^
\"
]*
\"
"
for
i
in
range
(
0
,
self
.
size
):
tier_info
=
[
tier_headers
[
i
]]
+
\
re
.
findall
(
str
(
i
+
1
)
+
tier_re
,
self
.
read_file
)
tier_info
=
"
\n
"
.
join
(
tier_info
)
tiers
.
append
(
Tier
(
tier_info
,
self
.
text_type
,
self
.
t_time
))
return
tiers
tier_re
=
header
+
"[\s\S]+?(?="
+
header
+
"|$$)"
m
=
re
.
compile
(
tier_re
)
tier_iter
=
m
.
finditer
(
self
.
read_file
)
for
iterator
in
tier_iter
:
(
begin
,
end
)
=
iterator
.
span
()
tier_info
=
self
.
read_file
[
begin
:
end
]
tiers
.
append
(
Tier
(
tier_info
,
self
.
text_type
,
self
.
t_time
))
return
tiers
def
_check_type
(
self
):
"""
Figures out the TextGrid format.
"""
m
=
re
.
match
(
"(.*)[
\r\n
](.*)[
\r\n
](.*)[
\r\n
](.*)"
,
self
.
read_file
)
try
:
type_id
=
m
.
group
(
1
).
strip
()
except
AttributeError
:
raise
TypeError
(
"Cannot read file -- try TextGrid.load()"
)
xmin
=
m
.
group
(
4
)
if
type_id
==
"File type =
\"
ooTextFile
\"
"
:
if
"xmin"
not
in
xmin
:
text_type
=
"OldooTextFile"
else
:
text_type
=
"ooTextFile"
elif
type_id
==
"
\"
Praat chronological TextGrid text file
\"
"
:
text_type
=
"ChronTextFile"
else
:
raise
TypeError
(
"Unknown format '(%s)'"
,
(
type_id
))
return
text_type
def
_find_tiers
(
self
):
"""
Splits the textgrid file into substrings corresponding to tiers.
"""
if
self
.
text_type
==
"ooTextFile"
:
m
=
OOTEXTFILE
header
=
" +item \["
elif
self
.
text_type
==
"ChronTextFile"
:
m
=
CHRONTEXTFILE
header
=
"
\"
\S+
\"
\"
.*
\"
\d+\.?\d* \d+\.?\d*"
elif
self
.
text_type
==
"OldooTextFile"
:
m
=
OLDOOTEXTFILE
header
=
"
\"
.*
\"
[
\r\n
]+
\"
.*
\"
"
file_info
=
m
.
findall
(
self
.
read_file
)[
0
]
self
.
xmin
=
float
(
file_info
[
0
])
self
.
xmax
=
float
(
file_info
[
1
])
self
.
t_time
=
self
.
xmax
-
self
.
xmin
self
.
size
=
int
(
file_info
[
2
])
tiers
=
self
.
_load_tiers
(
header
)
return
tiers
def
to_chron
(
self
):
"""
@return: String in Chronological TextGrid file format.
"""
chron_file
=
""
chron_file
+=
"
\"
Praat chronological TextGrid text file
\"\n
"
chron_file
+=
str
(
self
.
xmin
)
+
" "
+
str
(
self
.
xmax
)
chron_file
+=
" ! Time domain.
\n
"
chron_file
+=
str
(
self
.
size
)
+
" ! Number of tiers.
\n
"
for
tier
in
self
.
tiers
:
idx
=
(
self
.
tiers
.
index
(
tier
))
+
1
tier_header
=
"
\"
"
+
tier
.
classid
+
"
\"
\"
"
\
+
tier
.
nameid
+
"
\"
"
+
str
(
tier
.
xmin
)
\
+
" "
+
str
(
tier
.
xmax
)
chron_file
+=
tier_header
+
"
\n
"
transcript
=
tier
.
simple_transcript
for
(
xmin
,
xmax
,
utt
)
in
transcript
:
chron_file
+=
str
(
idx
)
+
" "
+
str
(
xmin
)
chron_file
+=
" "
+
str
(
xmax
)
+
"
\n
"
chron_file
+=
"
\"
"
+
utt
+
"
\"\n
"
return
chron_file
def
to_oo
(
self
):
"""
@return: A string in OoTextGrid file format.
"""
oo_file
=
""
oo_file
+=
"File type =
\"
ooTextFile
\"\n
"
oo_file
+=
"Object class =
\"
TextGrid
\"\n\n
"
oo_file
+=
"xmin = "
,
self
.
xmin
,
"
\n
"
oo_file
+=
"xmax = "
,
self
.
xmax
,
"
\n
"
oo_file
+=
"tiers? <exists>
\n
"
oo_file
+=
"size = "
,
self
.
size
,
"
\n
"
oo_file
+=
"item []:
\n
"
for
i
in
range
(
len
(
self
.
tiers
)):
oo_file
+=
"%4s%s [%s]"
%
(
""
,
"item"
,
i
+
1
)
_curr_tier
=
self
.
tiers
[
i
]
for
(
x
,
y
)
in
_curr_tier
.
header
:
oo_file
+=
"%8s%s =
\"
%s
\"
"
%
(
""
,
x
,
y
)
if
_curr_tier
.
classid
!=
TEXTTIER
:
for
(
xmin
,
xmax
,
text
)
in
_curr_tier
.
simple_transcript
:
oo_file
+=
"%12s%s = %s"
%
(
""
,
"xmin"
,
xmin
)
oo_file
+=
"%12s%s = %s"
%
(
""
,
"xmax"
,
xmax
)
oo_file
+=
"%12s%s =
\"
%s
\"
"
%
(
""
,
"text"
,
text
)
else
:
for
(
time
,
mark
)
in
_curr_tier
.
simple_transcript
:
oo_file
+=
"%12s%s = %s"
%
(
""
,
"time"
,
time
)
oo_file
+=
"%12s%s = %s"
%
(
""
,
"mark"
,
mark
)
return
oo_file
#################################################################
# Tier Class
#################################################################
class
Tier
(
object
):
"""
A container for each tier.
"""
def
__init__
(
self
,
tier
,
text_type
,
t_time
):
"""
Initializes attributes of the tier: class, name, xmin, xmax
size, transcript, total time.
Utilizes text_type to guide how to parse the file.
@type tier: a tier object; single item in the TextGrid list.
@param text_type: TextGrid format
@param t_time: Total time of TextGrid file.
@param classid: Type of tier (point or interval).
@param nameid: Name of tier.
@param xmin: xmin of the tier.
@param xmax: xmax of the tier.
@param size: Number of entries in the tier
@param transcript: The raw transcript for the tier.
"""
self
.
tier
=
tier
self
.
text_type
=
text_type
self
.
t_time
=
t_time
self
.
classid
=
""
self
.
nameid
=
""
self
.
xmin
=
0
self
.
xmax
=
0
self
.
size
=
0
self
.
transcript
=
""
self
.
tier_info
=
""
self
.
_make_info
()
self
.
simple_transcript
=
self
.
make_simple_transcript
()
if
self
.
classid
!=
TEXTTIER
:
self
.
mark_type
=
"intervals"
else
:
self
.
mark_type
=
"points"
self
.
header
=
[(
"class"
,
self
.
classid
),
(
"name"
,
self
.
nameid
),
\
(
"xmin"
,
self
.
xmin
),
(
"xmax"
,
self
.
xmax
),
(
"size"
,
self
.
size
)]
def
__iter__
(
self
):
return
self
def
_make_info
(
self
):
"""
Figures out most attributes of the tier object:
class, name, xmin, xmax, transcript.
"""
trans
=
"([\S\s]*)"
if
self
.
text_type
==
"ChronTextFile"
:
classid
=
"
\"
(.*)
\"
+"
nameid
=
"
\"
(.*)
\"
+"
xmin
=
"(\d+\.?\d*) +"
xmax
=
"(\d+\.?\d*) *[
\r\n
]+"
# No size values are given in the Chronological Text File format.
self
.
size
=
None
size
=
""
elif
self
.
text_type
==
"ooTextFile"
:
classid
=
" +class =
\"
(.*)
\"
*[
\r\n
]+"
nameid
=
" +name =
\"
(.*)
\"
*[
\r\n
]+"
xmin
=
" +xmin = (\d+\.?\d*) *[
\r\n
]+"
xmax
=
" +xmax = (\d+\.?\d*) *[
\r\n
]+"
size
=
" +\S+: size = (\d+) *[
\r\n
]+"
elif
self
.
text_type
==
"OldooTextFile"
:
classid
=
"
\"
(.*)
\"
*[
\r\n
]+"
nameid
=
"
\"
(.*)
\"
*[
\r\n
]+"
xmin
=
"(\d+\.?\d*) *[
\r\n
]+"
xmax
=
"(\d+\.?\d*) *[
\r\n
]+"
size
=
"(\d+) *[
\r\n
]+"
m
=
re
.
compile
(
classid
+
nameid
+
xmin
+
xmax
+
size
+
trans
)
self
.
tier_info
=
m
.
findall
(
self
.
tier
)[
0
]
self
.
classid
=
self
.
tier_info
[
0
]
self
.
nameid
=
self
.
tier_info
[
1
]
self
.
xmin
=
float
(
self
.
tier_info
[
2
])
self
.
xmax
=
float
(
self
.
tier_info
[
3
])
if
self
.
size
!=
None
:
self
.
size
=
int
(
self
.
tier_info
[
4
])
self
.
transcript
=
self
.
tier_info
[
-
1
]
def
make_simple_transcript
(
self
):
"""
@return: Transcript of the tier, in form [(start_time end_time label)]
"""
if
self
.
text_type
==
"ChronTextFile"
:
trans_head
=
""
trans_xmin
=
" (\S+)"
trans_xmax
=
" (\S+)[
\r\n
]+"
trans_text
=
"
\"
([\S\s]*?)
\"
"
elif
self
.
text_type
==
"ooTextFile"
:
trans_head
=
" +\S+ \[\d+\]: *[
\r\n
]+"
trans_xmin
=
" +\S+ = (\S+) *[
\r\n
]+"
trans_xmax
=
" +\S+ = (\S+) *[
\r\n
]+"
trans_text
=
" +\S+ =
\"
([^
\"
]*?)
\"
"
elif
self
.
text_type
==
"OldooTextFile"
:
trans_head
=
""
trans_xmin
=
"(.*)[
\r\n
]+"
trans_xmax
=
"(.*)[
\r\n
]+"
trans_text
=
"
\"
([\S\s]*?)
\"
"
if
self
.
classid
==
TEXTTIER
:
trans_xmin
=
""
trans_m
=
re
.
compile
(
trans_head
+
trans_xmin
+
trans_xmax
+
trans_text
)
self
.
simple_transcript
=
trans_m
.
findall
(
self
.
transcript
)
return
self
.
simple_transcript
def
transcript
(
self
):
"""
@return: Transcript of the tier, as it appears in the file.
"""
return
self
.
transcript
def
time
(
self
,
non_speech_char
=
"."
):
"""
@return: Utterance time of a given tier.
Screens out entries that begin with a non-speech marker.
"""
total
=
0.0
if
self
.
classid
!=
TEXTTIER
:
for
(
time1
,
time2
,
utt
)
in
self
.
simple_transcript
:
utt
=
utt
.
strip
()
if
utt
and
not
utt
[
0
]
==
"."
:
total
+=
(
float
(
time2
)
-
float
(
time1
))
return
total
def
tier_name
(
self
):
"""
@return: Tier name of a given tier.
"""
return
self
.
nameid
def
classid
(
self
):
"""
@return: Type of transcription on tier.
"""
return
self
.
classid
def
min_max
(
self
):
"""
@return: (xmin, xmax) tuple for a given tier.
"""
return
(
self
.
xmin
,
self
.
xmax
)
def
__repr__
(
self
):
return
"<%s
\"
%s
\"
(%.2f, %.2f) %.2f%%>"
%
(
self
.
classid
,
self
.
nameid
,
self
.
xmin
,
self
.
xmax
,
100
*
self
.
time
()
/
self
.
t_time
)
def
__str__
(
self
):
return
self
.
__repr__
()
+
"
\n
"
+
"
\n
"
.
join
(
" "
.
join
(
row
)
for
row
in
self
.
simple_transcript
)
def
demo_TextGrid
(
demo_data
):
print
(
"** Demo of the TextGrid class. **"
)
fid
=
TextGrid
(
demo_data
)
print
(
"Tiers: %s"
%
(
fid
.
size
))
for
i
,
tier
in
enumerate
(
fid
):
print
(
"
\n
***"
)
print
(
"Tier: %s"
%
(
i
+
1
))
print
(
tier
)
def
demo
():
# Each demo demonstrates different TextGrid formats.