Commit 0e544ebd authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

Parse ACSYNT speech Database

parent 1697aa30
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Abdel Linagora@March17
from textgrid import TextGrid
from sys import argv
import re
import os.path
def transform_text(text):
if len(re.findall(r"\<.+\>", text)) > 0:
text=re.sub(r"\<.{2}"," ",text)
text=re.sub(r"\>"," ",text)
text=re.sub(r"\[\w+\]","<noise>",text)
text=re.sub(r"\[.+\]","<noise>",text)
text=re.sub(r"\.","",text)
text=re.sub(r"\\+","",text)
text=re.sub(r"e\^","ê",text)
text=re.sub(r"c\,","ç",text)
text=re.sub(r"o\^","ô",text)
text=re.sub(r"u\`","ù",text)
text=re.sub(r"i\^","î",text)
text=re.sub(r"\?","",text)
# delete space in the end of utterance
text=re.sub(r"[\s]+$","",text)
# replace n succesive spaces with one space. : OK
text=re.sub(r"\s{2,}"," ",text)
# split word j'y j' y
if len(re.findall(r"\w+-\w+\'\w+", text)) > 0:
a=
else:
text = re.sub("\'","\' ",text)
text=text.lower()
return text
if __name__=="__main__":
# Input : file.TEXTGRID and we assume that file.wav are in the same directory
TEXTGRID_file=argv[1]
dirname=os.path.dirname(TEXTGRID_file)
basename=os.path.basename(TEXTGRID_file.split('.')[0])
WAV_file=dirname+'/'+basename+'.wav'
# Output directory
outdir=argv[2]
# Output File needed for kaldi input
segments_file = open(outdir + '/segments', 'a')
utt2spk_file = open(outdir + '/utt2spk', 'a')
text_file = open(outdir + '/text', 'a')
wav_scp = open(outdir + '/wav.scp', 'a')
# Parse TEXTGRID FILE
TEXTGRID_io=open(TEXTGRID_file,'r')
TEXTGRID_obj=TextGrid(TEXTGRID_io.read())
# Get only first Imte
Tier=TEXTGRID_obj.tiers[0]
count=0
Spk_that_contribute_to_meeting=[]
spkr_id=1
for deb_seg,end_seg,text in Tier.simple_transcript:
seg_id=str(basename) + '_spk-%03d_seg-%05d' % (int(count), int(count))
text=transform_text(text)
split_spkr_text=text.split(':')
if len(split_spkr_text)>1:
if not split_spkr_text[0] in Spk_that_contribute_to_meeting:
Spk_that_contribute_to_meeting.append(split_spkr_text[0])
spkr=Spk_that_contribute_to_meeting.index(split_spkr_text[0])+1
spkr_id=str(basename)+'_spk-%03d' % int(spkr)
text=split_spkr_text[1]
text = re.sub("^ ", "", text)
#print(split_spkr_text)
segments_file.write(seg_id+" "+basename+" "+str(round(float(deb_seg),3))+" "+str(round(float(end_seg),3))+"\n")
text_file.write(seg_id+" "+text+"\n")
utt2spk_file.write(seg_id+" "+str(spkr_id)+"\n")
count=count+1
wav_scp.write(basename+" sox "+WAV_file+" -t wav -r 16000 -c 1 - |\n")
segments_file.close()
utt2spk_file.close()
text_file.close()
wav_scp.close()
\ No newline at end of file
#!/usr/bin/env bash
# @ Abdel HEBA
# Cluster Readed Text, meeting with 2 loc, and presentation
corpus=/home/lingora/Documents/Linagora/Data/ACSYNT
outdir=/home/lingora/Documents/Linagora/Data/ACSYNT/ACSYNT_Final
local/ACSYNT_Parse/prep_ACSYNT.sh $corpus $outdir
#!/usr/bin/env bash
# Abdel @LINAGORA - DONE
corpus=$1
# CD1 CD2 CD3 are directories which contain Data
ACSYNT_CD1=$corpus/ACSYNT_CD1/ACSYNT_CD1
ACSYNT_CD2=$corpus/ACSYNT_CD2/ACSYNT_CD2
ACSYNT_CD3=$corpus/ACSYNT_CD3/ACSYNT_CD3
# separate Readed speech , prepared speech and meetings
# 4 letters : 3 for ID and latest: E: for meeting, P: for prepared speech and T: for readed speech
out=$2
out_prepared=$out/prepared_speech
out_meeting=$out/meeting
out_story=$out/story
mkdir -p $out
mkdir -p $out_prepared
mkdir -p $out_story
mkdir -p $out_meeting
for type_speech in $(find $ACSYNT_CD1 $ACSYNT_CD2 $ACSYNT_CD3 -mindepth 1 -maxdepth 1 -type d);do
type=$(basename $type_speech)
_Bool_meeting=`echo $type | grep "Entretien"`
_Bool_prepared=`echo $type | grep "Presentation"`
_Bool_text=`echo $type | grep "Text"`
if [ ! -z "$_Bool_meeting" ]; then
# This file is a meeting
for dir_meeting in $(find $type_speech -mindepth 1 -maxdepth 1 -type d);do
cp -r $dir_meeting/ $out_meeting
dir_name=$(basename $dir_meeting)
_Bool_Uppercase=`ls $out_meeting/$dir_name | grep ".TEXTGRID"`
for filewithuppercase in $(echo $_Bool_Uppercase); do
fileingoodformat=`echo $filewithuppercase | sed "s/TEXTGRID/TextGrid/"`
mv $out_meeting/$dir_name/$filewithuppercase $out_meeting/$dir_name/$fileingoodformat
done
done
fi
if [ ! -z "$_Bool_prepared" ]; then
# This file is a prepared speech
for dir_prepared in $(find $type_speech -mindepth 1 -maxdepth 1 -type d);do
cp -r $dir_prepared/ $out_prepared
dir_name=$(basename $dir_prepared)
_Bool_Uppercase=`ls $out_prepared/$dir_name | grep ".TEXTGRID"`
for filewithuppercase in $(echo $_Bool_Uppercase); do
fileingoodformat=`echo $filewithuppercase | sed "s/TEXTGRID/TextGrid/"`
mv $out_prepared/$dir_name/$filewithuppercase $out_prepared/$dir_name/$fileingoodformat
done
done
fi
if [ ! -z "$_Bool_text" ]; then
# This file is a story readed
for file_story in $(find $type_speech -mindepth 1 -maxdepth 1 | grep wav);do
wav_file=$(basename $file_story)
dir_name=$(dirname $file_story)
file_name=`echo $wav_file | sed 's/\.wav//g'`
mkdir -p $out_story/$file_name
cp $dir_name/$file_name* $out_story/$file_name
#cp `echo $file_story | sed 's/wav/TextGrid/g'` $out_story/$dir_out
# change all textgrid extension to TEXTGRID
_Bool_Uppercase=`ls $out_story/$file_name | grep ".TEXTGRID"`
if [ ! -z "$_Bool_Uppercase" ]; then
mv $out_story/$file_name/$file_name.TEXTGRID $out_story/$file_name/$file_name.TextGrid
fi
done
echo "Text..."
fi
done
\ No newline at end of file
# Natural Language Toolkit: TextGrid analysis
#
# Copyright (C) 2001-2011 NLTK Project
# Author: Margaret Mitchell <itallow@gmail.com>
# Steven Bird <sb@csse.unimelb.edu.au> (revisions)
# URL: <http://www.nltk.org>
# For license information, see LICENSE.TXT
#
"""
Tools for reading TextGrid files, the format used by Praat.
Module contents
===============
The textgrid corpus reader provides 4 data items and 1 function
for each textgrid file. For each tier in the file, the reader
provides 10 data items and 2 functions.
For the full textgrid file:
- size
The number of tiers in the file.
- xmin
First marked time of the file.
- xmax
Last marked time of the file.
- t_time
xmax - xmin.
- text_type
The style of TextGrid format:
- ooTextFile: Organized by tier.
- ChronTextFile: Organized by time.
- OldooTextFile: Similar to ooTextFile.
- to_chron()
Convert given file to a ChronTextFile format.
- to_oo()
Convert given file to an ooTextFile format.
For each tier:
- text_type
The style of TextGrid format, as above.
- classid
The style of transcription on this tier:
- IntervalTier: Transcription is marked as intervals.
- TextTier: Transcription is marked as single points.
- nameid
The name of the tier.
- xmin
First marked time of the tier.
- xmax
Last marked time of the tier.
- size
Number of entries in the tier.
- transcript
The raw transcript for the tier.
- simple_transcript
The transcript formatted as a list of tuples: (time1, time2, utterance).
- tier_info
List of (classid, nameid, xmin, xmax, size, transcript).
- min_max()
A tuple of (xmin, xmax).
- time(non_speech_marker)
Returns the utterance time of a given tier.
Excludes entries that begin with a non-speech marker.
"""
# needs more cleanup, subclassing, epydoc docstrings
import sys
import re
TEXTTIER = "TextTier"
INTERVALTIER = "IntervalTier"
OOTEXTFILE = re.compile(r"""(?x)
xmin\ =\ (.*)[\r\n]+
xmax\ =\ (.*)[\r\n]+
[\s\S]+?size\ =\ (.*)[\r\n]+
""")
CHRONTEXTFILE = re.compile(r"""(?x)
[\r\n]+(\S+)\
(\S+)\ +!\ Time\ domain.\ *[\r\n]+
(\S+)\ +!\ Number\ of\ tiers.\ *[\r\n]+"
""")
OLDOOTEXTFILE = re.compile(r"""(?x)
[\r\n]+(\S+)
[\r\n]+(\S+)
[\r\n]+.+[\r\n]+(\S+)
""")
#################################################################
# TextGrid Class
#################################################################
class TextGrid(object):
"""
Class to manipulate the TextGrid format used by Praat.
Separates each tier within this file into its own Tier
object. Each TextGrid object has
a number of tiers (size), xmin, xmax, a text type to help
with the different styles of TextGrid format, and tiers with their
own attributes.
"""
def __init__(self, read_file):
"""
Takes open read file as input, initializes attributes
of the TextGrid file.
@type read_file: An open TextGrid file, mode "r".
@param size: Number of tiers.
@param xmin: xmin.
@param xmax: xmax.
@param t_time: Total time of TextGrid file.
@param text_type: TextGrid format.
@type tiers: A list of tier objects.
"""
self.read_file = read_file
self.size = 0
self.xmin = 0
self.xmax = 0
self.t_time = 0
self.text_type = self._check_type()
self.tiers = self._find_tiers()
def __iter__(self):
for tier in self.tiers:
yield tier
def next(self):
if self.idx == (self.size - 1):
raise StopIteration
self.idx += 1
return self.tiers[self.idx]
@staticmethod
def load(file):
"""
@param file: a file in TextGrid format
"""
return TextGrid(open(file).read())
def _load_tiers(self, header):
"""
Iterates over each tier and grabs tier information.
"""
tiers = []
if self.text_type == "ChronTextFile":
m = re.compile(header)
tier_headers = m.findall(self.read_file)
tier_re = " \d+.?\d* \d+.?\d*[\r\n]+\"[^\"]*\""
for i in range(0, self.size):
tier_info = [tier_headers[i]] + \
re.findall(str(i + 1) + tier_re, self.read_file)
tier_info = "\n".join(tier_info)
tiers.append(Tier(tier_info, self.text_type, self.t_time))
return tiers
tier_re = header + "[\s\S]+?(?=" + header + "|$$)"
m = re.compile(tier_re)
tier_iter = m.finditer(self.read_file)
for iterator in tier_iter:
(begin, end) = iterator.span()
tier_info = self.read_file[begin:end]
tiers.append(Tier(tier_info, self.text_type, self.t_time))
return tiers
def _check_type(self):
"""
Figures out the TextGrid format.
"""
m = re.match("(.*)[\r\n](.*)[\r\n](.*)[\r\n](.*)", self.read_file)
try:
type_id = m.group(1).strip()
except AttributeError:
raise TypeError("Cannot read file -- try TextGrid.load()")
xmin = m.group(4)
if type_id == "File type = \"ooTextFile\"":
if "xmin" not in xmin:
text_type = "OldooTextFile"
else:
text_type = "ooTextFile"
elif type_id == "\"Praat chronological TextGrid text file\"":
text_type = "ChronTextFile"
else:
raise TypeError("Unknown format '(%s)'", (type_id))
return text_type
def _find_tiers(self):
"""
Splits the textgrid file into substrings corresponding to tiers.
"""
if self.text_type == "ooTextFile":
m = OOTEXTFILE
header = " +item \["
elif self.text_type == "ChronTextFile":
m = CHRONTEXTFILE
header = "\"\S+\" \".*\" \d+\.?\d* \d+\.?\d*"
elif self.text_type == "OldooTextFile":
m = OLDOOTEXTFILE
header = "\".*\"[\r\n]+\".*\""
file_info = m.findall(self.read_file)[0]
self.xmin = float(file_info[0])
self.xmax = float(file_info[1])
self.t_time = self.xmax - self.xmin
self.size = int(file_info[2])
tiers = self._load_tiers(header)
return tiers
def to_chron(self):
"""
@return: String in Chronological TextGrid file format.
"""
chron_file = ""
chron_file += "\"Praat chronological TextGrid text file\"\n"
chron_file += str(self.xmin) + " " + str(self.xmax)
chron_file += " ! Time domain.\n"
chron_file += str(self.size) + " ! Number of tiers.\n"
for tier in self.tiers:
idx = (self.tiers.index(tier)) + 1
tier_header = "\"" + tier.classid + "\" \"" \
+ tier.nameid + "\" " + str(tier.xmin) \
+ " " + str(tier.xmax)
chron_file += tier_header + "\n"
transcript = tier.simple_transcript
for (xmin, xmax, utt) in transcript:
chron_file += str(idx) + " " + str(xmin)
chron_file += " " + str(xmax) +"\n"
chron_file += "\"" + utt + "\"\n"
return chron_file
def to_oo(self):
"""
@return: A string in OoTextGrid file format.
"""
oo_file = ""
oo_file += "File type = \"ooTextFile\"\n"
oo_file += "Object class = \"TextGrid\"\n\n"
oo_file += "xmin = ", self.xmin, "\n"
oo_file += "xmax = ", self.xmax, "\n"
oo_file += "tiers? <exists>\n"
oo_file += "size = ", self.size, "\n"
oo_file += "item []:\n"
for i in range(len(self.tiers)):
oo_file += "%4s%s [%s]" % ("", "item", i + 1)
_curr_tier = self.tiers[i]
for (x, y) in _curr_tier.header:
oo_file += "%8s%s = \"%s\"" % ("", x, y)
if _curr_tier.classid != TEXTTIER:
for (xmin, xmax, text) in _curr_tier.simple_transcript:
oo_file += "%12s%s = %s" % ("", "xmin", xmin)
oo_file += "%12s%s = %s" % ("", "xmax", xmax)
oo_file += "%12s%s = \"%s\"" % ("", "text", text)
else:
for (time, mark) in _curr_tier.simple_transcript:
oo_file += "%12s%s = %s" % ("", "time", time)
oo_file += "%12s%s = %s" % ("", "mark", mark)
return oo_file
#################################################################
# Tier Class
#################################################################
class Tier(object):
"""
A container for each tier.
"""
def __init__(self, tier, text_type, t_time):
"""
Initializes attributes of the tier: class, name, xmin, xmax
size, transcript, total time.
Utilizes text_type to guide how to parse the file.
@type tier: a tier object; single item in the TextGrid list.
@param text_type: TextGrid format
@param t_time: Total time of TextGrid file.
@param classid: Type of tier (point or interval).
@param nameid: Name of tier.
@param xmin: xmin of the tier.
@param xmax: xmax of the tier.
@param size: Number of entries in the tier
@param transcript: The raw transcript for the tier.
"""
self.tier = tier
self.text_type = text_type
self.t_time = t_time
self.classid = ""
self.nameid = ""
self.xmin = 0
self.xmax = 0
self.size = 0
self.transcript = ""
self.tier_info = ""
self._make_info()
self.simple_transcript = self.make_simple_transcript()
if self.classid != TEXTTIER:
self.mark_type = "intervals"
else:
self.mark_type = "points"
self.header = [("class", self.classid), ("name", self.nameid), \
("xmin", self.xmin), ("xmax", self.xmax), ("size", self.size)]
def __iter__(self):
return self
def _make_info(self):
"""
Figures out most attributes of the tier object:
class, name, xmin, xmax, transcript.
"""
trans = "([\S\s]*)"
if self.text_type == "ChronTextFile":
classid = "\"(.*)\" +"
nameid = "\"(.*)\" +"
xmin = "(\d+\.?\d*) +"
xmax = "(\d+\.?\d*) *[\r\n]+"
# No size values are given in the Chronological Text File format.
self.size = None
size = ""
elif self.text_type == "ooTextFile":
classid = " +class = \"(.*)\" *[\r\n]+"
nameid = " +name = \"(.*)\" *[\r\n]+"
xmin = " +xmin = (\d+\.?\d*) *[\r\n]+"
xmax = " +xmax = (\d+\.?\d*) *[\r\n]+"
size = " +\S+: size = (\d+) *[\r\n]+"
elif self.text_type == "OldooTextFile":
classid = "\"(.*)\" *[\r\n]+"
nameid = "\"(.*)\" *[\r\n]+"
xmin = "(\d+\.?\d*) *[\r\n]+"
xmax = "(\d+\.?\d*) *[\r\n]+"
size = "(\d+) *[\r\n]+"
m = re.compile(classid + nameid + xmin + xmax + size + trans)
self.tier_info = m.findall(self.tier)[0]
self.classid = self.tier_info[0]
self.nameid = self.tier_info[1]
self.xmin = float(self.tier_info[2])
self.xmax = float(self.tier_info[3])
if self.size != None:
self.size = int(self.tier_info[4])
self.transcript = self.tier_info[-1]
def make_simple_transcript(self):
"""
@return: Transcript of the tier, in form [(start_time end_time label)]
"""
if self.text_type == "ChronTextFile":
trans_head = ""
trans_xmin = " (\S+)"
trans_xmax = " (\S+)[\r\n]+"
trans_text = "\"([\S\s]*?)\""
elif self.text_type == "ooTextFile":
trans_head = " +\S+ \[\d+\]: *[\r\n]+"
trans_xmin = " +\S+ = (\S+) *[\r\n]+"
trans_xmax = " +\S+ = (\S+) *[\r\n]+"
trans_text = " +\S+ = \"([^\"]*?)\""
elif self.text_type == "OldooTextFile":
trans_head = ""
trans_xmin = "(.*)[\r\n]+"
trans_xmax = "(.*)[\r\n]+"
trans_text = "\"([\S\s]*?)\""
if self.classid == TEXTTIER:
trans_xmin = ""
trans_m = re.compile(trans_head + trans_xmin + trans_xmax + trans_text)
self.simple_transcript = trans_m.findall(self.transcript)
return self.simple_transcript
def transcript(self):
"""
@return: Transcript of the tier, as it appears in the file.
"""
return self.transcript
def time(self, non_speech_char="."):
"""
@return: Utterance time of a given tier.
Screens out entries that begin with a non-speech marker.
"""
total = 0.0
if self.classid != TEXTTIER:
for (time1, time2, utt) in self.simple_transcript:
utt = utt.strip()
if utt and not utt[0] == ".":
total += (float(time2) - float(time1))
return total
def tier_name(self):
"""
@return: Tier name of a given tier.
"""
return self.nameid
def classid(self):
"""
@return: Type of transcription on tier.
"""
return self.classid
def min_max(self):
"""
@return: (xmin, xmax) tuple for a given tier.
"""
return (self.xmin, self.xmax)
def __repr__(self):
return "<%s \"%s\" (%.2f, %.2f) %.2f%%>" % (self.classid, self.nameid, self.xmin, self.xmax, 100*self.time()/self.t_time)
def __str__(self):
return self.__repr__() + "\n " + "\n ".join(" ".join(row) for row in self.simple_transcript)
def demo_TextGrid(demo_data):
print("** Demo of the TextGrid class. **")
fid = TextGrid(demo_data)
print("Tiers: %s" % (fid.size))
for i, tier in enumerate(fid):
print("\n***")
print("Tier: %s" % (i + 1))
print(tier)
def demo():
# Each demo demonstrates different TextGrid formats.