parseTcof.py~ 5.72 KB
Newer Older
Abdelwahab HEBA's avatar
Abdelwahab HEBA committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from xml.dom import minidom
from unicodedata import normalize
from sys import argv

import re
import os.path

def transformation_text(text):
    bool=True
    if "###" in text or "(" in text: # "voir - amorces"
        print "Supprimer Ligne"
        bool=False
    else:
        #print "detecter (///|/|<|>)"
        print text
        text = re.sub(r"(\+|[*]+|///|/|<|>)", "", text.strip())
        text= re.sub(r"-|_|\."," ",text.strip())
        text = re.sub(r"(O K | O K|^O K$)", " ok ", text)
        text=re.sub(r"{[^{]+}"," ",text.strip())
        #text=re.sub(r"¤[^¤]+¤","",text.strip())
        text=re.sub(r"¤[^ ]+|[^ ]+¤|¤","",text.strip())
        text=re.sub(r" +"," ",text.strip())
        text=re.sub(r" 4x4 "," quatre fois quatre ",text)

#        if "///" in text:
#            print text
#            print "Detecté ///"
#            text=text.replace('///','')
#        if ">" in text or "<" in text:
#            print "< or > Detecté"
#            else:
#                if "{" in text and "}" in text:
#                    print "comment detected"
#                else:
#                    if "(" in text and ")" in text:
#                        print "( ) detected"
#                    else:
#                        if "***" in text:
#                            print "suite de syllabes incompréhensibles"
#                        else:
#                            if "*" in text:
#                                print "suite de syllable incompréhensible"
#                            else:
#                                if "$$$" in text:
#                                    print "coupure de l'enregistrement"
    return bool,text

if __name__=="__main__":
    file_trs=argv[1]
    outdir=argv[2]
    print file_trs.split('.')[0]
    # Output File needed for kaldi input
    segments_file = open(outdir + '/segments', 'a')
    utt2spk_file = open(outdir + '/utt2spk', 'a')
    text_file = open(outdir + '/text', 'a')
    wav_scp = open(outdir + '/wav.scp', 'a')
    spk2gender= open(outdir + '/spk2gender', 'a')
    # Read Trans File
    trsdoc= minidom.parse(file_trs)
    #Read MetaData Of speaker ( ID and Name)
    Speaker= trsdoc.getElementsByTagName('Speaker')
    speaker_id=[]
    namespk=[]
    for spk in Speaker:
        id_spk=spk.attributes['id'].value
        id_spk=normalize('NFKD', id_spk).encode('utf-8', 'ignore')
        name_spk=spk.attributes['name'].value
        name_spk=normalize('NFKD', name_spk).encode('utf-8', 'ignore')
        speaker_id.append(id_spk)
        namespk.append(name_spk.lower())
    #Read MetaData To get Gender of Speaker (Gender and Name)
    file_xml=file_trs.split('.')[0]+'.xml'
    xmldoc= minidom.parse(file_xml)
    locuteur= xmldoc.getElementsByTagName('locuteur')
    sexe= xmldoc.getElementsByTagName('sexe')
    speaker_gender=[]
    count=0
    for loc in locuteur:
        if loc.hasAttribute('identifiant'):
            name_loc=loc.attributes['identifiant'].value
            name_loc=normalize('NFKD', name_loc).encode('utf-8', 'ignore')
            #If the gender of speaker doesn't mentioned
            if sexe[0].childNodes==[]:
                speaker_gender.append([speaker_id[namespk.index(name_loc.lower())],'m'])
            else:
                gender_loc="".join(t.nodeValue for t in sexe[count].childNodes if t.nodeType == t.TEXT_NODE)
                gender_loc=normalize('NFKD', gender_loc).encode('utf-8', 'ignore')
                speaker_gender.append([speaker_id[namespk.index(name_loc.lower())],gender_loc.lower()])
            count=count+1
    #g_spk='m' if gender_spk=='male' else 'f'
    #speaker_gender.append([id_spk,g_spk])
    #print speaker_gender
    Turnlist= trsdoc.getElementsByTagName('Turn')
    #print len(Turnlist)
    a=""
    count=1
    #print "#id_utt\tid_Seg\tid_Spkr\tstartTime\tendTime\tText"
    for Turn in Turnlist:
        # Get id_spkr
        att_spk=Turn.attributes['speaker'].value
        spkr=normalize('NFKD', att_spk).encode('utf-8', 'ignore')
        # Get StartSegment
        att_startTime=Turn.attributes['startTime'].value
        startTime=normalize('NFKD', att_startTime).encode('utf-8', 'ignore')
        #Get EndSegment
        att_endTime=Turn.attributes['endTime'].value
        endTime=normalize('NFKD', att_endTime).encode('utf-8', 'ignore')
        # Get Text
        field_text="".join(t.nodeValue for t in Turn.childNodes if t.nodeType == t.TEXT_NODE)
        #print field_text.encode('utf-8','ignore')
        #a=a.decode('unicode_escape').encode('utf-8','ignore').split()
        _text=field_text.encode('utf-8','ignore').split()
        text=""
        for x in _text:
            text=text+' '+x
        # Function Transformation à faire
        #bool,text=transformation_text(text)
        bool=True
        seg_id=str(os.path.basename(file_trs.split('.')[0]))+'_seg-%07d' % count
        spkr_id=str(os.path.basename(file_trs.split('.')[0]))+'_spk-%03d' % int(spkr.split('spk')[1])
        if bool and text!="":
            #print seg_id+'\t'+spkr_id+'\t'+startTime+'\t'+endTime+'\t'+text
            print >> segments_file, '%s %s %s %s' % (seg_id, os.path.basename(file_trs.split('.')[0]), startTime, endTime)
            print >> utt2spk_file, '%s %s' % (seg_id, spkr_id) 
            print >> text_file, '%s %s' % (seg_id, text)
            for spk_tuple in speaker_gender:
                if spk_tuple[0]==spkr:
                    print >> spk2gender,'%s %s' % (seg_id, spk_tuple[1])
                    break
            count=count+1
    print >> wav_scp, '%s sox %s -t wav -r 16000 -c 1 -' % (os.path.basename(file_trs.split('.')[0]), os.path.dirname(file_trs)+'/'+os.path.basename(file_trs.split('.')[0])+'.wav')
    segments_file.close()
    utt2spk_file.close()
    text_file.close()
    wav_scp.close()