Commit 8552dfae authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

remove useless files

parent 7427bc90
#!/bin/bash
# Copyright 2016 Linagora (author: Abdel HEBA)
# see research.linagora.com OpenPaas Project and https://hubl.in for meetings
# GPL
source path.sh
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <src-dir> <dst-dir>"
echo "e.g: $0 /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/train data/train"
#exit 1
fi
src=$1
dst=$2
# all utterances are Wav compressed, we use sox for reading signal in binary format
if ! which sox >&/dev/null; then
echo "Please install 'sox' on All worker nodes"
echo "apt-get install sox"
#exit 1
fi
#Reflechir partie Split...?
#echo "=== Starting initial Tcof Data preparation ..."
#echo "--- Making test/train data split ..."
mkdir -p $dst #|| exit 1;
[ ! -d $src ] && echo "$0: no such directory $src" #&& exit 1;
wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
trans=$dst/text; [[ -f "$trans" ]] && rm $trans
utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur
# à voir
# cat lexicon/lexicon | awk '{print $1}' | egrep "_|-|'" | egrep -v '^-|-$|\)$' > lexicon/lex
for meeting_dir in $(find $src -mindepth 1 -maxdepth 1 -type d | sort); do
meeting=$(basename $meeting_dir)
#if ! [ $meeting -eq $meeting ]; then
#echo "$0 unexpected subdirectory name $reader"
#exit 1;
#fi
[ ! -f $meeting_dir/$meeting.trs ] && [ ! -f $meeting_dir/$meeting.wav ] && echo " Missing $meeting.trs or $meeting.wav file " #&& exit 1
#dir.tsr contains metadata gender of speaker
#reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}')
#if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then
# echo "Unexpected gender: '$reader_gender'"
#exit 1;
#fi
$PYTHON local/parseTcof.py $meeting_dir/$meeting.trs $dst >> file_ok.txt 2>> log.txt
done
#spk2utt=$dst/spk2utt
# utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt #|| exit 1
# ntrans=$(wc -l <$trans)
# nutt2spk=$(wc -l <$utt2spk)
# ! [ "$ntrans" -eq "$nutt2spk" ] && \
# echo "Inconsistent #transcripts($ntrans) and # utt2spk($nutt2spk)" #&& exit 1;
# ustils/data/get_utt2dur.sh $dst 1>&2 #|| exit 1
# utils/validate_data_dir.sh --no-feats $dst #|| exit 1;
# echo "Successfully prepared data in $dst.."
#exit 0
#num2Words
pip install num2words
python setup.py install
python setup.py test
from num2words import num2words
num2words(42, lang='fr')
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from xml.dom import minidom
from unicodedata import normalize
from sys import argv
import re
import os.path
def transformation_text(text):
bool=True
if "###" in text or "(" in text: # "voir - amorces"
print "Supprimer Ligne"
bool=False
else:
#print "detecter (///|/|<|>)"
print text
text = re.sub(r"(\+|[*]+|///|/|<|>)", "", text.strip())
text= re.sub(r"-|_|\."," ",text.strip())
text = re.sub(r"(O K | O K|^O K$)", " ok ", text)
text=re.sub(r"{[^{]+}"," ",text.strip())
#text=re.sub(r"¤[^¤]+¤","",text.strip())
text=re.sub(r"¤[^ ]+|[^ ]+¤|¤","",text.strip())
text=re.sub(r" +"," ",text.strip())
text=re.sub(r" 4x4 "," quatre fois quatre ",text)
# if "///" in text:
# print text
# print "Detecté ///"
# text=text.replace('///','')
# if ">" in text or "<" in text:
# print "< or > Detecté"
# else:
# if "{" in text and "}" in text:
# print "comment detected"
# else:
# if "(" in text and ")" in text:
# print "( ) detected"
# else:
# if "***" in text:
# print "suite de syllabes incompréhensibles"
# else:
# if "*" in text:
# print "suite de syllable incompréhensible"
# else:
# if "$$$" in text:
# print "coupure de l'enregistrement"
return bool,text
if __name__=="__main__":
file_trs=argv[1]
outdir=argv[2]
print file_trs.split('.')[0]
# Output File needed for kaldi input
segments_file = open(outdir + '/segments', 'a')
utt2spk_file = open(outdir + '/utt2spk', 'a')
text_file = open(outdir + '/text', 'a')
wav_scp = open(outdir + '/wav.scp', 'a')
spk2gender= open(outdir + '/spk2gender', 'a')
# Read Trans File
trsdoc= minidom.parse(file_trs)
#Read MetaData Of speaker ( ID and Name)
Speaker= trsdoc.getElementsByTagName('Speaker')
speaker_id=[]
namespk=[]
for spk in Speaker:
id_spk=spk.attributes['id'].value
id_spk=normalize('NFKD', id_spk).encode('utf-8', 'ignore')
name_spk=spk.attributes['name'].value
name_spk=normalize('NFKD', name_spk).encode('utf-8', 'ignore')
speaker_id.append(id_spk.replace(" ",""))
namespk.append(name_spk.lower().replace(" ",""))
#Read MetaData To get Gender of Speaker (Gender and Name)
file_xml=file_trs.split('.')[0]+'.xml'
xmldoc= minidom.parse(file_xml)
locuteur= xmldoc.getElementsByTagName('locuteur')
sexe= xmldoc.getElementsByTagName('sexe')
speaker_gender=[]
count=0
print namespk
print speaker_id
for loc in locuteur:
if loc.hasAttribute('identifiant'):
name_loc=loc.attributes['identifiant'].value
name_loc=normalize('NFKD', name_loc).encode('utf-8', 'ignore').replace(" ","")
print name_loc
#If the gender of speaker doesn't mentioned
if sexe[count].childNodes==[]:
speaker_gender.append([speaker_id[namespk.index(name_loc.lower())],'m'])
else:
gender_loc="".join(t.nodeValue for t in sexe[count].childNodes if t.nodeType == t.TEXT_NODE)
gender_loc=normalize('NFKD', gender_loc).encode('utf-8', 'ignore')
speaker_gender.append([speaker_id[namespk.index(name_loc.lower())],gender_loc.lower()])
count=count+1
print speaker_gender
#g_spk='m' if gender_spk=='male' else 'f'
#speaker_gender.append([id_spk,g_spk])
#print speaker_gender
Turnlist= trsdoc.getElementsByTagName('Turn')
#print len(Turnlist)
a=""
count=1
#print "#id_utt\tid_Seg\tid_Spkr\tstartTime\tendTime\tText"
for Turn in Turnlist:
# Get id_spkr
att_spk=Turn.attributes['speaker'].value
spkr=normalize('NFKD', att_spk).encode('utf-8', 'ignore')
# Get StartSegment
att_startTime=Turn.attributes['startTime'].value
startTime=normalize('NFKD', att_startTime).encode('utf-8', 'ignore')
#Get EndSegment
att_endTime=Turn.attributes['endTime'].value
endTime=normalize('NFKD', att_endTime).encode('utf-8', 'ignore')
# Get Text
field_text="".join(t.nodeValue for t in Turn.childNodes if t.nodeType == t.TEXT_NODE)
#print field_text.encode('utf-8','ignore')
#a=a.decode('unicode_escape').encode('utf-8','ignore').split()
_text=field_text.encode('utf-8','ignore').split()
text=""
for x in _text:
text=text+' '+x
# Function Transformation à faire
#bool,text=transformation_text(text)
bool=True
seg_id=str(os.path.basename(file_trs.split('.')[0]))+'_seg-%07d' % count
spkr_id=str(os.path.basename(file_trs.split('.')[0]))+'_spk-%03d' % int(spkr.split('spk')[1])
if bool and text!="":
#print seg_id+'\t'+spkr_id+'\t'+startTime+'\t'+endTime+'\t'+text
print >> segments_file, '%s %s %s %s' % (seg_id, os.path.basename(file_trs.split('.')[0]), startTime, endTime)
print >> utt2spk_file, '%s %s' % (seg_id, spkr_id)
print >> text_file, '%s %s' % (seg_id, text)
for spk_tuple in speaker_gender:
if spk_tuple[0]==spkr:
print >> spk2gender,'%s %s' % (seg_id, spk_tuple[1])
break
count=count+1
print >> wav_scp, '%s sox %s -t wav -r 16000 -c 1 -' % (os.path.basename(file_trs.split('.')[0]), os.path.dirname(file_trs)+'/'+os.path.basename(file_trs.split('.')[0])+'.wav')
segments_file.close()
utt2spk_file.close()
text_file.close()
wav_scp.close()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from xml.dom import minidom
from unicodedata import normalize
from sys import argv
import re
import os.path
def transformation_text(text):
bool=True
if "###" in text or "(" in text: # "voir - amorces"
print "Supprimer Ligne"
bool=False
else:
#print "detecter (///|/|<|>)"
print text
text = re.sub(r"(\+|[*]+|///|/|<|>)", "", text.strip())
text= re.sub(r"-|_|\."," ",text.strip())
text = re.sub(r"(O K | O K|^O K$)", " ok ", text)
text=re.sub(r"{[^{]+}"," ",text.strip())
#text=re.sub(r"¤[^¤]+¤","",text.strip())
text=re.sub(r"¤[^ ]+|[^ ]+¤|¤","",text.strip())
text=re.sub(r" +"," ",text.strip())
text=re.sub(r" 4x4 "," quatre fois quatre ",text)
# if "///" in text:
# print text
# print "Detecté ///"
# text=text.replace('///','')
# if ">" in text or "<" in text:
# print "< or > Detecté"
# else:
# if "{" in text and "}" in text:
# print "comment detected"
# else:
# if "(" in text and ")" in text:
# print "( ) detected"
# else:
# if "***" in text:
# print "suite de syllabes incompréhensibles"
# else:
# if "*" in text:
# print "suite de syllable incompréhensible"
# else:
# if "$$$" in text:
# print "coupure de l'enregistrement"
return bool,text
if __name__=="__main__":
file_trs=argv[1]
outdir=argv[2]
print file_trs.split('.')[0]
# Output File needed for kaldi input
segments_file = open(outdir + '/segments', 'a')
utt2spk_file = open(outdir + '/utt2spk', 'a')
text_file = open(outdir + '/text', 'a')
wav_scp = open(outdir + '/wav.scp', 'a')
spk2gender= open(outdir + '/spk2gender', 'a')
# Read Trans File
trsdoc= minidom.parse(file_trs)
#Read MetaData Of speaker ( ID and Name)
Speaker= trsdoc.getElementsByTagName('Speaker')
speaker_id=[]
namespk=[]
for spk in Speaker:
id_spk=spk.attributes['id'].value
id_spk=normalize('NFKD', id_spk).encode('utf-8', 'ignore')
name_spk=spk.attributes['name'].value
name_spk=normalize('NFKD', name_spk).encode('utf-8', 'ignore')
speaker_id.append(id_spk)
namespk.append(name_spk.lower())
#Read MetaData To get Gender of Speaker (Gender and Name)
file_xml=file_trs.split('.')[0]+'.xml'
xmldoc= minidom.parse(file_xml)
locuteur= xmldoc.getElementsByTagName('locuteur')
sexe= xmldoc.getElementsByTagName('sexe')
speaker_gender=[]
count=0
for loc in locuteur:
if loc.hasAttribute('identifiant'):
name_loc=loc.attributes['identifiant'].value
name_loc=normalize('NFKD', name_loc).encode('utf-8', 'ignore')
#If the gender of speaker doesn't mentioned
if sexe[0].childNodes==[]:
speaker_gender.append([speaker_id[namespk.index(name_loc.lower())],'m'])
else:
gender_loc="".join(t.nodeValue for t in sexe[count].childNodes if t.nodeType == t.TEXT_NODE)
gender_loc=normalize('NFKD', gender_loc).encode('utf-8', 'ignore')
speaker_gender.append([speaker_id[namespk.index(name_loc.lower())],gender_loc.lower()])
count=count+1
#g_spk='m' if gender_spk=='male' else 'f'
#speaker_gender.append([id_spk,g_spk])
#print speaker_gender
Turnlist= trsdoc.getElementsByTagName('Turn')
#print len(Turnlist)
a=""
count=1
#print "#id_utt\tid_Seg\tid_Spkr\tstartTime\tendTime\tText"
for Turn in Turnlist:
# Get id_spkr
att_spk=Turn.attributes['speaker'].value
spkr=normalize('NFKD', att_spk).encode('utf-8', 'ignore')
# Get StartSegment
att_startTime=Turn.attributes['startTime'].value
startTime=normalize('NFKD', att_startTime).encode('utf-8', 'ignore')
#Get EndSegment
att_endTime=Turn.attributes['endTime'].value
endTime=normalize('NFKD', att_endTime).encode('utf-8', 'ignore')
# Get Text
field_text="".join(t.nodeValue for t in Turn.childNodes if t.nodeType == t.TEXT_NODE)
#print field_text.encode('utf-8','ignore')
#a=a.decode('unicode_escape').encode('utf-8','ignore').split()
_text=field_text.encode('utf-8','ignore').split()
text=""
for x in _text:
text=text+' '+x
# Function Transformation à faire
#bool,text=transformation_text(text)
bool=True
seg_id=str(os.path.basename(file_trs.split('.')[0]))+'_seg-%07d' % count
spkr_id=str(os.path.basename(file_trs.split('.')[0]))+'_spk-%03d' % int(spkr.split('spk')[1])
if bool and text!="":
#print seg_id+'\t'+spkr_id+'\t'+startTime+'\t'+endTime+'\t'+text
print >> segments_file, '%s %s %s %s' % (seg_id, os.path.basename(file_trs.split('.')[0]), startTime, endTime)
print >> utt2spk_file, '%s %s' % (seg_id, spkr_id)
print >> text_file, '%s %s' % (seg_id, text)
for spk_tuple in speaker_gender:
if spk_tuple[0]==spkr:
print >> spk2gender,'%s %s' % (seg_id, spk_tuple[1])
break
count=count+1
print >> wav_scp, '%s sox %s -t wav -r 16000 -c 1 -' % (os.path.basename(file_trs.split('.')[0]), os.path.dirname(file_trs)+'/'+os.path.basename(file_trs.split('.')[0])+'.wav')
segments_file.close()
utt2spk_file.close()
text_file.close()
wav_scp.close()
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment