Commit 8e434695 authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

V1 sys FR

parent a4dda09b
......@@ -15,6 +15,6 @@
#export mkgraph_cmd="queue.pl --mem 8G"
# Local machine
export train_cmd="run.pl --mem 2G"
export decode_cmd="run.pl --mem 4G"
export train_cmd="run.pl --mem 8G"
export decode_cmd="run.pl --mem 8G"
export mkgraph_cmd="run.pl --mem 8G"
This diff is collapsed.
# based on Elyes Config
--window-type=hamming
--use-energy=false # only fbank outputs
--sample-frequency=16000
--num-mel-bins=23
--use-energy=false # only non-default option.
--sample-frequency=16000
--sample-frequency=16000
#!/bin/bash
#
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# To be run from one directory above this script.
perl -e 'while(<>){
s/\|(\d)/\| $1/g; s/(\d)\|/$1 \|/g;
if (m/[WS]ER (\S+)/ && (!defined $bestwer || $bestwer > $1)){ $bestwer = $1; $bestline=$_; } # kaldi "compute-wer" tool.
elsif (m: (Mean|Sum/Avg|)\s*\|\s*\S+\s+\S+\s+\|\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\|:
&& (!defined $bestwer || $bestwer > $2)){ $bestwer = $2; $bestline=$_; } } # sclite.
if (defined $bestline){ print $bestline; } ' | \
awk 'BEGIN{ FS="%WER"; } { if(NF == 2) { print FS$2" "$1; } else { print $0; }}' | \
awk 'BEGIN{ FS="Sum/Avg"; } { if(NF == 2) { print $2" "$1; } else { print $0; }}' | \
awk '{ if($1!~/%WER/) { print "%WER "$9" "$0; } else { print $0; }}' | \
sed -e 's|\s\s*| |g' -e 's|\:$||' -e 's|\:\s*\|\s*$||'
#!/bin/bash
# Copyright 2016 Linagora (author: Abdel HEBA)
# Copyright 2016 Linagora (author: Abdel HEBA) | DONE
# see research.linagora.com OpenPaas Project and https://hubl.in for meetings
# GPL
source path.sh
LANG=en_US.ISO-8859-15
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <src-dir> <dst-dir>"
echo "e.g: $0 /home/lingora/Documents/Linagora/Data/Tcof/tcof/3/Corpus/train data/train"
#exit 1
fi
src=$1
dst=$2
src=$1
dst=$2
# all utterances are Wav compressed, we use sox for reading signal in binary format
if ! which sox >&/dev/null; then
......@@ -38,7 +40,7 @@ trans=$dst/text; [[ -f "$trans" ]] && rm $trans
utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur
segments=$dst/segments; [[ -f "$segments" ]] && rm $segments
# à voir
# cat lexicon/lexicon | awk '{print $1}' | egrep "_|-|'" | egrep -v '^-|-$|\)$' > lexicon/lex
......@@ -57,24 +59,50 @@ for meeting_dir in $(find $src -mindepth 1 -maxdepth 1 -type d | sort); do
# echo "Unexpected gender: '$reader_gender'"
#exit 1;
#fi
$PYTHON local/parseTcof.py $meeting_dir/$meeting.trs $dst >> log.txt 2>&1
$PYTHON local/parseTcofSync.py $meeting_dir/$meeting.trs $dst >> log.txt 2>&1
done
#spk2utt=$dst/spk2utt
# utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt #|| exit 1
# ntrans=$(wc -l <$trans)
# nutt2spk=$(wc -l <$utt2spk)
# ! [ "$ntrans" -eq "$nutt2spk" ] && \
# echo "Inconsistent #transcripts($ntrans) and # utt2spk($nutt2spk)" #&& exit 1;
# ustils/data/get_utt2dur.sh $dst 1>&2 #|| exit 1
# utils/validate_data_dir.sh --no-feats $dst #|| exit 1;
# echo "Successfully prepared data in $dst.."
# Sort all files
# text
cat $trans | sort -k1 > $trans.txt
rm $trans
mv $trans.txt $trans
#segments
cat $segments | sort -k1 > $segments.txt
rm $segments
mv $segments.txt $segments
# wav
cat $wav_scp | sort -k1 > $wav_scp.txt
rm $wav_scp
mv $wav_scp.txt $wav_scp
# # spk2gender
cat $spk2gender | sort -k1 > $spk2gender.txt
rm $spk2gender
mv $spk2gender.txt $spk2gender
# # utt2spk
cat $utt2spk | sort -k1 > $utt2spk.txt
rm $utt2spk
mv $utt2spk.txt $utt2spk
spk2utt=$dst/spk2utt
utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt #|| exit 1
# spk2utt
cat $spk2utt | sort -k1 > $spk2utt.txt
rm $spk2utt
mv $spk2utt.txt $spk2utt
ntrans=$(wc -l <$trans)
nutt2spk=$(wc -l <$utt2spk)
! [ "$ntrans" -eq "$nutt2spk" ] && \
echo "Inconsistent #transcripts($ntrans) and # utt2spk($nutt2spk)" #&& exit 1;
utils/data/get_utt2dur.sh $dst 1>&2 #|| exit 1
utils/validate_data_dir.sh --no-feats $dst #|| exit 1;
echo "Successfully prepared data in $dst.."
#exit 0
#!/bin/bash
# Copyright 2014 Vassil Panayotov
# Apache 2.0
# Prepares the test time language model(G) transducers
# (adapted from wsj/s5/local/wsj_format_data.sh)
. ./path.sh || exit 1;
# begin configuration section
src_dir=data/lang
# end configuration section
. utils/parse_options.sh || exit 1;
set -e
if [ $# -ne 1 ]; then
echo "Usage: $0 <lm-dir>"
echo "e.g.: $0 /export/a15/vpanayotov/data/lm"
echo ", where:"
echo " <lm-dir> is the directory in which the language model is stored/downloaded"
echo "Options:"
echo " --src-dir <dir> # source lang directory, default data/lang"
exit 1
fi
lm_dir=$1
if [ ! -d $lm_dir ]; then
echo "$0: expected source LM directory $lm_dir to exist"
exit 1;
fi
if [ ! -f $src_dir/words.txt ]; then
echo "$0: expected $src_dir/words.txt to exist."
exit 1;
fi
tmpdir=data/local/lm_tmp.$$
trap "rm -r $tmpdir" EXIT
mkdir -p $tmpdir
for lm_suffix in tgsmall tgmed tglarge fglarge; do
# tglarge is prepared by a separate command, called from run.sh; we don't
# want to compile G.fst for tglarge, as it takes a while.
test=${src_dir}_test_${lm_suffix}
mkdir -p $test
cp -r ${src_dir}/* $test
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
utils/validate_lang.pl --skip-determinization-check $test || exit 1;
done
echo "Succeeded in formatting data."
exit 0
#!/bin/bash
# Copyright 2017 Abdel HEBA @ Linagora
# Apache 2.0
# Auto-generates pronunciations using Sequitur G2P
. path.sh || exit 1
[ -z "$PYTHON" ] && PYTHON=python2.7
if [ $# -ne 3 ]; then
echo "Usage: $0 <vocab> <g2p-model-dir> <out-lexicon>"
echo "e.g.: $0 data/local/dict/g2p/vocab_autogen.1 /export/a15/vpanayotov/data/g2p data/local/dict/g2p/lexicon_autogen.1"
echo ", where:"
echo " <vocab> - input vocabulary, that's words for which we want to generate pronunciations"
echo " <g2p-model-dir> - source directory where g2p model is located"
echo " <out-lexicon> - the output, i.e. the generated pronunciations"
exit 1
fi
vocab=$1
g2p_model_dir=$2
out_lexicon=$3
[ ! -f $vocab ] && echo "Can't find the G2P input file: $vocab" && exit 1;
sequitur_model=$g2p_model_dir/model-3
# Turns out, that Sequitur has some sort of bug so it doesn't output pronunciations
# for some (admittedly peculiar) words. We manually specify these exceptions below
g2p_exceptions="HH HH" # more such entries can be added, separated by "\n"
[ ! -f $sequitur ] && \
echo "Can't find the Sequitur G2P script. Please check $KALDI_ROOT/tools for installation script and instructions" && \
exit 1;
[ ! -d $sequitur_path ] && echo "Can't find '$sequitur_path' - please fix your Sequitur installation" && exit 1
[ ! -f $sequitur_model ] && echo "Can't find the Sequitur model file: $sequitur_model" && exit 1
PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \
--model=$sequitur_model --apply $vocab \
>${out_lexicon}.tmp || exit 1
awk 'NR==FNR{p[$1]=$0; next;} {if ($1 in p) print p[$1]; else print}' \
<(echo -e $g2p_exceptions) ${out_lexicon}.tmp >$out_lexicon || exit 1
rm ${out_lexicon}.tmp
exit 0
#!/bin/bash
# Copyright 2017 @ Linagora Abdel HEBA
# Trains Sequitur G2P models on CMUdict
# can be used to skip some of the initial steps
stage=1
. utils/parse_options.sh || exit 1
. path.sh || exit 1
if [ $# -ne "2" ]; then
echo "Usage: $0 <cmudict-download-dir> <g2p-dir>"
echo "e.g.: $0 data/local/dict/cmudict data/local/g2p_model"
exit 1
fi
cmudict_dir=$1
g2p_dir=$2
mkdir -p $cmudict_dir
mkdir -p $g2p_dir
cmudict_clean=$cmudict_dir/fr.dict
if [ $stage -le 1 ]; then
echo "Downloading and preparing CMUdict"
if [ ! -s $cmudict_dir/fr.dict ]; then
# à modifier
echo "voir telechargement fr.dict"
#svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $cmudict_dir || exit 1;
else
echo "CMUdict copy found in $cmudict_dir - skipping download!"
fi
fi
model_1=$g2p_dir/model-1
if [ $stage -le 2 ]; then
echo "Training first-order G2P model (log in '$g2p_dir/model-1.log') ..."
PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \
--train $cmudict_dir/fr.dict --devel 5% --write-model $model_1 >$g2p_dir/model-1.log 2>&1 || exit 1
fi
model_2=$g2p_dir/model-2
if [ $stage -le 3 ]; then
echo "Training second-order G2P model (log in '$g2p_dir/model-2.log') ..."
PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \
--model $model_1 --ramp-up --train $cmudict_clean \
--devel 5% --write-model $model_2 >$g2p_dir/model-2.log \
>$g2p_dir/model-2.log 2>&1 || exit 1
fi
model_3=$g2p_dir/model-3
if [ $stage -le 4 ]; then
echo "Training third-order G2P model (log in '$g2p_dir/model-3.log') ..."
PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \
--model $model_2 --ramp-up --train $cmudict_clean \
--devel 5% --write-model $model_3 \
>$g2p_dir/model-3.log 2>&1 || exit 1
fi
model_4=$g2p_dir/model-4
if [ $stage -le 5 ]; then
echo "Training fourth-order G2P model (log in '$g2p_dir/model-4.log') ..."
PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \
--model $model_3 --ramp-up --train $cmudict_clean \
--devel 5% --write-model $model_4 \
>$g2p_dir/model-4.log 2>&1 || exit 1
fi
model_5=$g2p_dir/model-5
if [ $stage -le 6 ]; then
echo "Training fifth-order G2P model (log in '$g2p_dir/model-5.log') ..."
PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \
--model $model_4 --ramp-up --train $cmudict_clean \
--devel 5% --write-model $model_5 \
>$g2p_dir/model-5.log 2>&1 || exit 1
fi
echo "G2P training finished OK!"
exit 0
#!/bin/bash
# Copyright 2017 Abdel HEBA Linagora GSO
# Extract and normalize text from file.trs built with Transcriber for subsequent language model training
echo $@
. path.sh || exit 1
if [[ $# -ne 2 ]]; then
echo "Usage: $0 <input-book-dirs> <output-root>"
exit 1
fi
in_list=$1
out_root=$2
[[ -f "$in_list" ]] || { echo "The input file '$in_list' does not exists!"; exit 1; }
mkdir -p $out_root
processed=0
for b in $(cat $in_list); do
id=$(basename $b)
echo "Start processing $id at $(date '+%T %F')"
in_file=$b/$id.trs
[[ -f "$in_file" ]] || { echo "WARNING: $in_file does not exists"; continue; }
$PYTHON local/lm/parseText.py $in_file |\
$PYTHON local/lm/pre_filter.py /dev/stdin $out_root/corpus_train.txt
processed=$((processed + 1))
echo "Processing of $id has finished at $(date '+%T %F') [$processed texts ready so far]"
done
echo "$processed texts processed OK and stored under '$out_root'"
#!/usr/bin/env python
# -*- coding: latin-1 -*-
from xml.etree import ElementTree as ET
from unicodedata import normalize
from sys import argv
from num2words import num2words
import re
import os.path
# ( in text
# ) in text
def transformation_text(text):
bool=True
#print text
#or "(" in text
# Remove Line when : ### | $$$ | Particular Pronunciation | Amorse | BIP | Sylable incompr�hensible
#len(re.findall(r"\w+-[^\w+]|\w+-$",text))
#if "###" in text or len(re.findall(r"\[.+\]",text))>0 or len(re.findall(r"[\w|�|�|�|�|�|�|�|�|�|�|�|]+-[^\w|�|�|�|�|�|�|�|�|�|�|�|]+|[\w|�|�|�|�|�|�|�|�|�|�|�|]+-$",text))>0 or len(re.findall(" -\w+",text))>0 or len(re.findall(r"\�",text))>0 or len(re.findall(r"\*+",text))>0 or len(re.findall(r"/.+/",text))>0:
#print text
#print len(re.findall(r"[\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]+-[^\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]|[\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]+-$",text))
##len(re.findall(r"[\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]+-[^\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]|[\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]+-$",text)) > 0\
#print len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text))
#len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text)) > 0 \
#len(re.findall(r"[\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]+-[^\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]|[\w|à|â|ç|è|é|ê|î|ô|ù|û|ü|]+-$",text)) > 0\
if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text)) > 0 \
or len(re.findall("[^\p{L}]-\p{L}+|^-\p{L}+", text)) > 0:
#print text
#print "Ligne Supprime"
bool=False
else:
# 4x4
if len(re.findall(r"\dx\d",text))>0:
text=re.sub(r"x"," ",text)
# remove silence character : OK
#text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable
if len(re.findall("\d+h\d+",text))>0:
heures=re.findall("\d+h\d+",text)
for h in heures:
split_h=h.split('h')
text_rep=split_h[0]+' heure '+split_h[1]
text=text.replace(h, text_rep)
text=re.sub(r',',' ',text)
text=re.sub(r'=\w+=','',text)
# Comment Transcriber
text=re.sub(r'\{.+\}','',text)
text=re.sub(r'\(.+\}','',text)
#print "detecter (///|/|<|>)"
# Remove undecidable variant heared like on (n') en:
text=re.sub(r"\(.+\)","",text)
#text = re.sub(r"(\+|[*]+|///|/|<|>)", "", text.strip())
#text=re.sub(r"-|_|\."," ",text.strip())
text=re.sub(r'(O.K.)','ok',text)
text = re.sub(r'(O.K)', 'ok', text)
# Replace . with ' '
text=re.sub(r'\.',' ',text)
#text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK
text=re.sub(r"\?|/|\!|<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$","",text)
# Remove noise sound (BIP) over Name of places and person
#text = re.sub(r"¤[^ ]+|[^ ]+¤|¤", "", text.strip())
text=re.sub(r"(¤.+¤)",'',text)
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
text=re.sub(r"(\+)", "", text)
text=re.sub(r"(///)", "", text)
#text=re.sub(r"(///)", "<long-sil>", text)
if len(re.findall(r"/.+/", text)) > 0:
#print "AVANT***********"+text
for unchoosen_text in re.findall(r"/.+/", text):
# choose first undecideble word
unchoosen_word=unchoosen_text.split(',')
for choosen_word in unchoosen_word:
# isn't incomprehensible word
if len(re.findall(r"\*+|\d+", choosen_word))==0:
choosen_word = choosen_word.replace('/', '')
text = text.replace(unchoosen_text, choosen_word)
#print "Apres************"+text
# replace unkown syllable
text=re.sub(r"\*+","",text)
# cut of recording : OK
text=re.sub(r"\$+","",text)
# remove " character: OK
text = re.sub(r"\"+", "", text)
# t 'avais
text = re.sub(r"[ ]\'", " ", text)
text = re.sub(r"\'", "\' ", text)
# convert number if exist : OK
num_list = re.findall(" \d+| \d+$", text)
if len(num_list) > 0:
#print text
#print "********************************* NUM2WORD"
for num in num_list:
num_in_word = num2words(int(num), lang='fr')
num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text = text.replace(str(num), " " + str(num_in_word) + " ")
#print text
# replace n succesive spaces with one space. : OK
text=re.sub(r"\s{2,}"," ",text)
text = re.sub("^ ", '', text)
# c'est l'essaim ....
text=text.lower()
return bool,text
if __name__=="__main__":
# Inputs
file_trs=argv[1]
basename=os.path.basename(file_trs.split('.')[0])
# MetaData File
file_meta = file_trs.split('.')[0] + '.xml'
#print file_trs.split('.')[0]
# Read Trans File
tree_trs = ET.parse(file_trs)
trsdoc= tree_trs.getroot()
text=""
Turn_count=0
count=0
has_attrib_speaker=False
# set for uniq add
for Element in trsdoc.iter():
if Element.tag=="Turn" and Element.get('speaker') is None:
has_attrib_speaker=False
elif Element.tag=="Turn":
# If the latest Utterance of previous Speaker is the latest one of his Turn speech
if Turn_count>0:
count = 0
bool, text = transformation_text(text)
# File wav.scp
# File utt2spk
# File text
# File speaker_gender
if bool and text!="":
print text.encode('utf-8')
#for spk_tuple in speaker_gender:
# if spk_tuple[0]==spkr:
# print >> spk2gender,'%s %s' % (seg_id, spk_tuple[1])
# break
has_attrib_speaker=True
# count sync for computing start and end utterance
Turn_count = Turn_count+1
elif Element.tag=="Sync" and has_attrib_speaker:
if count>0:
bool, text = transformation_text(text)
if bool and text!="":
print text.encode('utf-8')
text=Element.tail.replace('\n', '')
count=count+1
elif Element.tag=="Comment" and has_attrib_speaker and not Element.tail is None:
text=text+" "+Element.tail.replace('\n', '')
elif Element.tag=="Event" and has_attrib_speaker and not Element.tail is None :
if Element.get('type')=='noise':
if Element.get('desc')=='rire':
text=text+" "+Element.tail.replace('\n', '')
else:
text=text+" "+Element.tail.replace('\n', '')
elif Element.get('type')=='pronounce':
text=text+" "+Element.tail.replace('\n', '')
else:
text=text+" "+Element.tail.replace('\n', '')
elif Element.tag=="Who" and has_attrib_speaker and not Element.tail is None:
text=text+" "+Element.tail.replace('\n', '')
if count > 0 and has_attrib_speaker and not Element.tail is None:
bool, text = transformation_text(text)
if bool and text != "":
print text.encode('utf-8')
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from xml.dom import minidom
from unicodedata import normalize
from sys import argv
from num2words import num2words
import re
import os.path
def transformation_text(text):
bool=True
#print text
#or "(" in text
# Remove Line when : ### | $$$ | Particular Pronunciation | Amorse | BIP | Sylable incompréhensible
#len(re.findall(r"\w+-[^\w+]|\w+-$",text))
if "###" in text or len(re.findall(r"\[.+\]",text))>0 or len(re.findall(r"\w+-[^\w+]|\w+-$",text))>0 or len(re.findall(" -\w+",text))>0 or len(re.findall(r"\¤",text))>0 or len(re.findall(r"/.+/",text))>0:
#print text
#print "Ligne Supprimé"
bool=False
else:
# 4x4
text=re.sub(r"4x4","quatre fois quatre",text)
#convert number if exist : OK
num_list=re.findall(" \d+ ",text)
if len(num_list)>0:
for num in num_list:
num_in_word=num2words(int(num), lang='fr')
text=normalize('NFKD', text.replace(num,num_in_word)).encode('utf-8', 'ignore')
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
#text=re.sub(r"(///)", "<sil>", text)
# remove silence character : OK
text=re.sub(r"(\+|///)", "", text.strip())
#text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable
text=re.sub(r'=\w+=','',text)
# Comment Transcriber
text=re.sub(r'\{.+\}','',text)
#print "detecter (///|/|<|>)"
# Remove noise sound (BIP) over Name of places and person
text=re.sub(r"(¤\.+¤)",'<noise>',text)
# Remove undecidable variant heared like on (n') en:
text=re.sub(r"\(.+\)","",text)
#print text
#text=re.sub(r"(\+)", "<sil>", text)
#text = re.sub(r"(\+|[*]+|///|/|<|>)", "", text.strip())