Commit a6fbf1c3 authored by Gaurav Kumar's avatar Gaurav Kumar
Browse files

Added new scripts for processing arabic script, converting lattices and extracting n-best lists

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4266 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 05a3f2b9
#!/usr/bin/env python
import sys
import codecs
romanToScript = {}
lexiconLocation = sys.argv[1]
wordLexiconLoc = sys.argv[2]
outLexiconLoc = sys.argv[3]
# First create a map from the romanized to the script version
lexicon = codecs.open(lexiconLocation + "callhome_arabic_lexicon_991012/ar_lex.v07", \
encoding="iso-8859-6")
for line in lexicon:
lineComp = line.strip().split('\t')
romanToScript[lineComp[0]] = lineComp[1]
lexicon.close()
# Now read the word lexicon and write out a script lexicon
wordLexicon = open(wordLexiconLoc)
outLexicon = codecs.open(outLexiconLoc, "w+", encoding="utf-8")
for line in wordLexicon:
lineComp = line.strip().split(" ")
if lineComp[0] in romanToScript:
lineComp[0] = romanToScript[lineComp[0]]
outLexicon.write(" ".join(lineComp) + '\n')
wordLexicon.close()
outLexicon.close()
#!/usr/bin/env bash
# Author : Gaurav Kumar, Johns Hopkins University
# Creates n-best lists from Kaldi lattices
# This script needs to be run from one level above this directory
. path.sh
if [ $# -lt 3 ]; then
echo "Enter the latdir (where the n-best will be put), the decode dir containing lattices and the acoustic scale"
exit 1
fi
noNBest=100
maxProcesses=10
latdir=$1
decode_dir=$2
acoustic_scale=$3
partition=$4
symTable=exp/tri5a/graph/words.txt
scriptSymTable=data/local/dict/lexicon_script.txt
stage=0
if [ -d $decode_dir ]
then
allNBest=$latdir/$partition.all.nbest
runningProcesses=0
for l in $decode_dir/lat.*.gz
do
(
# Extract file name and unzip the file first
bname=${l##*/}
bname="$latdir/$partition.${bname%.gz}"
gunzip -c $l > "$bname.bin"
if [ $stage -le 0 ]; then
# Extract n-best from the lattices
lattice-to-nbest --acoustic-scale=$acoustic_scale --n=$noNBest \
ark:$bname.bin ark:$bname.nbest
#Convert the n-best lattice to linear word based sentences
nbest-to-linear ark,t:$bname.nbest ark,t:$bname.ali ark,t:$bname.words \
ark,t:$bname.lmscore ark,t:$bname.acscore
#Convert the int to word for each sentence
cat $bname.words | utils/int2sym.pl -f 2- \
$symTable >> $allNBest.roman
#Use the script version on the symbol table too
cat $bname.words | utils/int2sym.pl -f 2- \
$scriptSymTable >> $allNBest.script
fi
echo "Done getting n-best"
) &
runningProcesses=$((runningProcesses+1))
echo "#### Processes running = " $runningProcesses " ####"
if [ $runningProcesses -eq $maxProcesses ]; then
echo "#### Waiting for slot ####"
wait
runningProcesses=0
echo "#### Done waiting ####"
fi
done
wait
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment