Commit 05a3f2b9 authored by Gaurav Kumar's avatar Gaurav Kumar
Browse files

Added new scripts for processing arabic script, converting lattices and extracting n-best lists

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4265 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent e2700de1
......@@ -24,8 +24,12 @@ fi
mkdir -p $oracleDir
cat $textFile | sed 's:\[laughter\]::g' | sed 's:\[noise\]::g' | \
utils/sym2int.pl -f 2- $symTable | \
# Since the lexicon is built from the LDC lexicon, there are words in the dataset
# that do not appear in the lexicon. These have to marked as OOV.
# Removing [hes] symbols as well. This is not consistent with the scoring scheme used
# while scoring 1-best.
cat $textFile | sed 's:\[laughter\]::g' | sed 's:\[noise\]::g' | sed 's:\[hes\]::g' | \
utils/sym2int.pl --map-oov [oov] -f 2- $symTable | \
$KALDI_ROOT/src/latbin/lattice-oracle --word-symbol-table=$symTable "ark:gunzip -c $latticeDir/lat.*.gz|" ark:- ark,t:$oracleDir/oracle.tra 2>$oracleDir/oracle.log
sort -k1,1 -u $oracleDir/oracle.tra -o $oracleDir/oracle.tra
......@@ -3,17 +3,17 @@
outDir=exp/lat
mkdir -p $outDir
stage=1
stage=2
if [ $stage -lt 1 ]; then
# First convert all lattices into the pruned, minimized version
decodeDir=exp/tri5a/decode_dev
acousticScale=0.8333
acousticScale=0.08333
local/latconvert.sh $outDir $decodeDir $acousticScale
decodeDir=exp/tri5a/decode_test
acousticScale=0.8333
acousticScale=0.08333
local/latconvert.sh $outDir $decodeDir $acousticScale
fi
......@@ -30,3 +30,20 @@ if [ $stage -lt 2 ]; then
symTable=exp/tri5a/graph/words.txt
local/get_oracle.sh $latticeDir $symTable $textFile
fi
if [ $stage -lt 3 ]; then
# Create a script lexicon if it does not exist
if [ ! -f data/local/dict/lexicon_script.txt ]; then
local/callhome_prepare_script_dict.py /export/corpora/LDC/LDC99L22/ \
exp/tri5a/graph/words.txt data/local/dict/lexicon_script.txt
fi
# Now get the n-best files from the lattices
decodeDir=exp/tri5a/decode_dev
acousticScale=0.08333
local/get_nbest.sh $outDir $decodeDir $acousticScale dev
decodeDir=exp/tri5a/decode_test
acousticScale=0.08333
local/get_nbest.sh $outDir $decodeDir $acousticScale test
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment