Commit faae7bb6 authored by Stefan Kombrink's avatar Stefan Kombrink
Browse files

adding script to train a RNN LM based on rnnlm+WSJ data

git-svn-id: 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 45ece9fb
if [ $# != 3 ]; then
echo "Usage: steps/ <#hidden> <#class> <rnn-outfile-name>"
echo "This is going to take a long time, try it first with "
echo "steps/ 5 140 test"
echo "but to improve over baseline you'd need e.g. 350 hidden, 200 classes"
exit 1;
ModelName=$(basename $3)
rm -rf $TMP
mkdir -p $TMP
. || exit 1;
cd $TMP
echo "Using $TMP"
echo "##DAN - TODO! Copy these data somehow from the LCD std DVD set!!! "
echo "##(So far, I copy them locally from Brno :)"
echo "Using 99% for training, 1% for validation,limiting vocab to 20k word list"
gzcat /mnt/matylda2/data/WSJ1/13-32.1/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/* | awk '
while (getline<"'$WordSym'")v[$1]=$2
for (i=1;i<=NF;i++)
if ($i in v)printf $i" ";
else printf "<UNK> ";
print ""
}' | sed 's/ $//' > WSJ.txt
cat WSJ.txt | awk 'FNR%100==0{print $0}' > WSJ.valid
cat WSJ.txt | awk 'FNR%100!=0{print $0}' > WSJ.train
echo "Downloading latest version of rnnlm"
tar xzf rnnlm-0.3b.tgz
cd rnnlm-0.3b
g++ -lm -O2 -funroll-loops -fprefetch-loop-arrays -g rnnlmlib.cpp rnnlm.cpp -o ../rnnlm
cd ..
echo "Training a RNN language model ($ModelName) with $NumHidden hidden neurons and $NumClass classes"
time ./rnnlm -rnnlm rnn -train WSJ.train -valid WSJ.valid -hidden $NumHidden -class $NumClass -bptt 5 -bptt-block 10 -debug 2 -alpha 0.1
mv rnn ../$ModelName
echo "Deleting $TMP"
rm -rf $TMP
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment