Commit e378b3eb authored by Stefan Kombrink's avatar Stefan Kombrink
Browse files

fix usage of wrong vocab!

git-svn-id: 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent eac28048
......@@ -12,31 +12,32 @@ NumHidden=$1
ModelName=$(basename $3)
rm -rf $TMP
mkdir -p $TMP
. || exit 1;
cd $TMP
echo "Using $TMP"
echo "Extracting word symbols from baseline ngram LM"
gzcat data_prep/ | awk '/^\\data/{dump=1;next}{if (dump)print}' | awk '/1-grams/{dump=1}/2-grams/{dump=0}{if (dump)print $2}' | awk '$0!=""{print}' > $TMP/vocab
cd $TMP
echo "##DAN - TODO! Copy these data somehow from the LCD std DVD set!!! "
echo "##(So far, I copy them locally from Brno :)"
echo "Using 99% for training, 1% for validation,limiting vocab to 20k word list"
gzcat /mnt/matylda2/data/WSJ1/13-32.1/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/* | awk '
while (getline<"'$WordSym'")v[$1]=$2
while (getline<"vocab")v[$1]=$2
for (i=1;i<=NF;i++)
if ($i in v)printf $i" ";
if ($i in v)printf $i" "
else printf "<UNK> ";
print ""
print "";
}' | sed 's/ $//' > WSJ.txt
cat WSJ.txt | awk 'FNR%100==0{print $0}' > WSJ.valid
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment