Commit e459d6d1 authored by Dan Povey's avatar Dan Povey
Browse files

trunk: adding Fisher-english example scripts for data-cleanup method (doesn't...

trunk: adding Fisher-english example scripts for data-cleanup method (doesn't seem to help on Fisher, but may be useful elsewhere.)

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4255 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 4eaefc9b
for x in exp/*/decode_dev; do grep WER $x/wer_* | utils/best_wer.sh; done
%WER 50.00 [ 19571 / 39141, 1893 ins, 4738 del, 12940 sub ] exp/tri1/decode_dev/wer_12
%WER 49.52 [ 19384 / 39141, 1774 ins, 5035 del, 12575 sub ] exp/tri2/decode_dev/wer_13
%WER 42.57 [ 16664 / 39141, 1908 ins, 4080 del, 10676 sub ] exp/tri3a/decode_dev/wer_12
%WER 35.67 [ 13963 / 39141, 1810 ins, 3347 del, 8806 sub ] exp/tri4a/decode_dev/wer_13
%WER 32.09 [ 12560 / 39141, 1680 ins, 3131 del, 7749 sub ] exp/tri5a/decode_dev/wer_14
%WER 49.72 [ 19461 / 39141, 1999 ins, 4578 del, 12884 sub ] exp/tri1/decode_dev/wer_12
%WER 49.00 [ 19181 / 39141, 1812 ins, 4848 del, 12521 sub ] exp/tri2/decode_dev/wer_13
%WER 41.86 [ 16384 / 39141, 1735 ins, 4152 del, 10497 sub ] exp/tri3a/decode_dev/wer_13
%WER 34.73 [ 13593 / 39141, 1719 ins, 3365 del, 8509 sub ] exp/tri4a/decode_dev/wer_14
%WER 31.07 [ 12163 / 39141, 1869 ins, 2705 del, 7589 sub ] exp/tri5a/decode_dev/wer_13
%WER 31.13 [ 12184 / 39141, 1939 ins, 2584 del, 7661 sub ] exp/tri5a_0.1/decode_dev/wer_12
%WER 23.66 [ 9259 / 39141, 1495 ins, 2432 del, 5332 sub ] exp/nnet6c4_gpu/decode_dev/wer_11
......
......@@ -156,12 +156,35 @@ steps/train_sat.sh --cmd "$train_cmd" \
exp/tri5a/graph data/dev exp/tri5a/decode_dev
)&
#
# steps/cleanup/find_bad_utts.sh --nj 200 --cmd "$train_cmd" data/train data/lang \
# exp/tri5a exp/tri5a_cleanup
# The step below won't run by default; it demonstrates a data-cleaning method.
# It doesn't seem to help in this setup; maybe the data was clean enough already.
false && (
steps/cleanup/find_bad_utts.sh --nj 200 --cmd "$train_cmd" data/train data/lang \
exp/tri5a exp/tri5a_cleanup
# with threshold of 0.05 we keep 1.1 million out of 1.6 million utterances, and
# around 8.7 million out of 18.1 million words
# with threshold of 0.1 we keep 1.3 out of 1.6 million utterances, and around
# 13.2 million out of 18.1 million words.
thresh=0.1
cat exp/tri5a_cleanup/all_info.txt | awk -v threshold=$thresh '{ errs=$2;ref=$3; if (errs <= threshold*ref) { print $1; } }' > uttlist
utils/subset_data_dir.sh --utt-list uttlist data/train data/train.thresh$thresh
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train.thresh$thresh data/lang exp/tri4a exp/tri4a_ali_$thresh
steps/train_sat.sh --cmd "$train_cmd" \
10000 300000 data/train data/lang exp/tri4a_ali_$thresh exp/tri5a_$thresh || exit 1;
(
utils/mkgraph.sh data/lang_test exp/tri5a_$thresh exp/tri5a_$thresh/graph
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri5a_$thresh/graph data/dev exp/tri5a_$thresh/decode_dev
)&
)
# local/run_for_spkid.sh
# we don't have to results for the step below yet.
# local/run_nnet2.sh
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment