Commit 4676a3a1 authored by Ho Yin Chan's avatar Ho Yin Chan
Browse files

trunk:egs/hkust/ a better Chinese character error rate scoring script for future use

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@2804 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent ef2ac10d
#!/usr/bin/perl
# Copyright Hong Kong University of Science and Technology (Author: Ricky Chan) 2013.
# Copyright 2013 Hong Kong University of Science and Technology (Author: Ricky Chan Ho Yin)
#
# Apache 2.0.
#
# A script to convert Kaldi Chinese word transcription to Chinese character transcription using a word2char mapping (e.g. a word2char_map likes "195k_chinese_word2char_map")
# This is helpful for Chinese character word error rate scoring
# A script to convert Kaldi Chinese words transcription to Chinese characters transcription.
# This is helpful for Chinese character error rate scoring.
#
# If no option is applied, by default the script converts the Chinese words transcription to Chinese characters transcription \
# by assuming the input Chinese words/characters are 3 bytes UTF8 code.
# Continuous English/ASCII characters without space are treated as single token.
#
# When --useword2charmap option is applied, an input Chinese words to Chinese characters mapping table \
# (e.g. a word2char_map likes "195k_chinese_word2char_map") is used for converting the corresponding Chinese words \
# to seperate Chinese characters.
#
# When --encodeoutput option is applied, the script runs like default mode w/o applying option except the \
# output Chinese characters are in readable encoded format. The output Chinese characters are encoded in a way \
# the same as the opensource HTK toolkit from the Cambridge University Engineering Department.
use POSIX();
if($#ARGV+1 != 2) {
print "usage: perl hkust_word2char_tran.pl chinese_word2char_map tran_file \n";
sub printUsage {
print "usage: perl hkust_word2ch_tran.pl [--useword2charmap chinese_word2char_map|--encodeoutput] tran_file \n";
print "e.g. perl hkust_word2ch_tran.pl tran_file \n";
print "e.g. perl hkust_word2ch_tran.pl --useword2charmap 195k_chinese_word2char_map tran_file \n";
print "e.g. perl hkust_word2ch_tran.pl --encodeoutput tran_file \n";
exit;
}
sub encodeByteCharacter {
$enbc = "\\";
$uchar = ord($_[0]);
$encrypt1 = (($uchar>>6)&7)+'0';
$encrypt2 = (($uchar>>3)&7)+'0';
$encrypt3 = ($uchar&7)+'0';
$enbc = $enbc."$encrypt1"."$encrypt2"."$encrypt3";
return $enbc;
}
if(@ARGV < 1 || @ARGV > 3 ) {
printUsage();
}
$word2charfile=$ARGV[0];
$tranfile=$ARGV[1];
$useMapping=0;
$useEncodeoutput=0;
%word2charlist=();
open(INFILE, $word2charfile) || die("Can't open chinese word to char map: ".$word2charfile."\n");
while(<INFILE>){
chomp;
@line=split(/\s+/);
$a=$line[0];
$b="";
for($i=1; $i<scalar(@line); $i++) {
$b=$b . " " . $line[$i];
if(@ARGV == 2) {
if($ARGV[0] ne "--encodeoutput") {
printUsage();
}
$word2charlist{$a}=$b;
$useEncodeoutput=1;
$tranfile=$ARGV[1];
}
elsif(@ARGV == 3) {
if($ARGV[0] ne "--useword2charmap") {
printUsage();
}
$useMapping=1;
$word2charfile=$ARGV[1];
$tranfile=$ARGV[2];
}
else {
$tranfile=$ARGV[0];
}
close(INFILE);
# if Chinese word to character map is provided, read it
if($useMapping) {
%word2charlist=();
open(INFILE, $word2charfile) || die("Can't open Chinese word to char map: ".$word2charfile."\n");
while(<INFILE>){
chomp;
@line=split(/\s+/);
$a=$line[0];
$b="";
for($i=1; $i<scalar(@line); $i++) {
$b=$b . " " . $line[$i];
}
$word2charlist{$a}=$b;
}
close(INFILE);
}
# process kaldi transcription
open(INFILE, $tranfile) || die("Can't open transcription file ".$tranfile."\n");
while(<INFILE>) {
chomp;
......@@ -34,13 +89,49 @@ while(<INFILE>) {
## utt_id
print $line[0];
## utt_character_word
for($i=1; $i<scalar(@line); $i++) {
if(!exists($word2charlist{$line[$i]})) {
print " ".$line[$i];
if($useMapping) {
if(!exists($word2charlist{$line[$i]})) {
print " ".$line[$i];
}
else {
print $word2charlist{$line[$i]};
}
}
else {
print $word2charlist{$line[$i]};
@carray = split(//, $line[$i]);
$wspace=0;
$l=0;
while($l<@carray) {
$c = $carray[$l];
if(POSIX::isprint($c)) {
if($wspace) {
print $c;
}
else {
print " ".$c;
$wspace=1;
}
$l=$l+1;
}
else { ## here we find chinese character
if(!$useEncodeoutput) {
## print utf8 chinese character, which should be 3 bytes
print " ".$carray[$l].$carray[$l+1].$carray[$l+2];
}
else {
## print 3 bytes utf8 chinese character in readable encoded format
$enbc1 = encodeByteCharacter($carray[$l]);
$enbc2 = encodeByteCharacter($carray[$l+1]);
$enbc3 = encodeByteCharacter($carray[$l+2]);
print " ".$enbc1.$enbc2.$enbc3;
}
$l=$l+3;
$wspace=0;
}
}
}
}
print "\n";
......
......@@ -14,7 +14,7 @@ max_lmwt=20
. parse_options.sh || exit 1;
if [ $# -ne 3 ]; then
echo "Usage: local/score_basic.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
echo "Usage: local/score_basic_ext.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
echo " Options:"
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
......@@ -61,10 +61,14 @@ filter_text <$data/text >$dir/scoring/text.filt
unset LC_ALL
#for Chinese character error rate
perl local/ext/hkust_word2ch_tran.pl local/ext/195k_chinese_word2char_map $dir/scoring/text.filt > $dir/scoring/cchar.filt
perl local/ext/hkust_word2ch_tran.pl $dir/scoring/text.filt > $dir/scoring/cchar.filt
# perl local/ext/hkust_word2ch_tran.pl --useword2charmap local/ext/195k_chinese_word2char_map $dir/scoring/text.filt > $dir/scoring/cchar.filt
# perl local/ext/hkust_word2ch_tran.pl --encodeoutput $dir/scoring/text.filt > $dir/scoring/cchar.filt
for lmwt in `seq $min_lmwt $max_lmwt`; do
perl local/ext/hkust_word2ch_tran.pl local/ext/195k_chinese_word2char_map $dir/scoring/$lmwt.txt > $dir/scoring/${lmwt}.cchar
perl local/ext/hkust_word2ch_tran.pl $dir/scoring/$lmwt.txt > $dir/scoring/${lmwt}.cchar
# perl local/ext/hkust_word2ch_tran.pl --useword2charmap local/ext/195k_chinese_word2char_map $dir/scoring/$lmwt.txt > $dir/scoring/${lmwt}.cchar
# perl local/ext/hkust_word2ch_tran.pl --encodeoutput $dir/scoring/$lmwt.txt > $dir/scoring/${lmwt}.cchar
done
export LC_ALL=C
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment