Commit 2cf6d87d authored by Ho Yin Chan's avatar Ho Yin Chan
Browse files

trunk/egs/hkust: subdict extraction script is modified for handling multiple...

trunk/egs/hkust: subdict extraction script is modified for handling multiple pronunciations generation

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@2725 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent d0a8ce3c
#!/usr/bin/perl
# Copyright Hong Kong University of Science and Technology (Author: Ricky Chan) 2013.
#
# A script for dictionary generation with a input dict and wordlist
#
# example of dict format as follow:
# WORD1 ph1 ph2
# WORD2 ph1 ph2 ph3
# WORDX ph4
# WORDY ph4 ph5
# WORDZ ph3 ph1
#
# example of wordlist (support phrase of words) format as follow:
# WORD1
# WORD2
# WORDX WORDY
# WORDX WORDY WORDZ
if($#ARGV+1 != 2) {
print "usage: perl hkust_extract_subdict.pl dict wordlist \n";
if($#ARGV+1 != 2 && $#ARGV+1 != 3) {
printUsage();
exit;
}
$usespron=0;
if(@ARGV == 3) {
if($ARGV[2] ne "--spron") {
printUsage();
exit;
}
$usespron=1;
}
$dictfile=$ARGV[0];
$inputfile=$ARGV[1];
......@@ -19,30 +43,88 @@ while(<INFILE>){
for($i=1; $i<scalar(@line); $i++) {
$b=$b . " " . $line[$i];
}
$dictionarylist{$a}=$b;
push ( @{ $dictionarylist{$a} }, $b );
}
close(INFILE);
open(INFILE, $inputfile) || die("Can't open wordlist ".$inputfile."\n");
while(<INFILE>) {
chomp;
$phrase = $_;
@line = split(/\s+/);
for($i=0; $i<scalar(@line); $i++) {
print $line[$i]." ";
## single pronunciation handling
if($usespron==1) {
if(scalar(@line)==0) {
next;
}
for($i=0; $i<scalar(@line); $i++) {
print $line[$i]." ";
}
print "\t";
for($i=0; $i<scalar(@line); $i++) {
if(!exists($dictionarylist{$line[$i]})) {
print " _NOT_FOUND_";
}
else {
@ref=@{ $dictionarylist{$line[$i]} };
print $ref[0]."";
}
}
print "\n";
next;
}
print "\t";
for($i=0; $i<scalar(@line); $i++) {
if(! exists($dictionarylist{$line[$i]})) {
print "_NOT_FOUND_ ";
## multiple pronunciations handling
@pronlist=();
@tmppronlist=();
if(scalar(@line)>0) {
$word = $line[$0];
if(!exists($dictionarylist{$word})) {
push(@pronlist, '_NOT_FOUND_');
}
else {
print $dictionarylist{$line[$i]}." ";
@ref=@{ $dictionarylist{$word} };
for($i=0; $i<scalar(@ref); $i++) {
push(@pronlist, $ref[$i]."");
}
}
for($i=1; $i<scalar(@line); $i++) {
$word = $line[$i];
if(!exists($dictionarylist{$word})) {
for($j=0; $j<scalar(@pronlist); $j++) {
$pronlist[$j] = $pronlist[$j]." _NOT_FOUND_";
}
}
else {
@ref=@{ $dictionarylist{$word} };
while(scalar(@pronlist)>0) {
push(@tmppronlist, shift(@pronlist));
}
while(scalar(@tmppronlist)>0) {
$tmppron = shift(@tmppronlist);
for($j=0; $j<scalar(@ref); $j++) {
push(@pronlist, $tmppron." ".$ref[$j]);
}
}
}
}
for($i=0; $i<scalar(@pronlist); $i++) {
print $phrase."\t".$pronlist[$i]."\n";
}
}
print "\n";
}
close(INFILE);
sub printUsage {
print "usage: perl hkust_extract_subdict.pl dict wordlist [--spron]\n\n";
print "### this script handle multiple pronunciations for dict in default\n";
print "### if you want to extract single(top) pronunciation from dict, please use the option --spron\n\n";
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment