hkust_extract_subdict.pl 2.81 KB
Newer Older
1
#!/usr/bin/env perl
2
# Copyright Hong Kong University of Science and Technology (Author: Ricky Chan) 2013.
3
# 
4
# A script for dictionary generation with an input dict and a wordlist 
5 6 7 8 9 10 11 12 13 14 15 16 17
#
# example of dict format as follow:
# WORD1 ph1 ph2
# WORD2 ph1 ph2 ph3
# WORDX ph4
# WORDY ph4 ph5
# WORDZ ph3 ph1
#
# example of wordlist (support phrase of words) format as follow:
# WORD1
# WORD2
# WORDX WORDY 
# WORDX WORDY WORDZ
18

19 20
if($#ARGV+1 != 2 && $#ARGV+1 != 3) {
  printUsage();
21 22 23
  exit;
}

24 25 26 27 28 29 30 31 32
$usespron=0;
if(@ARGV == 3) {
  if($ARGV[2] ne "--spron") {
    printUsage();
    exit;
  }
  $usespron=1;
}

33 34 35 36 37 38 39 40 41 42 43 44 45
$dictfile=$ARGV[0];
$inputfile=$ARGV[1];

%dictionarylist=();
open(INFILE, $dictfile) || die("Can't open dict ".$dictfile."\n");
while(<INFILE>){
  chomp;
  @line=split(/\s+/);
  $a=$line[0];
  $b="";
  for($i=1; $i<scalar(@line); $i++) {
    $b=$b . " " . $line[$i];
  }
46
  push ( @{ $dictionarylist{$a} }, $b );
47 48 49 50 51 52
}
close(INFILE);

open(INFILE, $inputfile) || die("Can't open wordlist ".$inputfile."\n");
while(<INFILE>) {
  chomp;
53
  $phrase = $_;
54 55
  @line = split(/\s+/);

56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
  ## single pronunciation handling
  if($usespron==1) {
    if(scalar(@line)==0) {
      next;
    }

    for($i=0; $i<scalar(@line); $i++) {
      print $line[$i]." ";
    }
    print "\t";

    for($i=0; $i<scalar(@line); $i++) {
      if(!exists($dictionarylist{$line[$i]})) {
        print " _NOT_FOUND_";
      }
      else {
        @ref=@{ $dictionarylist{$line[$i]} };
        print $ref[0]."";
      }
    }
    print "\n";
    next;
78 79
  }

80 81 82 83 84 85 86 87
  ## multiple pronunciations handling 
  @pronlist=();
  @tmppronlist=();

  if(scalar(@line)>0) {
    $word = $line[$0];
    if(!exists($dictionarylist{$word})) {
        push(@pronlist, '_NOT_FOUND_');
88 89
    }
    else {
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
      @ref=@{ $dictionarylist{$word} };
      for($i=0; $i<scalar(@ref); $i++) {
        push(@pronlist, $ref[$i]."");
      }
    }

    for($i=1; $i<scalar(@line); $i++) {
      $word = $line[$i];
      if(!exists($dictionarylist{$word})) {
        for($j=0; $j<scalar(@pronlist); $j++) {
          $pronlist[$j] = $pronlist[$j]." _NOT_FOUND_";
        }
      }
      else {
        @ref=@{ $dictionarylist{$word} };
        while(scalar(@pronlist)>0) {
          push(@tmppronlist, shift(@pronlist));
        }
        while(scalar(@tmppronlist)>0) {
          $tmppron = shift(@tmppronlist);
          for($j=0; $j<scalar(@ref); $j++) {
            push(@pronlist, $tmppron." ".$ref[$j]);
          } 
        }
      }
    }
    
    for($i=0; $i<scalar(@pronlist); $i++) {
      print $phrase."\t".$pronlist[$i]."\n";
119 120
    }
  }
121

122 123 124
}
close(INFILE);

125 126 127 128 129
sub printUsage {
    print "usage: perl hkust_extract_subdict.pl dict wordlist [--spron]\n\n";
    print "### this script handle multiple pronunciations for dict in default\n";
    print "### if you want to extract single(top) pronunciation from dict, please use the option --spron\n\n";
}
130