Commit adfa50a6 authored by Vassil Panayotov's avatar Vassil Panayotov
Browse files

Removing the egs/rm/s4 recipe. It's based on the older generation 's3'...

Removing the egs/rm/s4 recipe. It's based on the older generation 's3' recipes, and now there are better examples using free data in Kaldi

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4654 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 970a5484
......@@ -11,8 +11,6 @@ Each subdirectory of this directory contains the
scripts for a sequence of experiments.
s5 is the currently recommmended setup.
s4: A recipe based on freely available subset of RM data, distributed by CMU
s5: This is the "new-new-style" recipe. It is now finished.
All further work will be on top of this style of recipe. Note:
unlike previous recipes, this now uses the same underlying
......
This recipe is using a publicly available subset of Resource Management data,
distributed by CMU.
To run the recipe the data should be downloaded first, for which ./getdata.sh
command can be used. Then ./run.sh script can be executed to automatically perform
all steps or the commands can be started manually by copy/pasting them.
The script and data layout are based on egs/rm/s3 recipe, with several exceptions:
- because this recipe uses pre-extracted feature vectors no conversion from .sph
to .wav format and consequent feature extraction is needed. The features are just
converted from CMU Sphinx feature files to Kaldi Tables.
- only one test set is available instead of several (e.g. mar87, oct87 and so on)
as in the original recipe
- no speaker-dependent processing
- only the steps up to tri2a stage are implemented
- on the plus side it requires less disk space (about 220MB)
--use-energy=false # only non-default option.
# No non-default options for now.
<Topology>
<TopologyEntry>
<ForPhones>
NONSILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State>
<State> 3 </State>
</TopologyEntry>
<TopologyEntry>
<ForPhones>
SILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 3 <PdfClass> 3 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 4 <PdfClass> 4 <Transition> 4 0.25 <Transition> 5 0.75 </State>
<State> 5 </State>
</TopologyEntry>
</Topology>
#!/bin/bash
# Copyright 2012 Vassil Panayotov
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
source path.sh
# Download and extract CMU's feature files
mkdir -p $RM1_ROOT
wget -P $RM1_ROOT http://www.speech.cs.cmu.edu/databases/rm1/rm1_cepstra.tar.gz ||
wget -P $RM1_ROOT http://sourceforge.net/projects/kaldi/files/rm1_cepstra.tar.gz
tar -C $RM1_ROOT/ -xf $RM1_ROOT/rm1_cepstra.tar.gz
# Download the G.fst graph produced from 'wp_gram.txt'
wget -P $RM1_ROOT http://sourceforge.net/projects/kaldi/files/RM_G.fst
\ No newline at end of file
#!/bin/bash
# This script basically calls the supplied decoding script
# once for each test set (in parallel on the same machine),
# and then averages the resulting WERs.
# The interpretation of the decode-dir-1, etc., as inputs,
# outputs and so on, depends on the decoding script you call.
# It assumes the model directory is one level of from decode-dir-1.
mono_opt=
if [ "$1" == "--mono" ]; then
mono_opt=$1;
shift;
fi
script=$1
decode_dir_1=$2 # e.g. exp/sgmm3b/decode
decode_dir_2=$3
decode_dir_3=$4
dir=`dirname $decode_dir_1` # e.g. exp/sgmm3b
if [ $# -ne 2 ]; then
echo "Usage: scripts/decode.sh <decode-script> <decode-dir-1>"
exit 1;
fi
if [ ! -x $script -o ! -d $dir ]; then
echo "scripts/decode.sh: Either no such script $script or not executable, or no such dir $dir"
exit 1;
fi
scripts/mkgraph.sh $mono_opt data/lang_test $dir $dir/graph
$script $dir data/test data/lang $decode_dir_1/ &
wait
# The publicly available RM subset has just one test set(instead of mar87 etc.),
# so no averaging is needed
grep WER $decode_dir_1/wer* || echo "Error decoding $decode_dir: no WER results found."
#!/usr/bin/perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# usage: make_trans.sh prefix in.flist input.snr out.txt out.scp
# prefix is first letters of the database "key" (rest are numeric)
# in.flist is just a list of filenames, probably of .sph files.
# input.snr is an snr format file from the RM dataset.
# out.txt is the output transcriptions in format "key word1 word\n"
# out.scp is the output scp file, which is as in.scp but has the
# database-key first on each line.
# Reads from first argument e.g. $rootdir/rm1_audio1/rm1/doc/al_sents.snr
# and second argument train_wav.scp
# Writes to standard output trans.txt
if(@ARGV != 5) {
die "usage: make_trans.sh prefix in.flist input.snr out.txt out.scp\n";
}
($prefix, $in_flist, $input_snr, $out_txt, $out_scp) = @ARGV;
open(F, "<$input_snr") || die "Opening SNOR file $input_snr";
while(<F>) {
if(m/^;/) { next; }
m/(.+) \((.+)\)/ || die "bad line $_";
$T{$2} = $1;
}
close(F);
open(G, "<$in_flist") || die "Opening file list $in_flist";
open(O, ">$out_txt") || die "Open output transcription file $out_txt";
open(P, ">$out_scp") || die "Open output scp file $out_scp";
while(<G>) {
$_ =~ m:/(\w+)/(\w+)\.mfc\s+$:i || die "bad scp line $_";
$spkname = $1;
$uttname = $2;
$uttname =~ tr/a-z/A-Z/;
defined $T{$uttname} || die "no trans for sent $uttname";
$spkname =~ s/_//g; # remove underscore from spk name to make key nicer.
$key = $prefix . "_" . $spkname . "_" . $uttname;
$key =~ tr/A-Z/a-z/; # Make it all lower case.
# to make the numerical and string-sorted orders the same.
print O "$key $T{$uttname}\n";
print P "$key $_";
$n++;
}
close(O) || die "Closing output.";
close(P) || die "Closing output.";
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation
# Copyright 2012 Vassil Panayotov
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# To be run from one directory above this script.
# Note: when creating your own data preparation scripts, it's a good idea
# to make sure that the speaker id (if present) is a prefix of the utterance
# id, that the output scp file is sorted on utterance id, and that the
# transcription file is exactly the same length as the scp file and is also
# sorted on utterance id (missing transcriptions should be removed from the
# scp file using e.g. scripts/filter_scp.pl)
if [ $# != 1 ]; then
echo "Usage: ../../local/RM_data_prep.sh /path/to/RM"
exit 1;
fi
export LC_ALL=C
RMROOT=$1
mkdir -p data/local
cd data/local
if [ ! -f $RMROOT/RM_G.fst -o ! -d $RMROOT/rm1 ]; then
echo "Required data is missing. You can download the data by running ./getdata.sh"
exit 1;
fi
# Make a list of files
cat $RMROOT/rm1/etc/rm1_train.fileids | \
xargs -I_x_ echo $RMROOT/rm1/feat/_x_.mfc > train.flist
cat $RMROOT/rm1/etc/rm1_test.fileids | \
xargs -I_x_ echo $RMROOT/rm1/feat/_x_.mfc > test.flist
# make_trans.pl also creates the utterance id's and the kaldi-format scp file.
# this is needed, because the original "al_sents.snr" file is not available
# (and because CMU's train utterances have tags like '<sil>' added)
cat $RMROOT/rm1/etc/rm1_train.transcription |\
tr '[a-z]' '[A-Z]' |\
sed -E -e 's:</?S(IL)?>: :g' -e 's:\([0-9]\): :g' -e 's: +: :g' -e 's:^ +::' |\
cat $RMROOT/rm1/etc/rm1_test.transcription - \
> al_sents.snr
# training set
../../local/make_trans.pl trn train.flist al_sents.snr train_trans.txt train.scp
mv train_trans.txt tmp; sort -k 1 tmp > train_trans.txt
mv train.scp tmp; sort -k 1 tmp > train.scp
rm tmp
# test set
../../local/make_trans.pl test test.flist al_sents.snr test_trans.txt test.scp
mv test_trans.txt tmp; sort -k 1 tmp > test_trans.txt
mv test.scp tmp; sort -k 1 tmp > test.scp
rm tmp
# We already have the features, so sph2pipe step is skipped and
# given the limited data the speaker-dependent processing is also not used
# "wp_gram.txt" is no longer available from LDC's website, so we are just using a
# pre-built grammar WFST (G.fst). The word-pair grammar is a finite-state description
# of the allowed utterances, which just enumerates the words that can follow each word
# in the vocabulary. G.fst is constructed by adding output arcs to each node
# representing a word, one for each word that is allowed to follow, and the
# probability mass is distributed uniformly among all these arcs.
#../../scripts/make_rm_lm.pl $RMROOT/LDC93S3B/disc_1/doc/wp_gram.txt > G.txt || exit 1;
cp $RMROOT/RM_G.fst ./G.fst
# Convert the CMU's lexicon to a form which the other scripts expect
# (leave only the first pronunciation variant and convert the phones to lower case)
cat $RMROOT/rm1/etc/rm1.dic | \
egrep -v '\(' | \
sed -e "s/^\([[:alnum:]-]\+\('[[:alpha:]]\+\)\?\)\(.*\)/\1\L\3/g" > lexicon.txt
echo RM_data_prep succeeded.
#!/bin/bash
#
# Copyright 2012 Vassil Panayotov
# modified from:
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# To be run from one directory above this script.
if [ -f path.sh ]; then . path.sh; fi
data_list="train test"
for x in lang lang_test $data_list; do
mkdir -p data/$x
done
# Copy stuff into its final location:
for x in $data_list; do
cp data/local/${x}.scp data/$x/mfc.scp || exit 1;
cp data/local/${x}_trans.txt data/$x/text || exit 1;
done
# We are not using make_words_symtab.pl for symbol table creation in this
# recipe, because CMU's lexicon have several words that are not in the
# word-pair grammar
cat data/local/lexicon.txt | \
awk 'BEGIN{print "<eps>\t0";} {print $1 "\t" NR;} END{print "!SIL\t" NR+1;}' \
> data/lang/words.txt
scripts/make_phones_symtab.pl < data/local/lexicon.txt > data/lang/phones.txt
cp data/lang/words.txt data/lang_test/words.txt
silphones="sil"; # This would in general be a space-separated list of all silence phones. E.g. "sil vn"
# Generate colon-separated lists of silence and non-silence phones.
scripts/silphones.pl data/lang/phones.txt "$silphones" data/lang/silphones.csl \
data/lang/nonsilphones.csl
ndisambig=`scripts/add_lex_disambig.pl data/local/lexicon.txt data/local/lexicon_disambig.txt`
ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST.
scripts/add_disambig.pl data/lang/phones.txt $ndisambig > data/lang_test/phones_disambig.txt
cp data/lang_test/phones_disambig.txt data/lang/ # needed for MMI.
silprob=0.5 # same prob as word
scripts/make_lexicon_fst.pl data/local/lexicon.txt $silprob sil | \
fstcompile --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > data/lang/L.fst
# Create L_align.fst, which is as L.fst but with alignment symbols (#1 and #2 at the
# beginning and end of words, on the input side)... useful if we
# ever need to e.g. create ctm's-- these are used to work out the
# word boundaries.
cat data/local/lexicon.txt | \
awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }' | \
scripts/make_lexicon_fst.pl - 0.5 sil | \
fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > data/lang_test/L_align.fst
# L_disambig.fst has the disambiguation symbols (c.f. Mohri's papers)
scripts/make_lexicon_fst.pl data/local/lexicon_disambig.txt $silprob sil '#'$ndisambig | \
fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \
--keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel \
> data/lang_test/L_disambig.fst
cp data/lang_test/L_disambig.fst data/lang/ # Needed for MMI training.
# Compilation is no longer needed, because we are using a pre-built G.fst
#fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang/words.txt --keep_isymbols=false \
# --keep_osymbols=false data/local/G.txt > data/lang_test/G.fst
cp data/local/G.fst data/lang_test/
# Checking that G is stochastic [note, it wouldn't be for an Arpa]
fstisstochastic data/lang_test/G.fst || echo Error: G is not stochastic
# Checking that G.fst is determinizable.
fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
# Checking that L_disambig.fst is determinizable.
fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
# Checking that disambiguated lexicon times G is determinizable
fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
fstdeterminize >/dev/null || echo Error
# Checking that LG is stochastic:
fsttablecompose data/lang/L.fst data/lang_test/G.fst | \
fstisstochastic || echo Error: LG is not stochastic.
# Checking that L_disambig.G is stochastic:
fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
fstisstochastic || echo Error: LG is not stochastic.
## Check lexicon.
## just have a look and make sure it seems sane.
echo "First few lines of lexicon FST:"
fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head
silphonelist=`cat data/lang/silphones.csl | sed 's/:/ /g'`
nonsilphonelist=`cat data/lang/nonsilphones.csl | sed 's/:/ /g'`
cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \
sed "s:SILENCEPHONES:$silphonelist:" > data/lang/topo
for x in phones.txt words.txt silphones.csl nonsilphones.csl topo; do
cp data/lang/$x data/lang_test/$x || exit 1;
done
echo RM_format_data succeeded.
#!/bin/bash
# path to Kaldi's root directory
root=`pwd`/../../..
export PATH=${root}/src/bin:${root}/tools/openfst/bin:${root}/src/fstbin/:${root}/src/gmmbin/:${root}/src/featbin/:${root}/src/fgmmbin:${root}/src/sgmmbin:${root}/src/lm:${root}/src/latbin:$PATH
# path to the directory in which the subset of RM corpus is stored
export RM1_ROOT=`pwd`/data/download
export LC_ALL=C
export LC_LOCALE_ALL=C
#!/bin/bash
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
source ./path.sh
# call the next line with the directory where the RM data is
local/rm_data_prep.sh $RM1_ROOT || exit 1;
local/rm_format_data.sh || exit 1;
# the directory, where you want to store MFCC features.
featdir=data/rm_feats
# convert the Sphinx feature files to Kaldi tables
for x in train test; do
steps/make_mfcc.sh data/$x exp/make_mfcc/$x $featdir || exit 1;
done
scripts/subset_data_dir.sh data/train 1000 data/train.1k || exit 1;
# train monophone system.
steps/train_mono.sh data/train.1k data/lang exp/mono || exit 1;
# monophone decoding
local/decode.sh --mono steps/decode_deltas.sh exp/mono/decode || exit 1;
# Get alignments from monophone system.
steps/align_deltas.sh data/train data/lang exp/mono exp/mono_ali || exit 1;
# train tri1 [first triphone pass]
steps/train_deltas.sh data/train data/lang exp/mono_ali exp/tri1 || exit 1;
# decode tri1
local/decode.sh steps/decode_deltas.sh exp/tri1/decode || exit 1;
# align tri1
steps/align_deltas.sh --graphs "ark,s,cs:gunzip -c exp/tri1/graphs.fsts.gz|" \
data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
# train tri2a [delta+delta-deltas]
steps/train_deltas.sh data/train data/lang exp/tri1_ali exp/tri2a || exit 1;
# decode tri2a
local/decode.sh steps/decode_deltas.sh exp/tri2a/decode || exit 1;
#!/usr/bin/perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Adds some specified number of disambig symbols to a symbol table.
# Adds these as #1, #2, etc.
# If the --include-zero option is specified, includes an extra one
# #0.
if(!(@ARGV == 2 || (@ARGV ==3 && $ARGV[0] eq "--include-zero"))) {
die "Usage: add_disambig.pl [--include-zero] symtab.txt num_extra > symtab_out.txt ";
}
if(@ARGV == 3) {
$include_zero = 1;
$ARGV[0] eq "--include-zero" || die "Bad option/first argument $ARGV[0]";
shift @ARGV;
} else {
$include_zero = 0;
}
$input = $ARGV[0];
$nsyms = $ARGV[1];
open(F, "<$input") || die "Opening file $input";
while(<F>) {
@A = split(" ", $_);
@A == 2 || die "Bad line $_";
$lastsym = $A[1];
print;
}
if(!defined($lastsym)){
die "Empty symbol file?";
}
if($include_zero) {
$lastsym++;
print "#0 $lastsym\n";
}
for($n = 1; $n <= $nsyms; $n++) {
$y = $n + $lastsym;
print "#$n $y\n";
}
#!/usr/bin/perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Adds disambiguation symbols to a lexicon.
# Outputs still in the normal lexicon format.
# Disambig syms are numbered #1, #2, #3, etc. (#0
# reserved for symbol in grammar).
# Outputs the number of disambig syms to the standard output.
if(@ARGV != 2) {
die "Usage: add_lex_disambig.pl [ --sil silphone ] lexicon.txt lexicon_disambig.txt "
}
$lexfn = shift @ARGV;
$lexoutfn = shift @ARGV;
open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
# (1) Read in the lexicon.
@L = ( );
while(<L>) {
@A = split(" ", $_);
push @L, join(" ", @A);
}
# (2) Work out the count of each phone-sequence in the
# lexicon.
foreach $l (@L) {
@A = split(" ", $l);
shift @A; # Remove word.
$count{join(" ",@A)}++;
}
# (3) For each left sub-sequence of each phone-sequence, note down
# that exists (for identifying prefixes of longer strings).
foreach $l (@L) {
@A = split(" ", $l);
shift @A; # Remove word.
while(@A > 0) {
pop @A; # Remove last phone
$issubseq{join(" ",@A)} = 1;
}
}