Commit c6267c08 authored by Dan Povey's avatar Dan Povey
Browse files

Adding swbd s4 scripts (unfinished as yet)

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@627 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 2bfc6292
......@@ -11,6 +11,14 @@ About the Switchboard corpus
Each subdirectory of this directory contains the
scripts for a sequence of experiments.
s1:
[adding first version of scripts. This is a draft.
Note: this is closest to the s3/ script for WSJ.]
s3:
Adding first version of scripts. Only the earliest stages of
this are finished.
Note: this is closest to the s3/ scripts for WSJ.
s4:
This is currently unfinished; I had to commit it due to
certain SVN limitations. Changing the topology of the
silence phones after finding a problem with the
transition-probs.
......@@ -18,9 +18,7 @@
# This script makes lists of phones that are shared for building the monophone
# system and shared during phone clustering when creating questions for the
# triphone system. It puts out a line for each "real" phone, and that line
# contains all the versions of the "real" phones. With --nosil we have no
# silences in these sets; otherwise we share the silence phones SIL SPN and NSN
# into one group.
# contains all the versions of the "real" phones.
# Takes as standard input e.g. data/lang/phones.txt
......
......@@ -19,7 +19,7 @@ exit 1;
# This is a shell script, but it's recommended that you run the commands one by
# one by copying and pasting into the shell.
# Caution: some of the graph creation steps use quite a bit of memory, so you
# should run this on a machine that has sufficient memory.
# Data prep
......
--use-energy=false # only non-default option.
--sample-frequency=8000 # Switchboard is sampled at 8kHz
<Topology>
<TopologyEntry>
<ForPhones>
NONSILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State>
<State> 3 </State>
</TopologyEntry>
<TopologyEntry>
<ForPhones>
SILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.333 <Transition> 1 0.333 <Transition> 2 0.333 </State>
<State> 1 <PdfClass> 1 <Transition> 0 0.333 <Transition> 1 0.333 <Transition> 2 0.333 </State>
<State> 2 <PdfClass> 2 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State>
<State> 3 </State>
</TopologyEntry>
</Topology>
#!/bin/awk -f
# read whole DCT file
{
for (i=2; i<=NF; i++){DCT[$i]++;}
}
END{
# print output file
for (i in DCT){
#print i " - " DCT[i];
if (i=="SIL") continue; # throw-away SIL phone
print i;
}
}
\ No newline at end of file
#!/bin/bash
#
# To be run from one directory above this script.
# The input is two directory names (possibly the same) ontaining the
# 2000 Hub5 english evaluation test set and transcripts, which are
# respectively:
# LDC2002S09 LDC2002T43
# e.g. see
#http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2002S09
#http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002T43
#
# Example usage:
# local/eval2000_data_prep.sh /mnt/matylda2/data/HUB5_2000/ /mnt/matylda2/data/HUB5_2000/
# If you just copied the CDs directly, both directories might end with "hub5e_00".
# [note: I'm not sure about this though, I didn't see the original CD's].
# The first directory ($sdir) contains the speech data, and the directory
# $sdir/english/
# should exist.
# The second directory ($tdir) contains the transcripts, and the directory
# $tdir/2000_hub5_eng_eval_tr
# should exist; in particular we need the file
# $tdir/2000_hub5_eng_eval_tr/reference/hub5e00.english.000405.stm
# [just change this script if you don't have this type of structure in
# the way you unpacked it].
if [ $# -ne 2 ]; then
echo "Usage: local/eval2000_data_prep.sh <speech-dir> <transcription-dir>"
echo e.g. local/eval2000_data_prep.sh /mnt/matylda2/data/HUB5_2000/ /mnt/matylda2/data/HUB5_2000/
echo See comments in the script for more details
exit 1
fi
sdir=$1
tdir=$2
[ ! -d $sdir/english ] && echo Expecting directory $sdir/english to be present \
&& exit 1;
[ ! -d $tdir/2000_hub5_eng_eval_tr ] && echo Expecting directory $tdir/2000_hub5_eng_eval_tr to be present \
&& exit 1;
dir=data/local/eval2000
mkdir -p $dir
for x in $sdir/english/*.sph; do echo $x; done > $dir/sph.flist
awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $dir/sph.flist > $dir/sph_sides.scp
sph2pipe=`cd ../../..; echo $PWD/tools/sph2pipe_v2.5/sph2pipe`
[ ! -f $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
cat $dir/sph_sides.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
sort > $dir/wav_sides.scp
#cat /mnt/matylda2/data/HUB5_2000/2000_hub5_eng_eval_tr/reference/english/*.txt | \
# awk '/<contraction/{next;} /</{print;}'| head
# Get segments file...
#segments file format is: utt-id side-id start-time end-time, e.g.:
#sw02001-A_000098-001156 sw02001-A 0.98 11.56
pem=$sdir/english/hub5e_00.pem
[ ! -f $pem ] && echo "No such file $pem" && exit 1;
# pem file has lines like:
#en_4156 A unknown_speaker 301.85 302.48
grep -v ';;' $pem | awk '{spk=$1"-"$2; utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); print utt,spk,$4,$5;}' \
| sort > $dir/segments
# sgm file has lines like:
#en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F> HE IS A POLICE OFFICER
grep -v ';;' $tdir/2000_hub5_eng_eval_tr/reference/hub5e00.english.000405.stm | \
awk '{spk=$1"-"$2; utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); printf utt;
for(n=7;n<=NF;n++) printf " " $n; print ""; }' | sort > $dir/text.all
# We'll use the stm file for sclite scoring. There seem to be various errors
# in the stm file that upset hubscr.pl, and we fix them here.
cat $tdir/2000_hub5_eng_eval_tr/reference/hub5e00.english.000405.stm | \
sed 's:((:(:' | sed s:<B_ASIDE>::g | sed s:<E_ASIDE>::g |
> $dir/stm
cp $tdir/2000_hub5_eng_eval_tr/reference/en20000405_hub5.glm $dir/glm
# next line uses command substitution
# Just checking that the segments are the same in pem vs. stm.
! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \
echo "Segments from pem file and stm file do not match." && exit 1;
grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
# create an utt2spk file that assumes each conversation side is
# a separate speaker.
cat $dir/segments | awk '{print $1,$2;}' > $dir/utt2spk
scripts/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
dest=data/eval2000
mkdir -p $dest
for x in wav_sides.scp segments text utt2spk spk2utt stm glm; do
cp $dir/$x $dest/$x
done
echo Data preparation and formatting completed for Eval 2000
echo "(but not MFCC extraction)"
#!/bin/bash
# Usage: is_sorted.sh [script-file]
# This script returns 0 (success) if the script file argument [or standard input]
# is sorted and 1 otherwise.
export LC_ALL=C
if [ $# == 0 ]; then
scp=-
fi
if [ $# == 1 ]; then
scp=$1
fi
if [ $# -gt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then
echo "Usage: is_sorted.sh [script-file]"
exit 1
fi
cat $scp > /tmp/tmp1.$$
sort /tmp/tmp1.$$ > /tmp/tmp2.$$
cmp /tmp/tmp1.$$ /tmp/tmp2.$$ >/dev/null
ret=$?
rm /tmp/tmp1.$$ /tmp/tmp2.$$
if [ $ret == 0 ]; then
exit 0;
else
echo "is_sorted.sh: script file $scp is not sorted";
exit 1;
fi
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This is specialized for this particular WSJ setup.
# It creates the extra questions that specialize within each phone to ask
# about the stress markers and positions. We create one question for each stress-marker
# type and each position.
# We also have a question that asks about silences... just because
# our recipe may remove silence from the standard questions.
# The standard input of this script should be a phones.txt frile.
echo "SIL NSN SPN LAU";
echo "SIL NSN"
echo "SPN LAU"
echo "SIL"
echo "NSN"
echo "SPN"
echo "LAU'
# Questions about position and stress. stdin==phones.txt
grep -v eps | awk '{print $1}' | \
perl -e 'while(<>){ m:([A-Za-z]+)(\d*)(_.)?: || die "Bad line $_";
$phone=$1; $stress=$2; $pos=$3;
$full_phone ="$1$2$3";
$pos2list{$pos} = $pos2list{$pos} . $full_phone . " ";
$stress2list{$stress} = $stress2list{$stress} . $full_phone . " ";
}
foreach $k (keys %pos2list) { print "$pos2list{$k}\n"; }
foreach $k (keys %stress2list) { print "$stress2list{$k}\n"; } '
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# To be run from .. (one directory up from here)
cmd="scripts/run.pl"
nj=4
for x in 1 2; do
if [ $1 == "--cmd" ]; then
shift
cmd="$1";
shift;
[ "$cmd" == "" ] && echo Empty value given to --cmd option && exit 1;
fi
if [ $1 == "--num-jobs" ]; then
shift
nj=$1
shift
fi
done
if [ $# != 3 ]; then
echo "usage: make_mfcc.sh <data-dir> <log-dir> <abs-path-to-mfccdir>";
exit 1;
fi
if [ -f path.sh ]; then . path.sh; fi
data=$1
logdir=$2
mfccdir=$3
# use "name" as part of name of the archive.
name=`basename $data`
mkdir -p $mfccdir || exit 1;
mkdir -p $logdir || exit 1;
scp=$data/wav_sides.scp
segments=$data/segments
config=conf/mfcc.conf
required="$scp $segments $config"
for f in $required; do
if [ ! -f $f ]; then
echo "make_mfcc.sh: no such file $f"
exit 1;
fi
done
# note: in general, the double-parenthesis construct in bash "((" is "C-style
# syntax" where we can get rid of the $ for variable names, and omit spaces.
# The "for" loop in this style is a special construct.
split_segments=""
for ((n=1; n<=nj; n++)); do
split_segments="$split_segments $logdir/segments$n"
done
scripts/split_scp.pl $segments $split_segments || exit 1;
rm $logdir/.error 2>/dev/null
for ((n=1; n<=nj; n++)); do
log=$logdir/make_mfcc.$n.log
$cmd $log \
extract-segments scp:$scp $logdir/segments${n} ark:- \| \
compute-mfcc-feats --verbose=2 --config=$config ark:- \
ark,scp:$mfccdir/raw_mfcc_$name.$n.ark,$mfccdir/raw_mfcc_$name.$n.scp \
|| touch $logdir/.error &
done
wait;
if [ -f $logdir/.error.$name ]; then
echo "Error producing mfcc features for $name:"
tail $logdir/make_mfcc.*.log
exit 1;
fi
# concatenate the .scp files together.
rm $data/feats.scp 2>/dev/null
for ((n=1; n<=nj; n++)); do
cat $mfccdir/raw_mfcc_$name.$n.scp >> $data/feats.scp
done
rm $logdir/segments*
echo "Succeeded creating MFCC features for $name"
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script makes lists of phones that are shared for building the monophone
# system and shared during phone clustering when creating questions for the
# triphone system. It puts out a line for each "real" phone, and that line
# contains all the versions of the "real" phones.
# Takes as standard input e.g. data/lang/phones.txt
if [ $# != 0 ]; then
echo "Usage: make_shared_phones.sh < phones.txt"
exit 1;
fi
echo SIL
echo NSN
echo SPN
echo LAU
# This script reads from the standard input.
grep -v eps | grep -v -E 'SIL|NSN|SPN|LAU' | awk '{print $1}' | \
perl -e 'while(<>){ m:([A-Za-z]+)(\d*)(_.)?: || die "Bad line $_";
$phone=$1; $stress=$2; $position=$3;
if($phone eq $curphone){ print " $phone$stress$position"; }
else { if(defined $curphone){ print "\n"; } $curphone=$phone; print "$phone$stress$position"; }} print "\n"; '
#!/usr/bin/perl
# Copyright 2011 Milos Janda
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# takes a transcript file with lines like
# 40po031e THE RATE FELL TO SIX %PERCENT IN NOVEMBER NINETEEN EIGHTY SIX .PERIOD
# on the standard input.
# The first (and only) command-line argument is the filename of a dictionary file with lines like
# ZYUGANOV Z Y UW1 G AA0 N AA0 V
# This file replaces all OOVs with the spoken-noise word and prints counts for each OOV on the standard error.
@ARGV == 2 || die "Usage: oov2unk.pl dict spoken-noise-word < transcript > transcript2";
$dict = shift @ARGV;
open(F, "<$dict") || die "Died opening dictionary file $dict\n";
while(<F>){
@A = split(" ", $_);
$word = shift @A;
$seen{$word} = 1;
}
$spoken_noise_word = shift @ARGV;
while(<STDIN>) {
@A = split(" ", $_);
$utt = shift @A;
print $utt;
foreach $a (@A) {
if(defined $seen{$a}) {
print " $a";
} else {
$oov{$a}++;
print " $spoken_noise_word";
}
}
print "\n";
}
foreach $w (sort { $oov{$a} <=> $oov{$b} } keys %oov) {
print STDERR "$w $oov{$w}\n";
}
#!/bin/bash
# Remove excess utterances once they appear more than a specified
# number of times with the same transcription, in a data set.
# E.g. useful for removing excess "uh-huh" from training.
if [ $# != 3 ]; then
echo "Usage: remove_dup_utts.sh max-count src-data-dir dest-data-dir"
exit 1;
fi
maxcount=$1
srcdir=$2
destdir=$3
mkdir -p $destdir
[ ! -f $srcdir/text ] && echo "Invalid input directory $srcdir" && exit 1;
cat $srcdir/text | \
perl -e '
$maxcount = shift @ARGV;
@all = ();
$p1 = 103349; $p2 = 71147; $k = 0;
sub random { # our own random number generator: predictable.
$k = ($k + $p1) % $p2;
return ($k / $p2);
}
while(<>) {
push @all, $_;
@A = split(" ", $_);
shift @A;
$text = join(" ", @A);
$count{$text} ++;
}
foreach $line (@all) {
@A = split(" ", $line);
shift @A;
$text = join(" ", @A);
$n = $count{$text};
if ($n < $maxcount || random() < ($maxcount / $n)) {
print $line;
}
}' $maxcount >$destdir/text
echo "Reduced number of utterances from `cat $srcdir/text | wc -l` to `cat $destdir/text | wc -l`"
scripts/filter_scp.pl $destdir/text <$srcdir/feats.scp >$destdir/feats.scp
scripts/filter_scp.pl $destdir/text <$srcdir/utt2spk >$destdir/utt2spk
scripts/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt
[ -f $srcdir/wav.scp ] && scripts/filter_scp.pl $destdir/feats.scp <$srcdir/wav.scp >$destdir/wav.scp
[ -f $srcdir/spk2gender ] && scripts/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
#!/bin/bash
fake=false
if [ "$1" == "--fake" ]; then
fake=true
shift
fi
sphdir=$1 # e.g. /mnt/matylda2/data/RM
wavdir=$2 # e.g. /mnt/matylda6/jhu09/qpovey/kaldi_rm_wav
flistin=$3 # e.g. train_sph.flist, contains sph files in sphdir
flistout=$4 # e.g. train_wav.flist, contains wav files in wavdir
if [ $fake == false ]; then
for x in `cat $flistin`; do
y=`echo $x | sed s:$sphdir:$wavdir: | sed s:.sph:.wav:`;
mkdir -p `dirname $y`
../../tools/sph2pipe_v2.5/sph2pipe -f wav $x $y || exit 1;
done
fi
cat $flistin | sed s:$sphdir:$wavdir: | sed s:.sph:.wav: > $flistout || exit 1;
#!/bin/bash
#
# To be run from one directory above this script.
## The input is some directory containing the switchboard-1 release 2
## corpus (LDC97S62). Note: we don't make many assumptions about how
## you unpacked this. We are just doing a "find" command to locate
## the .sph files.
# for example /mnt/matylda2/data/SWITCHBOARD_1R2
. path.sh
#check existing directories
if [ $# != 1 ]; then
echo "Usage: swbd_p1_data_prep.sh /path/to/SWBD"
exit 1;
fi
SWBD_DIR=$1
DIR=$PWD
mkdir -p data/local
cd data/local
# Audio data directory check
if [ ! -d $SWBD_DIR ]; then
echo "Error: run.sh requires a directory argument"
exit 1;
fi
# Trans directory check
if [ ! -d swb_ms98_transcriptions ]; then
# To get the SWBD transcriptions and dict, do:
echo " *** Downloading trascriptions and dictionary ***"
wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
tar -xf switchboard_word_alignments.tar.gz
else
echo "Directory with transcriptions exists, skipping downloading"
fi
# Option A: SWBD dictionary file check
if [ ! -f swb_ms98_transcriptions/sw-ms98-dict.text ]; then
echo "SWBD dictionary file does not exist"
exit 1;
fi
# find sph audio files
(
find $SWBD_DIR -iname '*.sph';
) > train_sph.flist
if [ `cat train_sph.flist | wc -l` -ne 2435 ]; then
echo Warning: expected 2435 data data files, found `cat train_sph.flist | wc -l`
fi
# (1a) Transcriptions preparation
# make basic transcription file (add segments info)
awk '{name=substr($1,1,6);gsub("^sw","sw0",name); side=substr($1,7,1);stime=$2;etime=$3;
printf("%s-%s_%06.0f-%06.0f", name, side, int(100*stime+0.5), int(100*etime+0.5));
for(i=4;i<=NF;i++) printf " " toupper($i); printf "\n"}' \
swb_ms98_transcriptions/*/*/*-trans.text > transcripts1.txt