Commit 7398f6ea authored by Peng Qi's avatar Peng Qi
Browse files

Added speaker ID patch into SWBD and FSH

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4256 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent e459d6d1
...@@ -5,17 +5,30 @@ ...@@ -5,17 +5,30 @@
stage=0 stage=0
calldata=
while test $# -gt 0
do
case "$1" in
--calldata) calldata=1
;;
*) break;
;;
esac
shift
done
. utils/parse_options.sh . utils/parse_options.sh
if [ $# -eq 0 ]; then if [ $# -eq 0 ]; then
echo "$0 <fisher-dir-1> [<fisher-dir-2> ...]" echo "$0 [--calldata] <fisher-dir-1> [<fisher-dir-2> ...]"
echo " e.g.: $0 /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19\\" echo " e.g.: $0 /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19\\"
echo " /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13" echo " /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13"
echo " (We also support a single directory that has the contents of all of them)" echo " (We also support a single directory that has the contents of all of them)"
echo " If specified, --calldata will be used to map Kaldi speaker ID to real"
echo " speaker PIN released with the Fisher corpus."
exit 1; exit 1;
fi fi
# Check that the arguments are all absolute pathnames. # Check that the arguments are all absolute pathnames.
for dir in $*; do for dir in $*; do
...@@ -178,5 +191,17 @@ if [ $stage -le 4 ]; then ...@@ -178,5 +191,17 @@ if [ $stage -le 4 ]; then
fi fi
fi fi
if [ ! -z "$calldata" ]; then # fix speaker IDs
cat $links/fe_03_p{1,2}_tran/doc/*calldata.tbl > $tmpdir/combined-calldata.tbl
local/fisher_fix_speakerid.pl $tmpdir/combined-calldata.tbl data/train_all
utils/utt2spk_to_spk2utt.pl data/train_all/utt2spk.new > data/train_all/spk2utt.new
# patch files
for f in spk2utt utt2spk text segments spk2gender; do
cp data/train_all/$f data/train_all/$f.old || exit 1;
cp data/train_all/$f.new data/train_all/$f || exit 1;
done
rm $tmpdir/combined-calldata.tbl
fi
echo "Data preparation succeeded" echo "Data preparation succeeded"
#!/usr/bin/perl -w
# Author: Peng Qi (pengqi@cs.stanford.edu)
# This script maps Switchboard speaker IDs to the true physical speakers
# and fixes the utterances IDs accordingly. Expected to be run one level of
# directory above.
sub trim {
(my $s = $_[0]) =~ s/^\s+|\s+$//g;
return $s;
}
if ($#ARGV != 1) {
print "Usage: swbd1_fix_speakerid.pl <fisher-calldata-tbl-file> <data-dir>\n";
print "E.g.: swbd1_fix_speakerid.pl data/local/train/combined-calldata.tbl data/train_all\n";
}
$tab_file = $ARGV[0];
$dir = $ARGV[1];
%conv_to_spk = ();
open(my $conv_tab, '<', $tab_file) or die "Could not open '$tab_file' $!\n";
while (my $line = <$conv_tab>) {
chomp $line;
my @fields = split "," , $line;
#$fields[0] = trim($fields[0]);
$fields[5] = trim($fields[5]);
$fields[10] = trim($fields103]);
$conv_to_spk{'fe_03_' . $fields[0] . '-A'} = $fields[5];
$conv_to_spk{'fe_03_' . $fields[0] . '-B'} = $fields[10];
}
close($conv_tab);
# fix utt2spk
%missingconv = ();
open(my $utt2spk, '<', $dir . '/utt2spk') or die "Could not open '$dir/utt2spk' $!\n";
open(my $utt2spk_new, '>', $dir . '/utt2spk.new');
while (my $line = <$utt2spk>) {
chomp $line;
my @fields = split " " , $line;
my $convid = substr $fields[0], 0, 13;
if (exists $conv_to_spk{ $convid }) {
my $spkid = $conv_to_spk{ $convid };
$spkid = "fe_03_" . $spkid;
my $newuttid = $spkid . '-' . (substr $fields[0], 6);
print $utt2spk_new "$newuttid $spkid\n";
} else {
my $convid = substr $convid, 6, 5;
$missingconv{$convid} = 1;
print $utt2spk_new $fields[0]." ".$fields[1]."\n";
}
}
close($utt2spk);
close($utt2spk_new);
foreach my $conv (keys %missingconv) {
print "Warning: Conversation ID '$conv' not found in conv.tab, retaining old speaker IDs\n"
}
# fix spk2gender
if (open(my $spk2gender, '<', $dir . '/spk2gender')) {
open(my $spk2gender_new, '>', $dir . '/spk2gender.new')
while (my $line = <$spk2gender>) {
chomp $line;
my @fields = split " ", $line;
my $convid = $fields[0];
if (exists $conv_to_spk{ $convid }) {
my $spkid = $conv_to_spk{ $convid };
$spkid = "fe_03_" . $spkid;
print $spk2gender_new $spkid." ".$fields[1]."\n";
} else {
print $spk2gender_new $fields[0]." ".$fields[1]."\n";
}
}
close($spk2gender);
close($spk2gender_new);
}
# fix segments and text
foreach my $file ('segments','text') {
open(my $oldfile, '<', "$dir/$file") or die "Could not open '$dir/$file' $!\n";
open(my $newfile, '>', "$dir/$file.new");
while (my $line = <$oldfile>) {
chomp $line;
my $convid = substr $line, 0, 13;
if (exists $conv_to_spk{$convid}) {
my $spkid = $conv_to_spk{$convid};
print $newfile "fe_03_$spkid-" . (substr $line, 6) . "\n";
} else {
print $newfile "$line\n";
}
}
}
...@@ -10,6 +10,12 @@ set -e ...@@ -10,6 +10,12 @@ set -e
# the next command produces the data in local/train_all # the next command produces the data in local/train_all
local/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \ local/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
/export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13 /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13
# You could also try specifying the --calldata argument to this command as below.
# If specified, the script will use actual speaker personal identification
# numbers released with the dataset, i.e. real speaker IDs. Note: --calldata has
# to be the first argument of this script.
# local/fisher_data_prep.sh --calldata /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
# /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13
# at BUT: # at BUT:
# local/fisher_data_prep.sh /mnt/matylda6/jhu09/qpovey/FISHER/LDC2005T19 /mnt/matylda2/data/FISHER/ # local/fisher_data_prep.sh /mnt/matylda6/jhu09/qpovey/FISHER/LDC2005T19 /mnt/matylda2/data/FISHER/
......
...@@ -10,11 +10,18 @@ ...@@ -10,11 +10,18 @@
## you unpacked this. We are just doing a "find" command to locate ## you unpacked this. We are just doing a "find" command to locate
## the .sph files. ## the .sph files.
## The second input is optional, which should point to a directory containing
## Switchboard transcriptions/documentations (specifically, the conv.tab file).
## If specified, the script will try to use the actual speaker PINs provided
## with the corpus instead of the conversation side ID (Kaldi default). We
## will be using "find" to locate this file so we don't make any assumptions
## on the directory structure. (Peng Qi, Aug 2014)
. path.sh . path.sh
#check existing directories #check existing directories
if [ $# != 1 ]; then if [ $# != 1 -a $# != 2 ]; then
echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD" echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]"
exit 1; exit 1;
fi fi
...@@ -144,6 +151,17 @@ for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do ...@@ -144,6 +151,17 @@ for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
cp data/local/train/$f data/train/$f || exit 1; cp data/local/train/$f data/train/$f || exit 1;
done done
if [ $# == 2 ]; then # fix speaker IDs
find $2 -name conv.tab > $dir/conv.tab
local/swbd1_fix_speakerid.pl `cat $dir/conv.tab` data/train
utils/utt2spk_to_spk2utt.pl data/train/utt2spk.new > data/train/spk2utt.new
# patch files
for f in spk2utt utt2spk text segments; do
cp data/train/$f data/train/$f.old || exit 1;
cp data/train/$f.new data/train/$f || exit 1;
done
rm $dir/conv.tab
fi
echo Switchboard-1 data preparation succeeded. echo Switchboard-1 data preparation succeeded.
......
#!/usr/bin/perl -w
# Author: Peng Qi (pengqi@cs.stanford.edu)
# This script maps Switchboard speaker IDs to the true physical speakers
# and fixes the utterances IDs accordingly. Expected to be run one level of
# directory above.
sub trim {
(my $s = $_[0]) =~ s/^\s+|\s+$//g;
return $s;
}
if ($#ARGV != 1) {
print "Usage: swbd1_fix_speakerid.pl <swbd-conv-tab-file> <data-dir>\n";
print "E.g.: swbd1_fix_speakerid.pl /datasets/SWBD1Transcripts/tables/conv.tab data/train\n";
}
$tab_file = $ARGV[0];
$dir = $ARGV[1];
%conv_to_spk = ();
open(my $conv_tab, '<', $tab_file) or die "Could not open '$tab_file' $!\n";
while (my $line = <$conv_tab>) {
chomp $line;
my @fields = split "," , $line;
#$fields[0] = trim($fields[0]);
$fields[2] = trim($fields[2]);
$fields[3] = trim($fields[3]);
$conv_to_spk{'sw0' . $fields[0] . '-A'} = $fields[2];
$conv_to_spk{'sw0' . $fields[0] . '-B'} = $fields[3];
}
close($conv_tab);
# fix utt2spk
%missingconv = ();
open(my $utt2spk, '<', $dir . '/utt2spk') or die "Could not open '$dir/utt2spk' $!\n";
open(my $utt2spk_new, '>', $dir . '/utt2spk.new');
while (my $line = <$utt2spk>) {
chomp $line;
my @fields = split " " , $line;
my $convid = substr $fields[0], 0, 9;
if (exists $conv_to_spk{ $convid }) {
my $spkid = $conv_to_spk{ $convid };
$spkid = "sw" . $spkid;
my $newuttid = $spkid . '-' . (substr $fields[0], 2);
print $utt2spk_new "$newuttid $spkid\n";
} else {
my $convid = substr $convid, 3, 4;
$missingconv{$convid} = 1;
print $utt2spk_new $fields[0]." ".$fields[1]."\n";
}
}
close($utt2spk);
close($utt2spk_new);
foreach my $conv (keys %missingconv) {
print "Warning: Conversation ID '$conv' not found in conv.tab, retaining old speaker IDs\n"
}
# fix segments and text
foreach my $file ('segments','text') {
open(my $oldfile, '<', "$dir/$file") or die "Could not open '$dir/$file' $!\n";
open(my $newfile, '>', "$dir/$file.new");
while (my $line = <$oldfile>) {
chomp $line;
my $convid = substr $line, 0, 9;
if (exists $conv_to_spk{$convid}) {
my $spkid = $conv_to_spk{$convid};
print $newfile "sw$spkid-" . (substr $line, 2) . "\n";
} else {
print $newfile "$line\n";
}
}
}
...@@ -15,6 +15,15 @@ ...@@ -15,6 +15,15 @@
. path.sh . path.sh
set -e # exit on error set -e # exit on error
# Prepare Switchboard data. This command can also take a second optional argument
# which specifies the directory to Switchboard documentations. Specifically, if
# this argument is given, the script will look for the conv.tab file and correct
# speaker IDs to the actual speaker personal identification numbers released in
# the documentations. The documentations can be found here:
# https://catalog.ldc.upenn.edu/docs/LDC97S62/
# Note: if you are using this link, make sure you rename conv_tab.csv to conv.tab
# after downloading.
# Usage: local/swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_docs]
local/swbd1_data_prep.sh /export/corpora3/LDC/LDC97S62 local/swbd1_data_prep.sh /export/corpora3/LDC/LDC97S62
# local/swbd1_data_prep.sh /home/dpovey/data/LDC97S62 # local/swbd1_data_prep.sh /home/dpovey/data/LDC97S62
# local/swbd1_data_prep.sh /data/corpora0/LDC97S62 # local/swbd1_data_prep.sh /data/corpora0/LDC97S62
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment