Skip to content
Run_ESTER2.ipynb 6.61 KiB
Newer Older
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    ". path.sh\n",
    ". cmd.sh\n",
    "idata_kaldi=data-ESTER2-V1\n",
    "exp_kaldi=exp-ESTER2-V1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "prepare train\n",
      "utils/data/get_utt2dur.sh: working out data-ESTER2-V1/train/utt2dur from data-ESTER2-V1/train/segments\n",
      "utils/data/get_utt2dur.sh: computed data-ESTER2-V1/train/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/train\n",
      "Successfully prepared data in data-ESTER2-V1/train..\n",
      "prepare test\n",
      "utils/data/get_utt2dur.sh: working out data-ESTER2-V1/test/utt2dur from data-ESTER2-V1/test/segments\n",
      "utils/data/get_utt2dur.sh: computed data-ESTER2-V1/test/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/test\n",
      "Successfully prepared data in data-ESTER2-V1/test..\n",
      "prepare dev\n",
      "utils/data/get_utt2dur.sh: working out data-ESTER2-V1/dev/utt2dur from data-ESTER2-V1/dev/segments\n",
      "utils/data/get_utt2dur.sh: computed data-ESTER2-V1/dev/utt2dur\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/dev\n",
      "Successfully prepared data in data-ESTER2-V1/dev..\n"
     ]
    }
   ],
   "source": [
    "data=/fast/LINAGORA/Corpus/Corpus/ESTER2/corpus\n",
    "#idata_kaldi=data-ESTER-V4\n",
    "for part in train test dev; do\n",
    "  # use underscore-separated names in data directories.\n",
    "  echo \"prepare $part\"\n",
    "  #local/data_prepTCOF.sh $data/$part $idata_kaldi/$part\n",
    "  # probleme event (URL:)\n",
    "  local/data_prepESTER.sh $data/$part $idata_kaldi/$part\n",
    "done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER2-V1/train exp-ESTER2-V1/make_mfcc/train mfcc-ESTER2-V1\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/train\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "Succeeded creating MFCC features for train\n",
      "steps/compute_cmvn_stats.sh data-ESTER2-V1/train exp-ESTER2-V1/make_mfcc/train mfcc-ESTER2-V1\n",
      "Succeeded creating CMVN stats for train\n",
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER2-V1/test exp-ESTER2-V1/make_mfcc/test mfcc-ESTER2-V1\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/test\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "Succeeded creating MFCC features for test\n",
      "steps/compute_cmvn_stats.sh data-ESTER2-V1/test exp-ESTER2-V1/make_mfcc/test mfcc-ESTER2-V1\n",
      "Succeeded creating CMVN stats for test\n",
      "steps/make_mfcc.sh --cmd run.pl --mem 64G --nj 32 data-ESTER2-V1/dev exp-ESTER2-V1/make_mfcc/dev mfcc-ESTER2-V1\n",
      "utils/validate_data_dir.sh: Successfully validated data-directory data-ESTER2-V1/dev\n",
      "steps/make_mfcc.sh [info]: segments file exists: using that.\n",
      "Succeeded creating MFCC features for dev\n",
      "steps/compute_cmvn_stats.sh data-ESTER2-V1/dev exp-ESTER2-V1/make_mfcc/dev mfcc-ESTER2-V1\n",
      "Succeeded creating CMVN stats for dev\n"
     ]
    }
   ],
   "source": [
    "exp_kaldi=exp-ESTER2-V1\n",
    "mfccdir=mfcc-ESTER2-V1\n",
    "for part in train test dev; do\n",
    "    #MFCC features\n",
    "    steps/make_mfcc.sh --cmd \"$train_cmd\" --nj 32 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    #MFCC features + Pitch\n",
    "    #steps/make_mfcc_pitch.sh --cmd \"$train_cmd\" --nj 12 $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    steps/compute_cmvn_stats.sh $idata_kaldi/$part $exp_kaldi/make_mfcc/$part $mfccdir\n",
    "    #utils/fix_data_dir.sh $idata_kaldi/$part\n",
    "done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Splitting into 2 parts, to allow for parallel processing ...\n",
      "Checking the splits ...\n",
      "Performing text normalization (2 jobs) - check data-ESTER2-V1/local_ESTER12/lm/norm/tmp/txt_norm.JOB.log ...\n",
      "Finished OK\n",
      "Selecting the vocabulary (400000 words) ...\n",
      "Making the corpus and the vocabulary ...\n",
      "Word counts saved to 'data-ESTER2-V1/local_ESTER12/lm/word_counts.txt'\n",
      "Vocabulary saved as 'data-ESTER2-V1/local_ESTER12/lm/meeting-vocab.txt'\n",
      "All unique sentences (in sorted order) stored in 'data-ESTER2-V1/local_ESTER12/lm/meeting-lm-norm.txt.gz'\n",
      "Counting the total number word tokens in the corpus ...\n",
      "There are 1075067 tokens in the corpus\n",
      "Training a 3-gram LM ...\n",
      "This implementation assumes that you have a lot of free RAM(> 12GB) on your machine\n",
      "If that's not the case, consider something like: http://joshua-decoder.org/4.0/large-lms.html\n",
      "3,8M\tdata-ESTER2-V1/local_ESTER12/lm/lm_tglarge.arpa.gz\n",
      "Creating a 'small' pruned 3-gram LM (threshold: 0.0000003) ...\n",
      "data-ESTER2-V1/local_ESTER12/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "3,0M\tdata-ESTER2-V1/local_ESTER12/lm/lm_tgsmall.arpa.gz\n",
      "Creating a 'medium' pruned 3-gram LM (threshold: 0.0000001) ...\n",
      "data-ESTER2-V1/local_ESTER12/lm/lm_tglarge.arpa.gz: line 10: warning: non-zero probability for <unk> in closed-vocabulary LM\n",
      "3,5M\tdata-ESTER2-V1/local_ESTER12/lm/lm_tgmed.arpa.gz\n",
      "Training a 4-gram LM ...\n",
      "4,5M\tdata-ESTER2-V1/local_ESTER12/lm/lm_fglarge.arpa.gz\n"
     ]
    }
   ],
   "source": [
    "LM_train_text=/fast/LINAGORA/Corpus/Corpus/ESTER2/corpus/train\n",
    "local/lm/train_lm.sh $LM_train_text \\\n",
    "$idata_kaldi/local_ESTER12/lm/norm/tmp $idata_kaldi/local_ESTER12/lm/norm/norm_texts $idata_kaldi/local_ESTER12/lm"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Bash",
   "language": "bash",
   "name": "bash"
  },
  "language_info": {
   "codemirror_mode": "shell",
   "file_extension": ".sh",
   "mimetype": "text/x-sh",
   "name": "bash"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}