kaldi sre16/v1中run.sh

江阳羽
2023-12-01
#!/bin/bash
# Copyright      2017   David Snyder
#                2017   Johns Hopkins University (Author: Daniel Garcia-Romero)
#                2017   Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
#
# See README.txt for more info on data required.
# Results (mostly EERs) are inline内嵌在 in comments below.
#
# This example demonstrates a "bare bones" NIST SRE 2016 recipe using ivectors.
# 本实例展示了sre2016 使用ivectors方法的骨架方法
# In the future, we will add score-normalization and a more effective form of
# 将来我们会加上分数归一化和一种PLDA域自适应的更有效的方法
# PLDA domain adaptation.

# 本demo是经典的i-vector+PLDA方法

# nist-sre16的比赛规则：
# https://www.nist.gov/itl/iad/mig/speaker-recognition-evaluation-2016
# The basic task in NIST’s speaker recognition evaluations is speaker detection, i.e., 
# to determine whether a specified target speaker is speaking during a given segment of speech.
# 也就是说给一段两人的谈话录音，判断某个人是否说话了，某个人可以是两人之一，也可以不是
# 一段录音有两个人说话，没有进行说话人分割

# 根据evaluation plan，训练数据还可以用the Fisher corpus，但是这里没用
# the Fisher corpus：https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/lrec2004-fisher-corpus.pdf
# https://catalog.ldc.upenn.edu/LDC2004T19

# 理论知识，i-vector提取过程：https://blog.csdn.net/veritasalice/article/details/89215180
# https://www.zhihu.com/question/63978977

. ./cmd.sh
. ./path.sh
# 引用文件

set -e
mfccdir=`pwd`/mfcc
vaddir=`pwd`/mfcc
# 变量赋值

# SRE16 trials
sre16_trials=data/sre16_eval_test/trials            
# 根据evaluation plan，sre16的test set只有两种major语言，也就是他家路语和粤语
sre16_trials_tgl=data/sre16_eval_test/trials_tgl    
# Tagalog 他加禄语(通行于菲律宾群岛)
sre16_trials_yue=data/sre16_eval_test/trials_yue    
# 广东话，粤语
# 变量赋值

stage=0
# 通过控制stage变量来控制整个步骤

# -le 检测左边的数字是否小于等于右边的，如果是，则返回true
if [ $stage -le 0 ]; then
  # Path to some, but not all of the training corpora
  data_root=/export/corpora/LDC

  # Prepare telephone and microphone speech from Mixer6.
  local/make_mx6.sh $data_root/LDC2013S03 data/
  # LDC2013S03：https://catalog.ldc.upenn.edu/LDC2013S03
  # make_mx6.sh 准备2013年的数据库，需要两个参数，源文件夹：$data_root/LDC2013S03，目的文件夹：data/
  # 里面即有mic数据（多个mic，16-KHz 1-channel flac/ms-wav files，需要选mic和降采样）
  # 又有电话录音数据（8-KHz 2-channel NIST SPHERE files ），要整合在一起
  # 调用了local/make_mx6_mic.pl，utils/combine_data.sh，utils/subset_data_dir.sh，local/make_mx6_calls.pl，utils/fix_data_dir.sh

  # Prepare SRE10 test and enroll. Includes microphone interview speech.
  # NOTE: This corpus is now available through the LDC as LDC2017S06.
  local/make_sre10.pl /export/corpora5/SRE/SRE2010/eval/ data/
  # LDC2017S06：https://catalog.ldc.upenn.edu/LDC2017S06
  # 这个是2010 NIST SRE 的 Test Set，有microphone speech, telephone speech，8-KHz
  # This collection was part of the Mixer 6 project，
  # 参数意义：<path-to-SRE10-eval> <path-to-output>
  # make_sre10.pl 与 local/make_mx6_calls.pl ，local/make_mx6_mic.pl很类似，
  # 主要任务就是转换了语音格式，生成了一些文件，检查了一些文件

  # Prepare SRE08 test and enroll. Includes some microphone speech.
  local/make_sre08.pl $data_root/LDC2011S08 $data_root/LDC2011S05 data/
  # LDC2011S05：https://catalog.ldc.upenn.edu/LDC2011S05
  # 2008 NIST SRE Training Set Part 1，有microphone speech, telephone speech，8-KHz

  # This prepares the older NIST SREs from 2004-2006.
  local/make_sre.sh $data_root data/
  # make_sre.sh中调用了make_sre.pl，参考local/make_mx6_mic.pl，local/make_mx6_calls.pl
  # Usage: make_sre.pl <path-to-data> <name-of-source> <sre-ref> <output-dir>，name-of-source实际指的是sre_year

  # Combine all SREs prior to 2016 and Mixer6 into one dataset
  utils/combine_data.sh data/sre \
    data/sre2004 data/sre2005_train \
    data/sre2005_test data/sre2006_train \
    data/sre2006_test_1 data/sre2006_test_2 \
    data/sre08 data/mx6 data/sre10
  # 参数：目的文件夹，源1，源2，源3，源4，源5，源6，源7，源8，源9
  utils/validate_data_dir.sh --no-text --no-feats data/sre
  utils/fix_data_dir.sh data/sre

# 参考解释
# combine_data.sh  https://blog.csdn.net/yj13811596648/article/details/103343669
# fix_data_dir.sh  https://blog.csdn.net/yj13811596648/article/details/103316332
# validate_data_dir.sh  https://blog.csdn.net/yj13811596648/article/details/103316292

  # Prepare SWBD corpora.
  local/make_swbd_cellular1.pl $data_root/LDC2001S13 \
    data/swbd_cellular1_train
	# 有 badAudio 被跳过了
	# LDC2001S13：https://catalog.ldc.upenn.edu/LDC2001S13，
	# telephone conversations，2-channel ulaw，8-KHz
  local/make_swbd_cellular2.pl /export/corpora5/LDC/LDC2004S07 \
    data/swbd_cellular2_train
	# LDC2004S07：https://catalog.ldc.upenn.edu/LDC2004S07，
	# telephone conversations，2-channel ulaw，8-KHz
  local/make_swbd2_phase1.pl $data_root/LDC98S75 \
    data/swbd2_phase1_train
	# LDC98S75：https://catalog.ldc.upenn.edu/LDC98S75，
	# telephone conversations，2-channel ulaw，8-KHz
  local/make_swbd2_phase2.pl /export/corpora5/LDC/LDC99S79 \
    data/swbd2_phase2_train
	# LDC99S79：https://catalog.ldc.upenn.edu/LDC99S79，
	# telephone conversations，2-channel ulaw，8-KHz
  local/make_swbd2_phase3.pl /export/corpora5/LDC/LDC2002S06 \
    data/swbd2_phase3_train
	# LDC2002S06：https://catalog.ldc.upenn.edu/LDC2002S06，
	# telephone speech，2-channel ulaw，8-KHz

  # Combine all SWB corpora into one dataset.
  utils/combine_data.sh data/swbd \
    data/swbd_cellular1_train data/swbd_cellular2_train \
    data/swbd2_phase1_train data/swbd2_phase2_train data/swbd2_phase3_train
	# 参数：目的文件夹，源1，源2，源3，源4，源5

  # Prepare NIST SRE 2016 evaluation data.
  local/make_sre16_eval.pl /export/corpora5/SRE/R149_0_1 data

  # Prepare unlabeled Cantonese and Tagalog development data. This dataset
  # was distributed to SRE participants.
  local/make_sre16_unlabeled.pl /export/corpora5/SRE/LDC2016E46_SRE16_Call_My_Net_Training_Data data
  # LDC2016E46：这个网址没有找到，这个语料库相当于sre16的in-domain语料
fi

# make ***.pl 做的主要事情还是转换声音格式和降采样，生成一些文件，检查一些文件

# 把stage分成好多个部分，应该也是为了条理清晰和分步骤进行
if [ $stage -le 1 ]; then
  # Make MFCCs and compute the energy-based VAD for each dataset
  for name in sre swbd sre16_eval_enroll sre16_eval_test sre16_major; do
    steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
      data/${name} exp/make_mfcc $mfccdir
    utils/fix_data_dir.sh data/${name}
    sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
      data/${name} exp/make_vad $vaddir
    utils/fix_data_dir.sh data/${name}
  done
fi

# 解释
# steps/make_mfcc.sh  https://blog.csdn.net/yj13811596648/article/details/102817366
# fix_data_dir.sh  https://blog.csdn.net/yj13811596648/article/details/103316332
# sid/compute_vad_decision.sh   https://blog.csdn.net/yj13811596648/article/details/102817405

# 理论知识，i-vector提取过程：https://blog.csdn.net/veritasalice/article/details/89215180
# https://www.zhihu.com/question/63978977
if [ $stage -le 2 ]; then
  # Train the UBM.
  sid/train_diag_ubm.sh --cmd "$train_cmd --mem 20G" \
    --nj 40 --num-threads 8  --subsample 1 \
    data/sre16_major 2048 \
    exp/diag_ubm

  sid/train_full_ubm.sh --cmd "$train_cmd --mem 25G" \
    --nj 40 --remove-low-count-gaussians false --subsample 1 \
    data/sre16_major \
    exp/diag_ubm exp/full_ubm
fi

# 先得到一个对角的ubm，在此基础上再得到一个full ubm
# sid/train_diag_ubm.sh的解释
.<<EOF
train_diag_ubm.sh中的介绍：
# This is a modified version of steps/train_diag_ubm.sh, specialized for
# speaker-id, that does not require to start with a trained model, that applies
# sliding-window CMVN, and that expects voice activity detection (vad.scp) in
# the data directory.  We initialize the GMM using gmm-global-init-from-feats,
# which sets the means to random data points and then does some iterations of
# E-M in memory.  After the in-memory initialization we train for a few
# iterations in parallel.

  echo "Usage: $0  <data> <num-gauss> <output-dir>"
  echo " e.g.: $0 data/train 1024 exp/diag_ubm"
  echo "Options: "
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --nj <num-jobs|4>                                # number of parallel jobs to run."
  echo "  --num-iters <niter|20>                           # number of iterations of parallel "
  echo "                                                   # training (default: $num_iters)"
  echo "  --stage <stage|-2>                               # stage to do partial re-run from."
  echo "  --num-gselect <n|30>                             # Number of Gaussians per frame to"
  echo "                                                   # limit computation to, for speed"
  echo " --subsample <n|5>                                 # In main E-M phase, use every n"
  echo "                                                   # frames (a speedup)"
  echo "  --num-frames <n|500000>                          # Maximum num-frames to keep in memory"
  echo "                                                   # for model initialization"
  echo "  --num-iters-init <n|20>                          # Number of E-M iterations for model"
  echo "                                                   # initialization"
  echo " --initial-gauss-proportion <proportion|0.5>       # Proportion of Gaussians to start with"
  echo "                                                   # in initialization phase (then split)"
  echo " --num-threads <n|32>                              # number of threads to use in initialization"
  echo "                                                   # phase (must match with parallel-opts option)"
  echo " --parallel-opts <string|'--num-threads 32'>       # Option should match number of threads in"
  echo "                                                   # --num-threads option above"
  echo " --min-gaussian-weight <weight|0.0001>             # min Gaussian weight allowed in GMM"
  echo "                                                   # initialization (this relatively high"
  echo "                                                   # value keeps counts fairly even)"
  echo " --delta-window <n|3>                              # number of frames of context used to"
  echo "                                                   # calculate delta"
  echo " --delta-order <n|2>                               # number of delta features"
  echo " --apply-cmn <true,false|true>                     # if true, apply sliding window cepstral mean"
  echo "                                                   # normalization to features"
  
EOF

# sid/train_full_ubm.sh的解释
.<<EOF
# This trains a full-covariance UBM from an existing (diagonal or full) UBM,
# for a specified number of iterations.  This is for speaker-id systems
# (we use features specialized for that, and vad).

  echo "Usage: steps/train_full_ubm.sh <data> <old-ubm-dir> <new-ubm-dir>"
  echo "Trains a full-covariance UBM starting from an existing diagonal or"
  echo "full-covariance UBM system."
  echo " e.g.: steps/train_full_ubm.sh --num-iters 8 data/train exp/diag_ubm exp/full_ubm"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --nj <n|16>                                      # number of parallel training jobs"
  echo "  --num-gselect <n|20>                             # Number of Gaussians to select using"
  echo "                                                   # initial model (diagonalized if needed)"
  echo "  --subsample <n|5>                                # Take every n'th sample, for efficiency"
  echo "  --num-iters <n|4>                                # Number of iterations of E-M"
  echo "  --min-gaussian-weight <weight|1.0e-05>           # Minimum Gaussian weight (below this,"
  echo "                                                   # we won't update, and will remove Gaussians"
  echo "                                                   # if --remove-low-count-gaussians is true"
  echo "  --remove-low-count-gaussians <true,false|true>   # If true, remove Gaussians below min-weight"
  echo "                                                   # (will only happen on last iteration, in any case"
  echo "  --cleanup <true,false|true>                      # If true, clean up accumulators, intermediate"
  echo "                                                   # models and gselect info"
  exit 1;
  echo " --apply-cmn <true,false|true>                     # if true, apply sliding window cepstral mean"
  echo "                                                   # normalization to features"
  
EOF


if [ $stage -le 3 ]; then
  # Train the i-vector extractor.
  utils/combine_data.sh data/swbd_sre data/swbd data/sre    
  # 这里也能看出来主要使用了sre和swbd的data，没有fisher data
  sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 35G" \
    --ivector-dim 600 \
    --num-iters 5 \
    exp/full_ubm/final.ubm data/swbd_sre \
    exp/extractor
fi
# 解释
# combine_data.sh  https://blog.csdn.net/yj13811596648/article/details/103343669
# sid/train_ivector_extractor.sh
.<<EOF
   This script trains the i-vector extractor.  Note:并行化有3个独立的级别：num_threads、num_processes和num_jobs。这似乎有点过分。
   这与最小化内存使用和磁盘I/O有关，受到各种限制。
   -“num_threads”是程序使用的线程数；
	   “num_processes”是单个作业生成的单独进程数，然后对内存中的累加器求和。
	   我们的建议是：将num_threads设置为最小值（4，或者您的机器有多少虚拟内核）。
	   （由于需要锁定各种全局数量，程序不能使用超过4个具有良好CPU利用率的线程）。
   -将num_processes设置为每台计算机上的虚拟核心数除以num_threads。例如4，如果你有16个虚拟核。
	   如果你在一个共享的队列中忙于其他人的工作，那么将其设置为低于这个最大值可能是明智的，或者你的工作不会被安排。
	   如果内存不足，则需要小心；在我们的正常设置中，每个进程使用大约5G。
   -将num-jobs设置为尽可能多的作业（每个作业使用$num-threads*$num-processs   CPU），队列将允许您一次运行，但不要超过10或20，否则累加器的求和可能会变慢。如果你有很多数据，你可能需要更多的工作。
EOF

.<<EOF
  sid/train_ivector_extractor.sh
  echo "Usage: $0 <fgmm-model> <data> <extractor-dir>"
  echo " e.g.: $0 exp/ubm_2048_male/final.ubm data/train_male exp/extractor_male"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-iters <#iters|10>                          # Number of iterations of E-M"
  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
  echo "  --num-processes <n|4>                            # Number of processes for each queue job (relates"
  echo "                                                   # to summing accs in memory)"
  echo "  --num-threads <n|4>                              # Number of threads for each process (can't be usefully"
  echo "                                                   # increased much above 4)"
  echo "  --stage <stage|-4>                               # To control partial reruns"
  echo "  --num-gselect <n|20>                             # Number of Gaussians to select using"
  echo "                                                   # diagonal model."
  echo "  --sum-accs-opt <option|''>                       # Option e.g. '-l hostname=a15' to localize"
  echo "                                                   # sum-accs process to nfs server."
  echo " --apply-cmn <true,false|true>                     # if true, apply sliding window cepstral mean"
  echo "                                                   # normalization to features"
EOF


# 下面开始进行语音增强，也就是只在训练PLDA model的时候用到了语音增强，
# 之前训练ubm，i-vector提取器的时候没有用语音增强
# In this section, we augment the SRE data with reverberation,
# noise, music, and babble, and combined it with the clean SRE
# data.  The combined list will be used to train the PLDA model.

if [ $stage -le 4 ]; then
  utils/data/get_utt2num_frames.sh --nj 40 --cmd "$train_cmd" data/sre
  frame_shift=0.01
  awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/sre/utt2num_frames > data/sre/reco2dur
  # get_utt2num_frames.sh，得到每个utterence对应的帧，每个utterence被分成了很多帧
  
  # 如果RIRS_NOISES这个目录不存在，就去下载并解压它
  if [ ! -d "RIRS_NOISES" ]; then
    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
    unzip rirs_noises.zip
  fi

  # Make a version with reverberated speech
  rvb_opts=()
  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
  # 将命令的执行结果赋值给变量，单小括号，(cmd1;cmd2;cmd3) 新开一个子shell顺序执行命令cmd1,cmd2,cmd3, 
  # 各命令之间用分号隔开, 最后一个命令后可以没有分号。

  # Make a reverberated version of the SRE list.  Note that we don't add any
  # additive noise here.
  steps/data/reverberate_data_dir.py \
    "${rvb_opts[@]}" \
    --speech-rvb-probability 1 \
    --pointsource-noise-addition-probability 0 \
    --isotropic-noise-addition-probability 0 \
    --num-replications 1 \
    --source-sampling-rate 8000 \
    data/sre data/sre_reverb
  cp data/sre/vad.scp data/sre_reverb/
  # cp命令主要用于复制文件或目录
  utils/copy_data_dir.sh --utt-suffix "-reverb" data/sre_reverb data/sre_reverb.new
  # copy_data_dir.sh：It copies to another directory, possibly adding a specified prefix or a suffix
  # to the utterance and/or speaker names.  Note, the recording-ids stay the same.
  rm -rf data/sre_reverb
  mv data/sre_reverb.new data/sre_reverb
  # 这里有问题吧？都删掉了，还怎么再放东西进去，再新建么？那之前加混响的数据岂不是都没有了

  # Prepare the MUSAN corpus, which consists of music, speech, and noise
  # suitable for augmentation.
  local/make_musan.sh /export/corpora/JHU/musan data
  # make_musan.sh调用了make_musan.py，生成了一些文件
  # 这里怎么不下载解压musan数据集了，估计是自己有吧

  # Get the duration of the MUSAN recordings.  This will be used by the
  # script augment_data_dir.py.
  for name in speech noise music; do
    utils/data/get_utt2dur.sh data/musan_${name}
	# get_utt2dur.sh：This script operates on a data directory, such as in data/train/, and adds the
	# utt2dur file if it does not already exist.  The file 'utt2dur' maps from
	# utterance to the duration of the utterance in seconds.  This script works it
	# out from the 'segments' file, or, if not present, from the wav.scp file (it
	# first tries interrogating 询问the headers, and if this fails, it reads the wave
	# files in entirely.)
    mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur
  done

  # Augment with musan_noise
  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/sre data/sre_noise
.<<EOF
	# augment_data_dir.py：This script generates augmented data.  It is based on
	# steps/data/reverberate_data_dir.py but doesn't handle reverberation.
	# It is designed to be somewhat simpler and more flexible for augmenting with
	# additive noise.
		"Augment the data directory with additive noises. "
        "Noises are separated into background and foreground noises which are added together or "
        "separately.  Background noises are added to the entire recording, and repeated as necessary "
        "to cover the full length.  Multiple overlapping background noises can be added, to simulate "
        "babble, for example.  Foreground noises are added sequentially, according to a specified "
        "interval.  See also steps/data/reverberate_data_dir.py "
        "Usage: augment_data_dir.py [options...] <in-data-dir> <out-data-dir> "
        "E.g., steps/data/augment_data_dir.py --utt-suffix aug --fg-snrs 20:10:5:0 --bg-snrs 20:15:10 "
        "--num-bg-noise 1:2:3 --fg-interval 3 --fg-noise-dir data/musan_noise --bg-noise-dir "
        "data/musan_music data/train data/train_aug", formatter_class=argparse.ArgumentDefaultsHelpFormatter
EOF
  # Augment with musan_music
  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/sre data/sre_music
  # Augment with musan_speech
  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/sre data/sre_babble

  # Combine reverb, noise, music, and babble into one directory.
  utils/combine_data.sh data/sre_aug data/sre_reverb data/sre_noise data/sre_music data/sre_babble
  # 参数：目的文件夹，源1，源2，源3，源4

  # Take a random subset of the augmentations (64k is roughly the size of the SRE dataset)
  utils/subset_data_dir.sh data/sre_aug 64000 data/sre_aug_64k
  # 应该是随机选取了64000个utterence：
  # subset_data_dir.sh：https://blog.csdn.net/yj13811596648/article/details/103344314
  utils/fix_data_dir.sh data/sre_aug_64k

  # Make MFCCs for the augmented data.  Note that we want we should alreay have the vad.scp
  # from the clean version at this point, which is identical to the clean version!
  # 之前在line133行附近已经生成过一次MFCC，那里是先make MFCC，再生成vad文件。
  # 这里的clean是不是应该是经过了vad的augmented data？
  steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
    data/sre_aug_64k exp/make_mfcc $mfccdir
  # 第一个是数据源文件夹，第二个是存放log的文件夹，第三个才是生成的mfcc存放的文件夹

  # Combine the clean and augmented SRE list.  This is now roughly
  # double the size of the original clean list.
  utils/combine_data.sh data/sre_combined data/sre_aug_64k data/sre
  # 参数：目的文件夹，源1，源2
fi

if [ $stage -le 5 ]; then
  # Extract i-vectors for SRE data (includes Mixer 6). We'll use this for
  # things like LDA or PLDA.
  sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
    exp/extractor data/sre_combined \
    exp/ivectors_sre_combined
.<<EOF
extract_ivectors.sh:
# This script extracts iVectors for a set of utterances, given
# features and a trained iVector extractor.

echo "Usage: $0 <extractor-dir> <data> <ivector-dir>"
  echo " e.g.: $0 exp/extractor_2048_male data/train_male exp/ivectors_male"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --nj <n|10>                                      # Number of jobs (also see num-threads)"
  echo "  --num-threads <n|1>                              # Number of threads for each job"
  echo "  --stage <stage|0>                                # To control partial reruns"
  echo "  --num-gselect <n|20>                             # Number of Gaussians to select using"
  echo "                                                   # diagonal model."
  echo "  --min-post <min-post|0.025>                      # Pruning threshold for posteriors"
  echo " --apply-cmn <true,false|true>                     # if true, apply sliding window cepstral mean"
  echo "                                                   # normalization to features"
EOF

  # The SRE16 major is an unlabeled dataset consisting of Cantonese广东话 and
  # and Tagalog塔加拉族语.  This is useful for things like centering, whitening and
  # score normalization.
  sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
    exp/extractor data/sre16_major \
    exp/ivectors_sre16_major

  # The SRE16 test data
  sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
    exp/extractor data/sre16_eval_test \
    exp/ivectors_sre16_eval_test

  # The SRE16 enroll data
  sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
    exp/extractor data/sre16_eval_enroll \
    exp/ivectors_sre16_eval_enroll
fi
# 上面的if主要针对data下面的4个不同的文件夹，分别提取了i-vector，存在了4个不同的文件夹中
# data/sre_combined，data/sre16_major，data/sre16_eval_test，data/sre16_eval_enroll

if [ $stage -le 6 ]; then
  # Compute the mean vector for centering the evaluation i-vectors.
  $train_cmd exp/ivectors_sre16_major/log/compute_mean.log \
    ivector-mean scp:exp/ivectors_sre16_major/ivector.scp \
    exp/ivectors_sre16_major/mean.vec || exit 1;
	# 针对由sre16_major数据生成的ivectors，来取ivector-mean
	# 生成的结果是：exp/ivectors_sre16_major/mean.vec

  # This script uses LDA to decrease the dimensionality prior to PLDA.
  # 在PLDA之前，使用LDA，对，由sre_combined数据提取到的ivectors降维，降维后ivector的维度为200
  # 使用了ivector-subtract-global-mean这个工具，最后得到了transform.mat，应该是T矩阵吧
  lda_dim=200
  $train_cmd exp/ivectors_sre_combined/log/lda.log \
    ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \
    "ark:ivector-subtract-global-mean scp:exp/ivectors_sre_combined/ivector.scp ark:- |" \
    ark:data/sre_combined/utt2spk exp/ivectors_sre_combined/transform.mat || exit 1;

  #  Train the PLDA model.
  # 使用了ivector-compute-plda这个工具，还使用了ivector-subtract-global-mean，ivector-normalize-length
  # 最后生成的是exp/ivectors_sre_combined/plda
  # transform-vec 是啥，也是个工具么？还是生成的结果？
  $train_cmd exp/ivectors_sre_combined/log/plda.log \
    ivector-compute-plda ark:data/sre_combined/spk2utt \
    "ark:ivector-subtract-global-mean scp:exp/ivectors_sre_combined/ivector.scp ark:- | transform-vec exp/ivectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:-  ark:- |" \
    exp/ivectors_sre_combined/plda || exit 1;

  # Here we adapt the out-of-domain PLDA model to SRE16 major, a pile
  # of unlabeled in-domain data.  In the future, we will include a clustering
  # based approach for domain adaptation.
  # 使用了ivector-adapt-plda这个工具，之前生成的exp/ivectors_sre_combined/plda是作为输入数据使用的，
  # 最后生成了exp/ivectors_sre16_major/plda_adapt
  $train_cmd exp/ivectors_sre16_major/log/plda_adapt.log \
    ivector-adapt-plda --within-covar-scale=0.75 --between-covar-scale=0.25 \
    exp/ivectors_sre_combined/plda \
    "ark:ivector-subtract-global-mean scp:exp/ivectors_sre16_major/ivector.scp ark:- | transform-vec exp/ivectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
    exp/ivectors_sre16_major/plda_adapt || exit 1;
fi

if [ $stage -le 7 ]; then
  # Get results using the out-of-domain PLDA model
  # 使用了ivector-plda-scoring这个工具，生成的结果在exp/scores/sre16_eval_scores
  $train_cmd exp/scores/log/sre16_eval_scoring.log \
    ivector-plda-scoring --normalize-length=true \
    --num-utts=ark:exp/ivectors_sre16_eval_enroll/num_utts.ark \
    "ivector-copy-plda --smoothing=0.0 exp/ivectors_sre_combined/plda - |" \
    "ark:ivector-mean ark:data/sre16_eval_enroll/spk2utt scp:exp/ivectors_sre16_eval_enroll/ivector.scp ark:- | ivector-subtract-global-mean exp/ivectors_sre16_major/mean.vec ark:- ark:- | transform-vec exp/ivectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
    "ark:ivector-subtract-global-mean exp/ivectors_sre16_major/mean.vec scp:exp/ivectors_sre16_eval_test/ivector.scp ark:- | transform-vec exp/ivectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
    "cat '$sre16_trials' | cut -d\  --fields=1,2 |" exp/scores/sre16_eval_scores || exit 1;

  utils/filter_scp.pl $sre16_trials_tgl exp/scores/sre16_eval_scores > exp/scores/sre16_eval_tgl_scores
  # $sre16_trials_tgl,这个文件的utterance-ids所构成的集合是，exp/scores/sre16_eval_scores这个文件的utterance-ids
  # 所构成集合的子集，filter_scp.pl把所有$sre16_trials_tgl中utterance-id的分数过滤出来，
  # 写入到文件exp/scores/sre16_eval_tgl_scores中，专门把Tagalog，他加禄语，的得分筛选出来了
.<<EOF
filter_scp.pl:
# This script takes a list of utterance-ids or any file whose first field
# of each line is an utterance-id, and filters an scp
# file (or any file whose "n-th" field is an utterance id), printing
# out only those lines whose "n-th" field is in id_list. The index of
# the "n-th" field is 1, by default, but can be changed by using
# the -f <n> switch

"Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
      "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
      "Note: only the first field of each line in id_list matters.  With --exclude, prints\n" .
      "only the lines that were *not* in id_list.\n" .
      "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
      "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
      "-f option, add 1 to the argument.\n" 
EOF

  utils/filter_scp.pl $sre16_trials_yue exp/scores/sre16_eval_scores > exp/scores/sre16_eval_yue_scores
  # 专门把粤语的得分筛选出来
  
  pooled_eer=$(paste $sre16_trials exp/scores/sre16_eval_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null)     
  # 2>/dev/null的意思就是将标准错误stderr删掉。
  # paste 指令会把每个文件以列对列的方式，一列列地加以合并：https://www.runoob.com/linux/linux-comm-paste.html
  # 本句应该是计算sre16_trials的平均eer
  
  tgl_eer=$(paste $sre16_trials_tgl exp/scores/sre16_eval_tgl_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
  # 计算他家路语的平均eer
  
  yue_eer=$(paste $sre16_trials_yue exp/scores/sre16_eval_yue_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
  # 计算粤语的平均eer
  
  echo "Using Out-of-Domain PLDA, EER: Pooled ${pooled_eer}%, Tagalog ${tgl_eer}%, Cantonese ${yue_eer}%"
  # EER: Pooled 13.65%, Tagalog 17.73%, Cantonese 9.612%
  # Using Out-of-Domain PLDA，sre16_trials的平均eer，他家路语的平均eer，粤语的平均eer
fi

if [ $stage -le 8 ]; then
  # Get results using an adapted PLDA model. In the future we'll replace
  # this (or add to this) with a clustering based approach to PLDA adaptation.
  # 同样使用了ivector-plda-scoring这个工具，结果存放在exp/scores/sre16_eval_scores_adapt
  $train_cmd exp/scores/log/sre16_eval_scoring_adapt.log \
    ivector-plda-scoring --normalize-length=true \
    --num-utts=ark:exp/ivectors_sre16_eval_enroll/num_utts.ark \
    "ivector-copy-plda --smoothing=0.0 exp/ivectors_sre16_major/plda_adapt - |" \
    "ark:ivector-mean ark:data/sre16_eval_enroll/spk2utt scp:exp/ivectors_sre16_eval_enroll/ivector.scp ark:- | ivector-subtract-global-mean exp/ivectors_sre16_major/mean.vec ark:- ark:- | transform-vec exp/ivectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
    "ark:ivector-subtract-global-mean exp/ivectors_sre16_major/mean.vec scp:exp/ivectors_sre16_eval_test/ivector.scp ark:- | transform-vec exp/ivectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
    "cat '$sre16_trials' | cut -d\  --fields=1,2 |" exp/scores/sre16_eval_scores_adapt || exit 1;

  utils/filter_scp.pl $sre16_trials_tgl exp/scores/sre16_eval_scores_adapt > exp/scores/sre16_eval_tgl_scores_adapt
  utils/filter_scp.pl $sre16_trials_yue exp/scores/sre16_eval_scores_adapt > exp/scores/sre16_eval_yue_scores_adapt
  pooled_eer=$(paste $sre16_trials exp/scores/sre16_eval_scores_adapt | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
  tgl_eer=$(paste $sre16_trials_tgl exp/scores/sre16_eval_tgl_scores_adapt | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
  yue_eer=$(paste $sre16_trials_yue exp/scores/sre16_eval_yue_scores_adapt | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
  
  echo "Using Adapted PLDA, EER: Pooled ${pooled_eer}%, Tagalog ${tgl_eer}%, Cantonese ${yue_eer}%"
  # EER: Pooled 12.98%, Tagalog 17.8%, Cantonese 8.35%
  # Using Adapted PLDA，sre16_trials的平均eer，他家路语的平均eer，粤语的平均eer
  
  
  # Using the official SRE16 scoring software, we obtain the following equalized results:
  #
  # -- Pooled --
  # EER:         13.08
  # min_Cprimary: 0.72
  # act_Cprimary: 0.73

  # -- Cantonese --
  # EER:          8.23
  # min_Cprimary: 0.59
  # act_Cprimary: 0.59

  # -- Tagalog --
  # EER:         17.87
  # min_Cprimary: 0.84
  # act_Cprimary: 0.87
fi
kaldi sre16/v1中run.sh

相关阅读

相关文章

相关问答

相关文档