Fairseq学习日记：注定麻烦的旅程

马星阑

2023-12-01

现在开头：Fairseq是一个正在快速迭代的产品，而且是开源的！

这不是表扬，这意味着三件事情：

1.他没有文档！所有框架代码都没有任何注释，包括函数docstring都没有

2.他没有经过有效测试，估计是抢时间吧！即使是官网Readme里的例子也是无法跑起来的！

3.他是一个框架，而且是一个非常不Pythonic的框架，充斥着inline/包装器/莫名其妙的语法。

4.他大量使用类的静态方法和全局函数，这是一种破坏对象化的机制，实在是太难理解了。

虽然这四点决定他真的对不住Facebook的金字招牌，但是作为一个学习者，总要把他运行起来，那么开始这场针对 FaceBOOK派“全新老爷车”的修车之旅吧！

通过调试Fairseq我深刻的感到痛苦：各种莫名其妙的跳转，你永远不会知道在什么时候和什么地方发生跳转。

如果想利用Fairseq的机制提供管线封装，那么肯定会很累！

Fairseq的代码质量非常低，模型/模型配置参数/模型管线的参数对应的变量名都是model，任务/任务名称/任务的参数集合的变量名都是task，具体的含义需要调试上下文获得，并且不能保证一个函数的两次重入都是一样的含义。

以上批评意见的时间是1.0版，20201123

Step1 安装

略。。。网上教程很多的，反正你要自己动手

Fairseq

apex

下载预训练模型

下载训练数据 Lirispeech

Step2 跑wave2vec

先用官网的例子跑一个看看：

import torch
from fairseq.models.wav2vec import Wav2VecModel

cp = torch.load('/home/**/Documents/Research/fairseq/model/wav2vec_vox_960h_pl.pt.zip')
model = Wav2VecModel.build_model(cp['args'], task=None)
model.load_state_dict(cp['model'])
model.eval()

wav_input_16khz = torch.randn(1,10000)
z = model.feature_extractor(wav_input_16khz)
c = model.feature_aggregator(z)

你猜会咋样呢？

>>> Exception "AttributeError"
'Namespace' object has no attribute 'prediction_steps'
File: /home/**/Documents/workspace/fairseq/fairseq/models/wav2vec/wav2vec.py, Line: 175
Exception "unhandled AttributeError"
'Namespace' object has no attribute 'prediction_steps'
File: /home/**/Documents/workspace/fairseq/fairseq/models/wav2vec/wav2vec.py, Line: 175

哈哈，很神奇吧，其实也没啥就是单纯的代码质量太差！

@register_model("wav2vec", dataclass=Wav2VecConfig)
class Wav2VecModel(BaseFairseqModel):
    @classmethod
    def build_model(cls, cfg: Wav2VecConfig, task: FairseqTask):
        """Build a new model instance."""

        model = Wav2VecModel(cfg)
        logger.info(model)
        return model

    def __init__(self, cfg: Wav2VecConfig):
        super().__init__()

        self.prediction_steps = cfg.prediction_steps
        offset = cfg.offset

        if cfg.activation == "relu":
            activation = nn.ReLU()
        elif cfg.activation == "gelu":
            activation = nn.GELU()
        else:
            raise Exception("unknown activation " + cfg.activation)

调试可以看到，cfg这个参数根本就不是Wav2VecConfig类型，这个args的类型是<class 'argparse.Namespace'>,内容是：

Namespace(activation_dropout=0.1, adam_betas='(0.9, 0.98)', adam_eps=1e-08, all_gather_list_size=16384, 
apply_mask=True, arch='wav2vec_ctc', attention_dropout=0.0, best_checkpoint_metric='wer', 
bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_suffix='', clip_norm=0.0, 
cpu=False, criterion='wav2vec', curriculum=0, data='/private/home/qiantong/w2v/data/train_36s_short_960h_rescore', 
data_buffer_size=10, dataset_impl=None, ddp_backend='no_c10d', decay_steps=250000, device_id=0, disable_validation=False, 
distributed_backend='nccl', distributed_init_method='tcp://learnfair1679:56443', distributed_no_spawn=True, 
distributed_num_procs=None, distributed_port=56443, distributed_rank=0, distributed_world_size=24, 
distributed_wrapper='DDP', 
dropout=0.0, dropout_input=0, 
empty_cache_freq=0, enable_padding=False, fast_stat_sync=False, feature_grad_mult=0.0, final_dropout=0.0, final_lr_scale=0.05, find_unused_parameters=False, 
finetune_from_model=None, fix_batches_to_gpus=False,
 fixed_validation_seed=None, fp16=True, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, freeze_finetune_updates=10000, 
hold_steps=210000, init_lr_scale=0.01, keep_best_checkpoints=-1, keep_interval_updates=-1, 
keep_last_epochs=-1, labels='ltr', layerdrop=0.1, localsgd_frequency=3, log_format='json', log_interval=500, lr=[3e-05], lr_scheduler='tri_stage', 
mask_channel_length=64, mask_channel_other=0.0, mask_channel_prob=0.1, mask_channel_selection='static',
 mask_length=10, mask_other=0.0, mask_prob=0.1, mask_selection='static', max_epoch=0, max_sample_size=None, 
max_sentences=None, max_sentences_valid=None, max_tokens=1280000, max_tokens_valid=1280000, max_update=500000, maximize_best_checkpoint_metric=False, 
memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, 
min_lr=-1, min_sample_size=None, model_parallel_size=1, no_epoch_checkpoints=True, no_last_checkpoints=False, 
no_mask_channel_overlap=False, no_mask_overlap=False, no_pretrained_weights=False, no_progress_bar=False, no_save=False, 
no_save_optimizer_state=False, no_seed_provided=False, normalize=True, nprocs_per_node=8, num_workers=4, 
optimizer='adam', optimizer_overrides='{}', patience=-1, pipeline_balance=None, pipeline_checkpoint='never', 
pipeline_chunks=None, pipeline_devices=None, pipeline_model_parallel=False, profile=False, quantization_config_path=None, 
remove_bpe='letter', required_batch_size_multiple=8, required_seq_len_multiple=1, reset_dataloader=False, 
reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt', 
sample_rate=16000, save_dir='/checkpoint/michaelauli/asr/pseudolbl/960h_vox.fp16.u500000.savg.nrm.ltr.m_static.mstd0.mask10.mprob0.1.ld0.1.mc_static.mcstd0.maskc64.mcprob0.1.fgm0.0.ffu10000.lr3e-05.warmup40000.hld210000.dec250000.frs0.05.fd0.0.drop0.0.ad0.1.atd0.0.ms1280000.sd1337.uf1.ngpu24', 
save_interval=1, save_interval_updates=0, scoring='bleu', seed=1337, sentence_avg=True, skip_invalid_size_inputs_valid_test=False, slowmo_algorithm='LocalSGD', 
slowmo_momentum=None, stop_time_hours=0, task='audio_pretraining', tensorboard_logdir='', 
threshold_loss_scale=None, tokenizer=None, tpu=False, train_subset='train', update_freq=[1], use_bmuf=False, 
use_old_adam=False, user_dir=None, valid_subset='dev_other', validate_after_updates=10000, 
validate_interval=1, validate_interval_updates=0, w2v_args=Namespace(activation_dropout=0.0, activation_fn='gelu', adam_betas='(0.9,0.98)', 
adam_eps=1e-06, arch='wav2vec2', attention_dropout=0.1, attention_type='default', augment=False, 
best_checkpoint_metric='loss', bpe=None, bucket_cap_mb=25, centroids=None, clip_norm=25, codebook_negatives=0, combine_banks=False, combine_dataset=False, conv_bias=True, 
conv_feature_layers='[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2', conv_logs=True, conv_pos=128, conv_pos_groups=16, conv_pos_layers=1, cpu=False, criterion='wav2vec', 
cross_sample_negatives=0, curriculum=0, data='/private/home/abaevski/data/librivox/no_silence', dataset_impl=None, ddp_backend='c10d', debug=False, device_id=0, disable_validation=False, 
distributed_backend='nccl', distributed_init_method='tcp://learnfair1331:55498', distributed_no_spawn=True, distributed_port=55498, distributed_rank=0, distributed_world_size=128, 
div_drop_percent=0, div_pen_threshold=None, dropout=0.0, dropout_features=0.1, dropout_input=0.1, duplicate_negatives=0, empty_cache_freq=0, enable_padding=False, 
encode_padded_indiv=False, encoder_attention_heads=16, encoder_embed_dim=1024, encoder_ffn_embed_dim=4096, encoder_layerdrop=0.0, encoder_layers=24, encoder_normalize_before=True, 
encoder_optimizer_params=None, encoder_schedule=0, end_learning_rate=0.0, extractor_mode='layer_norm', extractor_model=None, extractor_norm_location='default', 
fast_stat_sync=False, feature_glu=False, feature_grad_mult=1.0, feature_noise=0, feature_noise_last=0, features_pen=True, final_dim=768, find_unused_parameters=True, finetune_extractor=True, fix_batches_to_gpus=False, fixed_validation_seed=None, force_anneal=None, fp16=True, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, group_norm_features=False, group_norm_groups=512, 
gumbel_noise_gain=1, infomax=True, input_noise=0.0, keep_interval_updates=1, keep_last_epochs=-1, label_smoothing=0.0, labels=None, latent_groups=2, latent_temp='(2.0,0.1,0.999995)', 
latent_var_banks=2, latent_vars=320, layer_norm_after=-1, layer_norm_before=0, layer_norm_features=True, layer_norm_first=True, layer_norm_repr=True, lazy_load_labels=False, 
log_format='json', log_interval=100, logit_temp=0.1, loss_weights=None, lr=[0.005], lr_scheduler='polynomial_decay', mask_min_space=1, mask_multiple_length=10, 
mask_prob=0.65, mask_same_channels=False, mask_same_timesteps=False, mask_selection='static', 
mask_stdev=0.0, masking_schedule=0, max_epoch=0, max_positions=8000, max_pred_length=0, max_sample_size=320000, max_sentences=None, max_sentences_valid=None, max_tokens=1200000, max_tokens_valid=1200000, max_update=1000000, 
maximize_best_checkpoint_metric=False, memory_efficient_fp16=False, min_loss_scale=0.0001, min_lr=-1, min_sample_size=32000, mlp_mi=768, negatives_from_everywhere=False, new_emb_pen=True, new_logit_pen=False, no_bert_init=False, 
no_epoch_checkpoints=True, no_last_checkpoints=False, no_mask_channel_overlap=False, no_mask_overlap=False, no_norm_after=None, no_progress_bar=False, no_save=False, no_save_optimizer_state=False, no_token_positional_embeddings=True, noise_type='gaussian', norm_init_weight=1.0, normalize=True, num_negatives=100, num_workers=6, optimizer='adam', optimizer_overrides='{}', 
penalize_transformer=False, penalty_coeff='[0,0,0.1,0]', penalty_temp=1.0, pooler_activation_fn='tanh', pooler_dropout=0.0, power=1.0, pre_norm=False, predict_everything=False, predictor_grad_mult=1.0, preemp=False, project_quantized=True, 
quantize_input=False, quantize_targets=True, quantized=False, quantizer_chance=0.0, quantizer_grad_mult=1.0, quantizer_init=True, quantizer_init_gain=1.0, quantizer_init_normal=True, relative_positional_embeddings=0, required_batch_size_multiple=8, rescale_sample_size=False, 
reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt', same_quantizer=False, sample_rate=16000, save_dir='/checkpoint/abaevski/asr/speechbert_raw_big_dbg/prenorm_ln_stable_repr_lr.qtz.lnfrst.lnr.cb.nrm.mlp768.pq.lv320.lvb2.ab0.9_0.98.lr0.005.wu32000.mask10.mprob0.65.mstd0.drp_i0.1.drp_f0.1.in0.0.nt_gaus.lnf-1.ng512.fgm1.0.nep.qini.qini1.pen0_0_0.1_0.cpl1.ld0.0.wd0.01.uf1.mu1000000.s5.ngpu128', 
save_interval=1, save_interval_updates=25000, scale_input=0, scp=False, seed=5, sentence_avg=False, siamese_extractor=False, siamese_feature_layers=None, skip_connections=False, 
skip_invalid_size_inputs_valid_test=True, skip_main_loss_prob=0, soft=False, squeeze_constant=20, squeeze_logits='norm_temp', squeeze_pos_emb='add', squeeze_quantizer_logits=False, static_preemp=False, tanh_after_norm=0, target_labels=None, 
task='audio_pretraining', tensorboard_logdir='', threshold_loss_scale=None, tokenizer=None, total_num_update=1000000, train_on_full=False, train_subset='train', 
unprojected_feats=False, update_freq=[1], use_aggregator_feats=False, use_bmuf=False, use_old_adam=False, user_dir=None, valid_subset='valid', validate_after_updates=0, 
validate_interval=1, validate_interval_updates=0, warmup_updates=32000, weight_decay=0.01, weight_norm=False), w2v_path='/private/home/abaevski/models/wav2vec2/wav2vec_vox_new.pt', 
warmup_steps=40000, weight_decay=0.0, wer_args="('/checkpoint/abaevski/data/speech/libri/4-gram.bin','/checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst',1.57,-0.64)", zero_infinity=True, zero_sharding='none')

就那么一个字，考！

写了这么多知道怎么解决了吗？显然没有

无法传递绝对参数的Bug,其实不能叫Bug，因为Hydra就是这么设计的。

问题出在那里呢？在hydra/_internal/utils.py中，只能传递相对参数，如果传递了绝对参数将导致计算错误。

def compute_search_path_dir(
    calling_file: Optional[str],
    calling_module: Optional[str],
    config_path: Optional[str],
) -> str:
    if calling_file is not None:
        abs_base_dir = realpath(dirname(calling_file))

        if config_path is not None:
            search_path_dir = join(abs_base_dir, config_path)
        else:
            search_path_dir = abs_base_dir
        search_path_dir = normpath(search_path_dir)
    elif calling_module is not None:
        last_dot = calling_module.rfind(".")
        if last_dot != -1:
            calling_module = calling_module[0:last_dot]
        else:
            calling_module = ""

但是Facebook就没有考虑过代码和数据分析部署的问题？如果代码在安装目录，比如一个anaconda环境中，但是配置文件在一个用户目录中，这个相对路径的计算将变得不可能完成。

Tips: FaceBook作hydra的架构师就是脑子进水！

自己动手丰衣足食，修改这一段，加入全局配置文件支持：

    if calling_file is not None:
        abs_base_dir = realpath(dirname(calling_file))

        if config_path is not None:
            #linger fix 
            if not os.path.isabs(config_path):
                search_path_dir = join(abs_base_dir, config_path)
            else:
                search_path_dir = config_path
            #linger fix end
        else:
            search_path_dir = abs_base_dir
        search_path_dir = normpath(search_path_dir)

是不是很简单，这样就可以配置和代码分离了！

Tips 2 ：无法传递命令行参数
其实是我不知道怎么传递，data.key 之类的写法一直报错，追踪了很久没有什么方法和改进，怎么办呢？

总是有类似下面的这种错误信息。

The debugged program raised the exception Exception
" -- Process 0 terminated with the following error: Traceback (most recent call last): File "/home/***/anaconda3/envs/lSrv08/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, 
in _wrap fn(i, *args) File "/home/***/Documents/workspace/fairseq/fairseq/distributed_utils.py", line 300, 
in distributed_main main(cfg, **kwargs) File "/home/***/Documents/workspace/fairseq/fairseq_cli/train.py", line 51, 
in main assert ( AssertionError: Must specify batch size either with --max-tokens or --batch-size "
File: /home/***/anaconda3/envs/lSrv08/lib/python3.8/site-packages/torch/multiprocessing/spawn.py, Line: 118
Break here?

既然已经可以传递全局配置文件了，那么就copy一个配置文件直接写参数吧

怎么知道改如何配置呢？

$ python fairseq_cli/hydra_train.py --help
hydra_train is powered by Hydra.

== Configuration groups ==
Compose your configuration from those groups (group=option)

bpe: bert, byte_bpe, fastbpe, gpt2, hf_byte_bpe, sentencepiece, subword_nmt
criterion: adaptive_loss, cross_entropy, ctc, wav2vec
lr_scheduler: cosine, fixed, inverse_sqrt, polynomial_decay, reduce_lr_on_plateau, tri_stage, triangular
model: transformer_lm, wav2vec, wav2vec2, wav2vec_ctc, wav2vec_seq2seq
model/transformer_lm: transformer_lm_baevski_gbw, transformer_lm_baevski_wiki103, transformer_lm_big, transformer_lm_gbw, transformer_lm_gpt, transformer_lm_gpt2_big, transformer_lm_gpt2_medium, transformer_lm_gpt2_small, transformer_lm_wiki103
model/wav2vec: vq_wav2vec_gumbel
model/wav2vec2: wav2vec2_base, wav2vec2_large
optimizer: adam, nag
scoring: bleu, sacrebleu, wer
task: audio_pretraining, dummy_lm, language_modeling
tokenizer: moses


== Config ==
Override anything in the config (foo.bar=value)

_name: null
common:
  _name: null
  no_progress_bar: false
  log_interval: 100
  log_format: null
  tensorboard_logdir: null
  wandb_project: null
  seed: 1
  cpu: false
  tpu: false
  bf16: false
  memory_efficient_bf16: false
  fp16: false
  memory_efficient_fp16: false
  fp16_no_flatten_grads: false
  fp16_init_scale: 128
  fp16_scale_window: null
  fp16_scale_tolerance: 0.0
  min_loss_scale: 0.0001
  threshold_loss_scale: null
  user_dir: null
  empty_cache_freq: 0
  all_gather_list_size: 16384
  model_parallel_size: 1
  quantization_config_path: null
  profile: false
  reset_logging: true
common_eval:
  _name: null
  path: null
  post_process: null
  quiet: false
  model_overrides: '{}'
  results_path: null
distributed_training:
  _name: null
  distributed_world_size: 3
  distributed_rank: 0
  distributed_backend: nccl
  distributed_init_method: null
  distributed_port: -1
  device_id: 0
  distributed_no_spawn: false
  ddp_backend: c10d
  bucket_cap_mb: 25
  fix_batches_to_gpus: false
  find_unused_parameters: false
  fast_stat_sync: false
  broadcast_buffers: false
  distributed_wrapper: DDP
  slowmo_momentum: null
  slowmo_algorithm: LocalSGD
  localsgd_frequency: 3
  nprocs_per_node: 3
  pipeline_model_parallel: false
  pipeline_balance: null
  pipeline_devices: null
  pipeline_chunks: 0
  pipeline_encoder_balance: null
  pipeline_encoder_devices: null
  pipeline_decoder_balance: null
  pipeline_decoder_devices: null
  pipeline_checkpoint: never
  zero_sharding: none
  tpu: ${common.tpu}
dataset:
  _name: null
  num_workers: 1
  skip_invalid_size_inputs_valid_test: false
  max_tokens: null
  batch_size: null
  required_batch_size_multiple: 8
  required_seq_len_multiple: 1
  dataset_impl: null
  data_buffer_size: 10
  train_subset: train
  valid_subset: valid
  validate_interval: 1
  validate_interval_updates: 0
  validate_after_updates: 0
  fixed_validation_seed: null
  disable_validation: false
  max_tokens_valid: null
  batch_size_valid: null
  curriculum: 0
  gen_subset: test
  num_shards: 1
  shard_id: 0
optimization:
  _name: null
  max_epoch: 0
  max_update: 0
  stop_time_hours: 0.0
  clip_norm: 0.0
  sentence_avg: false
  update_freq:
  - 1
  lr:
  - 0.25
  min_lr: -1.0
  use_bmuf: false
checkpoint:
  _name: null
  save_dir: checkpoints
  restore_file: checkpoint_last.pt
  finetune_from_model: null
  reset_dataloader: false
  reset_lr_scheduler: false
  reset_meters: false
  reset_optimizer: false
  optimizer_overrides: '{}'
  save_interval: 1
  save_interval_updates: 0
  keep_interval_updates: -1
  keep_last_epochs: -1
  keep_best_checkpoints: -1
  no_save: false
  no_epoch_checkpoints: false
  no_last_checkpoints: false
  no_save_optimizer_state: false
  best_checkpoint_metric: loss
  maximize_best_checkpoint_metric: false
  patience: -1
  checkpoint_suffix: ''
  checkpoint_shard_count: 1
  model_parallel_size: ${common.model_parallel_size}
  distributed_rank: ${distributed_training.distributed_rank}
bmuf:
  _name: null
  block_lr: 1.0
  block_momentum: 0.875
  global_sync_iter: 50
  warmup_iterations: 500
  use_nbm: false
  average_sync: false
  distributed_world_size: ${distributed_training.distributed_world_size}
generation:
  _name: null
  beam: 5
  nbest: 1
  max_len_a: 0.0
  max_len_b: 200
  min_len: 1
  match_source_len: false
  unnormalized: false
  no_early_stop: false
  no_beamable_mm: false
  lenpen: 1.0
  unkpen: 0.0
  replace_unk: null
  sacrebleu: false
  score_reference: false
  prefix_size: 0
  no_repeat_ngram_size: 0
  sampling: false
  sampling_topk: -1
  sampling_topp: -1.0
  constraints: null
  temperature: 1.0
  diverse_beam_groups: -1
  diverse_beam_strength: 0.5
  diversity_rate: -1.0
  print_alignment: false
  print_step: false
  lm_path: null
  lm_weight: 0.0
  iter_decode_eos_penalty: 0.0
  iter_decode_max_iter: 10
  iter_decode_force_max_iter: false
  iter_decode_with_beam: 1
  iter_decode_with_external_reranker: false
  retain_iter_history: false
  retain_dropout: false
  retain_dropout_modules: null
  decoding_format: null
  no_seed_provided: false
eval_lm:
  _name: null
  output_word_probs: false
  output_word_stats: false
  context_window: 0
  softmax_batch: 9223372036854775807
interactive:
  _name: null
  buffer_size: 0
  input: '-'
model: ???
task: null
criterion:
  _name: cross_entropy
  sentence_avg: ${optimization.sentence_avg}
optimizer: null
lr_scheduler:
  _name: fixed
  force_anneal: null
  lr_shrink: 0.1
  warmup_updates: 0
  lr: ${optimization.lr}
scoring: null
bpe: null
tokenizer: null


Powered by Hydra (https://hydra.cc)
Use --hydra-help to view Hydra specific help

打开配置文件，增加/修改对应的节，记住不能重复定义！

Tips 3： Fairseq的路径参数默认train.tsv valid.tsv

例如修改为自己的数据集将 train.tsv valid.tsv，创建到目录

task:
  _name: audio_pretraining
  data: /data/Cache/SLR18/
  max_sample_size: 250000
  min_sample_size: 32000

Tips 4 : 内存溢出问题，3090 24G的现存仍然不够，唉，感慨V100 * 64 的土豪配置3秒

OOM: Ran out of memory with exception: CUDA out of memory. Tried to allocate 20.83 GiB (GPU 1; 23.70 GiB total capacity; 1.99 GiB already allocated; 20.20 GiB free; 2.24 GiB reserved in total by PyTorch)

调整batch_size 到10，没有什么暖用，20.83GB

设置max_tokens: 1400000 -> 800000，也没有什么用

怎么回事呢？原来Fairseq需要设置 vaild阶段的sample的大小，追踪发现默认使用1300的大小。。。，训练阶段才3-4

batch_size_valid:和max_tokens_valid ,具体的值自己摸索吧

  max_tokens_valid: null
  batch_size_valid: null
改为

  max_tokens_valid: 200000
或者
  batch_size_valid: 20

Step3 Fintune 预训练模型

标准的训练脚本

python fairseq_cli/hydra_train.py \
    distributed_training.distributed_world_size=2 +optimization.update_freq='[12]' \
    --config-path /home/linger/Documents/Research/fairseq/config/finetuning \
    --config-name vox_960h_2

可能出现的问题

Tips1 : 准备数据 ltr

典型错误

FileNotFoundError: [Errno 2] No such file or directory: '~/Documents/Research/fairseq/cache/dev_other.ltr'

处理方法：

python examples/wav2vec/wav2vec_manifest.py /data/Cache/SLR12/dev-other --dest ~/Documents/Research/fairseq/cache	 --ext flac --valid-percent 0
split=dev_other
python examples/wav2vec/libri_labels.py ~/Documents/Research/fairseq/cache/dev_other.tsv --output-dir ~/Documents/Research/fairseq/cache/ --output-name $split

Tips2 : 调试小技巧，demo.yaml

可以设置一个比较小的数据集和yaml配置，用于调试

dataset:
  num_workers: 0      #eric有bug，不能调试多进程数据加载
  max_tokens: 1280000
  skip_invalid_size_inputs_valid_test: true
  valid_subset: dev_other
  max_tokens_valid: 1280000

例如：distributed_training.distributed_world_size=1 --config-path /home/linger/Documents/Research/fairseq/config/finetuning --config-name demo

Tips3 : 预先训练的模型和当前的代码可能存在不兼容的问题

在预先训练的模型中'latent_temp': '(2.0,0.1,0.999995)'，但是参数系统需要她是'latent_temp': (2.0,0.1,0.999995)

调用堆栈分析：

hydra_train -> cli_main ->hydra_main->_run_hydra->

    hydra = run_and_report(
        lambda: Hydra.create_main_hydra2(
            task_name=task_name, config_search_path=search_path, strict=strict
        )
    )

        if args.run:
            run_and_report(
                lambda: hydra.run(
                    config_name=config_name,
                    task_function=task_function,
                    overrides=args.overrides,
                )
            )
        elif args.multirun:
            run_and_report(
                lambda: hydra.multirun(
                    config_name=config_name,
                    task_function=task_function,
                    overrides=args.overrides,
                )
            )

从这里进入到了一个特殊的调用位置：hydra/_internal/hydra.py

class Hydra:
@classmethod

create_main_hydra2

这一句是真心TMD的写法：居然在hydra的类的实现里定义了一个变量叫hydra？？？？？？？？

        hydra = cls(task_name=task_name, config_loader=config_loader)
        from hydra.core.global_hydra import GlobalHydra

hydra/_internal/config_loader_impl.py

class ConfigLoaderImpl(ConfigLoader):
    """
    Configuration loader
    """

/home/linger/Documents/workspace/fairseq/fairseq/tasks/audio_pretraining.py

build_model

/home/linger/Documents/workspace/fairseq/fairseq/tasks/fairseq_task.py

model = models.build_model(cfg, self)

/home/linger/Documents/workspace/fairseq/fairseq/models/__init__.py

def build_model(cfg: FairseqDataclass, task):

/home/linger/Documents/workspace/fairseq/fairseq/models/wav2vec/wav2vec2_asr.py

    @classmethod
    def build_model(cls, cfg: Wav2Vec2CtcConfig, task: FairseqTask):
        """Build a new model instance."""

        w2v_encoder = Wav2VecEncoder(cfg, task.target_dictionary)
        return cls(cfg, w2v_encoder)

核心加载机制：

        if cfg.w2v_args is None:
            state = checkpoint_utils.load_checkpoint_to_cpu(cfg.w2v_path, arg_overrides)
            #这里的state就是那个 load + 覆盖参数，将所有参数覆盖到state的args 或者  cfg节
            w2v_args = state.get("cfg", None)        
            if w2v_args is None:
                w2v_args = convert_namespace_to_omegaconf(state["args"])
                #如果cfg无效则用args替换
            #linger
            try:
                w2v_args.model.w2v_args.model.latent_temp = eval(w2v_args.model.w2v_args.model.latent_temp)
            except :
                pass
            #linger    
            #将state的cfg保存到cfg的w2v_args
            cfg.w2v_args = w2v_args

>>>  print(json.dumps(eval(str(state.get("cfg", None)  )), indent=4))

{
    "_name": null,
    "common": {
        "_name": null,
        "no_progress_bar": false,
        "log_interval": 500,
        "log_format": "json",
        "tensorboard_logdir": "",
        "wandb_project": null,
        "seed": 1337,
        "cpu": false,
        "tpu": false,
        "bf16": false,
        "memory_efficient_bf16": false,
        "fp16": true,
        "memory_efficient_fp16": false,
        "fp16_no_flatten_grads": false,
        "fp16_init_scale": 128,
        "fp16_scale_window": null,
        "fp16_scale_tolerance": 0.0,
        "min_loss_scale": 0.0001,
        "threshold_loss_scale": null,
        "user_dir": null,
        "empty_cache_freq": 0,
        "all_gather_list_size": 16384,
        "model_parallel_size": 1,
        "quantization_config_path": null,
        "profile": false,
        "reset_logging": true
    },
    "common_eval": {
        "_name": null,
        "path": null,
        "post_process": "letter",
        "quiet": false,
        "model_overrides": "{}",
        "results_path": null
    },
    "distributed_training": {
        "_name": null,
        "distributed_world_size": 24,
        "distributed_rank": 0,
        "distributed_backend": "nccl",
        "distributed_init_method": "tcp://learnfair1679:56443",
        "distributed_port": 56443,
        "device_id": 0,
        "distributed_no_spawn": true,
        "ddp_backend": "no_c10d",
        "bucket_cap_mb": 25,
        "fix_batches_to_gpus": false,
        "find_unused_parameters": false,
        "fast_stat_sync": false,
        "broadcast_buffers": false,
        "distributed_wrapper": "DDP",
        "slowmo_momentum": null,
        "slowmo_algorithm": "LocalSGD",
        "localsgd_frequency": 3,
        "nprocs_per_node": 8,
        "pipeline_model_parallel": false,
        "pipeline_balance": null,
        "pipeline_devices": null,
        "pipeline_chunks": null,
        "pipeline_encoder_balance": null,
        "pipeline_encoder_devices": null,
        "pipeline_decoder_balance": null,
        "pipeline_decoder_devices": null,
        "pipeline_checkpoint": "never",
        "zero_sharding": "none",
        "tpu": false
    },
    "dataset": {
        "_name": null,
        "num_workers": 4,
        "skip_invalid_size_inputs_valid_test": false,
        "max_tokens": 1280000,
        "batch_size": null,
        "required_batch_size_multiple": 8,
        "required_seq_len_multiple": 1,
        "dataset_impl": null,
        "data_buffer_size": 10,
        "train_subset": "train",
        "valid_subset": "dev_other",
        "validate_interval": 1,
        "validate_interval_updates": 0,
        "validate_after_updates": 10000,
        "fixed_validation_seed": null,
        "disable_validation": false,
        "max_tokens_valid": 1280000,
        "batch_size_valid": null,
        "curriculum": 0,
        "gen_subset": "test",
        "num_shards": 1,
        "shard_id": 0
    },
    "optimization": {
        "_name": null,
        "max_epoch": 0,
        "max_update": 500000,
        "stop_time_hours": 0.0,
        "clip_norm": 0.0,
        "sentence_avg": true,
        "update_freq": [
            1
        ],
        "lr": [
            3e-05
        ],
        "min_lr": -1.0,
        "use_bmuf": false
    },
    "checkpoint": {
        "_name": null,
        "save_dir": "/checkpoint/michaelauli/asr/pseudolbl/960h_vox.fp16.u500000.savg.nrm.ltr.m_static.mstd0.mask10.mprob0.1.ld0.1.mc_static.mcstd0.maskc64.mcprob0.1.fgm0.0.ffu10000.lr3e-05.warmup40000.hld210000.dec250000.frs0.05.fd0.0.drop0.0.ad0.1.atd0.0.ms1280000.sd1337.uf1.ngpu24",
        "restore_file": "checkpoint_last.pt",
        "finetune_from_model": null,
        "reset_dataloader": false,
        "reset_lr_scheduler": false,
        "reset_meters": false,
        "reset_optimizer": false,
        "optimizer_overrides": "{}",
        "save_interval": 1,
        "save_interval_updates": 0,
        "keep_interval_updates": -1,
        "keep_last_epochs": -1,
        "keep_best_checkpoints": -1,
        "no_save": false,
        "no_epoch_checkpoints": true,
        "no_last_checkpoints": false,
        "no_save_optimizer_state": false,
        "best_checkpoint_metric": "wer",
        "maximize_best_checkpoint_metric": false,
        "patience": -1,
        "checkpoint_suffix": "",
        "checkpoint_shard_count": 1,
        "model_parallel_size": 1,
        "distributed_rank": 0
    },
    "bmuf": {
        "_name": null,
        "block_lr": 1.0,
        "block_momentum": 0.875,
        "global_sync_iter": 50,
        "warmup_iterations": 500,
        "use_nbm": false,
        "average_sync": false,
        "distributed_world_size": 24
    },
    "generation": {
        "_name": null,
        "beam": 5,
        "nbest": 1,
        "max_len_a": 0.0,
        "max_len_b": 200,
        "min_len": 1,
        "match_source_len": false,
        "unnormalized": false,
        "no_early_stop": false,
        "no_beamable_mm": false,
        "lenpen": 1.0,
        "unkpen": 0.0,
        "replace_unk": null,
        "sacrebleu": false,
        "score_reference": false,
        "prefix_size": 0,
        "no_repeat_ngram_size": 0,
        "sampling": false,
        "sampling_topk": -1,
        "sampling_topp": -1.0,
        "constraints": null,
        "temperature": 1.0,
        "diverse_beam_groups": -1,
        "diverse_beam_strength": 0.5,
        "diversity_rate": -1.0,
        "print_alignment": false,
        "print_step": false,
        "lm_path": null,
        "lm_weight": 0.0,
        "iter_decode_eos_penalty": 0.0,
        "iter_decode_max_iter": 10,
        "iter_decode_force_max_iter": false,
        "iter_decode_with_beam": 1,
        "iter_decode_with_external_reranker": false,
        "retain_iter_history": false,
        "retain_dropout": false,
        "retain_dropout_modules": null,
        "decoding_format": null,
        "no_seed_provided": false
    },
    "eval_lm": {
        "_name": null,
        "output_word_probs": false,
        "output_word_stats": false,
        "context_window": 0,
        "softmax_batch": 9223372036854775807
    },
    "interactive": {
        "_name": null,
        "buffer_size": 0,
        "input": "-"
    },
    "model": {
        "_name": "wav2vec_ctc",
        "w2v_path": "/private/home/abaevski/models/wav2vec2/wav2vec_vox_new.pt",
        "no_pretrained_weights": false,
        "dropout_input": 0.0,
        "final_dropout": 0.0,
        "dropout": 0.0,
        "attention_dropout": 0.0,
        "activation_dropout": 0.1,
        "apply_mask": true,
        "mask_length": 10,
        "mask_prob": 0.5,
        "mask_selection": "static",
        "mask_other": 0.0,
        "no_mask_overlap": false,
        "mask_channel_length": 64,
        "mask_channel_prob": 0.25,
        "mask_channel_selection": "static",
        "mask_channel_other": 0.0,
        "no_mask_channel_overlap": false,
        "freeze_finetune_updates": 10000,
        "feature_grad_mult": 0.0,
        "layerdrop": 0.1,
        "normalize": true,
        "data": "/private/home/qiantong/w2v/data/train_36s_short_960h_rescore",
        "w2v_args": {
            "common": {
                "no_progress_bar": false,
                "log_interval": 100,
                "log_format": "json",
                "tensorboard_logdir": "",
                "wandb_project": null,
                "seed": 5,
                "cpu": false,
                "tpu": false,
                "bf16": false,
                "memory_efficient_bf16": false,
                "fp16": true,
                "memory_efficient_fp16": false,
                "fp16_no_flatten_grads": false,
                "fp16_init_scale": 128,
                "fp16_scale_window": null,
                "fp16_scale_tolerance": 0.0,
                "min_loss_scale": 0.0001,
                "threshold_loss_scale": null,
                "user_dir": null,
                "empty_cache_freq": 0,
                "all_gather_list_size": 16384,
                "model_parallel_size": 1,
                "quantization_config_path": null,
                "profile": false,
                "reset_logging": true
            },
            "common_eval": {
                "path": null,
                "post_process": null,
                "quiet": false,
                "model_overrides": "{}",
                "results_path": null
            },
            "distributed_training": {
                "distributed_world_size": 128,
                "distributed_rank": 0,
                "distributed_backend": "nccl",
                "distributed_init_method": "tcp://learnfair1331:55498",
                "distributed_port": 55498,
                "device_id": 0,
                "distributed_no_spawn": true,
                "ddp_backend": "c10d",
                "bucket_cap_mb": 25,
                "fix_batches_to_gpus": false,
                "find_unused_parameters": true,
                "fast_stat_sync": false,
                "broadcast_buffers": false,
                "distributed_wrapper": "DDP",
                "slowmo_momentum": null,
                "slowmo_algorithm": "LocalSGD",
                "localsgd_frequency": 3,
                "nprocs_per_node": 3,
                "pipeline_model_parallel": false,
                "pipeline_balance": null,
                "pipeline_devices": null,
                "pipeline_chunks": 0,
                "pipeline_encoder_balance": null,
                "pipeline_encoder_devices": null,
                "pipeline_decoder_balance": null,
                "pipeline_decoder_devices": null,
                "pipeline_checkpoint": "never",
                "zero_sharding": "none",
                "tpu": false
            },
            "dataset": {
                "num_workers": 6,
                "skip_invalid_size_inputs_valid_test": true,
                "max_tokens": 1200000,
                "batch_size": null,
                "required_batch_size_multiple": 8,
                "required_seq_len_multiple": 1,
                "dataset_impl": null,
                "data_buffer_size": 10,
                "train_subset": "train",
                "valid_subset": "valid",
                "validate_interval": 1,
                "validate_interval_updates": 0,
                "validate_after_updates": 0,
                "fixed_validation_seed": null,
                "disable_validation": false,
                "max_tokens_valid": 1200000,
                "batch_size_valid": null,
                "curriculum": 0,
                "gen_subset": "test",
                "num_shards": 1,
                "shard_id": 0
            },
            "optimization": {
                "max_epoch": 0,
                "max_update": 1000000,
                "stop_time_hours": 0,
                "clip_norm": 25,
                "sentence_avg": false,
                "update_freq": [
                    1
                ],
                "lr": [
                    0.005
                ],
                "min_lr": -1,
                "use_bmuf": false
            },
            "checkpoint": {
                "save_dir": "/checkpoint/abaevski/asr/speechbert_raw_big_dbg/prenorm_ln_stable_repr_lr.qtz.lnfrst.lnr.cb.nrm.mlp768.pq.lv320.lvb2.ab0.9_0.98.lr0.005.wu32000.mask10.mprob0.65.mstd0.drp_i0.1.drp_f0.1.in0.0.nt_gaus.lnf-1.ng512.fgm1.0.nep.qini.qini1.pen0_0_0.1_0.cpl1.ld0.0.wd0.01.uf1.mu1000000.s5.ngpu128",
                "restore_file": "checkpoint_last.pt",
                "finetune_from_model": null,
                "reset_dataloader": false,
                "reset_lr_scheduler": false,
                "reset_meters": false,
                "reset_optimizer": false,
                "optimizer_overrides": "{}",
                "save_interval": 1,
                "save_interval_updates": 25000,
                "keep_interval_updates": 1,
                "keep_last_epochs": -1,
                "keep_best_checkpoints": -1,
                "no_save": false,
                "no_epoch_checkpoints": true,
                "no_last_checkpoints": false,
                "no_save_optimizer_state": false,
                "best_checkpoint_metric": "loss",
                "maximize_best_checkpoint_metric": false,
                "patience": -1,
                "checkpoint_suffix": "",
                "checkpoint_shard_count": 1,
                "model_parallel_size": 1,
                "distributed_rank": 0
            },
            "bmuf": {
                "block_lr": 1,
                "block_momentum": 0.875,
                "global_sync_iter": 50,
                "warmup_iterations": 500,
                "use_nbm": false,
                "average_sync": false,
                "distributed_world_size": 128
            },
            "generation": {
                "beam": 5,
                "nbest": 1,
                "max_len_a": 0,
                "max_len_b": 200,
                "min_len": 1,
                "match_source_len": false,
                "unnormalized": false,
                "no_early_stop": false,
                "no_beamable_mm": false,
                "lenpen": 1,
                "unkpen": 0,
                "replace_unk": null,
                "sacrebleu": false,
                "score_reference": false,
                "prefix_size": 0,
                "no_repeat_ngram_size": 0,
                "sampling": false,
                "sampling_topk": -1,
                "sampling_topp": -1.0,
                "constraints": null,
                "temperature": 1.0,
                "diverse_beam_groups": -1,
                "diverse_beam_strength": 0.5,
                "diversity_rate": -1.0,
                "print_alignment": false,
                "print_step": false,
                "lm_path": null,
                "lm_weight": 0.0,
                "iter_decode_eos_penalty": 0.0,
                "iter_decode_max_iter": 10,
                "iter_decode_force_max_iter": false,
                "iter_decode_with_beam": 1,
                "iter_decode_with_external_reranker": false,
                "retain_iter_history": false,
                "retain_dropout": false,
                "retain_dropout_modules": null,
                "decoding_format": null,
                "no_seed_provided": false
            },
            "eval_lm": {
                "output_word_probs": false,
                "output_word_stats": false,
                "context_window": 0,
                "softmax_batch": 9223372036854775807
            },
            "interactive": {
                "buffer_size": 0,
                "input": "-"
            },
            "task": {
                "_name": "audio_pretraining",
                "data": "/private/home/abaevski/data/librivox/no_silence",
                "labels": null,
                "sample_rate": 16000,
                "normalize": true,
                "enable_padding": false,
                "max_sample_size": 320000,
                "min_sample_size": 32000,
                "eval_wer": false,
                "eval_wer_config": {
                    "beam": 5,
                    "nbest": 1,
                    "max_len_a": 0,
                    "max_len_b": 200,
                    "min_len": 1,
                    "match_source_len": false,
                    "unnormalized": false,
                    "no_early_stop": false,
                    "no_beamable_mm": false,
                    "lenpen": 1,
                    "unkpen": 0,
                    "replace_unk": null,
                    "sacrebleu": false,
                    "score_reference": false,
                    "prefix_size": 0,
                    "no_repeat_ngram_size": 0,
                    "sampling": false,
                    "sampling_topk": -1,
                    "sampling_topp": -1.0,
                    "constraints": null,
                    "temperature": 1.0,
                    "diverse_beam_groups": -1,
                    "diverse_beam_strength": 0.5,
                    "diversity_rate": -1.0,
                    "print_alignment": false,
                    "print_step": false,
                    "lm_path": null,
                    "lm_weight": 0.0,
                    "iter_decode_eos_penalty": 0.0,
                    "iter_decode_max_iter": 10,
                    "iter_decode_force_max_iter": false,
                    "iter_decode_with_beam": 1,
                    "iter_decode_with_external_reranker": false,
                    "retain_iter_history": false,
                    "retain_dropout": false,
                    "retain_dropout_modules": null,
                    "decoding_format": null,
                    "no_seed_provided": false
                },
                "eval_wer_tokenizer": null,
                "eval_wer_post_process": "letter",
                "autoregressive": false
            },
            "criterion": {
                "_name": "wav2vec",
                "infonce": false,
                "loss_weights": null,
                "log_keys": []
            },
            "optimizer": {
                "_name": "adam",
                "adam_betas": "(0.9,0.98)",
                "adam_eps": 1e-06,
                "weight_decay": 0.01,
                "use_old_adam": false,
                "tpu": false,
                "lr": [
                    0.005
                ]
            },
            "lr_scheduler": {
                "_name": "polynomial_decay",
                "warmup_updates": 32000,
                "force_anneal": null,
                "end_learning_rate": 0.0,
                "power": 1.0,
                "total_num_update": 1000000,
                "lr": [
                    0.005
                ]
            },
            "model": {
                "_name": "wav2vec2",
                "extractor_mode": "layer_norm",
                "encoder_layers": 24,
                "encoder_embed_dim": 1024,
                "encoder_ffn_embed_dim": 4096,
                "encoder_attention_heads": 16,
                "activation_fn": "gelu",
                "dropout": 0.0,
                "attention_dropout": 0.1,
                "activation_dropout": 0.0,
                "encoder_layerdrop": 0.0,
                "dropout_input": 0.1,
                "dropout_features": 0.1,
                "final_dim": 768,
                "layer_norm_first": true,
                "conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2",
                "conv_bias": true,
                "logit_temp": 0.1,
                "quantize_targets": true,
                "quantize_input": false,
                "same_quantizer": false,
                "target_glu": false,
                "feature_grad_mult": 1.0,
                "latent_vars": 320,
                "latent_groups": 2,
                "latent_dim": 0,
                "mask_length": 10,
                "mask_prob": 0.65,
                "mask_selection": "static",
                "mask_other": 0,
                "no_mask_overlap": false,
                "mask_min_space": 1,
                "mask_channel_length": 10,
                "mask_channel_prob": 0.0,
                "mask_channel_selection": "static",
                "mask_channel_other": 0,
                "no_mask_channel_overlap": false,
                "mask_channel_min_space": 1,
                "num_negatives": 100,
                "negatives_from_everywhere": false,
                "cross_sample_negatives": 0,
                "codebook_negatives": 0,
                "conv_pos": 128,
                "conv_pos_groups": 16,
                "latent_temp": "(2.0,0.1,0.999995)"
            }
        }
    },
    "task": {
        "_name": "audio_pretraining",
        "data": "/private/home/qiantong/w2v/data/train_36s_short_960h_rescore",
        "labels": "ltr",
        "sample_rate": 16000,
        "normalize": true,
        "enable_padding": false,
        "max_sample_size": null,
        "min_sample_size": null,
        "eval_wer": false,
        "eval_wer_config": {
            "_name": null,
            "beam": 5,
            "nbest": 1,
            "max_len_a": 0.0,
            "max_len_b": 200,
            "min_len": 1,
            "match_source_len": false,
            "unnormalized": false,
            "no_early_stop": false,
            "no_beamable_mm": false,
            "lenpen": 1.0,
            "unkpen": 0.0,
            "replace_unk": null,
            "sacrebleu": false,
            "score_reference": false,
            "prefix_size": 0,
            "no_repeat_ngram_size": 0,
            "sampling": false,
            "sampling_topk": -1,
            "sampling_topp": -1.0,
            "constraints": null,
            "temperature": 1.0,
            "diverse_beam_groups": -1,
            "diverse_beam_strength": 0.5,
            "diversity_rate": -1.0,
            "print_alignment": false,
            "print_step": false,
            "lm_path": null,
            "lm_weight": 0.0,
            "iter_decode_eos_penalty": 0.0,
            "iter_decode_max_iter": 10,
            "iter_decode_force_max_iter": false,
            "iter_decode_with_beam": 1,
            "iter_decode_with_external_reranker": false,
            "retain_iter_history": false,
            "retain_dropout": false,
            "retain_dropout_modules": null,
            "decoding_format": null,
            "no_seed_provided": false
        },
        "eval_wer_tokenizer": null,
        "eval_wer_post_process": "letter",
        "autoregressive": false
    },
    "criterion": {
        "_name": "wav2vec",
        "infonce": false,
        "loss_weights": null,
        "log_keys": []
    },
    "optimizer": {
        "_name": "adam",
        "adam_betas": "(0.9, 0.98)",
        "adam_eps": 1e-08,
        "weight_decay": 0.0,
        "use_old_adam": false,
        "tpu": false,
        "lr": [
            3e-05
        ]
    },
    "lr_scheduler": {
        "_name": "tri_stage",
        "warmup_steps": 40000,
        "hold_steps": 210000,
        "decay_steps": 250000,
        "phase_ratio": null,
        "init_lr_scale": 0.01,
        "final_lr_scale": 0.05,
        "max_update": 500000.0,
        "lr": [
            3e-05
        ]
    },
    "scoring": {
        "_name": "bleu",
        "pad": 1,
        "eos": 2,
        "unk": 3
    },
    "bpe": null,
    "tokenizer": null
}

从这里可以看到：模型存在明显的迭代加载的问题，一级Keys：

>>> print(json.dumps(list(eval(str(state.get("cfg", None)  )).keys()), indent=4))

[
    "_name",
    "common",
    "common_eval",
    "distributed_training",
    "dataset",
    "optimization",
    "checkpoint",
    "bmuf",
    "generation",
    "eval_lm",
    "interactive",
    "model",
    "task",
    "criterion",
    "optimizer",
    "lr_scheduler",
    "scoring",
    "bpe",
    "tokenizer"
]

其中，checkpoint model task都存储了类似的数据，这可能是后续重复加载的原因！cfg -》w2v_args->task->model

这里的反复加载可能导致了多次加载和不一致性问题！

>>> print(json.dumps(list(eval(str(state.get("cfg", None)['model']  )).keys()), indent=4))
[
    "_name",
    "w2v_path",
    "no_pretrained_weights",
    "dropout_input",
    "final_dropout",
    "dropout",
    "attention_dropout",
    "activation_dropout",
    "apply_mask",
    "mask_length",
    "mask_prob",
    "mask_selection",
    "mask_other",
    "no_mask_overlap",
    "mask_channel_length",
    "mask_channel_prob",
    "mask_channel_selection",
    "mask_channel_other",
    "no_mask_channel_overlap",
    "freeze_finetune_updates",
    "feature_grad_mult",
    "layerdrop",
    "normalize",
    "data",
    "w2v_args"
]
>>> print(json.dumps(list(eval(str(state.get("cfg", None)['task']  )).keys()), indent=4))
[
    "_name",
    "data",
    "labels",
    "sample_rate",
    "normalize",
    "enable_padding",
    "max_sample_size",
    "min_sample_size",
    "eval_wer",
    "eval_wer_config",
    "eval_wer_tokenizer",
    "eval_wer_post_process",
    "autoregressive"
]

Tips 5:如何使hydra那么庞大的参数调试好看一些呢？

这里的w2v_args.task就是那个看到不想看的内容表格！

print(json.dumps(eval(str(w2v_args.task)), indent=4))

Tips 6 :Fairseq的Task类的说明

@register_task("audio_pretraining", dataclass=AudioPretrainingConfig)
class AudioPretrainingTask(FairseqTask):
    """"""

    cfg: AudioPretrainingConfig

    def __init__(
        self,
        cfg: AudioPretrainingConfig,
        source_dictionary=None,
        target_dictionary=None,
    ):
        super().__init__(cfg)
        self._target_dictionary = target_dictionary
        self._source_dictionary = source_dictionary
        if cfg.eval_wer:
            assert cfg.labels is not None, "eval_wer can only be set during fine-tuning"

    @classmethod
    def setup_task(cls, cfg: AudioPretrainingConfig, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            cfg (AudioPretrainingConfig): configuration of this task
        """

        if cfg.labels:
            dict_path = os.path.join(cfg.data, f"dict.{cfg.labels}.txt")
            target_dictionary = Dictionary.load(dict_path)
        else:
            target_dictionary = None

        return cls(cfg, target_dictionary=target_dictionary)
#其实这是用类静态函数封装的构造函数，为啥不直接对象化呢？
#做了一次奇怪而没有什么价值的封装，用重载super不香吗？