HubertFA/configs/non_lexical_labeler.yaml at main · wolfgitpr/HubertFA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
model_name: 1218_nll

float32_matmul_precision: high
random_seed: 123456

# Phonemes that have no lexical meaning, do not require annotation, and are automatically recognized,
# such as AP (breath), EP (tail breath), etc
non_lexical_phonemes:
  - AP
  - EP

datasets_config_paths: # datasets
  - configs/datasets_config.yaml

binary_folder: data/binary_nll # Preprocessing Results Folder
valid_set_size: 20 # When valid_set_Size is greater than 0, randomly select by quantity; otherwise, use the validation set in the dataset
max_length: 45 # Maximum duration of a single audio stream

# It is not recommended to enable the number of preprocessing threads and the size of the temporary queue if the data volume is too small (within 5 hours).
# Every 6-8g of memory and 6g of video memory can open one worker.
multiprocess_works: 0
multiprocess_max_size: 200  # size * num_works. Too large to ensure sufficient memory
multiprocess_start_size: 100  # Minimum startup data volume

# Number of training data loading processes
dataloader_workers: 2
dataloader_prefetch_factor: 2
batch_max_length: 200
binning_length: 1000
drop_last: False

num_valid_plots: 20 # Maximum drawing quantity for val and evaluate items
draw_evaluate: True

augmentation_args: # Code defect, prohibited from use
  enabled: false
  random_pitch_shifting:
    range: [ -5., 5. ]
    num: 0
  blank_padding:
    range: [ 0, 5 ] # seconds
    num: 2

cvnt_arg:
  mask_ratio: 0.3
  encoder_conform_attention_drop: 0.05
  num_layers: 4

  encoder_conform_dim: 128
  encoder_conform_ffn_latent_drop: 0.05
  encoder_conform_ffn_out_drop: 0.05
  encoder_conform_kernel_size: 31

optimizer_config:
  lr: 0.0003
  gamma: 0.9999
  total_steps: 10000
  muon_args:
    weight_decay: 0.1
  adamw_args:
    weight_decay: 0.0

loss_config:
  losses:
    weights: [ 0.5, 0.3, 0.2, 1.0 ]

# trainer
accelerator: auto
devices: auto # num_devices
precision: bf16-mixed # bf16-mixed , 32-true
gradient_clip_val: 0.5
gradient_clip_algorithm: norm
val_check_interval: 1000

save_top_k: 5
save_every_steps: 1000

# Normally not changed
hubert_config:
  encoder: cn_hubert
  model_path: dependencies/chinese-hubert-base
  sample_rate: 16000
  hop_size: 320
  channel: 768

mel_spec_config:
  n_mels: 128
  sample_rate: 44100
  window_size: 882
  hop_size: 441
  n_fft: 1764
  f_min: 40
  f_max: 16000
  clamp: 0.00001