-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathnon_lexical_labeler.yaml
More file actions
94 lines (78 loc) · 2.31 KB
/
non_lexical_labeler.yaml
File metadata and controls
94 lines (78 loc) · 2.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
model_name: 1218_nll
float32_matmul_precision: high
random_seed: 123456
# Phonemes that have no lexical meaning, do not require annotation, and are automatically recognized,
# such as AP (breath), EP (tail breath), etc
non_lexical_phonemes:
- AP
- EP
datasets_config_paths: # datasets
- configs/datasets_config.yaml
binary_folder: data/binary_nll # Preprocessing Results Folder
valid_set_size: 20 # When valid_set_Size is greater than 0, randomly select by quantity; otherwise, use the validation set in the dataset
max_length: 45 # Maximum duration of a single audio stream
# It is not recommended to enable the number of preprocessing threads and the size of the temporary queue if the data volume is too small (within 5 hours).
# Every 6-8g of memory and 6g of video memory can open one worker.
multiprocess_works: 0
multiprocess_max_size: 200 # size * num_works. Too large to ensure sufficient memory
multiprocess_start_size: 100 # Minimum startup data volume
# Number of training data loading processes
dataloader_workers: 2
dataloader_prefetch_factor: 2
batch_max_length: 200
binning_length: 1000
drop_last: False
num_valid_plots: 20 # Maximum drawing quantity for val and evaluate items
draw_evaluate: True
augmentation_args: # Code defect, prohibited from use
enabled: false
random_pitch_shifting:
range: [ -5., 5. ]
num: 0
blank_padding:
range: [ 0, 5 ] # seconds
num: 2
cvnt_arg:
mask_ratio: 0.3
encoder_conform_attention_drop: 0.05
num_layers: 4
encoder_conform_dim: 128
encoder_conform_ffn_latent_drop: 0.05
encoder_conform_ffn_out_drop: 0.05
encoder_conform_kernel_size: 31
optimizer_config:
lr: 0.0003
gamma: 0.9999
total_steps: 10000
muon_args:
weight_decay: 0.1
adamw_args:
weight_decay: 0.0
loss_config:
losses:
weights: [ 0.5, 0.3, 0.2, 1.0 ]
# trainer
accelerator: auto
devices: auto # num_devices
precision: bf16-mixed # bf16-mixed , 32-true
gradient_clip_val: 0.5
gradient_clip_algorithm: norm
val_check_interval: 1000
save_top_k: 5
save_every_steps: 1000
# Normally not changed
hubert_config:
encoder: cn_hubert
model_path: dependencies/chinese-hubert-base
sample_rate: 16000
hop_size: 320
channel: 768
mel_spec_config:
n_mels: 128
sample_rate: 44100
window_size: 882
hop_size: 441
n_fft: 1764
f_min: 40
f_max: 16000
clamp: 0.00001