This repository was archived by the owner on Jul 31, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathconfig.yaml
More file actions
28 lines (28 loc) · 1.65 KB
/
config.yaml
File metadata and controls
28 lines (28 loc) · 1.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# script parameters
model_id: "meta-llama/Meta-Llama-3-70b" # Hugging Face model id
dataset_path: "." # path to dataset
max_seq_len: 3072 # 2048 # max sequence length for model and packing of the dataset
# training parameters
output_dir: "./llama-3-70b-hf-no-robot" # Temporary output directory for model checkpoints
report_to: "tensorboard" # report metrics to tensorboard
learning_rate: 0.0002 # learning rate 2e-4
lr_scheduler_type: "constant" # learning rate scheduler
num_train_epochs: 3 # number of training epochs
per_device_train_batch_size: 1 # batch size per device during training
per_device_eval_batch_size: 1 # batch size for evaluation
gradient_accumulation_steps: 2 # number of steps before performing a backward/update pass
optim: adamw_torch # use torch adamw optimizer
logging_steps: 10 # log every 10 steps
save_strategy: epoch # save checkpoint every epoch
evaluation_strategy: epoch # evaluate every epoch
max_grad_norm: 0.3 # max gradient norm
warmup_ratio: 0.03 # warmup ratio
bf16: true # use bfloat16 precision
tf32: true # use tf32 precision
gradient_checkpointing: true # use gradient checkpointing to save memory
# FSDP parameters: https://huggingface.co/docs/transformers/main/en/fsdp
fsdp: "full_shard auto_wrap offload" # remove offload if enough GPU memory
fsdp_config:
- backward_prefetch: "backward_pre"
- forward_prefetch: "false"
- use_orig_params: "false"