Created
July 3, 2024 15:22
-
-
Save vkuzo/a4d6754358facffa64df931654459631 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// float8linear on, tp 1 | |
(pytorch) [[email protected] ~/local/torchtitan (20240702_float8_linear)]$ with-proxy NGPU=4 CUDA_VISIBLE_DEVICES=4,5,6,7 CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh | |
+ export USE_LIBUV=1 | |
+ USE_LIBUV=1 | |
+ TRAINER_DIR=/home/vasiliy/local/torchtitan | |
+ NGPU=4 | |
+ NNODES=1 | |
+ LOG_RANK=0 | |
+ CONFIG_FILE=./train_configs/llama3_8b.toml | |
+ overrides= | |
+ '[' 0 -ne 0 ']' | |
+ echo '' | |
+ grep -q -- --memory_estimation.enabled | |
+ torchrun --nproc_per_node=4 --rdzv_backend c10d --rdzv_endpoint=localhost:0 --local-ranks-filter 0 --role rank --tee 3 train.py --job.config_file ./train_configs/llama3_8b.toml | |
W0703 08:12:30.763000 140521685505024 torch/distributed/run.py:793] | |
W0703 08:12:30.763000 140521685505024 torch/distributed/run.py:793] ***************************************** | |
W0703 08:12:30.763000 140521685505024 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
W0703 08:12:30.763000 140521685505024 torch/distributed/run.py:793] ***************************************** | |
[rank0]:2024-07-03 08:12:32,874 - root - INFO - Starting job: Llama 3 8B training | |
[rank0]:2024-07-03 08:12:35,637 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config | |
[rank0]:2024-07-03 08:12:35,638 - root - INFO - Building 1-D device mesh with ['dp'], [4] | |
[rank0]:2024-07-03 08:12:35,641 - root - INFO - Building tiktoken tokenizer locally from ./torchtitan/datasets/tokenizer/original/tokenizer.model | |
[rank0]:2024-07-03 08:12:35,787 - root - INFO - TikTokenizer built: #words 128256, BOS ID 128000, EOS ID 128001 | |
[rank0]:2024-07-03 08:12:35,787 - root - INFO - Preparing c4 dataset from allenai/c4 | |
[rank0]:2024-07-03 08:12:49,070 - root - INFO - Building llama3 8B with ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=1024, ffn_dim_multiplier=1.3, norm_eps=1e-05, rope_theta=500000, max_batch_size=32, max_seq_len=8192, depth_init=True, norm_type='rmsnorm') | |
[rank0]:2024-07-03 08:12:49,201 - root - INFO - Swapped to Float8Linear layers | |
[rank0]:2024-07-03 08:12:49,202 - root - INFO - Model llama3 8B size: 8,030,261,248 total parameters | |
[rank0]:2024-07-03 08:12:49,203 - root - INFO - GPU capacity: NVIDIA H100 (0) with 95.04GiB memory | |
[rank0]:2024-07-03 08:12:49,203 - root - INFO - Applied selective activation checkpointing to the model | |
[rank0]:2024-07-03 08:12:49,492 - root - INFO - Compiled each TransformerBlock with torch.compile | |
[rank0]:2024-07-03 08:12:49,540 - root - INFO - Applied FSDP to the model | |
[rank0]:NCCL version 2.20.5+cuda12.2 | |
[rank0]:2024-07-03 08:13:01,304 - root - INFO - GPU memory usage for model: 7.51GiB(7.90%) | |
[rank0]:2024-07-03 08:13:01,305 - root - INFO - Metrics logging active. Tensorboard logs will be saved at ./outputs/tb/20240703-0813 | |
[rank0]:2024-07-03 08:13:01,386 - root - INFO - Training starts at step 1 | |
[rank0]:2024-07-03 08:13:01,386 - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace | |
[rank0]:[rank0]:W0703 08:13:03.346000 139636577391616 torch/_logging/_internal.py:1040] [0/0] | |
[rank0]:[rank0]:W0703 08:13:03.346000 139636577391616 torch/_logging/_internal.py:1040] [0/0] Detected that context_fn is passed to torch.utils.checkpoint under torch.compile. | |
[rank0]:[rank0]:W0703 08:13:03.346000 139636577391616 torch/_logging/_internal.py:1040] [0/0] Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_). | |
[rank0]:[rank0]:W0703 08:13:03.346000 139636577391616 torch/_logging/_internal.py:1040] [0/0] | |
[rank0]:/data/users/vasiliy/pytorch/torch/_inductor/lowering.py:1624: UserWarning: Torchinductor does not support code generation for complex operators. Performance may be worse than eager. | |
[rank0]: warnings.warn( | |
[rank0]:2024-07-03 08:13:15,111 - root - INFO - step: 1 loss: 12.2473 memory: 55.83GiB(58.75%) wps: 597 mfu: 3.50% | |
[rank0]:2024-07-03 08:13:15,112 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 | |
[rank0]:2024-07-03 08:13:26,148 - root - INFO - step: 10 loss: 10.8121 memory: 70.51GiB(74.19%) wps: 6,681 mfu: 39.12% | |
[rank0]:2024-07-03 08:13:38,409 - root - INFO - step: 20 loss: 9.0833 memory: 70.51GiB(74.19%) wps: 6,683 mfu: 39.13% | |
[rank0]:2024-07-03 08:13:50,693 - root - INFO - step: 30 loss: 8.0871 memory: 70.51GiB(74.19%) wps: 6,671 mfu: 39.06% | |
[rank0]:2024-07-03 08:14:03,002 - root - INFO - step: 40 loss: 7.4720 memory: 70.51GiB(74.19%) wps: 6,657 mfu: 38.98% | |
[rank0]:2024-07-03 08:14:15,603 - root - INFO - step: 50 loss: 7.3636 memory: 70.51GiB(74.19%) wps: 6,502 mfu: 38.08% | |
[rank0]:2024-07-03 08:14:15,606 - root - INFO - Sleeping 2 seconds for other ranks to complete | |
[rank0]:2024-07-03 08:14:17,606 - root - INFO - Training completed | |
// float8linear on, tp=1, fsdp_fp8_allgather=true | |
(pytorch) [[email protected] ~/local/torchtitan (20240702_float8_linear)]$ with-proxy NGPU=4 CUDA_VISIBLE_DEVICES=4,5,6,7 CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh | |
+ export USE_LIBUV=1 | |
+ USE_LIBUV=1 | |
+ TRAINER_DIR=/home/vasiliy/local/torchtitan | |
+ NGPU=4 | |
+ NNODES=1 | |
+ LOG_RANK=0 | |
+ CONFIG_FILE=./train_configs/llama3_8b.toml | |
+ overrides= | |
+ '[' 0 -ne 0 ']' | |
+ echo '' | |
+ grep -q -- --memory_estimation.enabled | |
+ torchrun --nproc_per_node=4 --rdzv_backend c10d --rdzv_endpoint=localhost:0 --local-ranks-filter 0 --role rank --tee 3 train.py --job.config_file ./train_configs/llama3_8b.toml | |
W0703 08:15:59.107000 139669474837504 torch/distributed/run.py:793] | |
W0703 08:15:59.107000 139669474837504 torch/distributed/run.py:793] ***************************************** | |
W0703 08:15:59.107000 139669474837504 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
W0703 08:15:59.107000 139669474837504 torch/distributed/run.py:793] ***************************************** | |
[rank0]:2024-07-03 08:16:00,976 - root - INFO - Starting job: Llama 3 8B training | |
[rank0]:2024-07-03 08:16:02,538 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config | |
[rank0]:2024-07-03 08:16:02,540 - root - INFO - Building 1-D device mesh with ['dp'], [4] | |
[rank0]:2024-07-03 08:16:02,543 - root - INFO - Building tiktoken tokenizer locally from ./torchtitan/datasets/tokenizer/original/tokenizer.model | |
[rank0]:2024-07-03 08:16:02,692 - root - INFO - TikTokenizer built: #words 128256, BOS ID 128000, EOS ID 128001 | |
[rank0]:2024-07-03 08:16:02,692 - root - INFO - Preparing c4 dataset from allenai/c4 | |
[rank0]:2024-07-03 08:16:15,116 - root - INFO - Building llama3 8B with ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=1024, ffn_dim_multiplier=1.3, norm_eps=1e-05, rope_theta=500000, max_batch_size=32, max_seq_len=8192, depth_init=True, norm_type='rmsnorm') | |
[rank0]:2024-07-03 08:16:15,256 - root - INFO - Swapped to Float8Linear layers | |
[rank0]:2024-07-03 08:16:15,257 - root - INFO - Model llama3 8B size: 8,030,261,248 total parameters | |
[rank0]:2024-07-03 08:16:15,257 - root - INFO - GPU capacity: NVIDIA H100 (0) with 95.04GiB memory | |
[rank0]:2024-07-03 08:16:15,258 - root - INFO - Applied selective activation checkpointing to the model | |
[rank0]:2024-07-03 08:16:15,531 - root - INFO - Compiled each TransformerBlock with torch.compile | |
[rank0]:2024-07-03 08:16:15,634 - root - INFO - Applied FSDP to the model | |
[rank0]:NCCL version 2.20.5+cuda12.2 | |
[rank0]:2024-07-03 08:16:21,383 - root - INFO - GPU memory usage for model: 7.51GiB(7.90%) | |
[rank0]:2024-07-03 08:16:21,384 - root - INFO - Metrics logging active. Tensorboard logs will be saved at ./outputs/tb/20240703-0816 | |
[rank0]:2024-07-03 08:16:21,386 - root - INFO - Training starts at step 1 | |
[rank0]:2024-07-03 08:16:21,386 - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace | |
[rank0]:[rank0]:W0703 08:16:23.409000 140432021210112 torch/_logging/_internal.py:1040] [0/0] | |
[rank0]:[rank0]:W0703 08:16:23.409000 140432021210112 torch/_logging/_internal.py:1040] [0/0] Detected that context_fn is passed to torch.utils.checkpoint under torch.compile. | |
[rank0]:[rank0]:W0703 08:16:23.409000 140432021210112 torch/_logging/_internal.py:1040] [0/0] Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_). | |
[rank0]:[rank0]:W0703 08:16:23.409000 140432021210112 torch/_logging/_internal.py:1040] [0/0] | |
[rank0]:/data/users/vasiliy/pytorch/torch/_inductor/lowering.py:1624: UserWarning: Torchinductor does not support code generation for complex operators. Performance may be worse than eager. | |
[rank0]: warnings.warn( | |
[rank0]:2024-07-03 08:16:33,563 - root - INFO - step: 1 loss: 12.2590 memory: 52.66GiB(55.41%) wps: 673 mfu: 3.94% | |
[rank0]:2024-07-03 08:16:33,563 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 | |
[rank0]:2024-07-03 08:16:44,316 - root - INFO - step: 10 loss: 10.8750 memory: 67.25GiB(70.76%) wps: 6,857 mfu: 40.15% | |
[rank0]:2024-07-03 08:16:56,284 - root - INFO - step: 20 loss: 9.2167 memory: 67.25GiB(70.76%) wps: 6,847 mfu: 40.09% | |
[rank0]:2024-07-03 08:17:08,276 - root - INFO - step: 30 loss: 8.1993 memory: 67.25GiB(70.76%) wps: 6,833 mfu: 40.01% | |
[rank0]:2024-07-03 08:17:20,242 - root - INFO - step: 40 loss: 7.5238 memory: 67.25GiB(70.76%) wps: 6,848 mfu: 40.10% | |
[rank0]:2024-07-03 08:17:32,369 - root - INFO - step: 50 loss: 7.3101 memory: 67.25GiB(70.76%) wps: 6,757 mfu: 39.57% | |
[rank0]:2024-07-03 08:17:32,371 - root - INFO - Sleeping 2 seconds for other ranks to complete | |
[rank0]:2024-07-03 08:17:34,372 - root - INFO - Training completed | |
// float8linear on, tp=2, fsdp_fp8_allgather=true | |
(pytorch) [[email protected] ~/local/torchtitan (20240702_float8_linear)]$ with-proxy NGPU=4 CUDA_VISIBLE_DEVICES=4,5,6,7 CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh | |
+ export USE_LIBUV=1 | |
+ USE_LIBUV=1 | |
+ TRAINER_DIR=/home/vasiliy/local/torchtitan | |
+ NGPU=4 | |
+ NNODES=1 | |
+ LOG_RANK=0 | |
+ CONFIG_FILE=./train_configs/llama3_8b.toml | |
+ overrides= | |
+ '[' 0 -ne 0 ']' | |
+ echo '' | |
+ grep -q -- --memory_estimation.enabled | |
+ torchrun --nproc_per_node=4 --rdzv_backend c10d --rdzv_endpoint=localhost:0 --local-ranks-filter 0 --role rank --tee 3 train.py --job.config_file ./train_configs/llama3_8b.toml | |
W0703 08:18:05.609000 139910487057408 torch/distributed/run.py:793] | |
W0703 08:18:05.609000 139910487057408 torch/distributed/run.py:793] ***************************************** | |
W0703 08:18:05.609000 139910487057408 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
W0703 08:18:05.609000 139910487057408 torch/distributed/run.py:793] ***************************************** | |
[rank0]:2024-07-03 08:18:07,487 - root - INFO - Starting job: Llama 3 8B training | |
[rank0]:2024-07-03 08:18:09,046 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config | |
[rank0]:2024-07-03 08:18:09,048 - root - INFO - Building 2-D device mesh with ['dp', 'tp'], [2, 2] | |
[rank0]:2024-07-03 08:18:09,052 - root - INFO - Building tiktoken tokenizer locally from ./torchtitan/datasets/tokenizer/original/tokenizer.model | |
[rank0]:2024-07-03 08:18:09,205 - root - INFO - TikTokenizer built: #words 128256, BOS ID 128000, EOS ID 128001 | |
[rank0]:2024-07-03 08:18:09,205 - root - INFO - Preparing c4 dataset from allenai/c4 | |
[rank0]:2024-07-03 08:18:24,421 - root - INFO - Building llama3 8B with ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=1024, ffn_dim_multiplier=1.3, norm_eps=1e-05, rope_theta=500000, max_batch_size=32, max_seq_len=8192, depth_init=True, norm_type='rmsnorm') | |
[rank0]:2024-07-03 08:18:24,556 - root - INFO - Swapped to Float8Linear layers | |
[rank0]:2024-07-03 08:18:24,558 - root - INFO - Model llama3 8B size: 8,030,261,248 total parameters | |
[rank0]:2024-07-03 08:18:24,558 - root - INFO - GPU capacity: NVIDIA H100 (0) with 95.04GiB memory | |
[rank0]:2024-07-03 08:18:24,621 - root - INFO - Applied Tensor Parallelism to the model | |
[rank0]:2024-07-03 08:18:24,622 - root - INFO - Applied selective activation checkpointing to the model | |
[rank0]:2024-07-03 08:18:24,891 - root - INFO - Compiled each TransformerBlock with torch.compile | |
[rank0]:2024-07-03 08:18:24,937 - root - INFO - Applied FSDP to the model | |
[rank0]:2024-07-03 08:18:25,309 - root - INFO - GPU memory usage for model: 7.51GiB(7.90%) | |
[rank0]:2024-07-03 08:18:25,310 - root - INFO - Metrics logging active. Tensorboard logs will be saved at ./outputs/tb/20240703-0818 | |
[rank0]:2024-07-03 08:18:25,312 - root - INFO - Training starts at step 1 | |
[rank0]:2024-07-03 08:18:25,312 - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace | |
[rank0]:NCCL version 2.20.5+cuda12.2 | |
[rank0]:[rank0]:W0703 08:19:13.928000 139805724644352 torch/_logging/_internal.py:1040] [0/0] | |
[rank0]:[rank0]:W0703 08:19:13.928000 139805724644352 torch/_logging/_internal.py:1040] [0/0] Detected that context_fn is passed to torch.utils.checkpoint under torch.compile. | |
[rank0]:[rank0]:W0703 08:19:13.928000 139805724644352 torch/_logging/_internal.py:1040] [0/0] Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_). | |
[rank0]:[rank0]:W0703 08:19:13.928000 139805724644352 torch/_logging/_internal.py:1040] [0/0] | |
[rank0]:/data/users/vasiliy/pytorch/torch/_inductor/lowering.py:1624: UserWarning: Torchinductor does not support code generation for complex operators. Performance may be worse than eager. | |
[rank0]: warnings.warn( | |
[rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8] torch._dynamo hit config.cache_size_limit (8) | |
[rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8] function: 'forward' (/data/users/vasiliy/pytorch/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py:144) | |
[rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8] last reason: ___check_type_id(L['args'][0], 81104784) | |
[rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles". | |
[rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html. | |
[rank0]:2024-07-03 08:20:17,389 - root - INFO - step: 1 loss: 12.2606 memory: 43.35GiB(45.61%) wps: 37 mfu: 0.21% | |
[rank0]:2024-07-03 08:20:17,389 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 | |
[rank0]:2024-07-03 08:20:33,417 - root - INFO - step: 10 loss: 11.0525 memory: 58.31GiB(61.35%) wps: 2,300 mfu: 13.47% | |
[rank0]:2024-07-03 08:20:48,504 - root - INFO - step: 20 loss: 9.1903 memory: 58.31GiB(61.35%) wps: 2,719 mfu: 15.92% | |
[rank0]:2024-07-03 08:21:03,515 - root - INFO - step: 30 loss: 8.1651 memory: 58.31GiB(61.35%) wps: 2,734 mfu: 16.01% | |
[rank0]:2024-07-03 08:21:18,552 - root - INFO - step: 40 loss: 7.5686 memory: 58.31GiB(61.35%) wps: 2,730 mfu: 15.98% | |
[rank0]:2024-07-03 08:21:36,587 - root - INFO - step: 50 loss: 7.5383 memory: 58.31GiB(61.35%) wps: 2,275 mfu: 13.32% | |
[rank0]:2024-07-03 08:21:36,623 - root - INFO - Sleeping 2 seconds for other ranks to complete | |
[rank0]:2024-07-03 08:21:38,623 - root - INFO - Training completed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment