vkuzo · July 3, 2024 15:22
diff --git a/gistfile1.txt b/gistfile1.txt
 // float8linear on, tp 1

 (pytorch) [[email protected] ~/local/torchtitan (20240702_float8_linear)]$ with-proxy NGPU=4 CUDA_VISIBLE_DEVICES=4,5,6,7 CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh
 + export USE_LIBUV=1
 + USE_LIBUV=1
 + TRAINER_DIR=/home/vasiliy/local/torchtitan
 + NGPU=4
 + NNODES=1
 + LOG_RANK=0
 + CONFIG_FILE=./train_configs/llama3_8b.toml
 + overrides=
 + '[' 0 -ne 0 ']'
 + echo ''
 + grep -q -- --memory_estimation.enabled
 + torchrun --nproc_per_node=4 --rdzv_backend c10d --rdzv_endpoint=localhost:0 --local-ranks-filter 0 --role rank --tee 3 train.py --job.config_file ./train_configs/llama3_8b.toml
 W0703 08:12:30.763000 140521685505024 torch/distributed/run.py:793] 
 W0703 08:12:30.763000 140521685505024 torch/distributed/run.py:793] *****************************************
 W0703 08:12:30.763000 140521685505024 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
 W0703 08:12:30.763000 140521685505024 torch/distributed/run.py:793] *****************************************
 [rank0]:2024-07-03 08:12:32,874 - root - INFO - Starting job: Llama 3 8B training
 [rank0]:2024-07-03 08:12:35,637 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
 [rank0]:2024-07-03 08:12:35,638 - root - INFO - Building 1-D device mesh with ['dp'], [4]
 [rank0]:2024-07-03 08:12:35,641 - root - INFO - Building tiktoken tokenizer locally from ./torchtitan/datasets/tokenizer/original/tokenizer.model
 [rank0]:2024-07-03 08:12:35,787 - root - INFO - TikTokenizer built: #words 128256, BOS ID 128000, EOS ID 128001
 [rank0]:2024-07-03 08:12:35,787 - root - INFO - Preparing c4 dataset from allenai/c4
 [rank0]:2024-07-03 08:12:49,070 - root - INFO - Building llama3 8B with ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=1024, ffn_dim_multiplier=1.3, norm_eps=1e-05, rope_theta=500000, max_batch_size=32, max_seq_len=8192, depth_init=True, norm_type='rmsnorm')
 [rank0]:2024-07-03 08:12:49,201 - root - INFO - Swapped to Float8Linear layers
 [rank0]:2024-07-03 08:12:49,202 - root - INFO - Model llama3 8B size: 8,030,261,248 total parameters
 [rank0]:2024-07-03 08:12:49,203 - root - INFO - GPU capacity: NVIDIA H100 (0) with 95.04GiB memory
 [rank0]:2024-07-03 08:12:49,203 - root - INFO - Applied selective activation checkpointing to the model
 [rank0]:2024-07-03 08:12:49,492 - root - INFO - Compiled each TransformerBlock with torch.compile
 [rank0]:2024-07-03 08:12:49,540 - root - INFO - Applied FSDP to the model
 [rank0]:NCCL version 2.20.5+cuda12.2
 [rank0]:2024-07-03 08:13:01,304 - root - INFO - GPU memory usage for model: 7.51GiB(7.90%)
 [rank0]:2024-07-03 08:13:01,305 - root - INFO - Metrics logging active. Tensorboard logs will be saved at ./outputs/tb/20240703-0813
 [rank0]:2024-07-03 08:13:01,386 - root - INFO - Training starts at step 1
 [rank0]:2024-07-03 08:13:01,386 - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace
 [rank0]:[rank0]:W0703 08:13:03.346000 139636577391616 torch/_logging/_internal.py:1040] [0/0] 
 [rank0]:[rank0]:W0703 08:13:03.346000 139636577391616 torch/_logging/_internal.py:1040] [0/0] Detected that context_fn is passed to torch.utils.checkpoint under torch.compile.
 [rank0]:[rank0]:W0703 08:13:03.346000 139636577391616 torch/_logging/_internal.py:1040] [0/0] Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_).
 [rank0]:[rank0]:W0703 08:13:03.346000 139636577391616 torch/_logging/_internal.py:1040] [0/0] 
 [rank0]:/data/users/vasiliy/pytorch/torch/_inductor/lowering.py:1624: UserWarning: Torchinductor does not support code generation for complex operators. Performance may be worse than eager.
 [rank0]:  warnings.warn(
 [rank0]:2024-07-03 08:13:15,111 - root - INFO - step:  1  loss: 12.2473  memory: 55.83GiB(58.75%)  wps: 597  mfu: 3.50%
 [rank0]:2024-07-03 08:13:15,112 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
 [rank0]:2024-07-03 08:13:26,148 - root - INFO - step: 10  loss: 10.8121  memory: 70.51GiB(74.19%)  wps: 6,681  mfu: 39.12%
 [rank0]:2024-07-03 08:13:38,409 - root - INFO - step: 20  loss:  9.0833  memory: 70.51GiB(74.19%)  wps: 6,683  mfu: 39.13%
 [rank0]:2024-07-03 08:13:50,693 - root - INFO - step: 30  loss:  8.0871  memory: 70.51GiB(74.19%)  wps: 6,671  mfu: 39.06%
 [rank0]:2024-07-03 08:14:03,002 - root - INFO - step: 40  loss:  7.4720  memory: 70.51GiB(74.19%)  wps: 6,657  mfu: 38.98%
 [rank0]:2024-07-03 08:14:15,603 - root - INFO - step: 50  loss:  7.3636  memory: 70.51GiB(74.19%)  wps: 6,502  mfu: 38.08%
 [rank0]:2024-07-03 08:14:15,606 - root - INFO - Sleeping 2 seconds for other ranks to complete
 [rank0]:2024-07-03 08:14:17,606 - root - INFO - Training completed

 // float8linear on, tp=1, fsdp_fp8_allgather=true

 (pytorch) [[email protected] ~/local/torchtitan (20240702_float8_linear)]$ with-proxy NGPU=4 CUDA_VISIBLE_DEVICES=4,5,6,7 CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh
 + export USE_LIBUV=1
 + USE_LIBUV=1
 + TRAINER_DIR=/home/vasiliy/local/torchtitan
 + NGPU=4
 + NNODES=1
 + LOG_RANK=0
 + CONFIG_FILE=./train_configs/llama3_8b.toml
 + overrides=
 + '[' 0 -ne 0 ']'
 + echo ''
 + grep -q -- --memory_estimation.enabled
 + torchrun --nproc_per_node=4 --rdzv_backend c10d --rdzv_endpoint=localhost:0 --local-ranks-filter 0 --role rank --tee 3 train.py --job.config_file ./train_configs/llama3_8b.toml
 W0703 08:15:59.107000 139669474837504 torch/distributed/run.py:793] 
 W0703 08:15:59.107000 139669474837504 torch/distributed/run.py:793] *****************************************
 W0703 08:15:59.107000 139669474837504 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
 W0703 08:15:59.107000 139669474837504 torch/distributed/run.py:793] *****************************************
 [rank0]:2024-07-03 08:16:00,976 - root - INFO - Starting job: Llama 3 8B training
 [rank0]:2024-07-03 08:16:02,538 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
 [rank0]:2024-07-03 08:16:02,540 - root - INFO - Building 1-D device mesh with ['dp'], [4]
 [rank0]:2024-07-03 08:16:02,543 - root - INFO - Building tiktoken tokenizer locally from ./torchtitan/datasets/tokenizer/original/tokenizer.model
 [rank0]:2024-07-03 08:16:02,692 - root - INFO - TikTokenizer built: #words 128256, BOS ID 128000, EOS ID 128001
 [rank0]:2024-07-03 08:16:02,692 - root - INFO - Preparing c4 dataset from allenai/c4
 [rank0]:2024-07-03 08:16:15,116 - root - INFO - Building llama3 8B with ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=1024, ffn_dim_multiplier=1.3, norm_eps=1e-05, rope_theta=500000, max_batch_size=32, max_seq_len=8192, depth_init=True, norm_type='rmsnorm')
 [rank0]:2024-07-03 08:16:15,256 - root - INFO - Swapped to Float8Linear layers
 [rank0]:2024-07-03 08:16:15,257 - root - INFO - Model llama3 8B size: 8,030,261,248 total parameters
 [rank0]:2024-07-03 08:16:15,257 - root - INFO - GPU capacity: NVIDIA H100 (0) with 95.04GiB memory
 [rank0]:2024-07-03 08:16:15,258 - root - INFO - Applied selective activation checkpointing to the model
 [rank0]:2024-07-03 08:16:15,531 - root - INFO - Compiled each TransformerBlock with torch.compile
 [rank0]:2024-07-03 08:16:15,634 - root - INFO - Applied FSDP to the model
 [rank0]:NCCL version 2.20.5+cuda12.2
 [rank0]:2024-07-03 08:16:21,383 - root - INFO - GPU memory usage for model: 7.51GiB(7.90%)
 [rank0]:2024-07-03 08:16:21,384 - root - INFO - Metrics logging active. Tensorboard logs will be saved at ./outputs/tb/20240703-0816
 [rank0]:2024-07-03 08:16:21,386 - root - INFO - Training starts at step 1
 [rank0]:2024-07-03 08:16:21,386 - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace
 [rank0]:[rank0]:W0703 08:16:23.409000 140432021210112 torch/_logging/_internal.py:1040] [0/0] 
 [rank0]:[rank0]:W0703 08:16:23.409000 140432021210112 torch/_logging/_internal.py:1040] [0/0] Detected that context_fn is passed to torch.utils.checkpoint under torch.compile.
 [rank0]:[rank0]:W0703 08:16:23.409000 140432021210112 torch/_logging/_internal.py:1040] [0/0] Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_).
 [rank0]:[rank0]:W0703 08:16:23.409000 140432021210112 torch/_logging/_internal.py:1040] [0/0] 
 [rank0]:/data/users/vasiliy/pytorch/torch/_inductor/lowering.py:1624: UserWarning: Torchinductor does not support code generation for complex operators. Performance may be worse than eager.
 [rank0]:  warnings.warn(
 [rank0]:2024-07-03 08:16:33,563 - root - INFO - step:  1  loss: 12.2590  memory: 52.66GiB(55.41%)  wps: 673  mfu: 3.94%
 [rank0]:2024-07-03 08:16:33,563 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
 [rank0]:2024-07-03 08:16:44,316 - root - INFO - step: 10  loss: 10.8750  memory: 67.25GiB(70.76%)  wps: 6,857  mfu: 40.15%
 [rank0]:2024-07-03 08:16:56,284 - root - INFO - step: 20  loss:  9.2167  memory: 67.25GiB(70.76%)  wps: 6,847  mfu: 40.09%
 [rank0]:2024-07-03 08:17:08,276 - root - INFO - step: 30  loss:  8.1993  memory: 67.25GiB(70.76%)  wps: 6,833  mfu: 40.01%
 [rank0]:2024-07-03 08:17:20,242 - root - INFO - step: 40  loss:  7.5238  memory: 67.25GiB(70.76%)  wps: 6,848  mfu: 40.10%
 [rank0]:2024-07-03 08:17:32,369 - root - INFO - step: 50  loss:  7.3101  memory: 67.25GiB(70.76%)  wps: 6,757  mfu: 39.57%
 [rank0]:2024-07-03 08:17:32,371 - root - INFO - Sleeping 2 seconds for other ranks to complete
 [rank0]:2024-07-03 08:17:34,372 - root - INFO - Training completed

 // float8linear on, tp=2, fsdp_fp8_allgather=true

 (pytorch) [[email protected] ~/local/torchtitan (20240702_float8_linear)]$ with-proxy NGPU=4 CUDA_VISIBLE_DEVICES=4,5,6,7 CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh
 + export USE_LIBUV=1
 + USE_LIBUV=1
 + TRAINER_DIR=/home/vasiliy/local/torchtitan
 + NGPU=4
 + NNODES=1
 + LOG_RANK=0
 + CONFIG_FILE=./train_configs/llama3_8b.toml
 + overrides=
 + '[' 0 -ne 0 ']'
 + echo ''
 + grep -q -- --memory_estimation.enabled
 + torchrun --nproc_per_node=4 --rdzv_backend c10d --rdzv_endpoint=localhost:0 --local-ranks-filter 0 --role rank --tee 3 train.py --job.config_file ./train_configs/llama3_8b.toml
 W0703 08:18:05.609000 139910487057408 torch/distributed/run.py:793] 
 W0703 08:18:05.609000 139910487057408 torch/distributed/run.py:793] *****************************************
 W0703 08:18:05.609000 139910487057408 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
 W0703 08:18:05.609000 139910487057408 torch/distributed/run.py:793] *****************************************
 [rank0]:2024-07-03 08:18:07,487 - root - INFO - Starting job: Llama 3 8B training
 [rank0]:2024-07-03 08:18:09,046 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
 [rank0]:2024-07-03 08:18:09,048 - root - INFO - Building 2-D device mesh with ['dp', 'tp'], [2, 2]
 [rank0]:2024-07-03 08:18:09,052 - root - INFO - Building tiktoken tokenizer locally from ./torchtitan/datasets/tokenizer/original/tokenizer.model
 [rank0]:2024-07-03 08:18:09,205 - root - INFO - TikTokenizer built: #words 128256, BOS ID 128000, EOS ID 128001
 [rank0]:2024-07-03 08:18:09,205 - root - INFO - Preparing c4 dataset from allenai/c4
 [rank0]:2024-07-03 08:18:24,421 - root - INFO - Building llama3 8B with ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=1024, ffn_dim_multiplier=1.3, norm_eps=1e-05, rope_theta=500000, max_batch_size=32, max_seq_len=8192, depth_init=True, norm_type='rmsnorm')
 [rank0]:2024-07-03 08:18:24,556 - root - INFO - Swapped to Float8Linear layers
 [rank0]:2024-07-03 08:18:24,558 - root - INFO - Model llama3 8B size: 8,030,261,248 total parameters
 [rank0]:2024-07-03 08:18:24,558 - root - INFO - GPU capacity: NVIDIA H100 (0) with 95.04GiB memory
 [rank0]:2024-07-03 08:18:24,621 - root - INFO - Applied Tensor Parallelism to the model
 [rank0]:2024-07-03 08:18:24,622 - root - INFO - Applied selective activation checkpointing to the model
 [rank0]:2024-07-03 08:18:24,891 - root - INFO - Compiled each TransformerBlock with torch.compile
 [rank0]:2024-07-03 08:18:24,937 - root - INFO - Applied FSDP to the model
 [rank0]:2024-07-03 08:18:25,309 - root - INFO - GPU memory usage for model: 7.51GiB(7.90%)
 [rank0]:2024-07-03 08:18:25,310 - root - INFO - Metrics logging active. Tensorboard logs will be saved at ./outputs/tb/20240703-0818
 [rank0]:2024-07-03 08:18:25,312 - root - INFO - Training starts at step 1
 [rank0]:2024-07-03 08:18:25,312 - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace
 [rank0]:NCCL version 2.20.5+cuda12.2
 [rank0]:[rank0]:W0703 08:19:13.928000 139805724644352 torch/_logging/_internal.py:1040] [0/0] 
 [rank0]:[rank0]:W0703 08:19:13.928000 139805724644352 torch/_logging/_internal.py:1040] [0/0] Detected that context_fn is passed to torch.utils.checkpoint under torch.compile.
 [rank0]:[rank0]:W0703 08:19:13.928000 139805724644352 torch/_logging/_internal.py:1040] [0/0] Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_).
 [rank0]:[rank0]:W0703 08:19:13.928000 139805724644352 torch/_logging/_internal.py:1040] [0/0] 
 [rank0]:/data/users/vasiliy/pytorch/torch/_inductor/lowering.py:1624: UserWarning: Torchinductor does not support code generation for complex operators. Performance may be worse than eager.
 [rank0]:  warnings.warn(
 [rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8] torch._dynamo hit config.cache_size_limit (8)
 [rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8]    function: 'forward' (/data/users/vasiliy/pytorch/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py:144)
 [rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8]    last reason: ___check_type_id(L['args'][0], 81104784)                    
 [rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
 [rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
 [rank0]:2024-07-03 08:20:17,389 - root - INFO - step:  1  loss: 12.2606  memory: 43.35GiB(45.61%)  wps: 37  mfu: 0.21%
 [rank0]:2024-07-03 08:20:17,389 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
 [rank0]:2024-07-03 08:20:33,417 - root - INFO - step: 10  loss: 11.0525  memory: 58.31GiB(61.35%)  wps: 2,300  mfu: 13.47%
 [rank0]:2024-07-03 08:20:48,504 - root - INFO - step: 20  loss:  9.1903  memory: 58.31GiB(61.35%)  wps: 2,719  mfu: 15.92%
 [rank0]:2024-07-03 08:21:03,515 - root - INFO - step: 30  loss:  8.1651  memory: 58.31GiB(61.35%)  wps: 2,734  mfu: 16.01%
 [rank0]:2024-07-03 08:21:18,552 - root - INFO - step: 40  loss:  7.5686  memory: 58.31GiB(61.35%)  wps: 2,730  mfu: 15.98%
 [rank0]:2024-07-03 08:21:36,587 - root - INFO - step: 50  loss:  7.5383  memory: 58.31GiB(61.35%)  wps: 2,275  mfu: 13.32%
 [rank0]:2024-07-03 08:21:36,623 - root - INFO - Sleeping 2 seconds for other ranks to complete
 [rank0]:2024-07-03 08:21:38,623 - root - INFO - Training completed
	// float8linear on, tp 1

	(pytorch) [[email protected] ~/local/torchtitan (20240702_float8_linear)]$ with-proxy NGPU=4 CUDA_VISIBLE_DEVICES=4,5,6,7 CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh
	+ export USE_LIBUV=1
	+ USE_LIBUV=1
	+ TRAINER_DIR=/home/vasiliy/local/torchtitan
	+ NGPU=4
	+ NNODES=1
	+ LOG_RANK=0
	+ CONFIG_FILE=./train_configs/llama3_8b.toml
	+ overrides=
	+ '[' 0 -ne 0 ']'
	+ echo ''
	+ grep -q -- --memory_estimation.enabled
	+ torchrun --nproc_per_node=4 --rdzv_backend c10d --rdzv_endpoint=localhost:0 --local-ranks-filter 0 --role rank --tee 3 train.py --job.config_file ./train_configs/llama3_8b.toml
	W0703 08:12:30.763000 140521685505024 torch/distributed/run.py:793]
	W0703 08:12:30.763000 140521685505024 torch/distributed/run.py:793] *****************************************
	W0703 08:12:30.763000 140521685505024 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
	W0703 08:12:30.763000 140521685505024 torch/distributed/run.py:793] *****************************************
	[rank0]:2024-07-03 08:12:32,874 - root - INFO - Starting job: Llama 3 8B training
	[rank0]:2024-07-03 08:12:35,637 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
	[rank0]:2024-07-03 08:12:35,638 - root - INFO - Building 1-D device mesh with ['dp'], [4]
	[rank0]:2024-07-03 08:12:35,641 - root - INFO - Building tiktoken tokenizer locally from ./torchtitan/datasets/tokenizer/original/tokenizer.model
	[rank0]:2024-07-03 08:12:35,787 - root - INFO - TikTokenizer built: #words 128256, BOS ID 128000, EOS ID 128001
	[rank0]:2024-07-03 08:12:35,787 - root - INFO - Preparing c4 dataset from allenai/c4
	[rank0]:2024-07-03 08:12:49,070 - root - INFO - Building llama3 8B with ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=1024, ffn_dim_multiplier=1.3, norm_eps=1e-05, rope_theta=500000, max_batch_size=32, max_seq_len=8192, depth_init=True, norm_type='rmsnorm')
	[rank0]:2024-07-03 08:12:49,201 - root - INFO - Swapped to Float8Linear layers
	[rank0]:2024-07-03 08:12:49,202 - root - INFO - Model llama3 8B size: 8,030,261,248 total parameters
	[rank0]:2024-07-03 08:12:49,203 - root - INFO - GPU capacity: NVIDIA H100 (0) with 95.04GiB memory
	[rank0]:2024-07-03 08:12:49,203 - root - INFO - Applied selective activation checkpointing to the model
	[rank0]:2024-07-03 08:12:49,492 - root - INFO - Compiled each TransformerBlock with torch.compile
	[rank0]:2024-07-03 08:12:49,540 - root - INFO - Applied FSDP to the model
	[rank0]:NCCL version 2.20.5+cuda12.2
	[rank0]:2024-07-03 08:13:01,304 - root - INFO - GPU memory usage for model: 7.51GiB(7.90%)
	[rank0]:2024-07-03 08:13:01,305 - root - INFO - Metrics logging active. Tensorboard logs will be saved at ./outputs/tb/20240703-0813
	[rank0]:2024-07-03 08:13:01,386 - root - INFO - Training starts at step 1
	[rank0]:2024-07-03 08:13:01,386 - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace
	[rank0]:[rank0]:W0703 08:13:03.346000 139636577391616 torch/_logging/_internal.py:1040] [0/0]
	[rank0]:[rank0]:W0703 08:13:03.346000 139636577391616 torch/_logging/_internal.py:1040] [0/0] Detected that context_fn is passed to torch.utils.checkpoint under torch.compile.
	[rank0]:[rank0]:W0703 08:13:03.346000 139636577391616 torch/_logging/_internal.py:1040] [0/0] Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_).
	[rank0]:[rank0]:W0703 08:13:03.346000 139636577391616 torch/_logging/_internal.py:1040] [0/0]
	[rank0]:/data/users/vasiliy/pytorch/torch/_inductor/lowering.py:1624: UserWarning: Torchinductor does not support code generation for complex operators. Performance may be worse than eager.
	[rank0]: warnings.warn(
	[rank0]:2024-07-03 08:13:15,111 - root - INFO - step: 1 loss: 12.2473 memory: 55.83GiB(58.75%) wps: 597 mfu: 3.50%
	[rank0]:2024-07-03 08:13:15,112 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
	[rank0]:2024-07-03 08:13:26,148 - root - INFO - step: 10 loss: 10.8121 memory: 70.51GiB(74.19%) wps: 6,681 mfu: 39.12%
	[rank0]:2024-07-03 08:13:38,409 - root - INFO - step: 20 loss: 9.0833 memory: 70.51GiB(74.19%) wps: 6,683 mfu: 39.13%
	[rank0]:2024-07-03 08:13:50,693 - root - INFO - step: 30 loss: 8.0871 memory: 70.51GiB(74.19%) wps: 6,671 mfu: 39.06%
	[rank0]:2024-07-03 08:14:03,002 - root - INFO - step: 40 loss: 7.4720 memory: 70.51GiB(74.19%) wps: 6,657 mfu: 38.98%
	[rank0]:2024-07-03 08:14:15,603 - root - INFO - step: 50 loss: 7.3636 memory: 70.51GiB(74.19%) wps: 6,502 mfu: 38.08%
	[rank0]:2024-07-03 08:14:15,606 - root - INFO - Sleeping 2 seconds for other ranks to complete
	[rank0]:2024-07-03 08:14:17,606 - root - INFO - Training completed

	// float8linear on, tp=1, fsdp_fp8_allgather=true

	(pytorch) [[email protected] ~/local/torchtitan (20240702_float8_linear)]$ with-proxy NGPU=4 CUDA_VISIBLE_DEVICES=4,5,6,7 CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh
	+ export USE_LIBUV=1
	+ USE_LIBUV=1
	+ TRAINER_DIR=/home/vasiliy/local/torchtitan
	+ NGPU=4
	+ NNODES=1
	+ LOG_RANK=0
	+ CONFIG_FILE=./train_configs/llama3_8b.toml
	+ overrides=
	+ '[' 0 -ne 0 ']'
	+ echo ''
	+ grep -q -- --memory_estimation.enabled
	+ torchrun --nproc_per_node=4 --rdzv_backend c10d --rdzv_endpoint=localhost:0 --local-ranks-filter 0 --role rank --tee 3 train.py --job.config_file ./train_configs/llama3_8b.toml
	W0703 08:15:59.107000 139669474837504 torch/distributed/run.py:793]
	W0703 08:15:59.107000 139669474837504 torch/distributed/run.py:793] *****************************************
	W0703 08:15:59.107000 139669474837504 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
	W0703 08:15:59.107000 139669474837504 torch/distributed/run.py:793] *****************************************
	[rank0]:2024-07-03 08:16:00,976 - root - INFO - Starting job: Llama 3 8B training
	[rank0]:2024-07-03 08:16:02,538 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
	[rank0]:2024-07-03 08:16:02,540 - root - INFO - Building 1-D device mesh with ['dp'], [4]
	[rank0]:2024-07-03 08:16:02,543 - root - INFO - Building tiktoken tokenizer locally from ./torchtitan/datasets/tokenizer/original/tokenizer.model
	[rank0]:2024-07-03 08:16:02,692 - root - INFO - TikTokenizer built: #words 128256, BOS ID 128000, EOS ID 128001
	[rank0]:2024-07-03 08:16:02,692 - root - INFO - Preparing c4 dataset from allenai/c4
	[rank0]:2024-07-03 08:16:15,116 - root - INFO - Building llama3 8B with ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=1024, ffn_dim_multiplier=1.3, norm_eps=1e-05, rope_theta=500000, max_batch_size=32, max_seq_len=8192, depth_init=True, norm_type='rmsnorm')
	[rank0]:2024-07-03 08:16:15,256 - root - INFO - Swapped to Float8Linear layers
	[rank0]:2024-07-03 08:16:15,257 - root - INFO - Model llama3 8B size: 8,030,261,248 total parameters
	[rank0]:2024-07-03 08:16:15,257 - root - INFO - GPU capacity: NVIDIA H100 (0) with 95.04GiB memory
	[rank0]:2024-07-03 08:16:15,258 - root - INFO - Applied selective activation checkpointing to the model
	[rank0]:2024-07-03 08:16:15,531 - root - INFO - Compiled each TransformerBlock with torch.compile
	[rank0]:2024-07-03 08:16:15,634 - root - INFO - Applied FSDP to the model
	[rank0]:NCCL version 2.20.5+cuda12.2
	[rank0]:2024-07-03 08:16:21,383 - root - INFO - GPU memory usage for model: 7.51GiB(7.90%)
	[rank0]:2024-07-03 08:16:21,384 - root - INFO - Metrics logging active. Tensorboard logs will be saved at ./outputs/tb/20240703-0816
	[rank0]:2024-07-03 08:16:21,386 - root - INFO - Training starts at step 1
	[rank0]:2024-07-03 08:16:21,386 - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace
	[rank0]:[rank0]:W0703 08:16:23.409000 140432021210112 torch/_logging/_internal.py:1040] [0/0]
	[rank0]:[rank0]:W0703 08:16:23.409000 140432021210112 torch/_logging/_internal.py:1040] [0/0] Detected that context_fn is passed to torch.utils.checkpoint under torch.compile.
	[rank0]:[rank0]:W0703 08:16:23.409000 140432021210112 torch/_logging/_internal.py:1040] [0/0] Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_).
	[rank0]:[rank0]:W0703 08:16:23.409000 140432021210112 torch/_logging/_internal.py:1040] [0/0]
	[rank0]:/data/users/vasiliy/pytorch/torch/_inductor/lowering.py:1624: UserWarning: Torchinductor does not support code generation for complex operators. Performance may be worse than eager.
	[rank0]: warnings.warn(
	[rank0]:2024-07-03 08:16:33,563 - root - INFO - step: 1 loss: 12.2590 memory: 52.66GiB(55.41%) wps: 673 mfu: 3.94%
	[rank0]:2024-07-03 08:16:33,563 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
	[rank0]:2024-07-03 08:16:44,316 - root - INFO - step: 10 loss: 10.8750 memory: 67.25GiB(70.76%) wps: 6,857 mfu: 40.15%
	[rank0]:2024-07-03 08:16:56,284 - root - INFO - step: 20 loss: 9.2167 memory: 67.25GiB(70.76%) wps: 6,847 mfu: 40.09%
	[rank0]:2024-07-03 08:17:08,276 - root - INFO - step: 30 loss: 8.1993 memory: 67.25GiB(70.76%) wps: 6,833 mfu: 40.01%
	[rank0]:2024-07-03 08:17:20,242 - root - INFO - step: 40 loss: 7.5238 memory: 67.25GiB(70.76%) wps: 6,848 mfu: 40.10%
	[rank0]:2024-07-03 08:17:32,369 - root - INFO - step: 50 loss: 7.3101 memory: 67.25GiB(70.76%) wps: 6,757 mfu: 39.57%
	[rank0]:2024-07-03 08:17:32,371 - root - INFO - Sleeping 2 seconds for other ranks to complete
	[rank0]:2024-07-03 08:17:34,372 - root - INFO - Training completed

	// float8linear on, tp=2, fsdp_fp8_allgather=true

	(pytorch) [[email protected] ~/local/torchtitan (20240702_float8_linear)]$ with-proxy NGPU=4 CUDA_VISIBLE_DEVICES=4,5,6,7 CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh
	+ export USE_LIBUV=1
	+ USE_LIBUV=1
	+ TRAINER_DIR=/home/vasiliy/local/torchtitan
	+ NGPU=4
	+ NNODES=1
	+ LOG_RANK=0
	+ CONFIG_FILE=./train_configs/llama3_8b.toml
	+ overrides=
	+ '[' 0 -ne 0 ']'
	+ echo ''
	+ grep -q -- --memory_estimation.enabled
	+ torchrun --nproc_per_node=4 --rdzv_backend c10d --rdzv_endpoint=localhost:0 --local-ranks-filter 0 --role rank --tee 3 train.py --job.config_file ./train_configs/llama3_8b.toml
	W0703 08:18:05.609000 139910487057408 torch/distributed/run.py:793]
	W0703 08:18:05.609000 139910487057408 torch/distributed/run.py:793] *****************************************
	W0703 08:18:05.609000 139910487057408 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
	W0703 08:18:05.609000 139910487057408 torch/distributed/run.py:793] *****************************************
	[rank0]:2024-07-03 08:18:07,487 - root - INFO - Starting job: Llama 3 8B training
	[rank0]:2024-07-03 08:18:09,046 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
	[rank0]:2024-07-03 08:18:09,048 - root - INFO - Building 2-D device mesh with ['dp', 'tp'], [2, 2]
	[rank0]:2024-07-03 08:18:09,052 - root - INFO - Building tiktoken tokenizer locally from ./torchtitan/datasets/tokenizer/original/tokenizer.model
	[rank0]:2024-07-03 08:18:09,205 - root - INFO - TikTokenizer built: #words 128256, BOS ID 128000, EOS ID 128001
	[rank0]:2024-07-03 08:18:09,205 - root - INFO - Preparing c4 dataset from allenai/c4
	[rank0]:2024-07-03 08:18:24,421 - root - INFO - Building llama3 8B with ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=1024, ffn_dim_multiplier=1.3, norm_eps=1e-05, rope_theta=500000, max_batch_size=32, max_seq_len=8192, depth_init=True, norm_type='rmsnorm')
	[rank0]:2024-07-03 08:18:24,556 - root - INFO - Swapped to Float8Linear layers
	[rank0]:2024-07-03 08:18:24,558 - root - INFO - Model llama3 8B size: 8,030,261,248 total parameters
	[rank0]:2024-07-03 08:18:24,558 - root - INFO - GPU capacity: NVIDIA H100 (0) with 95.04GiB memory
	[rank0]:2024-07-03 08:18:24,621 - root - INFO - Applied Tensor Parallelism to the model
	[rank0]:2024-07-03 08:18:24,622 - root - INFO - Applied selective activation checkpointing to the model
	[rank0]:2024-07-03 08:18:24,891 - root - INFO - Compiled each TransformerBlock with torch.compile
	[rank0]:2024-07-03 08:18:24,937 - root - INFO - Applied FSDP to the model
	[rank0]:2024-07-03 08:18:25,309 - root - INFO - GPU memory usage for model: 7.51GiB(7.90%)
	[rank0]:2024-07-03 08:18:25,310 - root - INFO - Metrics logging active. Tensorboard logs will be saved at ./outputs/tb/20240703-0818
	[rank0]:2024-07-03 08:18:25,312 - root - INFO - Training starts at step 1
	[rank0]:2024-07-03 08:18:25,312 - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace
	[rank0]:NCCL version 2.20.5+cuda12.2
	[rank0]:[rank0]:W0703 08:19:13.928000 139805724644352 torch/_logging/_internal.py:1040] [0/0]
	[rank0]:[rank0]:W0703 08:19:13.928000 139805724644352 torch/_logging/_internal.py:1040] [0/0] Detected that context_fn is passed to torch.utils.checkpoint under torch.compile.
	[rank0]:[rank0]:W0703 08:19:13.928000 139805724644352 torch/_logging/_internal.py:1040] [0/0] Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_).
	[rank0]:[rank0]:W0703 08:19:13.928000 139805724644352 torch/_logging/_internal.py:1040] [0/0]
	[rank0]:/data/users/vasiliy/pytorch/torch/_inductor/lowering.py:1624: UserWarning: Torchinductor does not support code generation for complex operators. Performance may be worse than eager.
	[rank0]: warnings.warn(
	[rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8] torch._dynamo hit config.cache_size_limit (8)
	[rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8] function: 'forward' (/data/users/vasiliy/pytorch/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py:144)
	[rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8] last reason: ___check_type_id(L['args'][0], 81104784)
	[rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
	[rank0]:[rank0]:W0703 08:19:49.304000 139805724644352 torch/_dynamo/convert_frame.py:762] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
	[rank0]:2024-07-03 08:20:17,389 - root - INFO - step: 1 loss: 12.2606 memory: 43.35GiB(45.61%) wps: 37 mfu: 0.21%
	[rank0]:2024-07-03 08:20:17,389 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
	[rank0]:2024-07-03 08:20:33,417 - root - INFO - step: 10 loss: 11.0525 memory: 58.31GiB(61.35%) wps: 2,300 mfu: 13.47%
	[rank0]:2024-07-03 08:20:48,504 - root - INFO - step: 20 loss: 9.1903 memory: 58.31GiB(61.35%) wps: 2,719 mfu: 15.92%
	[rank0]:2024-07-03 08:21:03,515 - root - INFO - step: 30 loss: 8.1651 memory: 58.31GiB(61.35%) wps: 2,734 mfu: 16.01%
	[rank0]:2024-07-03 08:21:18,552 - root - INFO - step: 40 loss: 7.5686 memory: 58.31GiB(61.35%) wps: 2,730 mfu: 15.98%
	[rank0]:2024-07-03 08:21:36,587 - root - INFO - step: 50 loss: 7.5383 memory: 58.31GiB(61.35%) wps: 2,275 mfu: 13.32%
	[rank0]:2024-07-03 08:21:36,623 - root - INFO - Sleeping 2 seconds for other ranks to complete
	[rank0]:2024-07-03 08:21:38,623 - root - INFO - Training completed