- 检查device支持的metrics
# ncu check bank conflicts
# 先查看当前devices支持的metrics有哪些
ncu --query-metrics | grep data | grep bank | grep l1tex
metrics:
ncu --query-metrics | grep data | grep bank | grep l1tex
l1tex__data_bank_conflicts_pipe_lsu Counter # of data bank conflicts generated by LSU pipe
l1tex__data_bank_conflicts_pipe_lsu_cmd_read Counter # of data bank conflicts generated by LSU reads
l1tex__data_bank_conflicts_pipe_lsu_cmd_write Counter # of data bank conflicts generated by LSU writes
l1tex__data_bank_conflicts_pipe_lsu_mem_global Counter # of data bank conflicts generated by global ops
l1tex__data_bank_conflicts_pipe_lsu_mem_global_op_atom Counter # of data bank conflicts generated by global atomics
l1tex__data_bank_conflicts_pipe_lsu_mem_global_op_ld Counter # of data bank conflicts generated by global loads
l1tex__data_bank_conflicts_pipe_lsu_mem_global_op_red Counter # of data bank conflicts generated by global reductions
l1tex__data_bank_conflicts_pipe_lsu_mem_global_op_st Counter # of data bank conflicts generated by global stores
l1tex__data_bank_conflicts_pipe_lsu_mem_shared Counter # of shared memory data bank conflicts generated by LDS, LD, 3D
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_atom Counter # of shared memory data bank conflicts generated by ATOMS, ATOM
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld Counter # of shared memory data bank conflicts generated by LDS, LD, 3D
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts Counter # of data bank conflicts generated by shared ldgsts ops
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st Counter # of shared memory data bank conflicts generated by STS, ST, 3D
l1tex__data_bank_reads Counter # of data bank reads
l1tex__data_bank_writes Counter # of data bank writes
sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts Counter # of shared memory data bank conflicts generated by LDGSTS
sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts_cache_access Counter # of shared memory data bank conflicts generated by LDGSTS.ACCESS
sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts_cache_bypass Counter # of shared memory data bank conflicts generated by LDGSTS.BYPASS
sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm Counter # of shared memory data bank conflicts generated by LDSM
sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_st Counter # of shared memory data bank conflicts generated by STS, ST
sm__sass_l1tex_data_bank_writes_pipe_lsu_mem_shared_op_ldgsts_cache_access Counter # of LDGSTS.ACCESS shared data bank writes
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts Counter # of shared memory data bank conflicts generated by LDGSTS
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts_cache_access Counter # of shared memory data bank conflicts generated by LDGSTS.ACCESS
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts_cache_bypass Counter # of shared memory data bank conflicts generated by LDGSTS.BYPASS
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm Counter # of shared memory data bank conflicts generated by LDSM
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_st Counter # of shared memory data bank conflicts generated by STS, ST
smsp__sass_l1tex_data_bank_writes_pipe_lsu_mem_shared_op_ldgsts_cache_access Counter # of LDGSTS.ACCESS shared data bank writes
- 由LD指令产生的bank conflicts
# profile l1tex smem data bank conflicts
# 由LDS, LD指令产生的bank conflicts
ncu --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum hgemm_mma_stage.89.bin
ncu --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum hgemm_cute.89.debug.bin
ncu --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld \
python3 flash_attn_mma.py --B 1 --H 1 --D 64 --N 4096 --w 0 --i 1
log:
void flash_fwd_splitkv_combine_kernel<Flash_fwd_kernel_traits<64, 64, 256, 4, 0, 0, cutlass::half_t, Flash_kernel_traits<64, 64, 256, 4, cutlass::half_t>>, 8, 3, 1>(Flash_fwd_params) (512, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.9
Section: Command line profiler metrics
-------------------------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
-------------------------------------------------------- ----------- ------------
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.avg 11.18
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.max 13
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.min 10
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 1029
-------------------------------------------------------- ----------- ------------
- 由LDSM指令产生的bank conflicts
# 由LDSM(ldmatrix)指令产生的bank conflicts
ncu --metrics sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm \
python3 flash_attn_mma.py --B 1 --H 1 --D 64 --N 4096 --w 0 --i 1
ncu --metrics smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm \
python3 flash_attn_mma.py --B 1 --H 1 --D 64 --N 4096 --w 0 --i 1
log:
void flash_fwd_splitkv_combine_kernel<Flash_fwd_kernel_traits<64, 64, 256, 4, 0, 0, cutlass::half_t, Flash_kernel_traits<64, 64, 256, 4, cutlass::half_t>>, 8, 3, 1>(Flash_fwd_params) (512, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.9
Section: Command line profiler metrics
------------------------------------------------------------------ ----------- ------------
Metric Name Metric Unit Metric Value
------------------------------------------------------------------ ----------- ------------
sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm.avg 0
sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm.max 0
sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm.min 0
sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm.sum 0
------------------------------------------------------------------ ----------- ------------