Skip to content

Latest commit

 

History

History
85 lines (81 loc) · 7.52 KB

bank_conflicts.md

File metadata and controls

85 lines (81 loc) · 7.52 KB

Check Bank Conflicts via NCU

  • 检查device支持的metrics
# ncu check bank conflicts
# 先查看当前devices支持的metrics有哪些
ncu --query-metrics | grep data | grep bank | grep l1tex

metrics:

ncu --query-metrics | grep data | grep bank | grep l1tex
l1tex__data_bank_conflicts_pipe_lsu                                         Counter                         # of data bank conflicts generated by LSU pipe
l1tex__data_bank_conflicts_pipe_lsu_cmd_read                                Counter                         # of data bank conflicts generated by LSU reads
l1tex__data_bank_conflicts_pipe_lsu_cmd_write                               Counter                         # of data bank conflicts generated by LSU writes
l1tex__data_bank_conflicts_pipe_lsu_mem_global                              Counter                         # of data bank conflicts generated by global ops
l1tex__data_bank_conflicts_pipe_lsu_mem_global_op_atom                      Counter                         # of data bank conflicts generated by global atomics
l1tex__data_bank_conflicts_pipe_lsu_mem_global_op_ld                        Counter                         # of data bank conflicts generated by global loads
l1tex__data_bank_conflicts_pipe_lsu_mem_global_op_red                       Counter                         # of data bank conflicts generated by global reductions
l1tex__data_bank_conflicts_pipe_lsu_mem_global_op_st                        Counter                         # of data bank conflicts generated by global stores
l1tex__data_bank_conflicts_pipe_lsu_mem_shared                              Counter                         # of shared memory data bank conflicts generated by LDS, LD, 3D
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_atom                      Counter                         # of shared memory data bank conflicts generated by ATOMS, ATOM
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld                        Counter                         # of shared memory data bank conflicts generated by LDS, LD, 3D
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts                    Counter                         # of data bank conflicts generated by shared ldgsts ops
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st                        Counter                         # of shared memory data bank conflicts generated by STS, ST, 3D
l1tex__data_bank_reads                                                      Counter                         # of data bank reads
l1tex__data_bank_writes                                                     Counter                         # of data bank writes
sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts              Counter                         # of shared memory data bank conflicts generated by LDGSTS
sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts_cache_access Counter                         # of shared memory data bank conflicts generated by LDGSTS.ACCESS
sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts_cache_bypass Counter                         # of shared memory data bank conflicts generated by LDGSTS.BYPASS
sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm                Counter                         # of shared memory data bank conflicts generated by LDSM
sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_st                  Counter                         # of shared memory data bank conflicts generated by STS, ST
sm__sass_l1tex_data_bank_writes_pipe_lsu_mem_shared_op_ldgsts_cache_access    Counter                         # of LDGSTS.ACCESS shared data bank writes
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts                        Counter                         # of shared memory data bank conflicts generated by LDGSTS
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts_cache_access           Counter                         # of shared memory data bank conflicts generated by LDGSTS.ACCESS
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts_cache_bypass           Counter                         # of shared memory data bank conflicts generated by LDGSTS.BYPASS
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm                          Counter                         # of shared memory data bank conflicts generated by LDSM
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_st                            Counter                         # of shared memory data bank conflicts generated by STS, ST
smsp__sass_l1tex_data_bank_writes_pipe_lsu_mem_shared_op_ldgsts_cache_access              Counter                         # of LDGSTS.ACCESS shared data bank writes
  • 由LD指令产生的bank conflicts
# profile l1tex smem data bank conflicts
# 由LDS, LD指令产生的bank conflicts
ncu --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum hgemm_mma_stage.89.bin
ncu --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum hgemm_cute.89.debug.bin
ncu --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld \
    python3 flash_attn_mma.py --B 1 --H 1 --D 64 --N 4096 --w 0 --i 1

log:

void flash_fwd_splitkv_combine_kernel<Flash_fwd_kernel_traits<64, 64, 256, 4, 0, 0, cutlass::half_t, Flash_kernel_traits<64, 64, 256, 4, cutlass::half_t>>, 8, 3, 1>(Flash_fwd_params) (512, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.9
    Section: Command line profiler metrics
    -------------------------------------------------------- ----------- ------------
    Metric Name                                              Metric Unit Metric Value
    -------------------------------------------------------- ----------- ------------
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.avg                    11.18
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.max                       13
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.min                       10
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum                     1029
    -------------------------------------------------------- ----------- ------------
  • 由LDSM指令产生的bank conflicts
# 由LDSM(ldmatrix)指令产生的bank conflicts
ncu --metrics sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm \
    python3 flash_attn_mma.py --B 1 --H 1 --D 64 --N 4096 --w 0 --i 1
ncu --metrics smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm \
    python3 flash_attn_mma.py --B 1 --H 1 --D 64 --N 4096 --w 0 --i 1

log:

void flash_fwd_splitkv_combine_kernel<Flash_fwd_kernel_traits<64, 64, 256, 4, 0, 0, cutlass::half_t, Flash_kernel_traits<64, 64, 256, 4, cutlass::half_t>>, 8, 3, 1>(Flash_fwd_params) (512, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.9
    Section: Command line profiler metrics
    ------------------------------------------------------------------ ----------- ------------
    Metric Name                                                        Metric Unit Metric Value
    ------------------------------------------------------------------ ----------- ------------
    sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm.avg                        0
    sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm.max                        0
    sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm.min                        0
    sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm.sum                        0
    ------------------------------------------------------------------ ----------- ------------