Missing block sync after epilogue compute but before stmatrix (Correctness) #3960

rdspring1 · 2025-02-25T18:33:34Z

Why? Aliased shared memory between epilogue input and TMA store output. A WAR sync is required to avoid incorrect results.

Where to look:

Understand why WarSyncInserter did not add block sync between computation and stmatrix store.
Is it related to warp specialization IfThenElse?
Add shared memory alias analysis to WarAsyncWaitInserter

Cuda Code

   // Wait for wgmma to finish
   wgmma::wait<0LL>();

  // Epilogue computation - silu(a @ b) * c
    Array<__bfloat, 128, 8> T16;
#pragma unroll
    for (nvfuser_index_t i69 = 0; i69 < 32; ++i69) {
      nvfuser_index_t i70;
      i70 = 4 * i69;
      nvfuser_index_t i71;
      i71 = 8 * i69;
      nvfuser_index_t i72;
      i72 = i20 + i71;
      nvfuser_index_t i73;
      i73 = -i71;
#pragma unroll
      for (nvfuser_index_t i74 = 0; i74 < 2; ++i74) {
        nvfuser_index_t i75;
        i75 = i70 + (2 * i74);
        nvfuser_index_t i76;
        i76 = i19 + (512 * i74);
        bool b77;
        b77 = b30 && (i44 < (-(8 * i74)));
#pragma unroll
        for (nvfuser_index_t i78 = 0; i78 < 2; ++i78) {
          nvfuser_index_t i79;
          i79 = i75 + i78;
          nvfuser_index_t i80;
          i80 = i72 + i78;
          nvfuser_index_t i81;
          i81 = i80 % 64; 
          Array<float, 1, 1> T4;
          T4[0] = T3[i79];
          Array<float, 1, 1> T5;
          T5[0] = -T4[0]; 
          Array<float, 1, 1> T6;
          T6[0] = expf(T5[0]);
          Array<float, 1, 1> T7;
          T7[0] = 1.000000000e+00f + T6[0];
          Array<float, 1, 1> T8;
          T8[0] = reciprocal(T7[0]);
          Array<float, 1, 1> T9;
          T9[0] = T4[0] * T8[0];
          Array<__bfloat, 1, 1> T15;
          T15[0] = 0;
          if ((b77 && (i45 < (i73 - i78)))) {
            T15[0] = T17[(
                ((i76 + (8192 * (i80 / 64))) + (8 * (i21 ^ (i81 / 8)))) +
                (i81 % 8))];
          }
          Array<float, 1, 1> T10;
          T10[0] = __bfloat2float(T15[0]);
          Array<float, 1, 1> T11;
          T11[0] = T9[0] * T10[0];
          T16[i79] = __float2bfloat(T11[0]);
        }
      }
    }
    __syncthreads();  // <<<<<<< Missing block sync before stmatrix
  // stmatrix wgmma registers to swizzled shared memory
#pragma unroll
    for (nvfuser_index_t i82 = 0; i82 < 16; ++i82) {
      if ((b46 && (i47 < (-(16 * i82))))) {
        stmatrix4(
            (uint32_t)((
                toSmem(T18) +
                ((((nvfuser_index_t)threadIdx.y) * 32768) +
                 (((i82 / 4) * 8192) +
                  ((i22 * 128) +
                   (((((((nvfuser_index_t)threadIdx.x) % 32) / 16) +
                      ((i82 % 4) * 2)) ^
                     (i22 % 8)) *
                    16)))))),
            (*reinterpret_cast<Array<uint32_t, 4, 1>*>(&T16[(8 * i82)])));
      }
    }
    __syncthreads();

The text was updated successfully, but these errors were encountered:

rdspring1 added the Matmuls label Feb 25, 2025

rdspring1 self-assigned this Feb 25, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Missing block sync after epilogue compute but before stmatrix (Correctness) #3960

Missing block sync after epilogue compute but before stmatrix (Correctness) #3960

rdspring1 commented Feb 25, 2025

Missing block sync after epilogue compute but before stmatrix (Correctness) #3960

Missing block sync after epilogue compute but before stmatrix (Correctness) #3960

Comments

rdspring1 commented Feb 25, 2025