Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Missing block sync after epilogue compute but before stmatrix (Correctness) #3960

Open
rdspring1 opened this issue Feb 25, 2025 · 0 comments
Assignees
Labels

Comments

@rdspring1
Copy link
Collaborator

Why? Aliased shared memory between epilogue input and TMA store output. A WAR sync is required to avoid incorrect results.

Where to look:

  • Understand why WarSyncInserter did not add block sync between computation and stmatrix store.
  • Is it related to warp specialization IfThenElse?
  • Add shared memory alias analysis to WarAsyncWaitInserter

Cuda Code

   // Wait for wgmma to finish
   wgmma::wait<0LL>();

  // Epilogue computation - silu(a @ b) * c
    Array<__bfloat, 128, 8> T16;
#pragma unroll
    for (nvfuser_index_t i69 = 0; i69 < 32; ++i69) {
      nvfuser_index_t i70;
      i70 = 4 * i69;
      nvfuser_index_t i71;
      i71 = 8 * i69;
      nvfuser_index_t i72;
      i72 = i20 + i71;
      nvfuser_index_t i73;
      i73 = -i71;
#pragma unroll
      for (nvfuser_index_t i74 = 0; i74 < 2; ++i74) {
        nvfuser_index_t i75;
        i75 = i70 + (2 * i74);
        nvfuser_index_t i76;
        i76 = i19 + (512 * i74);
        bool b77;
        b77 = b30 && (i44 < (-(8 * i74)));
#pragma unroll
        for (nvfuser_index_t i78 = 0; i78 < 2; ++i78) {
          nvfuser_index_t i79;
          i79 = i75 + i78;
          nvfuser_index_t i80;
          i80 = i72 + i78;
          nvfuser_index_t i81;
          i81 = i80 % 64; 
          Array<float, 1, 1> T4;
          T4[0] = T3[i79];
          Array<float, 1, 1> T5;
          T5[0] = -T4[0]; 
          Array<float, 1, 1> T6;
          T6[0] = expf(T5[0]);
          Array<float, 1, 1> T7;
          T7[0] = 1.000000000e+00f + T6[0];
          Array<float, 1, 1> T8;
          T8[0] = reciprocal(T7[0]);
          Array<float, 1, 1> T9;
          T9[0] = T4[0] * T8[0];
          Array<__bfloat, 1, 1> T15;
          T15[0] = 0;
          if ((b77 && (i45 < (i73 - i78)))) {
            T15[0] = T17[(
                ((i76 + (8192 * (i80 / 64))) + (8 * (i21 ^ (i81 / 8)))) +
                (i81 % 8))];
          }
          Array<float, 1, 1> T10;
          T10[0] = __bfloat2float(T15[0]);
          Array<float, 1, 1> T11;
          T11[0] = T9[0] * T10[0];
          T16[i79] = __float2bfloat(T11[0]);
        }
      }
    }
    __syncthreads();  // <<<<<<< Missing block sync before stmatrix
  // stmatrix wgmma registers to swizzled shared memory
#pragma unroll
    for (nvfuser_index_t i82 = 0; i82 < 16; ++i82) {
      if ((b46 && (i47 < (-(16 * i82))))) {
        stmatrix4(
            (uint32_t)((
                toSmem(T18) +
                ((((nvfuser_index_t)threadIdx.y) * 32768) +
                 (((i82 / 4) * 8192) +
                  ((i22 * 128) +
                   (((((((nvfuser_index_t)threadIdx.x) % 32) / 16) +
                      ((i82 % 4) * 2)) ^
                     (i22 % 8)) *
                    16)))))),
            (*reinterpret_cast<Array<uint32_t, 4, 1>*>(&T16[(8 * i82)])));
      }
    }
    __syncthreads();
@rdspring1 rdspring1 self-assigned this Feb 25, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

1 participant