We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Why? Aliased shared memory between epilogue input and TMA store output. A WAR sync is required to avoid incorrect results.
Where to look:
WarSyncInserter
WarAsyncWaitInserter
Cuda Code
// Wait for wgmma to finish wgmma::wait<0LL>(); // Epilogue computation - silu(a @ b) * c Array<__bfloat, 128, 8> T16; #pragma unroll for (nvfuser_index_t i69 = 0; i69 < 32; ++i69) { nvfuser_index_t i70; i70 = 4 * i69; nvfuser_index_t i71; i71 = 8 * i69; nvfuser_index_t i72; i72 = i20 + i71; nvfuser_index_t i73; i73 = -i71; #pragma unroll for (nvfuser_index_t i74 = 0; i74 < 2; ++i74) { nvfuser_index_t i75; i75 = i70 + (2 * i74); nvfuser_index_t i76; i76 = i19 + (512 * i74); bool b77; b77 = b30 && (i44 < (-(8 * i74))); #pragma unroll for (nvfuser_index_t i78 = 0; i78 < 2; ++i78) { nvfuser_index_t i79; i79 = i75 + i78; nvfuser_index_t i80; i80 = i72 + i78; nvfuser_index_t i81; i81 = i80 % 64; Array<float, 1, 1> T4; T4[0] = T3[i79]; Array<float, 1, 1> T5; T5[0] = -T4[0]; Array<float, 1, 1> T6; T6[0] = expf(T5[0]); Array<float, 1, 1> T7; T7[0] = 1.000000000e+00f + T6[0]; Array<float, 1, 1> T8; T8[0] = reciprocal(T7[0]); Array<float, 1, 1> T9; T9[0] = T4[0] * T8[0]; Array<__bfloat, 1, 1> T15; T15[0] = 0; if ((b77 && (i45 < (i73 - i78)))) { T15[0] = T17[( ((i76 + (8192 * (i80 / 64))) + (8 * (i21 ^ (i81 / 8)))) + (i81 % 8))]; } Array<float, 1, 1> T10; T10[0] = __bfloat2float(T15[0]); Array<float, 1, 1> T11; T11[0] = T9[0] * T10[0]; T16[i79] = __float2bfloat(T11[0]); } } } __syncthreads(); // <<<<<<< Missing block sync before stmatrix // stmatrix wgmma registers to swizzled shared memory #pragma unroll for (nvfuser_index_t i82 = 0; i82 < 16; ++i82) { if ((b46 && (i47 < (-(16 * i82))))) { stmatrix4( (uint32_t)(( toSmem(T18) + ((((nvfuser_index_t)threadIdx.y) * 32768) + (((i82 / 4) * 8192) + ((i22 * 128) + (((((((nvfuser_index_t)threadIdx.x) % 32) / 16) + ((i82 % 4) * 2)) ^ (i22 % 8)) * 16)))))), (*reinterpret_cast<Array<uint32_t, 4, 1>*>(&T16[(8 * i82)]))); } } __syncthreads();
The text was updated successfully, but these errors were encountered:
rdspring1
No branches or pull requests
Why? Aliased shared memory between epilogue input and TMA store output. A WAR sync is required to avoid incorrect results.
Where to look:
WarSyncInserter
did not add block sync between computation and stmatrix store.WarAsyncWaitInserter
Cuda Code
The text was updated successfully, but these errors were encountered: