-
Notifications
You must be signed in to change notification settings - Fork 12.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libclc] Move sqrt to CLC library #128748
base: main
Are you sure you want to change the base?
Conversation
This is fairly straightforward for most targets. AMDGPU provides its own implementation of sqrt for double types. This commit moves this into the implementation of CLC sqrt. It uses weak linkage on the 'default' CLC sqrt to allow AMDGPU to only override the builtin for the types it cares about. Since we don't yet have CLC ldexp, and AMDGPU prefers the builtin anyway, it also uses __builtin_ldexp. There are no changes to the codegen for any AMDGPU target. There is some minor code movement on NVIDIA targets.
@llvm/pr-subscribers-backend-amdgpu Author: Fraser Cormack (frasercrmck) ChangesThis is fairly straightforward for most targets. AMDGPU provides its own implementation of sqrt for double types. This commit moves this into the implementation of CLC sqrt. It uses weak linkage on the 'default' CLC sqrt to allow AMDGPU to only override the builtin for the types it cares about. Since we don't yet have CLC ldexp, and AMDGPU prefers the builtin over the software implementation anyway, it also uses __builtin_ldexp. There are no changes to the codegen for any AMDGPU target. There is some minor code movement on NVIDIA targets. Full diff: https://github.com/llvm/llvm-project/pull/128748.diff 12 Files Affected:
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 22c3075801785..abdc4a001d386 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -28,6 +28,7 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
spirv/lib/SOURCES;
# CLC internal libraries
clc/lib/generic/SOURCES;
+ clc/lib/amdgpu/SOURCES;
clc/lib/clspv/SOURCES;
clc/lib/spirv/SOURCES;
)
diff --git a/libclc/amdgpu/lib/SOURCES b/libclc/amdgpu/lib/SOURCES
index 24f099d049cd3..d7782a2ae14dc 100644
--- a/libclc/amdgpu/lib/SOURCES
+++ b/libclc/amdgpu/lib/SOURCES
@@ -10,4 +10,3 @@ math/half_log2.cl
math/half_recip.cl
math/half_rsqrt.cl
math/half_sqrt.cl
-math/sqrt.cl
diff --git a/libclc/clc/include/clc/float/definitions.h b/libclc/clc/include/clc/float/definitions.h
index 618d02ab1c090..82ae90155be1d 100644
--- a/libclc/clc/include/clc/float/definitions.h
+++ b/libclc/clc/include/clc/float/definitions.h
@@ -1,7 +1,6 @@
#define MAXFLOAT 0x1.fffffep127f
#define HUGE_VALF __builtin_huge_valf()
#define INFINITY __builtin_inff()
-#define NAN __builtin_nanf("")
#define FLT_DIG 6
#define FLT_MANT_DIG 24
@@ -13,6 +12,7 @@
#define FLT_MAX MAXFLOAT
#define FLT_MIN 0x1.0p-126f
#define FLT_EPSILON 0x1.0p-23f
+#define FLT_NAN __builtin_nanf("")
#define FP_ILOGB0 (-2147483647 - 1)
#define FP_ILOGBNAN 2147483647
@@ -46,6 +46,7 @@
#define DBL_MAX 0x1.fffffffffffffp1023
#define DBL_MIN 0x1.0p-1022
#define DBL_EPSILON 0x1.0p-52
+#define DBL_NAN __builtin_nan("")
#define M_E 0x1.5bf0a8b145769p+1
#define M_LOG2E 0x1.71547652b82fep+0
@@ -80,6 +81,7 @@
#define HALF_MAX 0x1.ffcp15h
#define HALF_MIN 0x1.0p-14h
#define HALF_EPSILON 0x1.0p-10h
+#define HALF_NAN __builtin_nanf16("")
#define M_LOG2E_H 0x1.714p+0h
diff --git a/libclc/generic/include/math/clc_sqrt.h b/libclc/clc/include/clc/math/clc_sqrt.h
similarity index 60%
rename from libclc/generic/include/math/clc_sqrt.h
rename to libclc/clc/include/clc/math/clc_sqrt.h
index 90a7c575c9bad..c16edf196d9f6 100644
--- a/libclc/generic/include/math/clc_sqrt.h
+++ b/libclc/clc/include/clc/math/clc_sqrt.h
@@ -1,8 +1,12 @@
-#include <clc/clcfunc.h>
-#include <clc/clctypes.h>
+#ifndef __CLC_MATH_CLC_SQRT_H__
+#define __CLC_MATH_CLC_SQRT_H__
-#define __CLC_FUNCTION __clc_sqrt
#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_sqrt
+
#include <clc/math/gentype.inc>
+
#undef __CLC_BODY
#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_SQRT_H__
diff --git a/libclc/clc/lib/amdgpu/SOURCES b/libclc/clc/lib/amdgpu/SOURCES
new file mode 100644
index 0000000000000..fd64a862021e8
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/SOURCES
@@ -0,0 +1 @@
+math/clc_sqrt_fp64.cl
diff --git a/libclc/amdgpu/lib/math/sqrt.cl b/libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl
similarity index 64%
rename from libclc/amdgpu/lib/math/sqrt.cl
rename to libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl
index 17d77e50d44d3..9ce48ae8b3122 100644
--- a/libclc/amdgpu/lib/math/sqrt.cl
+++ b/libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl
@@ -20,52 +20,42 @@
* THE SOFTWARE.
*/
-#include "math/clc_sqrt.h"
-#include <clc/clc.h>
#include <clc/clcmacro.h>
-
-_CLC_DEFINE_UNARY_BUILTIN(float, sqrt, __clc_sqrt, float)
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-_CLC_DEFINE_UNARY_BUILTIN(half, sqrt, __clc_sqrt, half)
-
-#endif
+#include <clc/internal/clc.h>
+#include <clc/math/clc_fma.h>
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#ifdef __AMDGCN__
- #define __clc_builtin_rsq __builtin_amdgcn_rsq
+#define __clc_builtin_rsq __builtin_amdgcn_rsq
#else
- #define __clc_builtin_rsq __builtin_r600_recipsqrt_ieee
+#define __clc_builtin_rsq __builtin_r600_recipsqrt_ieee
#endif
-_CLC_OVERLOAD _CLC_DEF double sqrt(double x) {
-
+_CLC_OVERLOAD _CLC_DEF double __clc_sqrt(double x) {
uint vcc = x < 0x1p-767;
uint exp0 = vcc ? 0x100 : 0;
unsigned exp1 = vcc ? 0xffffff80 : 0;
- double v01 = ldexp(x, exp0);
+ double v01 = __builtin_ldexp(x, exp0);
double v23 = __clc_builtin_rsq(v01);
double v45 = v01 * v23;
v23 = v23 * 0.5;
- double v67 = fma(-v23, v45, 0.5);
- v45 = fma(v45, v67, v45);
- double v89 = fma(-v45, v45, v01);
- v23 = fma(v23, v67, v23);
- v45 = fma(v89, v23, v45);
- v67 = fma(-v45, v45, v01);
- v23 = fma(v67, v23, v45);
+ double v67 = __clc_fma(-v23, v45, 0.5);
+ v45 = __clc_fma(v45, v67, v45);
+ double v89 = __clc_fma(-v45, v45, v01);
+ v23 = __clc_fma(v23, v67, v23);
+ v45 = __clc_fma(v89, v23, v45);
+ v67 = __clc_fma(-v45, v45, v01);
+ v23 = __clc_fma(v67, v23, v45);
- v23 = ldexp(v23, exp1);
- return ((x == __builtin_inf()) || (x == 0.0)) ? v01 : v23;
+ v23 = __builtin_ldexp(v23, exp1);
+ return (x == __builtin_inf() || (x == 0.0)) ? v01 : v23;
}
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sqrt, double);
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __clc_sqrt, double);
#endif
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index dbf3c1bd31a0d..19c0ffe405d56 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -32,6 +32,7 @@ math/clc_nan.cl
math/clc_nextafter.cl
math/clc_rint.cl
math/clc_round.cl
+math/clc_sqrt.cl
math/clc_sw_fma.cl
math/clc_trunc.cl
relational/clc_all.cl
diff --git a/libclc/generic/lib/math/clc_sqrt.cl b/libclc/clc/lib/generic/math/clc_sqrt.cl
similarity index 80%
rename from libclc/generic/lib/math/clc_sqrt.cl
rename to libclc/clc/lib/generic/math/clc_sqrt.cl
index 92c7f6e73b11e..620c367dd8510 100644
--- a/libclc/generic/lib/math/clc_sqrt.cl
+++ b/libclc/clc/lib/generic/math/clc_sqrt.cl
@@ -20,14 +20,8 @@
* THE SOFTWARE.
*/
-#include <clc/clc.h>
+#include <clc/float/definitions.h>
+#include <clc/internal/clc.h>
-// Map the llvm sqrt intrinsic to an OpenCL function.
-#define __CLC_FUNCTION __clc_llvm_intr_sqrt
-#define __CLC_INTRINSIC "llvm.sqrt"
-#include <clc/math/unary_intrin.inc>
-#undef __CLC_FUNCTION
-#undef __CLC_INTRINSIC
-
-#define __CLC_BODY <clc_sqrt_impl.inc>
+#define __CLC_BODY <clc_sqrt.inc>
#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/clc_sqrt_impl.inc b/libclc/clc/lib/generic/math/clc_sqrt.inc
similarity index 80%
rename from libclc/generic/lib/math/clc_sqrt_impl.inc
rename to libclc/clc/lib/generic/math/clc_sqrt.inc
index fe724e8c14394..b33b47f26cfb3 100644
--- a/libclc/generic/lib/math/clc_sqrt_impl.inc
+++ b/libclc/clc/lib/generic/math/clc_sqrt.inc
@@ -21,19 +21,17 @@
*/
#if __CLC_FPSIZE == 64
-#define __CLC_NAN __builtin_nan("")
-#define ZERO 0.0
+#define __CLC_NAN DBL_NAN
#elif __CLC_FPSIZE == 32
-#define __CLC_NAN NAN
-#define ZERO 0.0f
+#define __CLC_NAN FLT_NAN
#elif __CLC_FPSIZE == 16
-#define __CLC_NAN (half)NAN
-#define ZERO 0.0h
+#define __CLC_NAN HALF_NAN
#endif
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sqrt(__CLC_GENTYPE val) {
- return val < ZERO ? __CLC_NAN : __clc_llvm_intr_sqrt(val);
+__attribute__((weak)) _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE
+__clc_sqrt(__CLC_GENTYPE val) {
+ return val < __CLC_FP_LIT(0.0) ? (__CLC_GENTYPE)__CLC_NAN
+ : __builtin_elementwise_sqrt(val);
}
#undef __CLC_NAN
-#undef ZERO
diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES
index bea78adf2cf08..42bf695850ede 100644
--- a/libclc/generic/lib/SOURCES
+++ b/libclc/generic/lib/SOURCES
@@ -180,7 +180,6 @@ math/sincos.cl
math/sincos_helpers.cl
math/sinh.cl
math/sinpi.cl
-math/clc_sqrt.cl
math/sqrt.cl
math/clc_tan.cl
math/tan.cl
diff --git a/libclc/generic/lib/math/clc_hypot.cl b/libclc/generic/lib/math/clc_hypot.cl
index 5e6a99b70f22a..fdf1e7ffa1def 100644
--- a/libclc/generic/lib/math/clc_hypot.cl
+++ b/libclc/generic/lib/math/clc_hypot.cl
@@ -27,6 +27,7 @@
#include <clc/math/clc_mad.h>
#include <clc/math/clc_subnormal_config.h>
#include <clc/math/math.h>
+#include <clc/math/clc_sqrt.h>
#include <clc/relational/clc_isnan.h>
#include <clc/shared/clc_clamp.h>
#include <math/clc_hypot.h>
@@ -49,7 +50,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) {
float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
float fx = as_float(ux) * fi_exp;
float fy = as_float(uy) * fi_exp;
- retval = sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;
+ retval = __clc_sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;
retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval;
retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32
@@ -81,7 +82,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) {
double ay = y * preadjust;
// The post adjust may overflow, but this can't be avoided in any case
- double r = sqrt(__clc_fma(ax, ax, ay * ay)) * postadjust;
+ double r = __clc_sqrt(__clc_fma(ax, ax, ay * ay)) * postadjust;
// If the difference in exponents between x and y is large
double s = x + y;
diff --git a/libclc/generic/lib/math/sqrt.cl b/libclc/generic/lib/math/sqrt.cl
index a9192a9493d17..d60d304fa1e1f 100644
--- a/libclc/generic/lib/math/sqrt.cl
+++ b/libclc/generic/lib/math/sqrt.cl
@@ -21,7 +21,9 @@
*/
#include <clc/clc.h>
-#include "math/clc_sqrt.h"
+#include <clc/math/clc_sqrt.h>
-#define __CLC_FUNCTION sqrt
-#include <clc/math/unary_builtin.inc>
+#define FUNCTION sqrt
+#define __CLC_BODY <clc/shared/unary_def.inc>
+
+#include <clc/math/gentype.inc>
|
return val < __CLC_FP_LIT(0.0) ? (__CLC_GENTYPE)__CLC_NAN | ||
: __builtin_elementwise_sqrt(val); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This pre-filtering of the argument is unnecessary. Many years ago the sqrt intrinsic officially did not handle the nan cases, but that has been fixed. This can be the raw builtin call
This is fairly straightforward for most targets.
AMDGPU provides its own implementation of sqrt for double types. This commit moves this into the implementation of CLC sqrt. It uses weak linkage on the 'default' CLC sqrt to allow AMDGPU to only override the builtin for the types it cares about.
Since we don't yet have CLC ldexp, and AMDGPU prefers the builtin over the software implementation anyway, it also uses __builtin_ldexp.
There are no changes to the codegen for any AMDGPU target. There is some minor code movement on NVIDIA targets.