Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[libclc] Move sqrt to CLC library #128748

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

frasercrmck
Copy link
Contributor

This is fairly straightforward for most targets.

AMDGPU provides its own implementation of sqrt for double types. This commit moves this into the implementation of CLC sqrt. It uses weak linkage on the 'default' CLC sqrt to allow AMDGPU to only override the builtin for the types it cares about.

Since we don't yet have CLC ldexp, and AMDGPU prefers the builtin over the software implementation anyway, it also uses __builtin_ldexp.

There are no changes to the codegen for any AMDGPU target. There is some minor code movement on NVIDIA targets.

This is fairly straightforward for most targets.

AMDGPU provides its own implementation of sqrt for double types. This
commit moves this into the implementation of CLC sqrt. It uses weak
linkage on the 'default' CLC sqrt to allow AMDGPU to only override the
builtin for the types it cares about.

Since we don't yet have CLC ldexp, and AMDGPU prefers the builtin
anyway, it also uses __builtin_ldexp.

There are no changes to the codegen for any AMDGPU target. There is some
minor code movement on NVIDIA targets.
@frasercrmck frasercrmck added the libclc libclc OpenCL library label Feb 25, 2025
@frasercrmck frasercrmck requested a review from arsenm February 25, 2025 17:51
@llvmbot
Copy link
Member

llvmbot commented Feb 25, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Fraser Cormack (frasercrmck)

Changes

This is fairly straightforward for most targets.

AMDGPU provides its own implementation of sqrt for double types. This commit moves this into the implementation of CLC sqrt. It uses weak linkage on the 'default' CLC sqrt to allow AMDGPU to only override the builtin for the types it cares about.

Since we don't yet have CLC ldexp, and AMDGPU prefers the builtin over the software implementation anyway, it also uses __builtin_ldexp.

There are no changes to the codegen for any AMDGPU target. There is some minor code movement on NVIDIA targets.


Full diff: https://github.com/llvm/llvm-project/pull/128748.diff

12 Files Affected:

  • (modified) libclc/CMakeLists.txt (+1)
  • (modified) libclc/amdgpu/lib/SOURCES (-1)
  • (modified) libclc/clc/include/clc/float/definitions.h (+3-1)
  • (renamed) libclc/clc/include/clc/math/clc_sqrt.h (+7-3)
  • (added) libclc/clc/lib/amdgpu/SOURCES (+1)
  • (renamed) libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl (+16-26)
  • (modified) libclc/clc/lib/generic/SOURCES (+1)
  • (renamed) libclc/clc/lib/generic/math/clc_sqrt.cl (+3-9)
  • (renamed) libclc/clc/lib/generic/math/clc_sqrt.inc (+7-9)
  • (modified) libclc/generic/lib/SOURCES (-1)
  • (modified) libclc/generic/lib/math/clc_hypot.cl (+3-2)
  • (modified) libclc/generic/lib/math/sqrt.cl (+5-3)
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 22c3075801785..abdc4a001d386 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -28,6 +28,7 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
   spirv/lib/SOURCES;
   # CLC internal libraries
   clc/lib/generic/SOURCES;
+  clc/lib/amdgpu/SOURCES;
   clc/lib/clspv/SOURCES;
   clc/lib/spirv/SOURCES;
 )
diff --git a/libclc/amdgpu/lib/SOURCES b/libclc/amdgpu/lib/SOURCES
index 24f099d049cd3..d7782a2ae14dc 100644
--- a/libclc/amdgpu/lib/SOURCES
+++ b/libclc/amdgpu/lib/SOURCES
@@ -10,4 +10,3 @@ math/half_log2.cl
 math/half_recip.cl
 math/half_rsqrt.cl
 math/half_sqrt.cl
-math/sqrt.cl
diff --git a/libclc/clc/include/clc/float/definitions.h b/libclc/clc/include/clc/float/definitions.h
index 618d02ab1c090..82ae90155be1d 100644
--- a/libclc/clc/include/clc/float/definitions.h
+++ b/libclc/clc/include/clc/float/definitions.h
@@ -1,7 +1,6 @@
 #define MAXFLOAT 0x1.fffffep127f
 #define HUGE_VALF __builtin_huge_valf()
 #define INFINITY __builtin_inff()
-#define NAN __builtin_nanf("")
 
 #define FLT_DIG 6
 #define FLT_MANT_DIG 24
@@ -13,6 +12,7 @@
 #define FLT_MAX MAXFLOAT
 #define FLT_MIN 0x1.0p-126f
 #define FLT_EPSILON 0x1.0p-23f
+#define FLT_NAN __builtin_nanf("")
 
 #define FP_ILOGB0 (-2147483647 - 1)
 #define FP_ILOGBNAN 2147483647
@@ -46,6 +46,7 @@
 #define DBL_MAX 0x1.fffffffffffffp1023
 #define DBL_MIN 0x1.0p-1022
 #define DBL_EPSILON 0x1.0p-52
+#define DBL_NAN __builtin_nan("")
 
 #define M_E 0x1.5bf0a8b145769p+1
 #define M_LOG2E 0x1.71547652b82fep+0
@@ -80,6 +81,7 @@
 #define HALF_MAX 0x1.ffcp15h
 #define HALF_MIN 0x1.0p-14h
 #define HALF_EPSILON 0x1.0p-10h
+#define HALF_NAN __builtin_nanf16("")
 
 #define M_LOG2E_H 0x1.714p+0h
 
diff --git a/libclc/generic/include/math/clc_sqrt.h b/libclc/clc/include/clc/math/clc_sqrt.h
similarity index 60%
rename from libclc/generic/include/math/clc_sqrt.h
rename to libclc/clc/include/clc/math/clc_sqrt.h
index 90a7c575c9bad..c16edf196d9f6 100644
--- a/libclc/generic/include/math/clc_sqrt.h
+++ b/libclc/clc/include/clc/math/clc_sqrt.h
@@ -1,8 +1,12 @@
-#include <clc/clcfunc.h>
-#include <clc/clctypes.h>
+#ifndef __CLC_MATH_CLC_SQRT_H__
+#define __CLC_MATH_CLC_SQRT_H__
 
-#define __CLC_FUNCTION __clc_sqrt
 #define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_sqrt
+
 #include <clc/math/gentype.inc>
+
 #undef __CLC_BODY
 #undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_SQRT_H__
diff --git a/libclc/clc/lib/amdgpu/SOURCES b/libclc/clc/lib/amdgpu/SOURCES
new file mode 100644
index 0000000000000..fd64a862021e8
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/SOURCES
@@ -0,0 +1 @@
+math/clc_sqrt_fp64.cl
diff --git a/libclc/amdgpu/lib/math/sqrt.cl b/libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl
similarity index 64%
rename from libclc/amdgpu/lib/math/sqrt.cl
rename to libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl
index 17d77e50d44d3..9ce48ae8b3122 100644
--- a/libclc/amdgpu/lib/math/sqrt.cl
+++ b/libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl
@@ -20,52 +20,42 @@
  * THE SOFTWARE.
  */
 
-#include "math/clc_sqrt.h"
-#include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-_CLC_DEFINE_UNARY_BUILTIN(float, sqrt, __clc_sqrt, float)
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-_CLC_DEFINE_UNARY_BUILTIN(half, sqrt, __clc_sqrt, half)
-
-#endif
+#include <clc/internal/clc.h>
+#include <clc/math/clc_fma.h>
 
 #ifdef cl_khr_fp64
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 #ifdef __AMDGCN__
-  #define __clc_builtin_rsq __builtin_amdgcn_rsq
+#define __clc_builtin_rsq __builtin_amdgcn_rsq
 #else
-  #define __clc_builtin_rsq __builtin_r600_recipsqrt_ieee
+#define __clc_builtin_rsq __builtin_r600_recipsqrt_ieee
 #endif
 
-_CLC_OVERLOAD _CLC_DEF double sqrt(double x) {
-
+_CLC_OVERLOAD _CLC_DEF double __clc_sqrt(double x) {
   uint vcc = x < 0x1p-767;
   uint exp0 = vcc ? 0x100 : 0;
   unsigned exp1 = vcc ? 0xffffff80 : 0;
 
-  double v01 = ldexp(x, exp0);
+  double v01 = __builtin_ldexp(x, exp0);
   double v23 = __clc_builtin_rsq(v01);
   double v45 = v01 * v23;
   v23 = v23 * 0.5;
 
-  double v67 = fma(-v23, v45, 0.5);
-  v45 = fma(v45, v67, v45);
-  double v89 = fma(-v45, v45, v01);
-  v23 = fma(v23, v67, v23);
-  v45 = fma(v89, v23, v45);
-  v67 = fma(-v45, v45, v01);
-  v23 = fma(v67, v23, v45);
+  double v67 = __clc_fma(-v23, v45, 0.5);
+  v45 = __clc_fma(v45, v67, v45);
+  double v89 = __clc_fma(-v45, v45, v01);
+  v23 = __clc_fma(v23, v67, v23);
+  v45 = __clc_fma(v89, v23, v45);
+  v67 = __clc_fma(-v45, v45, v01);
+  v23 = __clc_fma(v67, v23, v45);
 
-  v23 = ldexp(v23, exp1);
-  return ((x == __builtin_inf()) || (x == 0.0)) ? v01 : v23;
+  v23 = __builtin_ldexp(v23, exp1);
+  return (x == __builtin_inf() || (x == 0.0)) ? v01 : v23;
 }
 
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sqrt, double);
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __clc_sqrt, double);
 
 #endif
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index dbf3c1bd31a0d..19c0ffe405d56 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -32,6 +32,7 @@ math/clc_nan.cl
 math/clc_nextafter.cl
 math/clc_rint.cl
 math/clc_round.cl
+math/clc_sqrt.cl
 math/clc_sw_fma.cl
 math/clc_trunc.cl
 relational/clc_all.cl
diff --git a/libclc/generic/lib/math/clc_sqrt.cl b/libclc/clc/lib/generic/math/clc_sqrt.cl
similarity index 80%
rename from libclc/generic/lib/math/clc_sqrt.cl
rename to libclc/clc/lib/generic/math/clc_sqrt.cl
index 92c7f6e73b11e..620c367dd8510 100644
--- a/libclc/generic/lib/math/clc_sqrt.cl
+++ b/libclc/clc/lib/generic/math/clc_sqrt.cl
@@ -20,14 +20,8 @@
  * THE SOFTWARE.
  */
 
-#include <clc/clc.h>
+#include <clc/float/definitions.h>
+#include <clc/internal/clc.h>
 
-// Map the llvm sqrt intrinsic to an OpenCL function.
-#define __CLC_FUNCTION __clc_llvm_intr_sqrt
-#define __CLC_INTRINSIC "llvm.sqrt"
-#include <clc/math/unary_intrin.inc>
-#undef __CLC_FUNCTION
-#undef __CLC_INTRINSIC
-
-#define __CLC_BODY <clc_sqrt_impl.inc>
+#define __CLC_BODY <clc_sqrt.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/clc_sqrt_impl.inc b/libclc/clc/lib/generic/math/clc_sqrt.inc
similarity index 80%
rename from libclc/generic/lib/math/clc_sqrt_impl.inc
rename to libclc/clc/lib/generic/math/clc_sqrt.inc
index fe724e8c14394..b33b47f26cfb3 100644
--- a/libclc/generic/lib/math/clc_sqrt_impl.inc
+++ b/libclc/clc/lib/generic/math/clc_sqrt.inc
@@ -21,19 +21,17 @@
  */
 
 #if __CLC_FPSIZE == 64
-#define __CLC_NAN __builtin_nan("")
-#define ZERO 0.0
+#define __CLC_NAN DBL_NAN
 #elif __CLC_FPSIZE == 32
-#define __CLC_NAN NAN
-#define ZERO 0.0f
+#define __CLC_NAN FLT_NAN
 #elif __CLC_FPSIZE == 16
-#define __CLC_NAN (half)NAN
-#define ZERO 0.0h
+#define __CLC_NAN HALF_NAN
 #endif
 
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sqrt(__CLC_GENTYPE val) {
-  return val < ZERO ? __CLC_NAN : __clc_llvm_intr_sqrt(val);
+__attribute__((weak)) _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE
+__clc_sqrt(__CLC_GENTYPE val) {
+  return val < __CLC_FP_LIT(0.0) ? (__CLC_GENTYPE)__CLC_NAN
+                                 : __builtin_elementwise_sqrt(val);
 }
 
 #undef __CLC_NAN
-#undef ZERO
diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES
index bea78adf2cf08..42bf695850ede 100644
--- a/libclc/generic/lib/SOURCES
+++ b/libclc/generic/lib/SOURCES
@@ -180,7 +180,6 @@ math/sincos.cl
 math/sincos_helpers.cl
 math/sinh.cl
 math/sinpi.cl
-math/clc_sqrt.cl
 math/sqrt.cl
 math/clc_tan.cl
 math/tan.cl
diff --git a/libclc/generic/lib/math/clc_hypot.cl b/libclc/generic/lib/math/clc_hypot.cl
index 5e6a99b70f22a..fdf1e7ffa1def 100644
--- a/libclc/generic/lib/math/clc_hypot.cl
+++ b/libclc/generic/lib/math/clc_hypot.cl
@@ -27,6 +27,7 @@
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
+#include <clc/math/clc_sqrt.h>
 #include <clc/relational/clc_isnan.h>
 #include <clc/shared/clc_clamp.h>
 #include <math/clc_hypot.h>
@@ -49,7 +50,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) {
   float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
   float fx = as_float(ux) * fi_exp;
   float fy = as_float(uy) * fi_exp;
-  retval = sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;
+  retval = __clc_sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;
 
   retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval;
   retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32
@@ -81,7 +82,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) {
   double ay = y * preadjust;
 
   // The post adjust may overflow, but this can't be avoided in any case
-  double r = sqrt(__clc_fma(ax, ax, ay * ay)) * postadjust;
+  double r = __clc_sqrt(__clc_fma(ax, ax, ay * ay)) * postadjust;
 
   // If the difference in exponents between x and y is large
   double s = x + y;
diff --git a/libclc/generic/lib/math/sqrt.cl b/libclc/generic/lib/math/sqrt.cl
index a9192a9493d17..d60d304fa1e1f 100644
--- a/libclc/generic/lib/math/sqrt.cl
+++ b/libclc/generic/lib/math/sqrt.cl
@@ -21,7 +21,9 @@
  */
 
 #include <clc/clc.h>
-#include "math/clc_sqrt.h"
+#include <clc/math/clc_sqrt.h>
 
-#define __CLC_FUNCTION sqrt
-#include <clc/math/unary_builtin.inc>
+#define FUNCTION sqrt
+#define __CLC_BODY <clc/shared/unary_def.inc>
+
+#include <clc/math/gentype.inc>

Comment on lines +33 to +34
return val < __CLC_FP_LIT(0.0) ? (__CLC_GENTYPE)__CLC_NAN
: __builtin_elementwise_sqrt(val);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This pre-filtering of the argument is unnecessary. Many years ago the sqrt intrinsic officially did not handle the nan cases, but that has been fixed. This can be the raw builtin call

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AMDGPU libclc libclc OpenCL library
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants