Use __attribute__((target(...))) for AVX-512 support.
authorNathan Bossart <[email protected]>
Thu, 7 Nov 2024 19:58:43 +0000 (13:58 -0600)
committerNathan Bossart <[email protected]>
Thu, 7 Nov 2024 19:58:43 +0000 (13:58 -0600)
Presently, we check for compiler support for the required
intrinsics both with and without extra compiler flags (e.g.,
-mxsave), and then depending on the results of those checks, we
pick which files to compile with which flags.  This is tedious and
complicated, and it results in unsustainable coding patterns such
as separate files for each portion of code may need to be built
with different compiler flags.

This commit introduces support for __attribute__((target(...))) and
uses it for the AVX-512 code.  This simplifies both the
configure-time checks and the build scripts, and it allows us to
place the functions that use the intrinsics in files that we
otherwise do not want to build with special CPU instructions.  We
are careful to avoid using __attribute__((target(...))) on
compilers that do not understand it, but we still perform the
configure-time checks in case the compiler allows using the
intrinsics without it (e.g., MSVC).

A similar change could likely be made for some of the CRC-32C code,
but that is left as a future exercise.

Suggested-by: Andres Freund
Reviewed-by: Raghuveer Devulapalli, Andres Freund
Discussion: https://postgr.es/m/20240731205254.vfpap7uxwmebqeaf%40awork3.anarazel.de

config/c-compiler.m4
configure
configure.ac
meson.build
src/Makefile.global.in
src/include/c.h
src/makefiles/meson.build
src/port/Makefile
src/port/meson.build
src/port/pg_popcount_avx512.c
src/port/pg_popcount_avx512_choose.c[deleted file]

index 10f8c7bd0a9fbb12a2e7c0060560fe385dd9d01b..c7eb896f14a4782a74124a3a92052163e19cedac 100644 (file)
@@ -700,20 +700,22 @@ undefine([Ac_cachevar])dnl
 # Check if the compiler supports the XSAVE instructions using the _xgetbv
 # intrinsic function.
 #
-# An optional compiler flag can be passed as argument (e.g., -mxsave).  If the
-# intrinsic is supported, sets pgac_xsave_intrinsics and CFLAGS_XSAVE.
+# If the intrinsics are supported, sets pgac_xsave_intrinsics.
 AC_DEFUN([PGAC_XSAVE_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _xgetbv with CFLAGS=$1], [Ac_cachevar],
-[pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS $1"
-AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
-  [return _xgetbv(0) & 0xe0;])],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics])])dnl
+AC_CACHE_CHECK([for _xgetbv], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("xsave")))
+    #endif
+    static int xsave_test(void)
+    {
+      return _xgetbv(0) & 0xe0;
+    }],
+  [return xsave_test();])],
   [Ac_cachevar=yes],
-  [Ac_cachevar=no])
-CFLAGS="$pgac_save_CFLAGS"])
+  [Ac_cachevar=no])])
 if test x"$Ac_cachevar" = x"yes"; then
-  CFLAGS_XSAVE="$1"
   pgac_xsave_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
@@ -725,29 +727,29 @@ undefine([Ac_cachevar])dnl
 # _mm512_setzero_si512, _mm512_maskz_loadu_epi8, _mm512_popcnt_epi64,
 # _mm512_add_epi64, and _mm512_reduce_add_epi64 intrinsic functions.
 #
-# Optional compiler flags can be passed as argument (e.g., -mavx512vpopcntdq
-# -mavx512bw).  If the intrinsics are supported, sets
-# pgac_avx512_popcnt_intrinsics and CFLAGS_POPCNT.
+# If the intrinsics are supported, sets pgac_avx512_popcnt_intrinsics.
 AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar],
-[pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS $1"
-AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
-  [const char buf@<:@sizeof(__m512i)@:>@;
-   PG_INT64_TYPE popcnt = 0;
-   __m512i accum = _mm512_setzero_si512();
-   const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
-   const __m512i cnt = _mm512_popcnt_epi64(val);
-   accum = _mm512_add_epi64(accum, cnt);
-   popcnt = _mm512_reduce_add_epi64(accum);
-   /* return computed value, to prevent the above being optimized away */
-   return popcnt == 0;])],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics])])dnl
+AC_CACHE_CHECK([for _mm512_popcnt_epi64], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("avx512vpopcntdq","avx512bw")))
+    #endif
+    static int popcount_test(void)
+    {
+      const char buf@<:@sizeof(__m512i)@:>@;
+      PG_INT64_TYPE popcnt = 0;
+      __m512i accum = _mm512_setzero_si512();
+      const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
+      const __m512i cnt = _mm512_popcnt_epi64(val);
+      accum = _mm512_add_epi64(accum, cnt);
+      popcnt = _mm512_reduce_add_epi64(accum);
+      return (int) popcnt;
+    }],
+  [return popcount_test();])],
   [Ac_cachevar=yes],
-  [Ac_cachevar=no])
-CFLAGS="$pgac_save_CFLAGS"])
+  [Ac_cachevar=no])])
 if test x"$Ac_cachevar" = x"yes"; then
-  CFLAGS_POPCNT="$1"
   pgac_avx512_popcnt_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
index 6e256b417b9c20af916ac2816a4208bf52d70ab7..3a7332f834943d23c414070d57a6118d48babc5a 100755 (executable)
--- a/configure
+++ b/configure
@@ -647,9 +647,6 @@ MSGFMT_FLAGS
 MSGFMT
 PG_CRC32C_OBJS
 CFLAGS_CRC
-PG_POPCNT_OBJS
-CFLAGS_POPCNT
-CFLAGS_XSAVE
 LIBOBJS
 OPENSSL
 ZSTD
 
 # Check for XSAVE intrinsics
 #
-CFLAGS_XSAVE=""
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=" >&5
-$as_echo_n "checking for _xgetbv with CFLAGS=... " >&6; }
-if ${pgac_cv_xsave_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv" >&5
+$as_echo_n "checking for _xgetbv... " >&6; }
+if ${pgac_cv_xsave_intrinsics+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS "
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <immintrin.h>
-int
-main ()
-{
-return _xgetbv(0) & 0xe0;
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_xsave_intrinsics_=yes
-else
-  pgac_cv_xsave_intrinsics_=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics_" >&5
-$as_echo "$pgac_cv_xsave_intrinsics_" >&6; }
-if test x"$pgac_cv_xsave_intrinsics_" = x"yes"; then
-  CFLAGS_XSAVE=""
-  pgac_xsave_intrinsics=yes
-fi
-
-if test x"$pgac_xsave_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=-mxsave" >&5
-$as_echo_n "checking for _xgetbv with CFLAGS=-mxsave... " >&6; }
-if ${pgac_cv_xsave_intrinsics__mxsave+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS -mxsave"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <immintrin.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("xsave")))
+    #endif
+    static int xsave_test(void)
+    {
+      return _xgetbv(0) & 0xe0;
+    }
 int
 main ()
 {
-return _xgetbv(0) & 0xe0;
+return xsave_test();
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_xsave_intrinsics__mxsave=yes
+  pgac_cv_xsave_intrinsics=yes
 else
-  pgac_cv_xsave_intrinsics__mxsave=no
+  pgac_cv_xsave_intrinsics=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics__mxsave" >&5
-$as_echo "$pgac_cv_xsave_intrinsics__mxsave" >&6; }
-if test x"$pgac_cv_xsave_intrinsics__mxsave" = x"yes"; then
-  CFLAGS_XSAVE="-mxsave"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics" >&5
+$as_echo "$pgac_cv_xsave_intrinsics" >&6; }
+if test x"$pgac_cv_xsave_intrinsics" = x"yes"; then
   pgac_xsave_intrinsics=yes
 fi
 
-fi
 if test x"$pgac_xsave_intrinsics" = x"yes"; then
 
 $as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h
 
 fi
 
-
 # Check for AVX-512 popcount intrinsics
 #
-CFLAGS_POPCNT=""
-PG_POPCNT_OBJS=""
 if test x"$host_cpu" = x"x86_64"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5
-$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; }
-if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64" >&5
+$as_echo_n "checking for _mm512_popcnt_epi64... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS "
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <immintrin.h>
-int
-main ()
-{
-const char buf[sizeof(__m512i)];
-   PG_INT64_TYPE popcnt = 0;
-   __m512i accum = _mm512_setzero_si512();
-   const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
-   const __m512i cnt = _mm512_popcnt_epi64(val);
-   accum = _mm512_add_epi64(accum, cnt);
-   popcnt = _mm512_reduce_add_epi64(accum);
-   /* return computed value, to prevent the above being optimized away */
-   return popcnt == 0;
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_avx512_popcnt_intrinsics_=yes
-else
-  pgac_cv_avx512_popcnt_intrinsics_=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5
-$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; }
-if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then
-  CFLAGS_POPCNT=""
-  pgac_avx512_popcnt_intrinsics=yes
-fi
-
-  if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512bw" >&5
-$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512bw... " >&6; }
-if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq -mavx512bw"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <immintrin.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("avx512vpopcntdq","avx512bw")))
+    #endif
+    static int popcount_test(void)
+    {
+      const char buf[sizeof(__m512i)];
+      PG_INT64_TYPE popcnt = 0;
+      __m512i accum = _mm512_setzero_si512();
+      const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
+      const __m512i cnt = _mm512_popcnt_epi64(val);
+      accum = _mm512_add_epi64(accum, cnt);
+      popcnt = _mm512_reduce_add_epi64(accum);
+      return (int) popcnt;
+    }
 int
 main ()
 {
-const char buf[sizeof(__m512i)];
-   PG_INT64_TYPE popcnt = 0;
-   __m512i accum = _mm512_setzero_si512();
-   const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
-   const __m512i cnt = _mm512_popcnt_epi64(val);
-   accum = _mm512_add_epi64(accum, cnt);
-   popcnt = _mm512_reduce_add_epi64(accum);
-   /* return computed value, to prevent the above being optimized away */
-   return popcnt == 0;
+return popcount_test();
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw=yes
+  pgac_cv_avx512_popcnt_intrinsics=yes
 else
-  pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw=no
+  pgac_cv_avx512_popcnt_intrinsics=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" >&5
-$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" >&6; }
-if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" = x"yes"; then
-  CFLAGS_POPCNT="-mavx512vpopcntdq -mavx512bw"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics" = x"yes"; then
   pgac_avx512_popcnt_intrinsics=yes
 fi
 
-  fi
   if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
-    PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
 
 $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
   fi
 fi
 
-
-
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
index 3992694dacce06caf108faabc5f95faf46566bed..e7f4f0fc22ffe6ab48008a84b222393743b10996 100644 (file)
@@ -2050,32 +2050,19 @@ fi
 
 # Check for XSAVE intrinsics
 #
-CFLAGS_XSAVE=""
-PGAC_XSAVE_INTRINSICS([])
-if test x"$pgac_xsave_intrinsics" != x"yes"; then
-  PGAC_XSAVE_INTRINSICS([-mxsave])
-fi
+PGAC_XSAVE_INTRINSICS()
 if test x"$pgac_xsave_intrinsics" = x"yes"; then
   AC_DEFINE(HAVE_XSAVE_INTRINSICS, 1, [Define to 1 if you have XSAVE intrinsics.])
 fi
-AC_SUBST(CFLAGS_XSAVE)
 
 # Check for AVX-512 popcount intrinsics
 #
-CFLAGS_POPCNT=""
-PG_POPCNT_OBJS=""
 if test x"$host_cpu" = x"x86_64"; then
-  PGAC_AVX512_POPCNT_INTRINSICS([])
-  if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
-    PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq -mavx512bw])
-  fi
+  PGAC_AVX512_POPCNT_INTRINSICS()
   if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
-    PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
     AC_DEFINE(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 popcount instructions with a runtime check.])
   fi
 fi
-AC_SUBST(CFLAGS_POPCNT)
-AC_SUBST(PG_POPCNT_OBJS)
 
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
index 9a98f0c86a06f8f2d6a7bb6614dc6e6ac8623ce5..9eddd72a27ec9622fd5819a235e2551912fc8ec5 100644 (file)
@@ -2153,25 +2153,22 @@ endforeach
 # Check for the availability of XSAVE intrinsics.
 ###############################################################
 
-cflags_xsave = []
 if host_cpu == 'x86' or host_cpu == 'x86_64'
 
   prog = '''
 #include <immintrin.h>
 
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("xsave")))
+#endif
 int main(void)
 {
     return _xgetbv(0) & 0xe0;
 }
 '''
 
-  if cc.links(prog, name: 'XSAVE intrinsics without -mxsave',
-        args: test_c_args)
-    cdata.set('HAVE_XSAVE_INTRINSICS', 1)
-  elif cc.links(prog, name: 'XSAVE intrinsics with -mxsave',
-        args: test_c_args + ['-mxsave'])
+  if cc.links(prog, name: 'XSAVE intrinsics', args: test_c_args)
     cdata.set('HAVE_XSAVE_INTRINSICS', 1)
-    cflags_xsave += '-mxsave'
   endif
 
 endif
@@ -2181,12 +2178,14 @@ endif
 # Check for the availability of AVX-512 popcount intrinsics.
 ###############################################################
 
-cflags_popcnt = []
 if host_cpu == 'x86_64'
 
   prog = '''
 #include <immintrin.h>
 
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("avx512vpopcntdq","avx512bw")))
+#endif
 int main(void)
 {
     const char buf[sizeof(__m512i)];
@@ -2201,13 +2200,9 @@ int main(void)
 }
 '''
 
-  if cc.links(prog, name: 'AVX-512 popcount without -mavx512vpopcntdq -mavx512bw',
+  if cc.links(prog, name: 'AVX-512 popcount',
         args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))])
     cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
-  elif cc.links(prog, name: 'AVX-512 popcount with -mavx512vpopcntdq -mavx512bw',
-        args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))] + ['-mavx512vpopcntdq'] + ['-mavx512bw'])
-    cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
-    cflags_popcnt += ['-mavx512vpopcntdq'] + ['-mavx512bw']
   endif
 
 endif
index 4859343153ba05135df7adbb28ee89a4324874ee..0f38d712d15bbd08e876c5a318157238626de31d 100644 (file)
@@ -262,9 +262,7 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@
 CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@
 CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
 CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@
-CFLAGS_POPCNT = @CFLAGS_POPCNT@
 CFLAGS_CRC = @CFLAGS_CRC@
-CFLAGS_XSAVE = @CFLAGS_XSAVE@
 PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
 PERMIT_MISSING_VARIABLE_DECLARATIONS = @PERMIT_MISSING_VARIABLE_DECLARATIONS@
 CXXFLAGS = @CXXFLAGS@
@@ -772,9 +770,6 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
-# files needed for the chosen popcount implementation
-PG_POPCNT_OBJS = @PG_POPCNT_OBJS@
-
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
index 55dec71a6d7fc6a0cf5ff08e87762ecda8653587..0a548d69d7f483b59aa22b0eef7f67eec7d32a0f 100644 (file)
 #define pg_attribute_nonnull(...)
 #endif
 
+/*
+ * pg_attribute_target allows specifying different target options that the
+ * function should be compiled with (e.g., for using special CPU instructions).
+ * Note that there still needs to be a configure-time check to verify that a
+ * specific target is understood by the compiler.
+ */
+#if __has_attribute (target)
+#define pg_attribute_target(...) __attribute__((target(__VA_ARGS__)))
+#else
+#define pg_attribute_target(...)
+#endif
+
 /*
  * Append PG_USED_FOR_ASSERTS_ONLY to definitions of variables that are only
  * used in assert-enabled builds, to avoid compiler warnings about unused
index e13938fe8adf96a3ec1b865bed20d976241e8af2..aba7411a1bed0e1d5516e0538658459744a92ed0 100644 (file)
@@ -102,10 +102,8 @@ pgxs_kv = {
     ' '.join(cflags_no_missing_var_decls),
 
   'CFLAGS_CRC': ' '.join(cflags_crc),
-  'CFLAGS_POPCNT': ' '.join(cflags_popcnt),
   'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags),
   'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags),
-  'CFLAGS_XSAVE': ' '.join(cflags_xsave),
 
   'LDFLAGS': var_ldflags,
   'LDFLAGS_EX': var_ldflags_ex,
@@ -181,7 +179,7 @@ pgxs_empty = [
   'WANTED_LANGUAGES',
 
   # Not needed because we don't build the server / PLs with the generated makefile
-  'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_POPCNT_OBJS', 'TAS',
+  'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS',
   'PG_TEST_EXTRA',
   'DTRACEFLAGS', # only server has dtrace probes
 
index 9324ec2d9fcc79eed5c5f924226e87675b1631b4..366c814bd92e42d59a462bc3a1c9458157ce7cc4 100644 (file)
@@ -38,13 +38,13 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
    $(LIBOBJS) \
    $(PG_CRC32C_OBJS) \
-   $(PG_POPCNT_OBJS) \
    bsearch_arg.o \
    chklocale.o \
    inet_net_ntop.o \
    noblock.o \
    path.o \
    pg_bitutils.o \
+   pg_popcount_avx512.o \
    pg_strong_random.o \
    pgcheckdir.o \
    pgmkdirp.o \
@@ -92,16 +92,6 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
 
-# all versions of pg_popcount_avx512_choose.o need CFLAGS_XSAVE
-pg_popcount_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE)
-pg_popcount_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE)
-pg_popcount_avx512_choose_srv.o: CFLAGS+=$(CFLAGS_XSAVE)
-
-# all versions of pg_popcount_avx512.o need CFLAGS_POPCNT
-pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_POPCNT)
-pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
-pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
-
 #
 # Shared library versions of object files
 #
index 1150966ab711ccd76695c66c72f3be39e9dbd617..83a063252095ca2a36e0e40b970639446fdd23e7 100644 (file)
@@ -7,6 +7,7 @@ pgport_sources = [
   'noblock.c',
   'path.c',
   'pg_bitutils.c',
+  'pg_popcount_avx512.c',
   'pg_strong_random.c',
   'pgcheckdir.c',
   'pgmkdirp.c',
@@ -84,8 +85,6 @@ replace_funcs_pos = [
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
   ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
-  ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'],
-  ['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'xsave'],
 
   # arm / aarch64
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
@@ -100,8 +99,8 @@ replace_funcs_pos = [
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
 
-pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt, 'xsave': cflags_xsave}
-pgport_sources_cflags = {'crc': [], 'popcnt': [], 'xsave': []}
+pgport_cflags = {'crc': cflags_crc}
+pgport_sources_cflags = {'crc': []}
 
 foreach f : replace_funcs_neg
   func = f.get(0)
index 9d3149e2d002b10dd76effefade6e2b15f4fe076..b598e86554974157fa5e9442249e1f04da11c63d 100644 (file)
  */
 #include "c.h"
 
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#include <cpuid.h>
+#endif
+
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 #include <immintrin.h>
+#endif
+
+#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#include <intrin.h>
+#endif
 
 #include "port/pg_bitutils.h"
 
  * use AVX-512 intrinsics, but we check it anyway to be sure.  We piggy-back on
  * the function pointers that are only used when TRY_POPCNT_FAST is set.
  */
-#ifdef TRY_POPCNT_FAST
+#if defined(TRY_POPCNT_FAST) && defined(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK)
+
+/*
+ * Does CPUID say there's support for XSAVE instructions?
+ */
+static inline bool
+xsave_available(void)
+{
+   unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+   __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+   __cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+   return (exx[2] & (1 << 27)) != 0;   /* osxsave */
+}
+
+/*
+ * Does XGETBV say the ZMM registers are enabled?
+ *
+ * NB: Caller is responsible for verifying that xsave_available() returns true
+ * before calling this.
+ */
+#ifdef HAVE_XSAVE_INTRINSICS
+pg_attribute_target("xsave")
+#endif
+static inline bool
+zmm_regs_available(void)
+{
+#ifdef HAVE_XSAVE_INTRINSICS
+   return (_xgetbv(0) & 0xe6) == 0xe6;
+#else
+   return false;
+#endif
+}
+
+/*
+ * Does CPUID say there's support for AVX-512 popcount and byte-and-word
+ * instructions?
+ */
+static inline bool
+avx512_popcnt_available(void)
+{
+   unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID_COUNT)
+   __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+   __cpuidex(exx, 7, 0);
+#else
+#error cpuid instruction not available
+#endif
+   return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
+       (exx[1] & (1 << 30)) != 0;  /* avx512-bw */
+}
+
+/*
+ * Returns true if the CPU supports the instructions required for the AVX-512
+ * pg_popcount() implementation.
+ */
+bool
+pg_popcount_avx512_available(void)
+{
+   return xsave_available() &&
+       zmm_regs_available() &&
+       avx512_popcnt_available();
+}
 
 /*
  * pg_popcount_avx512
  *     Returns the number of 1-bits in buf
  */
+pg_attribute_target("avx512vpopcntdq", "avx512bw")
 uint64
 pg_popcount_avx512(const char *buf, int bytes)
 {
@@ -82,6 +162,7 @@ pg_popcount_avx512(const char *buf, int bytes)
  * pg_popcount_masked_avx512
  *     Returns the number of 1-bits in buf after applying the mask to each byte
  */
+pg_attribute_target("avx512vpopcntdq", "avx512bw")
 uint64
 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
 {
@@ -138,4 +219,5 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
    return _mm512_reduce_add_epi64(accum);
 }
 
-#endif                         /* TRY_POPCNT_FAST */
+#endif                         /* TRY_POPCNT_FAST &&
+                                * USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
diff --git a/src/port/pg_popcount_avx512_choose.c b/src/port/pg_popcount_avx512_choose.c
deleted file mode 100644 (file)
index b371078..0000000
+++ /dev/null
@@ -1,102 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * pg_popcount_avx512_choose.c
- *    Test whether we can use the AVX-512 pg_popcount() implementation.
- *
- * Copyright (c) 2024, PostgreSQL Global Development Group
- *
- * IDENTIFICATION
- *    src/port/pg_popcount_avx512_choose.c
- *
- *-------------------------------------------------------------------------
- */
-#include "c.h"
-
-#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
-#include <cpuid.h>
-#endif
-
-#ifdef HAVE_XSAVE_INTRINSICS
-#include <immintrin.h>
-#endif
-
-#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
-#include <intrin.h>
-#endif
-
-#include "port/pg_bitutils.h"
-
-/*
- * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
- * use AVX-512 intrinsics, but we check it anyway to be sure.  We piggy-back on
- * the function pointers that are only used when TRY_POPCNT_FAST is set.
- */
-#ifdef TRY_POPCNT_FAST
-
-/*
- * Does CPUID say there's support for XSAVE instructions?
- */
-static inline bool
-xsave_available(void)
-{
-   unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
-   __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
-   __cpuid(exx, 1);
-#else
-#error cpuid instruction not available
-#endif
-   return (exx[2] & (1 << 27)) != 0;   /* osxsave */
-}
-
-/*
- * Does XGETBV say the ZMM registers are enabled?
- *
- * NB: Caller is responsible for verifying that xsave_available() returns true
- * before calling this.
- */
-static inline bool
-zmm_regs_available(void)
-{
-#ifdef HAVE_XSAVE_INTRINSICS
-   return (_xgetbv(0) & 0xe6) == 0xe6;
-#else
-   return false;
-#endif
-}
-
-/*
- * Does CPUID say there's support for AVX-512 popcount and byte-and-word
- * instructions?
- */
-static inline bool
-avx512_popcnt_available(void)
-{
-   unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID_COUNT)
-   __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUIDEX)
-   __cpuidex(exx, 7, 0);
-#else
-#error cpuid instruction not available
-#endif
-   return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
-       (exx[1] & (1 << 30)) != 0;  /* avx512-bw */
-}
-
-/*
- * Returns true if the CPU supports the instructions required for the AVX-512
- * pg_popcount() implementation.
- */
-bool
-pg_popcount_avx512_available(void)
-{
-   return xsave_available() &&
-       zmm_regs_available() &&
-       avx512_popcnt_available();
-}
-
-#endif                         /* TRY_POPCNT_FAST */