linux/arch/x86/lib/crc-pclmul-template.h
Eric Biggers acf9f8da5e x86/crc: drop the avx10_256 functions and rename avx10_512 to avx512
Intel made a late change to the AVX10 specification that removes support
for a 256-bit maximum vector length and enumeration of the maximum
vector length.  AVX10 will imply a maximum vector length of 512 bits.
I.e. there won't be any such thing as AVX10/256 or AVX10/512; there will
just be AVX10, and it will essentially just consolidate AVX512 features.

As a result of this new development, my strategy of providing both
*_avx10_256 and *_avx10_512 functions didn't turn out to be that useful.
The only remaining motivation for the 256-bit AVX512 / AVX10 functions
is to avoid downclocking on older Intel CPUs.  But I already wrote
*_avx2 code too (primarily to support CPUs without AVX512), which
performs almost as well as *_avx10_256.  So we should just use that.

Therefore, remove the *_avx10_256 CRC functions, and rename the
*_avx10_512 CRC functions to *_avx512.  Make Ice Lake and Tiger Lake use
the *_avx2 functions instead of *_avx10_256 which they previously used.

Link: https://lore.kernel.org/r/20250319181316.91271-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
2025-03-19 12:22:00 -07:00

77 lines
2.8 KiB
C

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Macros for accessing the [V]PCLMULQDQ-based CRC functions that are
* instantiated by crc-pclmul-template.S
*
* Copyright 2025 Google LLC
*
* Author: Eric Biggers <ebiggers@google.com>
*/
#ifndef _CRC_PCLMUL_TEMPLATE_H
#define _CRC_PCLMUL_TEMPLATE_H
#include <asm/cpufeatures.h>
#include <asm/simd.h>
#include <crypto/internal/simd.h>
#include <linux/static_call.h>
#include "crc-pclmul-consts.h"
#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \
crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \
const void *consts_ptr); \
crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \
const void *consts_ptr); \
crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \
const void *consts_ptr); \
DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)
#define INIT_CRC_PCLMUL(prefix) \
do { \
if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) && \
boot_cpu_has(X86_FEATURE_VPCLMULQDQ) && \
boot_cpu_has(X86_FEATURE_AVX2) && \
cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) { \
if (boot_cpu_has(X86_FEATURE_AVX512BW) && \
boot_cpu_has(X86_FEATURE_AVX512VL) && \
!boot_cpu_has(X86_FEATURE_PREFER_YMM) && \
cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) { \
static_call_update(prefix##_pclmul, \
prefix##_vpclmul_avx512); \
} else { \
static_call_update(prefix##_pclmul, \
prefix##_vpclmul_avx2); \
} \
} \
} while (0)
/*
* Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16
* bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD.
*
* 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions.
* There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(),
* varying by CPU and factors such as which parts of the "FPU" state userspace
* has touched, which could result in a larger cutoff being better. Indeed, a
* larger cutoff is usually better for a *single* message. However, the
* overhead of the FPU section gets amortized if multiple FPU sections get
* executed before returning to userspace, since the XSAVE and XRSTOR occur only
* once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on
* the dcache than the table-based code is, a 16-byte cutoff seems to work well.
*/
#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \
do { \
if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \
crypto_simd_usable()) { \
const void *consts_ptr; \
\
consts_ptr = (consts).fold_across_128_bits_consts; \
kernel_fpu_begin(); \
crc = static_call(prefix##_pclmul)((crc), (p), (len), \
consts_ptr); \
kernel_fpu_end(); \
return crc; \
} \
} while (0)
#endif /* _CRC_PCLMUL_TEMPLATE_H */