[v2,02/18] lib/crc32: improve support for arch-specific overrides

Message ID	20241025191454.72616-3-ebiggers@kernel.org
State	Superseded
Headers	show Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6164420F3CB; Fri, 25 Oct 2024 19:15:45 +0000 (UTC) From: Eric Biggers <ebiggers@kernel.org> To: linux-kernel@vger.kernel.org Cc: linux-arch@vger.kernel.org, linux-arm-kernel@lists.infradead.org, linux-crypto@vger.kernel.org, linux-ext4@vger.kernel.org, linux-f2fs-devel@lists.sourceforge.net, linux-mips@vger.kernel.org, linux-riscv@lists.infradead.org, linux-s390@vger.kernel.org, linux-scsi@vger.kernel.org, linuxppc-dev@lists.ozlabs.org, loongarch@lists.linux.dev, sparclinux@vger.kernel.org, x86@kernel.org, Ard Biesheuvel <ardb@kernel.org> Subject: [PATCH v2 02/18] lib/crc32: improve support for arch-specific overrides Date: Fri, 25 Oct 2024 12:14:38 -0700 Message-ID: <20241025191454.72616-3-ebiggers@kernel.org> In-Reply-To: <20241025191454.72616-1-ebiggers@kernel.org> References: <20241025191454.72616-1-ebiggers@kernel.org> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	Wire up CRC32 library functions to arch-optimized code \| expand [v2,00/18] Wire up CRC32 library functions to arch-optimized code [v2,01/18] lib/crc32: drop leading underscores from __crc32c_le_base [v2,02/18] lib/crc32: improve support for arch-specific overrides [v2,03/18] lib/crc32: expose whether the lib is really optimized at runtime [v2,04/18] crypto: crc32 - don't unnecessarily register arch algorithms [v2,05/18] arm/crc32: expose CRC32 functions through lib [v2,06/18] loongarch/crc32: expose CRC32 functions through lib [v2,07/18] mips/crc32: expose CRC32 functions through lib [v2,08/18] powerpc/crc32: expose CRC32 functions through lib [v2,09/18] s390/crc32: expose CRC32 functions through lib [v2,10/18] sparc/crc32: expose CRC32 functions through lib [v2,11/18] x86/crc32: update prototype for crc_pcl() [v2,12/18] x86/crc32: update prototype for crc32_pclmul_le_16() [v2,13/18] x86/crc32: expose CRC32 functions through lib [v2,14/18] lib/crc32: make crc32c() go directly to lib [v2,15/18] ext4: switch to using the crc32c library [v2,16/18] jbd2: switch to using the crc32c library [v2,17/18] f2fs: switch to using the crc32 library [v2,18/18] scsi: target: iscsi: switch to using the crc32c library

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fd9df6dcc593..1e48f40f654e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -19,10 +19,11 @@ config ARM64 select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_CACHE_LINE_SIZE + select ARCH_HAS_CRC32 select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_OPS if XEN select ARCH_HAS_DMA_PREP_COHERENT diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 8e882f479d98..5fbcf0d56665 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -11,11 +11,12 @@ CFLAGS_xor-neon.o += $(CC_FLAGS_FPU) CFLAGS_REMOVE_xor-neon.o += $(CC_FLAGS_NO_FPU) endif lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o -obj-$(CONFIG_CRC32) += crc32.o crc32-glue.o +obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o +crc32-arm64-y := crc32.o crc32-glue.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o obj-$(CONFIG_ARM64_MTE) += mte.o diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32-glue.c index ad015223d15d..d7f6e1cbf0d2 100644 --- a/arch/arm64/lib/crc32-glue.c +++ b/arch/arm64/lib/crc32-glue.c @@ -1,9 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/crc32.h> #include <linux/linkage.h> +#include <linux/module.h> #include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/neon.h> #include <asm/simd.h> @@ -19,11 +20,11 @@ asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len); asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len); -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len) { if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return crc32_le_base(crc, p, len); if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { @@ -38,12 +39,13 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) return crc; } return crc32_le_arm64(crc, p, len); } +EXPORT_SYMBOL(crc32_le_arch); -u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) +u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len) { if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return crc32c_le_base(crc, p, len); if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { @@ -58,12 +60,13 @@ u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) return crc; } return crc32c_le_arm64(crc, p, len); } +EXPORT_SYMBOL(crc32c_le_arch); -u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) +u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len) { if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return crc32_be_base(crc, p, len); if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { @@ -78,5 +81,9 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) return crc; } return crc32_be_arm64(crc, p, len); } +EXPORT_SYMBOL(crc32_be_arch); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("arm64-optimized CRC32 functions"); diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 62545946ecf4..1c32e51eb3a4 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -22,10 +22,11 @@ config RISCV select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM_VMEMMAP select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_BINFMT_FLAT + select ARCH_HAS_CRC32 if RISCV_ISA_ZBC select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX select ARCH_HAS_FAST_MULTIPLIER diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index 8eec6b69a875..79368a895fee 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -13,10 +13,9 @@ ifeq ($(CONFIG_MMU), y) lib-$(CONFIG_RISCV_ISA_V) += uaccess_vector.o endif lib-$(CONFIG_MMU) += uaccess.o lib-$(CONFIG_64BIT) += tishift.o lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o -lib-$(CONFIG_RISCV_ISA_ZBC) += crc32.o - +obj-$(CONFIG_CRC32_ARCH) += crc32-riscv.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_RISCV_ISA_V) += xor.o lib-$(CONFIG_RISCV_ISA_V) += riscv_v_helpers.o diff --git a/arch/riscv/lib/crc32.c b/arch/riscv/lib/crc32-riscv.c similarity index 94% rename from arch/riscv/lib/crc32.c rename to arch/riscv/lib/crc32-riscv.c index 333fb7af1192..a3ff7db2a1ce 100644 --- a/arch/riscv/lib/crc32.c +++ b/arch/riscv/lib/crc32-riscv.c @@ -12,10 +12,11 @@ #include <linux/types.h> #include <linux/minmax.h> #include <linux/crc32poly.h> #include <linux/crc32.h> #include <linux/byteorder/generic.h> +#include <linux/module.h> /* * Refer to https://www.corsix.org/content/barrett-reduction-polynomials for * better understanding of how this math works. * @@ -215,21 +216,23 @@ static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, legacy: return crc_fb(crc, p, len); } -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len) { return crc32_le_generic(crc, p, len, CRC32_POLY_LE, CRC32_POLY_QT_LE, crc32_le_base); } +EXPORT_SYMBOL(crc32_le_arch); -u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) +u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len) { return crc32_le_generic(crc, p, len, CRC32C_POLY_LE, CRC32C_POLY_QT_LE, crc32c_le_base); } +EXPORT_SYMBOL(crc32c_le_arch); static inline u32 crc32_be_unaligned(u32 crc, unsigned char const *p, size_t len) { size_t bits = len * 8; @@ -251,11 +254,11 @@ static inline u32 crc32_be_unaligned(u32 crc, unsigned char const *p, crc ^= crc_low; return crc; } -u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) +u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len) { size_t offset, head_len, tail_len; unsigned long const *p_ul; unsigned long s; @@ -290,5 +293,9 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) return crc; legacy: return crc32_be_base(crc, p, len); } +EXPORT_SYMBOL(crc32_be_arch); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Accelerated CRC32 implementation with Zbc extension"); diff --git a/crypto/crc32_generic.c b/crypto/crc32_generic.c index 6a55d206fab3..cc064ea8240e 100644 --- a/crypto/crc32_generic.c +++ b/crypto/crc32_generic.c @@ -158,16 +158,16 @@ static struct shash_alg algs[] = {{ }}; static int __init crc32_mod_init(void) { /* register the arch flavor only if it differs from the generic one */ - return crypto_register_shashes(algs, 1 + (&crc32_le != &crc32_le_base)); + return crypto_register_shashes(algs, 1 + IS_ENABLED(CONFIG_CRC32_ARCH)); } static void __exit crc32_mod_fini(void) { - crypto_unregister_shashes(algs, 1 + (&crc32_le != &crc32_le_base)); + crypto_unregister_shashes(algs, 1 + IS_ENABLED(CONFIG_CRC32_ARCH)); } subsys_initcall(crc32_mod_init); module_exit(crc32_mod_fini); diff --git a/crypto/crc32c_generic.c b/crypto/crc32c_generic.c index 635599b255ec..04b03d825cf4 100644 --- a/crypto/crc32c_generic.c +++ b/crypto/crc32c_generic.c @@ -198,16 +198,16 @@ static struct shash_alg algs[] = {{ }}; static int __init crc32c_mod_init(void) { /* register the arch flavor only if it differs from the generic one */ - return crypto_register_shashes(algs, 1 + (&__crc32c_le != &crc32c_le_base)); + return crypto_register_shashes(algs, 1 + IS_ENABLED(CONFIG_CRC32_ARCH)); } static void __exit crc32c_mod_fini(void) { - crypto_unregister_shashes(algs, 1 + (&__crc32c_le != &crc32c_le_base)); + crypto_unregister_shashes(algs, 1 + IS_ENABLED(CONFIG_CRC32_ARCH)); } subsys_initcall(crc32c_mod_init); module_exit(crc32c_mod_fini); diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 5b07fc9081c4..58c632533b08 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -6,14 +6,38 @@ #define _LINUX_CRC32_H #include <linux/types.h> #include <linux/bitrev.h> -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len); -u32 __pure crc32_le_base(u32 crc, unsigned char const *p, size_t len); -u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len); -u32 __pure crc32_be_base(u32 crc, unsigned char const *p, size_t len); +u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len); +u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len); +u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len); +u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len); +u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len); +u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len); + +static inline u32 __pure crc32_le(u32 crc, const u8 *p, size_t len) +{ + if (IS_ENABLED(CONFIG_CRC32_ARCH)) + return crc32_le_arch(crc, p, len); + return crc32_le_base(crc, p, len); +} + +static inline u32 __pure crc32_be(u32 crc, const u8 *p, size_t len) +{ + if (IS_ENABLED(CONFIG_CRC32_ARCH)) + return crc32_be_arch(crc, p, len); + return crc32_be_base(crc, p, len); +} + +/* TODO: leading underscores should be dropped once callers have been updated */ +static inline u32 __pure __crc32c_le(u32 crc, const u8 *p, size_t len) +{ + if (IS_ENABLED(CONFIG_CRC32_ARCH)) + return crc32c_le_arch(crc, p, len); + return crc32c_le_base(crc, p, len); +} /** * crc32_le_combine - Combine two crc32 check values into one. For two * sequences of bytes, seq1 and seq2 with lengths len1 * and len2, crc32_le() check values were calculated @@ -36,13 +60,10 @@ u32 __attribute_const__ crc32_le_shift(u32 crc, size_t len); static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2) { return crc32_le_shift(crc1, len2) ^ crc2; } -u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len); -u32 __pure crc32c_le_base(u32 crc, unsigned char const *p, size_t len); - /** * __crc32c_le_combine - Combine two crc32c check values into one. For two * sequences of bytes, seq1 and seq2 with lengths len1 * and len2, __crc32c_le() check values were calculated * for each, crc1 and crc2. diff --git a/lib/Kconfig b/lib/Kconfig index b38849af6f13..07afcf214f35 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -176,10 +176,13 @@ config CRC32 This option is provided for the case where no in-kernel-tree modules require CRC32/CRC32c functions, but a module built outside the kernel tree does. Such modules that use library CRC32/CRC32c functions require M here. +config ARCH_HAS_CRC32 + bool + config CRC32_SELFTEST tristate "CRC32 perform self test on init" depends on CRC32 help This option enables the CRC32 library functions to perform a @@ -188,54 +191,89 @@ config CRC32_SELFTEST and computes the total elapsed time and number of bytes processed. choice prompt "CRC32 implementation" depends on CRC32 - default CRC32_SLICEBY8 + default CRC32_IMPL_ARCH_PLUS_SLICEBY8 if ARCH_HAS_CRC32 + default CRC32_IMPL_SLICEBY8 if !ARCH_HAS_CRC32 help - This option allows a kernel builder to override the default choice - of CRC32 algorithm. Choose the default ("slice by 8") unless you - know that you need one of the others. + This option allows you to override the default choice of CRC32 + implementation. Choose the default unless you know that you need one + of the others. -config CRC32_SLICEBY8 +config CRC32_IMPL_ARCH_PLUS_SLICEBY8 + bool "Arch-optimized, with fallback to slice-by-8" if ARCH_HAS_CRC32 + help + Use architecture-optimized implementation of CRC32. Fall back to + slice-by-8 in cases where the arch-optimized implementation cannot be + used, e.g. if the CPU lacks support for the needed instructions. + + This is the default when an arch-optimized implementation exists. + +config CRC32_IMPL_ARCH_PLUS_SLICEBY1 + bool "Arch-optimized, with fallback to slice-by-1" if ARCH_HAS_CRC32 + help + Use architecture-optimized implementation of CRC32, but fall back to + slice-by-1 instead of slice-by-8 in order to reduce the binary size. + +config CRC32_IMPL_SLICEBY8 bool "Slice by 8 bytes" help Calculate checksum 8 bytes at a time with a clever slicing algorithm. - This is the fastest algorithm, but comes with a 8KiB lookup table. - Most modern processors have enough cache to hold this table without - thrashing the cache. - - This is the default implementation choice. Choose this one unless - you have a good reason not to. + This is much slower than the architecture-optimized implementation of + CRC32 (if the selected arch has one), but it is portable and is the + fastest implementation when no arch-optimized implementation is + available. It uses an 8KiB lookup table. Most modern processors have + enough cache to hold this table without thrashing the cache. -config CRC32_SLICEBY4 +config CRC32_IMPL_SLICEBY4 bool "Slice by 4 bytes" help Calculate checksum 4 bytes at a time with a clever slicing algorithm. This is a bit slower than slice by 8, but has a smaller 4KiB lookup table. Only choose this option if you know what you are doing. -config CRC32_SARWATE - bool "Sarwate's Algorithm (one byte at a time)" +config CRC32_IMPL_SLICEBY1 + bool "Slice by 1 byte (Sarwate's algorithm)" help Calculate checksum a byte at a time using Sarwate's algorithm. This - is not particularly fast, but has a small 256 byte lookup table. + is not particularly fast, but has a small 1KiB lookup table. Only choose this option if you know what you are doing. -config CRC32_BIT +config CRC32_IMPL_BIT bool "Classic Algorithm (one bit at a time)" help Calculate checksum one bit at a time. This is VERY slow, but has no lookup table. This is provided as a debugging option. Only choose this option if you are debugging crc32. endchoice +config CRC32_ARCH + tristate + default CRC32 if CRC32_IMPL_ARCH_PLUS_SLICEBY8 || CRC32_IMPL_ARCH_PLUS_SLICEBY1 + +config CRC32_SLICEBY8 + bool + default y if CRC32_IMPL_SLICEBY8 || CRC32_IMPL_ARCH_PLUS_SLICEBY8 + +config CRC32_SLICEBY4 + bool + default y if CRC32_IMPL_SLICEBY4 + +config CRC32_SARWATE + bool + default y if CRC32_IMPL_SLICEBY1 || CRC32_IMPL_ARCH_PLUS_SLICEBY1 + +config CRC32_BIT + bool + default y if CRC32_IMPL_BIT + config CRC64 tristate "CRC64 functions" help This option is provided for the case where no in-kernel-tree modules require CRC64 functions, but a module built outside diff --git a/lib/crc32.c b/lib/crc32.c index c67059b0082b..47151624332e 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -181,39 +181,31 @@ static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, #endif return crc; } #if CRC_LE_BITS == 1 -u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len) +u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len) { return crc32_le_generic(crc, p, len, NULL, CRC32_POLY_LE); } -u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len) +u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len) { return crc32_le_generic(crc, p, len, NULL, CRC32C_POLY_LE); } #else -u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len) +u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len) { return crc32_le_generic(crc, p, len, crc32table_le, CRC32_POLY_LE); } -u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len) +u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len) { return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE); } #endif -EXPORT_SYMBOL(crc32_le); -EXPORT_SYMBOL(__crc32c_le); - -u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le); EXPORT_SYMBOL(crc32_le_base); - -u32 __pure crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le); EXPORT_SYMBOL(crc32c_le_base); -u32 __pure crc32_be_base(u32, unsigned char const *, size_t) __alias(crc32_be); - /* * This multiplies the polynomials x and y modulo the given modulus. * This follows the "little-endian" CRC convention that the lsbit * represents the highest power of x, and the msbit represents x^0. */ @@ -333,16 +325,16 @@ static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, # endif return crc; } #if CRC_BE_BITS == 1 -u32 __pure __weak crc32_be(u32 crc, unsigned char const *p, size_t len) +u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len) { return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE); } #else -u32 __pure __weak crc32_be(u32 crc, unsigned char const *p, size_t len) +u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len) { return crc32_be_generic(crc, p, len, crc32table_be, CRC32_POLY_BE); } #endif -EXPORT_SYMBOL(crc32_be); +EXPORT_SYMBOL(crc32_be_base);

[v2,02/18] lib/crc32: improve support for arch-specific overrides

Commit Message

Patch