From 730f4f45ef9205a1597df4040060ce802861a16e Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Fri, 3 Jun 2022 15:54:45 +0200 Subject: [PATCH] Tidy up hardware crc32 implementations --- include/wrenpp/detail/crc32.hpp | 2 +- meson.build | 1 - meson_options.txt | 1 - src/crc32/crc32.cpp | 123 ++++++++-------------------- src/crc32/crc32_neon.cpp | 100 ++++++++++++++++++++++ src/crc32/crc32_sse42.cpp | 66 +++++++++++++++ src/crc32/meson.build | 19 ++++- src/crc32/sse2neon.h | 141 -------------------------------- src/meson.build | 1 - src/pvt_config.h.in | 2 - 10 files changed, 217 insertions(+), 239 deletions(-) create mode 100644 src/crc32/crc32_neon.cpp create mode 100644 src/crc32/crc32_sse42.cpp delete mode 100644 src/crc32/sse2neon.h diff --git a/include/wrenpp/detail/crc32.hpp b/include/wrenpp/detail/crc32.hpp index 00a1fa0..6199d8d 100644 --- a/include/wrenpp/detail/crc32.hpp +++ b/include/wrenpp/detail/crc32.hpp @@ -69,7 +69,7 @@ namespace wren { constexpr inline auto g_polynomial_table = PolynomialTable::table; template - constexpr std::uint32_t crc32 (const char* data, std::size_t len, std::uint32_t crc) { + constexpr std::uint32_t crc32 (const char* data, std::size_t len, std::uint32_t crc) noexcept { //static_assert(g_polynomial_table[0b10000000] == Polynomial); crc ^= XorIn; for (std::size_t z = 0; z < len; ++z) { diff --git a/meson.build b/meson.build index c57f4ab..535e30b 100644 --- a/meson.build +++ b/meson.build @@ -48,7 +48,6 @@ endif conf.set('POINTER_SIZE', ptr_size) conf.set('FUNC_POINTER_SIZE', func_ptr_size) conf.set('WRENPP_NAME', meson.project_name()) -conf.set('WRENPP_WITH_SSE42', get_option('wrenpp_with_sse42')) subdir('include') subdir('src') diff --git a/meson_options.txt b/meson_options.txt index ee47212..2a9a580 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -3,4 +3,3 @@ option('build_examples', type: 'boolean', value: false) option('wren_with_rand', type: 'boolean', value: false) option('wren_with_meta', type: 'boolean', value: false) option('wrenpp_with_name_guessing', type: 'boolean', value: true) -option('wrenpp_with_sse42', type: 'boolean', value: true) diff --git a/src/crc32/crc32.cpp b/src/crc32/crc32.cpp index c892de8..1c9270a 100644 --- a/src/crc32/crc32.cpp +++ b/src/crc32/crc32.cpp @@ -16,103 +16,44 @@ */ #include "wrenpp/detail/crc32.hpp" -#include "pvt_config.h" -#if defined(WRENPP_WITH_SSE42) -# if defined(__aarch64__) -extern "C" { -# include "sse2neon.h" -# include -# include -} // extern C -# else -# include -# include -# endif -#endif -#include +#include "crc32_config.h" -#if defined(WRENPP_WITH_SSE42) - // Byte-boundary alignment issues -# define CALC_CRC(op, crc, type, buf, len) \ - do { \ - for (; (len) >= sizeof (type); (len) -= sizeof(type), buf += sizeof (type)) { \ - (crc) = op((crc), *(type *) (buf)); \ - } \ - } while(0) +namespace wren::detail { +#if HAVE_NEON +[[gnu::const]] bool has_crc32_neon() noexcept; +[[gnu::pure]] std::uint32_t crc32c_neon(const char*, std::size_t, std::uint32_t) noexcept; #endif +#if HAVE_SSE42 +[[gnu::const]] bool has_crc32_sse42() noexcept; +[[gnu::pure]] std::uint32_t crc32c_sse42(const char*, std::size_t, std::uint32_t) noexcept; +#endif +} //namespace wren::detail + +namespace wren::detail { + //Some useful polynomials: + // zlib: 0x04C11DB7 + // castagnoli (intel): 0x1EDC6F41 -namespace wren { namespace { - //Some useful polynomials: - // zlib: 0x04C11DB7 - // castagnoli (intel): 0x1EDC6F41 - - /* Compute CRC-32C using the Intel hardware instruction. */ - /* for better parallelization with bigger buffers see - http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */ - std::uint32_t crc32c_hw(const char* input, std::size_t len, std::uint32_t crc) - { -#if defined(WRENPP_WITH_SSE42) - //see https://github.com/rurban/smhasher/blob/master/crc32_hw.c - constexpr std::size_t align_size = alignof(std::uint64_t); - constexpr std::size_t align_mask = align_size - 1; - - // XOR the initial CRC with INT_MAX - //crc ^= 0xFFFFFFFF; - crc = ~crc; - - // Align the input to the word boundary - for (; (len > 0) && (reinterpret_cast(input) & align_mask); len--, input++) { - crc = _mm_crc32_u8(crc, *input); - } - - // Blast off the CRC32 calculation -#if defined(__x86_64__) || defined(__aarch64__) - CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, input, len); -#endif - CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, input, len); - CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, input, len); - CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, input, len); - - // Post-process the crc - return ~crc; -#else - static_cast(input); - static_cast(crc); - static_cast(len); - assert(false); //not available in this build, this code should be unreachable -#endif - } - [[gnu::const]] - bool has_hw_crc32() { -#if defined(WRENPP_WITH_SSE42) -# if defined(__amd64__) || defined(_M_AMD64) - //if (_may_i_use_cpu_feature(_FEATURE_SSE4_2)) { - if (__builtin_cpu_supports("sse4.2")) - return true; - else -# elif defined(__aarch64__) - //see - //https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu - if (getauxval(AT_HWCAP) & HWCAP_CRC32) - return true; - else -# endif + auto best_crc32_function() { +#if HAVE_NEON + if (has_crc32_neon()) + return &crc32c_neon; #endif - return false; + +#if HAVE_SSE42 + if (has_crc32_sse42()) + return &crc32c_sse42; +#endif + + return &crc32; } } //unnamed namespace - namespace detail { - [[gnu::const]] - std::uint32_t runtime_crc32c (const char* data, std::size_t size, std::uint32_t crc) { - static const auto crc32c_implem = (has_hw_crc32() ? - &crc32c_hw - : - &detail::crc32 - ); - return (*crc32c_implem)(data, size, crc); - } - } //namespace detail -} //namespace wren + [[gnu::const]] + std::uint32_t runtime_crc32c (const char* data, std::size_t size, std::uint32_t crc) { + static const auto crc32c_implem = best_crc32_function(); + return (*crc32c_implem)(data, size, crc); + } +} //namespace wren::detail diff --git a/src/crc32/crc32_neon.cpp b/src/crc32/crc32_neon.cpp new file mode 100644 index 0000000..c0131a5 --- /dev/null +++ b/src/crc32/crc32_neon.cpp @@ -0,0 +1,100 @@ +/* Copyright 2020-2022, Michele Santullo + * This file is part of wrenpp. + * + * Wrenpp is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Wrenpp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with wrenpp. If not, see . + */ + +#include +#include +#include + +// Byte-boundary alignment issues +#define CALC_CRC(op, crc, type, buf, len) \ + do { \ + for (; (len) >= sizeof (type); (len) -= sizeof(type), buf += sizeof (type)) { \ + (crc) = op((crc), *(type *) (buf)); \ + } \ + } while(0) + +namespace wren::detail { + //King_DuckZ - adapted from + //https://github.com/rurban/smhasher/blob/master/sse2neon.h + namespace { + [[gnu::always_inline]] + std::uint32_t neon_crc32cb(std::uint32_t crc, std::uint8_t v) { + __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); + } + + [[gnu::always_inline]] + std::uint32_t neon_crc32ch (std::uint32_t crc, std::uint16_t v) { + __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); + } + + [[gnu::always_inline]] + std::uint32_t neon_crc32cw(std::uint32_t crc, std::uint32_t v) { + __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); + } + + [[gnu::always_inline]] + std::uint64_t neon_crc32cx (std::uint64_t crc, std::uint64_t v) { + __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); + } + } //unnamed namespace + + [[gnu::const]] + bool has_crc32_neon() noexcept { + //see + //https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu + const bool has_hw_crc32 = (getauxval(AT_HWCAP) & HWCAP_CRC32 ? true : false); + return has_hw_crc32; + } + + /* Compute CRC-32C using the Intel hardware instruction. */ + /* for better parallelization with bigger buffers see + http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */ + [[gnu::pure]] + std::uint32_t crc32c_neon(const char* input, std::size_t len, std::uint32_t crc) noexcept { + //see https://github.com/rurban/smhasher/blob/master/crc32_hw.c + constexpr std::size_t align_size = alignof(std::uint64_t); + constexpr std::size_t align_mask = align_size - 1; + + // XOR the initial CRC with INT_MAX + //crc ^= 0xFFFFFFFF; + crc = ~crc; + + // Align the input to the word boundary + for (; (len > 0) && (reinterpret_cast(input) & align_mask); len--, input++) { + crc = neon_crc32cb(crc, *input); + } + + // Blast off the CRC32 calculation +#if defined(__x86_64__) || defined(__aarch64__) + CALC_CRC(neon_crc32cx, crc, std::uint64_t, input, len); +#endif + CALC_CRC(neon_crc32cw, crc, std::uint32_t, input, len); + CALC_CRC(neon_crc32ch, crc, std::uint16_t, input, len); + CALC_CRC(neon_crc32cb, crc, std::uint8_t, input, len); + + // Post-process the crc + return ~crc; + } +} //namespace wren::detail diff --git a/src/crc32/crc32_sse42.cpp b/src/crc32/crc32_sse42.cpp new file mode 100644 index 0000000..2894b8c --- /dev/null +++ b/src/crc32/crc32_sse42.cpp @@ -0,0 +1,66 @@ +/* Copyright 2020-2022, Michele Santullo + * This file is part of wrenpp. + * + * Wrenpp is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Wrenpp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with wrenpp. If not, see . + */ + +#include +#include +#include + +// Byte-boundary alignment issues +#define CALC_CRC(op, crc, type, buf, len) \ + do { \ + for (; (len) >= sizeof (type); (len) -= sizeof(type), buf += sizeof (type)) { \ + (crc) = op((crc), *(type *) (buf)); \ + } \ + } while(0) + +namespace wren::detail { + [[gnu::const]] + bool has_crc32_sse42() noexcept { + const bool has_hw_crc32 = (__builtin_cpu_supports("sse4.2") ? true : false); + return has_hw_crc32; + } + + /* Compute CRC-32C using the Intel hardware instruction. */ + /* for better parallelization with bigger buffers see + http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */ + [[gnu::pure]] + std::uint32_t crc32c_sse42(const char* input, std::size_t len, std::uint32_t crc) noexcept { + //see https://github.com/rurban/smhasher/blob/master/crc32_hw.c + constexpr std::size_t align_size = alignof(std::uint64_t); + constexpr std::size_t align_mask = align_size - 1; + + // XOR the initial CRC with INT_MAX + //crc ^= 0xFFFFFFFF; + crc = ~crc; + + // Align the input to the word boundary + for (; (len > 0) && (reinterpret_cast(input) & align_mask); len--, input++) { + crc = _mm_crc32_u8(crc, *input); + } + + // Blast off the CRC32 calculation +#if defined(__x86_64__) || defined(__aarch64__) + CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, input, len); +#endif + CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, input, len); + CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, input, len); + CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, input, len); + + // Post-process the crc + return ~crc; + } +} //namespace wren::detail diff --git a/src/crc32/meson.build b/src/crc32/meson.build index 88b408a..4ca156c 100644 --- a/src/crc32/meson.build +++ b/src/crc32/meson.build @@ -1,5 +1,21 @@ +simd = import('unstable-simd') compiler_opts = [] +cpp = meson.get_compiler('cpp') +crc32_simd = simd.check('crc32_hw', + sse42: 'crc32_sse42.cpp', + neon: 'crc32_neon.cpp', + compiler: cpp, +) + +crc32_objs = crc32_simd[0] +crc32_config = crc32_simd[1] + +project_config_file = configure_file( + output: 'crc32_config.h', + configuration: crc32_config +) + if get_option('wrenpp_with_sse42') if arch == 'amd64' compiler_opts += ['-msse4.2'] @@ -12,9 +28,10 @@ endif crc32 = static_library('crc32', 'crc32.cpp', - include_directories: [public_incl, src_incl], + include_directories: [public_incl], install: false, cpp_args: compiler_opts + global_compiler_opts, + link_with: crc32_objs, ) crc32_dep = declare_dependency( diff --git a/src/crc32/sse2neon.h b/src/crc32/sse2neon.h deleted file mode 100644 index 9a4afb9..0000000 --- a/src/crc32/sse2neon.h +++ /dev/null @@ -1,141 +0,0 @@ -#ifndef SSE2NEON_H -#define SSE2NEON_H - -// This header file provides a simple API translation layer -// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions -// -// This header file does not yet translate all of the SSE intrinsics. -// -// Contributors to this work are: -// John W. Ratcliff -// Brandon Rowlett -// Ken Fast -// Eric van Beurden -// Alexander Potylitsin -// Hasindu Gamaarachchi -// Jim Huang -// Mark Cheng -// Malcolm James MacLeod -// Devin Hussey (easyaspi314) -// Sebastian Pop -// Developer Ecosystem Engineering -// Danila Kutenin -// François Turban (JishinMaster) -// Pei-Hsuan Hung -// Yang-Hao Yuan -// Syoyo Fujita -// Brecht Van Lommel - -/* - * sse2neon is freely redistributable under the MIT License. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -//King_DuckZ -//trimmed down version of -//https://github.com/rurban/smhasher/blob/master/sse2neon.h - -#if defined(__GNUC__) || defined(__clang__) -#pragma push_macro("FORCE_INLINE") -#pragma push_macro("ALIGN_STRUCT") -#ifndef FORCE_INLINE -#define FORCE_INLINE static inline __attribute__((always_inline)) -#endif -#else -#error "Macro name collisions may happen with unsupported compiler." -#ifdef FORCE_INLINE -#undef FORCE_INLINE -#endif -#define FORCE_INLINE static inline -#endif - -#include -#include - -// Starting with the initial value in crc, accumulates a CRC32 value for -// unsigned 8-bit integer v. -// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100) -FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) -{ -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) - __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" - : [c] "+r"(crc) - : [v] "r"(v)); -#else - crc ^= v; - for (int bit = 0; bit < 8; bit++) { - if (crc & 1) - crc = (crc >> 1) ^ UINT32_C(0x82f63b78); - else - crc = (crc >> 1); - } -#endif - return crc; -} - -// Starting with the initial value in crc, accumulates a CRC32 value for -// unsigned 16-bit integer v. -// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100) -FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) -{ -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) - __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" - : [c] "+r"(crc) - : [v] "r"(v)); -#else - crc = _mm_crc32_u8(crc, v & 0xff); - crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); -#endif - return crc; -} - -// Starting with the initial value in crc, accumulates a CRC32 value for -// unsigned 32-bit integer v. -// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100) -FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) -{ -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) - __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" - : [c] "+r"(crc) - : [v] "r"(v)); -#else - crc = _mm_crc32_u16(crc, v & 0xffff); - crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); -#endif - return crc; -} - -// Starting with the initial value in crc, accumulates a CRC32 value for -// unsigned 64-bit integer v. -// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100) -FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) -{ -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) - __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" - : [c] "+r"(crc) - : [v] "r"(v)); -#else - crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff); - crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff); -#endif - return crc; -} - -#endif diff --git a/src/meson.build b/src/meson.build index e6bc345..28627c7 100644 --- a/src/meson.build +++ b/src/meson.build @@ -1,4 +1,3 @@ -src_incl = include_directories('.') subdir('crc32') project_config_file = configure_file( diff --git a/src/pvt_config.h.in b/src/pvt_config.h.in index 259dfa4..15e1d98 100644 --- a/src/pvt_config.h.in +++ b/src/pvt_config.h.in @@ -27,5 +27,3 @@ static_assert(sizeof(void*) == ASM_PTR_SIZE, "Build system reports an unexpected static_assert(sizeof(void(*)(int)) == ASM_FUNC_PTR_SIZE, "Build system reports an unexpected function pointer size, please ensure assembly code is correct"); #endif - -#mesondefine WRENPP_WITH_SSE42