From e97c7bad109fb2df520ee05007dafc5fc34d6ac3 Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Tue, 24 May 2022 13:39:45 +0200 Subject: [PATCH] Improve runtime crc32 code Only check for sse4.2 once and store the best crc32 implementation available into a static function pointer. Also other minor improvements. --- include/wrenpp/detail/crc32.hpp | 8 +++-- src/crc32.cpp | 56 ++++++++++++++++++++------------- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/include/wrenpp/detail/crc32.hpp b/include/wrenpp/detail/crc32.hpp index 8d38624..00a1fa0 100644 --- a/include/wrenpp/detail/crc32.hpp +++ b/include/wrenpp/detail/crc32.hpp @@ -29,11 +29,13 @@ namespace wren { constexpr std::uint32_t g_castagnoli_polynomial = 0x1EDC6F41; + [[gnu::const]] constexpr std::uint8_t reverse (std::uint8_t b) { //see https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits return ((b * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; } + [[gnu::const]] constexpr std::uint32_t reverse (std::uint32_t val) { return (reverse(static_cast(val & 0xff)) << 24) | (reverse(static_cast(val >> 8 & 0xff)) << 16) | @@ -87,9 +89,9 @@ namespace wren { } } - template + template [[gnu::const]] - constexpr std::uint32_t crc32c (const char (&data)[N]) { - return crc32c(data, N); + constexpr std::uint32_t crc32c (const T (&data)[N]) { + return crc32c(static_cast(data), N * sizeof(T)); } } //namespace wren diff --git a/src/crc32.cpp b/src/crc32.cpp index f8962bb..f8dde26 100644 --- a/src/crc32.cpp +++ b/src/crc32.cpp @@ -25,6 +25,7 @@ # include # endif #endif +#include #if defined(WRENPP_WITH_SSE42) // Byte-boundary alignment issues @@ -42,53 +43,64 @@ namespace wren { // zlib: 0x04C11DB7 // castagnoli (intel): 0x1EDC6F41 -#if defined(WRENPP_WITH_SSE42) /* Compute CRC-32C using the Intel hardware instruction. */ /* for better parallelization with bigger buffers see http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */ - std::uint32_t crc32c_hw(const void *input, std::size_t len, std::uint32_t crc) + std::uint32_t crc32c_hw(const char* input, std::size_t len, std::uint32_t crc) { +#if defined(WRENPP_WITH_SSE42) //see https://github.com/rurban/smhasher/blob/master/crc32_hw.c constexpr std::size_t align_size = alignof(std::uint64_t); constexpr std::size_t align_mask = align_size - 1; - const char* buf = static_cast(input); - // XOR the initial CRC with INT_MAX - crc ^= 0xFFFFFFFF; + //crc ^= 0xFFFFFFFF; + crc = ~crc; // Align the input to the word boundary - for (; (len > 0) && (reinterpret_cast(buf) & align_mask); len--, buf++) { - crc = _mm_crc32_u8(crc, *buf); + for (; (len > 0) && (reinterpret_cast(input) & align_mask); len--, input++) { + crc = _mm_crc32_u8(crc, *input); } // Blast off the CRC32 calculation #if defined(__x86_64__) || defined(__aarch64__) - CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, buf, len); + CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, input, len); #endif - CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, buf, len); - CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, buf, len); - CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, buf, len); + CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, input, len); + CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, input, len); + CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, input, len); // Post-process the crc return ~crc; - } +#else + static_cast(input); + static_cast(crc); + static_cast(len); + assert(false); //not available in this build, this code should be unreachable #endif + } + + [[gnu::const]] + bool has_hw_crc32() { +#if defined(WRENPP_WITH_SSE42) + //if (_may_i_use_cpu_feature(_FEATURE_SSE4_2)) { + if (__builtin_cpu_supports("sse4.2")) + return true; + else +#endif + return false; + } } //unnamed namespace namespace detail { [[gnu::const]] std::uint32_t runtime_crc32c (const char* data, std::size_t size, std::uint32_t crc) { -#if defined(WRENPP_WITH_SSE42) - //if (_may_i_use_cpu_feature(_FEATURE_SSE4_2)) { - if (__builtin_cpu_supports("sse4.2")) { - return crc32c_hw(data, size, crc); - } - else -#endif - { - return detail::crc32(data, size, crc); - } + static const auto crc32c_implem = (has_hw_crc32() ? + &crc32c_hw + : + &detail::crc32 + ); + return (*crc32c_implem)(data, size, crc); } } //namespace detail } //namespace wren