Improve runtime crc32 code

Only check for sse4.2 once and store the best
crc32 implementation available into a static
function pointer.
Also other minor improvements.
This commit is contained in:
King_DuckZ 2022-05-24 13:39:45 +02:00
parent e200288d06
commit e97c7bad10
2 changed files with 39 additions and 25 deletions

View file

@ -29,11 +29,13 @@ namespace wren {
constexpr std::uint32_t g_castagnoli_polynomial = 0x1EDC6F41; constexpr std::uint32_t g_castagnoli_polynomial = 0x1EDC6F41;
[[gnu::const]]
constexpr std::uint8_t reverse (std::uint8_t b) { constexpr std::uint8_t reverse (std::uint8_t b) {
//see https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits //see https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
return ((b * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; return ((b * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
} }
[[gnu::const]]
constexpr std::uint32_t reverse (std::uint32_t val) { constexpr std::uint32_t reverse (std::uint32_t val) {
return (reverse(static_cast<std::uint8_t>(val & 0xff)) << 24) | return (reverse(static_cast<std::uint8_t>(val & 0xff)) << 24) |
(reverse(static_cast<std::uint8_t>(val >> 8 & 0xff)) << 16) | (reverse(static_cast<std::uint8_t>(val >> 8 & 0xff)) << 16) |
@ -87,9 +89,9 @@ namespace wren {
} }
} }
template <std::size_t N> template <typename T, std::size_t N>
[[gnu::const]] [[gnu::const]]
constexpr std::uint32_t crc32c (const char (&data)[N]) { constexpr std::uint32_t crc32c (const T (&data)[N]) {
return crc32c(data, N); return crc32c(static_cast<const char*>(data), N * sizeof(T));
} }
} //namespace wren } //namespace wren

View file

@ -25,6 +25,7 @@
# include <immintrin.h> # include <immintrin.h>
# endif # endif
#endif #endif
#include <cassert>
#if defined(WRENPP_WITH_SSE42) #if defined(WRENPP_WITH_SSE42)
// Byte-boundary alignment issues // Byte-boundary alignment issues
@ -42,53 +43,64 @@ namespace wren {
// zlib: 0x04C11DB7 // zlib: 0x04C11DB7
// castagnoli (intel): 0x1EDC6F41 // castagnoli (intel): 0x1EDC6F41
#if defined(WRENPP_WITH_SSE42)
/* Compute CRC-32C using the Intel hardware instruction. */ /* Compute CRC-32C using the Intel hardware instruction. */
/* for better parallelization with bigger buffers see /* for better parallelization with bigger buffers see
http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */ http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */
std::uint32_t crc32c_hw(const void *input, std::size_t len, std::uint32_t crc) std::uint32_t crc32c_hw(const char* input, std::size_t len, std::uint32_t crc)
{ {
#if defined(WRENPP_WITH_SSE42)
//see https://github.com/rurban/smhasher/blob/master/crc32_hw.c //see https://github.com/rurban/smhasher/blob/master/crc32_hw.c
constexpr std::size_t align_size = alignof(std::uint64_t); constexpr std::size_t align_size = alignof(std::uint64_t);
constexpr std::size_t align_mask = align_size - 1; constexpr std::size_t align_mask = align_size - 1;
const char* buf = static_cast<const char*>(input);
// XOR the initial CRC with INT_MAX // XOR the initial CRC with INT_MAX
crc ^= 0xFFFFFFFF; //crc ^= 0xFFFFFFFF;
crc = ~crc;
// Align the input to the word boundary // Align the input to the word boundary
for (; (len > 0) && (reinterpret_cast<std::uintptr_t>(buf) & align_mask); len--, buf++) { for (; (len > 0) && (reinterpret_cast<std::uintptr_t>(input) & align_mask); len--, input++) {
crc = _mm_crc32_u8(crc, *buf); crc = _mm_crc32_u8(crc, *input);
} }
// Blast off the CRC32 calculation // Blast off the CRC32 calculation
#if defined(__x86_64__) || defined(__aarch64__) #if defined(__x86_64__) || defined(__aarch64__)
CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, buf, len); CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, input, len);
#endif #endif
CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, buf, len); CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, input, len);
CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, buf, len); CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, input, len);
CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, buf, len); CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, input, len);
// Post-process the crc // Post-process the crc
return ~crc; return ~crc;
} #else
static_cast<void>(input);
static_cast<void>(crc);
static_cast<void>(len);
assert(false); //not available in this build, this code should be unreachable
#endif #endif
}
[[gnu::const]]
bool has_hw_crc32() {
#if defined(WRENPP_WITH_SSE42)
//if (_may_i_use_cpu_feature(_FEATURE_SSE4_2)) {
if (__builtin_cpu_supports("sse4.2"))
return true;
else
#endif
return false;
}
} //unnamed namespace } //unnamed namespace
namespace detail { namespace detail {
[[gnu::const]] [[gnu::const]]
std::uint32_t runtime_crc32c (const char* data, std::size_t size, std::uint32_t crc) { std::uint32_t runtime_crc32c (const char* data, std::size_t size, std::uint32_t crc) {
#if defined(WRENPP_WITH_SSE42) static const auto crc32c_implem = (has_hw_crc32() ?
//if (_may_i_use_cpu_feature(_FEATURE_SSE4_2)) { &crc32c_hw
if (__builtin_cpu_supports("sse4.2")) { :
return crc32c_hw(data, size, crc); &detail::crc32<detail::g_castagnoli_polynomial>
} );
else return (*crc32c_implem)(data, size, crc);
#endif
{
return detail::crc32<detail::g_castagnoli_polynomial>(data, size, crc);
}
} }
} //namespace detail } //namespace detail
} //namespace wren } //namespace wren