Improve runtime crc32 code

Only check for sse4.2 once and store the best
crc32 implementation available into a static
function pointer.
Also other minor improvements.
This commit is contained in:
King_DuckZ 2022-05-24 13:39:45 +02:00
parent e200288d06
commit e97c7bad10
2 changed files with 39 additions and 25 deletions

View file

@ -29,11 +29,13 @@ namespace wren {
constexpr std::uint32_t g_castagnoli_polynomial = 0x1EDC6F41;
[[gnu::const]]
constexpr std::uint8_t reverse (std::uint8_t b) {
//see https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
return ((b * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
}
[[gnu::const]]
constexpr std::uint32_t reverse (std::uint32_t val) {
return (reverse(static_cast<std::uint8_t>(val & 0xff)) << 24) |
(reverse(static_cast<std::uint8_t>(val >> 8 & 0xff)) << 16) |
@ -87,9 +89,9 @@ namespace wren {
}
}
template <std::size_t N>
template <typename T, std::size_t N>
[[gnu::const]]
constexpr std::uint32_t crc32c (const char (&data)[N]) {
return crc32c(data, N);
constexpr std::uint32_t crc32c (const T (&data)[N]) {
return crc32c(static_cast<const char*>(data), N * sizeof(T));
}
} //namespace wren

View file

@ -25,6 +25,7 @@
# include <immintrin.h>
# endif
#endif
#include <cassert>
#if defined(WRENPP_WITH_SSE42)
// Byte-boundary alignment issues
@ -42,53 +43,64 @@ namespace wren {
// zlib: 0x04C11DB7
// castagnoli (intel): 0x1EDC6F41
#if defined(WRENPP_WITH_SSE42)
/* Compute CRC-32C using the Intel hardware instruction. */
/* for better parallelization with bigger buffers see
http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */
std::uint32_t crc32c_hw(const void *input, std::size_t len, std::uint32_t crc)
std::uint32_t crc32c_hw(const char* input, std::size_t len, std::uint32_t crc)
{
#if defined(WRENPP_WITH_SSE42)
//see https://github.com/rurban/smhasher/blob/master/crc32_hw.c
constexpr std::size_t align_size = alignof(std::uint64_t);
constexpr std::size_t align_mask = align_size - 1;
const char* buf = static_cast<const char*>(input);
// XOR the initial CRC with INT_MAX
crc ^= 0xFFFFFFFF;
//crc ^= 0xFFFFFFFF;
crc = ~crc;
// Align the input to the word boundary
for (; (len > 0) && (reinterpret_cast<std::uintptr_t>(buf) & align_mask); len--, buf++) {
crc = _mm_crc32_u8(crc, *buf);
for (; (len > 0) && (reinterpret_cast<std::uintptr_t>(input) & align_mask); len--, input++) {
crc = _mm_crc32_u8(crc, *input);
}
// Blast off the CRC32 calculation
#if defined(__x86_64__) || defined(__aarch64__)
CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, buf, len);
CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, input, len);
#endif
CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, buf, len);
CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, buf, len);
CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, buf, len);
CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, input, len);
CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, input, len);
CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, input, len);
// Post-process the crc
return ~crc;
}
#else
static_cast<void>(input);
static_cast<void>(crc);
static_cast<void>(len);
assert(false); //not available in this build, this code should be unreachable
#endif
}
[[gnu::const]]
bool has_hw_crc32() {
#if defined(WRENPP_WITH_SSE42)
//if (_may_i_use_cpu_feature(_FEATURE_SSE4_2)) {
if (__builtin_cpu_supports("sse4.2"))
return true;
else
#endif
return false;
}
} //unnamed namespace
namespace detail {
[[gnu::const]]
std::uint32_t runtime_crc32c (const char* data, std::size_t size, std::uint32_t crc) {
#if defined(WRENPP_WITH_SSE42)
//if (_may_i_use_cpu_feature(_FEATURE_SSE4_2)) {
if (__builtin_cpu_supports("sse4.2")) {
return crc32c_hw(data, size, crc);
}
else
#endif
{
return detail::crc32<detail::g_castagnoli_polynomial>(data, size, crc);
}
static const auto crc32c_implem = (has_hw_crc32() ?
&crc32c_hw
:
&detail::crc32<detail::g_castagnoli_polynomial>
);
return (*crc32c_implem)(data, size, crc);
}
} //namespace detail
} //namespace wren