Improve runtime crc32 code
Only check for sse4.2 once and store the best crc32 implementation available into a static function pointer. Also other minor improvements.
This commit is contained in:
parent
e200288d06
commit
e97c7bad10
2 changed files with 39 additions and 25 deletions
|
@ -29,11 +29,13 @@ namespace wren {
|
|||
|
||||
constexpr std::uint32_t g_castagnoli_polynomial = 0x1EDC6F41;
|
||||
|
||||
[[gnu::const]]
|
||||
constexpr std::uint8_t reverse (std::uint8_t b) {
|
||||
//see https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
|
||||
return ((b * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
|
||||
}
|
||||
|
||||
[[gnu::const]]
|
||||
constexpr std::uint32_t reverse (std::uint32_t val) {
|
||||
return (reverse(static_cast<std::uint8_t>(val & 0xff)) << 24) |
|
||||
(reverse(static_cast<std::uint8_t>(val >> 8 & 0xff)) << 16) |
|
||||
|
@ -87,9 +89,9 @@ namespace wren {
|
|||
}
|
||||
}
|
||||
|
||||
template <std::size_t N>
|
||||
template <typename T, std::size_t N>
|
||||
[[gnu::const]]
|
||||
constexpr std::uint32_t crc32c (const char (&data)[N]) {
|
||||
return crc32c(data, N);
|
||||
constexpr std::uint32_t crc32c (const T (&data)[N]) {
|
||||
return crc32c(static_cast<const char*>(data), N * sizeof(T));
|
||||
}
|
||||
} //namespace wren
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
# include <immintrin.h>
|
||||
# endif
|
||||
#endif
|
||||
#include <cassert>
|
||||
|
||||
#if defined(WRENPP_WITH_SSE42)
|
||||
// Byte-boundary alignment issues
|
||||
|
@ -42,53 +43,64 @@ namespace wren {
|
|||
// zlib: 0x04C11DB7
|
||||
// castagnoli (intel): 0x1EDC6F41
|
||||
|
||||
#if defined(WRENPP_WITH_SSE42)
|
||||
/* Compute CRC-32C using the Intel hardware instruction. */
|
||||
/* for better parallelization with bigger buffers see
|
||||
http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */
|
||||
std::uint32_t crc32c_hw(const void *input, std::size_t len, std::uint32_t crc)
|
||||
std::uint32_t crc32c_hw(const char* input, std::size_t len, std::uint32_t crc)
|
||||
{
|
||||
#if defined(WRENPP_WITH_SSE42)
|
||||
//see https://github.com/rurban/smhasher/blob/master/crc32_hw.c
|
||||
constexpr std::size_t align_size = alignof(std::uint64_t);
|
||||
constexpr std::size_t align_mask = align_size - 1;
|
||||
|
||||
const char* buf = static_cast<const char*>(input);
|
||||
|
||||
// XOR the initial CRC with INT_MAX
|
||||
crc ^= 0xFFFFFFFF;
|
||||
//crc ^= 0xFFFFFFFF;
|
||||
crc = ~crc;
|
||||
|
||||
// Align the input to the word boundary
|
||||
for (; (len > 0) && (reinterpret_cast<std::uintptr_t>(buf) & align_mask); len--, buf++) {
|
||||
crc = _mm_crc32_u8(crc, *buf);
|
||||
for (; (len > 0) && (reinterpret_cast<std::uintptr_t>(input) & align_mask); len--, input++) {
|
||||
crc = _mm_crc32_u8(crc, *input);
|
||||
}
|
||||
|
||||
// Blast off the CRC32 calculation
|
||||
#if defined(__x86_64__) || defined(__aarch64__)
|
||||
CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, buf, len);
|
||||
CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, input, len);
|
||||
#endif
|
||||
CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, buf, len);
|
||||
CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, buf, len);
|
||||
CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, buf, len);
|
||||
CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, input, len);
|
||||
CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, input, len);
|
||||
CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, input, len);
|
||||
|
||||
// Post-process the crc
|
||||
return ~crc;
|
||||
}
|
||||
#else
|
||||
static_cast<void>(input);
|
||||
static_cast<void>(crc);
|
||||
static_cast<void>(len);
|
||||
assert(false); //not available in this build, this code should be unreachable
|
||||
#endif
|
||||
}
|
||||
|
||||
[[gnu::const]]
|
||||
bool has_hw_crc32() {
|
||||
#if defined(WRENPP_WITH_SSE42)
|
||||
//if (_may_i_use_cpu_feature(_FEATURE_SSE4_2)) {
|
||||
if (__builtin_cpu_supports("sse4.2"))
|
||||
return true;
|
||||
else
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
} //unnamed namespace
|
||||
|
||||
namespace detail {
|
||||
[[gnu::const]]
|
||||
std::uint32_t runtime_crc32c (const char* data, std::size_t size, std::uint32_t crc) {
|
||||
#if defined(WRENPP_WITH_SSE42)
|
||||
//if (_may_i_use_cpu_feature(_FEATURE_SSE4_2)) {
|
||||
if (__builtin_cpu_supports("sse4.2")) {
|
||||
return crc32c_hw(data, size, crc);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
return detail::crc32<detail::g_castagnoli_polynomial>(data, size, crc);
|
||||
}
|
||||
static const auto crc32c_implem = (has_hw_crc32() ?
|
||||
&crc32c_hw
|
||||
:
|
||||
&detail::crc32<detail::g_castagnoli_polynomial>
|
||||
);
|
||||
return (*crc32c_implem)(data, size, crc);
|
||||
}
|
||||
} //namespace detail
|
||||
} //namespace wren
|
||||
|
|
Loading…
Reference in a new issue