Improve runtime crc32 code
Only check for sse4.2 once and store the best crc32 implementation available into a static function pointer. Also other minor improvements.
This commit is contained in:
parent
e200288d06
commit
e97c7bad10
2 changed files with 39 additions and 25 deletions
|
@ -29,11 +29,13 @@ namespace wren {
|
||||||
|
|
||||||
constexpr std::uint32_t g_castagnoli_polynomial = 0x1EDC6F41;
|
constexpr std::uint32_t g_castagnoli_polynomial = 0x1EDC6F41;
|
||||||
|
|
||||||
|
[[gnu::const]]
|
||||||
constexpr std::uint8_t reverse (std::uint8_t b) {
|
constexpr std::uint8_t reverse (std::uint8_t b) {
|
||||||
//see https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
|
//see https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
|
||||||
return ((b * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
|
return ((b * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[[gnu::const]]
|
||||||
constexpr std::uint32_t reverse (std::uint32_t val) {
|
constexpr std::uint32_t reverse (std::uint32_t val) {
|
||||||
return (reverse(static_cast<std::uint8_t>(val & 0xff)) << 24) |
|
return (reverse(static_cast<std::uint8_t>(val & 0xff)) << 24) |
|
||||||
(reverse(static_cast<std::uint8_t>(val >> 8 & 0xff)) << 16) |
|
(reverse(static_cast<std::uint8_t>(val >> 8 & 0xff)) << 16) |
|
||||||
|
@ -87,9 +89,9 @@ namespace wren {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <std::size_t N>
|
template <typename T, std::size_t N>
|
||||||
[[gnu::const]]
|
[[gnu::const]]
|
||||||
constexpr std::uint32_t crc32c (const char (&data)[N]) {
|
constexpr std::uint32_t crc32c (const T (&data)[N]) {
|
||||||
return crc32c(data, N);
|
return crc32c(static_cast<const char*>(data), N * sizeof(T));
|
||||||
}
|
}
|
||||||
} //namespace wren
|
} //namespace wren
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
# include <immintrin.h>
|
# include <immintrin.h>
|
||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
#include <cassert>
|
||||||
|
|
||||||
#if defined(WRENPP_WITH_SSE42)
|
#if defined(WRENPP_WITH_SSE42)
|
||||||
// Byte-boundary alignment issues
|
// Byte-boundary alignment issues
|
||||||
|
@ -42,53 +43,64 @@ namespace wren {
|
||||||
// zlib: 0x04C11DB7
|
// zlib: 0x04C11DB7
|
||||||
// castagnoli (intel): 0x1EDC6F41
|
// castagnoli (intel): 0x1EDC6F41
|
||||||
|
|
||||||
#if defined(WRENPP_WITH_SSE42)
|
|
||||||
/* Compute CRC-32C using the Intel hardware instruction. */
|
/* Compute CRC-32C using the Intel hardware instruction. */
|
||||||
/* for better parallelization with bigger buffers see
|
/* for better parallelization with bigger buffers see
|
||||||
http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */
|
http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */
|
||||||
std::uint32_t crc32c_hw(const void *input, std::size_t len, std::uint32_t crc)
|
std::uint32_t crc32c_hw(const char* input, std::size_t len, std::uint32_t crc)
|
||||||
{
|
{
|
||||||
|
#if defined(WRENPP_WITH_SSE42)
|
||||||
//see https://github.com/rurban/smhasher/blob/master/crc32_hw.c
|
//see https://github.com/rurban/smhasher/blob/master/crc32_hw.c
|
||||||
constexpr std::size_t align_size = alignof(std::uint64_t);
|
constexpr std::size_t align_size = alignof(std::uint64_t);
|
||||||
constexpr std::size_t align_mask = align_size - 1;
|
constexpr std::size_t align_mask = align_size - 1;
|
||||||
|
|
||||||
const char* buf = static_cast<const char*>(input);
|
|
||||||
|
|
||||||
// XOR the initial CRC with INT_MAX
|
// XOR the initial CRC with INT_MAX
|
||||||
crc ^= 0xFFFFFFFF;
|
//crc ^= 0xFFFFFFFF;
|
||||||
|
crc = ~crc;
|
||||||
|
|
||||||
// Align the input to the word boundary
|
// Align the input to the word boundary
|
||||||
for (; (len > 0) && (reinterpret_cast<std::uintptr_t>(buf) & align_mask); len--, buf++) {
|
for (; (len > 0) && (reinterpret_cast<std::uintptr_t>(input) & align_mask); len--, input++) {
|
||||||
crc = _mm_crc32_u8(crc, *buf);
|
crc = _mm_crc32_u8(crc, *input);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Blast off the CRC32 calculation
|
// Blast off the CRC32 calculation
|
||||||
#if defined(__x86_64__) || defined(__aarch64__)
|
#if defined(__x86_64__) || defined(__aarch64__)
|
||||||
CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, buf, len);
|
CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, input, len);
|
||||||
#endif
|
#endif
|
||||||
CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, buf, len);
|
CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, input, len);
|
||||||
CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, buf, len);
|
CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, input, len);
|
||||||
CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, buf, len);
|
CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, input, len);
|
||||||
|
|
||||||
// Post-process the crc
|
// Post-process the crc
|
||||||
return ~crc;
|
return ~crc;
|
||||||
}
|
#else
|
||||||
|
static_cast<void>(input);
|
||||||
|
static_cast<void>(crc);
|
||||||
|
static_cast<void>(len);
|
||||||
|
assert(false); //not available in this build, this code should be unreachable
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
[[gnu::const]]
|
||||||
|
bool has_hw_crc32() {
|
||||||
|
#if defined(WRENPP_WITH_SSE42)
|
||||||
|
//if (_may_i_use_cpu_feature(_FEATURE_SSE4_2)) {
|
||||||
|
if (__builtin_cpu_supports("sse4.2"))
|
||||||
|
return true;
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
return false;
|
||||||
|
}
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
[[gnu::const]]
|
[[gnu::const]]
|
||||||
std::uint32_t runtime_crc32c (const char* data, std::size_t size, std::uint32_t crc) {
|
std::uint32_t runtime_crc32c (const char* data, std::size_t size, std::uint32_t crc) {
|
||||||
#if defined(WRENPP_WITH_SSE42)
|
static const auto crc32c_implem = (has_hw_crc32() ?
|
||||||
//if (_may_i_use_cpu_feature(_FEATURE_SSE4_2)) {
|
&crc32c_hw
|
||||||
if (__builtin_cpu_supports("sse4.2")) {
|
:
|
||||||
return crc32c_hw(data, size, crc);
|
&detail::crc32<detail::g_castagnoli_polynomial>
|
||||||
}
|
);
|
||||||
else
|
return (*crc32c_implem)(data, size, crc);
|
||||||
#endif
|
|
||||||
{
|
|
||||||
return detail::crc32<detail::g_castagnoli_polynomial>(data, size, crc);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} //namespace detail
|
} //namespace detail
|
||||||
} //namespace wren
|
} //namespace wren
|
||||||
|
|
Loading…
Reference in a new issue