Improve runtime crc32 code

Only check for sse4.2 once and store the best crc32 implementation available into a static function pointer. Also other minor improvements.
2022-05-24 13:39:45 +02:00 · 2022-05-24 13:39:45 +02:00 · e97c7bad10
commit e97c7bad10
parent e200288d06
2 changed files with 39 additions and 25 deletions
--- a/include/wrenpp/detail/crc32.hpp
+++ b/include/wrenpp/detail/crc32.hpp
@ -29,11 +29,13 @@ namespace wren {
 		constexpr std::uint32_t g_castagnoli_polynomial = 0x1EDC6F41;
 		[[gnu::const]]
 		constexpr std::uint8_t reverse (std::uint8_t b) {
 			//see https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
 			return ((b * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
 		}
 		[[gnu::const]]
 		constexpr std::uint32_t reverse (std::uint32_t val) {
 			return (reverse(static_cast<std::uint8_t>(val & 0xff)) << 24) |
 				(reverse(static_cast<std::uint8_t>(val >> 8 & 0xff)) << 16) |
@ -87,9 +89,9 @@ namespace wren {
 		}
 	}
-	template <std::size_t N>
+	template <typename T, std::size_t N>
 	[[gnu::const]]
-	constexpr std::uint32_t crc32c (const char (&data)[N]) {
+	constexpr std::uint32_t crc32c (const T (&data)[N]) {
-		return crc32c(data, N);
+		return crc32c(static_cast<const char*>(data), N * sizeof(T));
 	}
 } //namespace wren
--- a/src/crc32.cpp
+++ b/src/crc32.cpp
@ -25,6 +25,7 @@
 #		include <immintrin.h>
 #	endif
 #endif
 #include <cassert>
 #if defined(WRENPP_WITH_SSE42)
 	// Byte-boundary alignment issues
@ -42,53 +43,64 @@ namespace wren {
 		// zlib: 0x04C11DB7
 		// castagnoli (intel):  0x1EDC6F41
 #if defined(WRENPP_WITH_SSE42)
 		/* Compute CRC-32C using the Intel hardware instruction. */
 		/* for better parallelization with bigger buffers see
 		   http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */
-		std::uint32_t crc32c_hw(const void *input, std::size_t len, std::uint32_t crc)
+		std::uint32_t crc32c_hw(const char* input, std::size_t len, std::uint32_t crc)
 		{
 #if defined(WRENPP_WITH_SSE42)
 			//see https://github.com/rurban/smhasher/blob/master/crc32_hw.c
 			constexpr std::size_t align_size = alignof(std::uint64_t);
 			constexpr std::size_t align_mask = align_size - 1;
 			const char* buf = static_cast<const char*>(input);
 			// XOR the initial CRC with INT_MAX
-			crc ^= 0xFFFFFFFF;
+			//crc ^= 0xFFFFFFFF;
 			crc = ~crc;
 			// Align the input to the word boundary
-			for (; (len > 0) && (reinterpret_cast<std::uintptr_t>(buf) & align_mask); len--, buf++) {
+			for (; (len > 0) && (reinterpret_cast<std::uintptr_t>(input) & align_mask); len--, input++) {
-				crc = _mm_crc32_u8(crc, *buf);
+				crc = _mm_crc32_u8(crc, *input);
 			}
 			// Blast off the CRC32 calculation
 #if defined(__x86_64__) || defined(__aarch64__)
-			CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, buf, len);
+			CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, input, len);
 #endif
-			CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, buf, len);
+			CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, input, len);
-			CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, buf, len);
+			CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, input, len);
-			CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, buf, len);
+			CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, input, len);
 			// Post-process the crc
 			return ~crc;
-		}
+#else
 			static_cast<void>(input);
 			static_cast<void>(crc);
 			static_cast<void>(len);
 			assert(false); //not available in this build, this code should be unreachable
 #endif
 		}
 		[[gnu::const]]
 		bool has_hw_crc32() {
 #if defined(WRENPP_WITH_SSE42)
 			//if (_may_i_use_cpu_feature(_FEATURE_SSE4_2)) {
 			if (__builtin_cpu_supports("sse4.2"))
 				return true;
 			else
 #endif
 				return false;
 		}
 	} //unnamed namespace
 	namespace detail {
 		[[gnu::const]]
 		std::uint32_t runtime_crc32c (const char* data, std::size_t size, std::uint32_t crc) {
-#if defined(WRENPP_WITH_SSE42)
+			static const auto crc32c_implem = (has_hw_crc32() ?
-			//if (_may_i_use_cpu_feature(_FEATURE_SSE4_2)) {
+				&crc32c_hw
-			if (__builtin_cpu_supports("sse4.2")) {
+			:
-				return crc32c_hw(data, size, crc);
+				&detail::crc32<detail::g_castagnoli_polynomial>
-			}
+			);
-			else
+			return (*crc32c_implem)(data, size, crc);
 #endif
 			{
 				return detail::crc32<detail::g_castagnoli_polynomial>(data, size, crc);
 			}
 		}
 	} //namespace detail
 } //namespace wren