Tidy up hardware crc32 implementations

This commit is contained in:
King_DuckZ 2022-06-03 15:54:45 +02:00
parent 9566553856
commit 730f4f45ef
10 changed files with 217 additions and 239 deletions

View File

@ -69,7 +69,7 @@ namespace wren {
constexpr inline auto g_polynomial_table = PolynomialTable<Polynomial>::table;
template <std::uint32_t Polynomial, std::uint32_t XorIn=0xffffffff, std::uint32_t XorOut=XorIn>
constexpr std::uint32_t crc32 (const char* data, std::size_t len, std::uint32_t crc) {
constexpr std::uint32_t crc32 (const char* data, std::size_t len, std::uint32_t crc) noexcept {
//static_assert(g_polynomial_table<Polynomial>[0b10000000] == Polynomial);
crc ^= XorIn;
for (std::size_t z = 0; z < len; ++z) {

View File

@ -48,7 +48,6 @@ endif
conf.set('POINTER_SIZE', ptr_size)
conf.set('FUNC_POINTER_SIZE', func_ptr_size)
conf.set('WRENPP_NAME', meson.project_name())
conf.set('WRENPP_WITH_SSE42', get_option('wrenpp_with_sse42'))
subdir('include')
subdir('src')

View File

@ -3,4 +3,3 @@ option('build_examples', type: 'boolean', value: false)
option('wren_with_rand', type: 'boolean', value: false)
option('wren_with_meta', type: 'boolean', value: false)
option('wrenpp_with_name_guessing', type: 'boolean', value: true)
option('wrenpp_with_sse42', type: 'boolean', value: true)

View File

@ -16,103 +16,44 @@
*/
#include "wrenpp/detail/crc32.hpp"
#include "pvt_config.h"
#if defined(WRENPP_WITH_SSE42)
# if defined(__aarch64__)
extern "C" {
# include "sse2neon.h"
# include <sys/auxv.h>
# include <asm/hwcap.h>
} // extern C
# else
# include <smmintrin.h>
# include <immintrin.h>
# endif
#endif
#include <cassert>
#include "crc32_config.h"
#if defined(WRENPP_WITH_SSE42)
// Byte-boundary alignment issues
# define CALC_CRC(op, crc, type, buf, len) \
do { \
for (; (len) >= sizeof (type); (len) -= sizeof(type), buf += sizeof (type)) { \
(crc) = op((crc), *(type *) (buf)); \
} \
} while(0)
namespace wren::detail {
#if HAVE_NEON
[[gnu::const]] bool has_crc32_neon() noexcept;
[[gnu::pure]] std::uint32_t crc32c_neon(const char*, std::size_t, std::uint32_t) noexcept;
#endif
#if HAVE_SSE42
[[gnu::const]] bool has_crc32_sse42() noexcept;
[[gnu::pure]] std::uint32_t crc32c_sse42(const char*, std::size_t, std::uint32_t) noexcept;
#endif
} //namespace wren::detail
namespace wren::detail {
//Some useful polynomials:
// zlib: 0x04C11DB7
// castagnoli (intel): 0x1EDC6F41
namespace wren {
namespace {
//Some useful polynomials:
// zlib: 0x04C11DB7
// castagnoli (intel): 0x1EDC6F41
/* Compute CRC-32C using the Intel hardware instruction. */
/* for better parallelization with bigger buffers see
http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */
std::uint32_t crc32c_hw(const char* input, std::size_t len, std::uint32_t crc)
{
#if defined(WRENPP_WITH_SSE42)
//see https://github.com/rurban/smhasher/blob/master/crc32_hw.c
constexpr std::size_t align_size = alignof(std::uint64_t);
constexpr std::size_t align_mask = align_size - 1;
// XOR the initial CRC with INT_MAX
//crc ^= 0xFFFFFFFF;
crc = ~crc;
// Align the input to the word boundary
for (; (len > 0) && (reinterpret_cast<std::uintptr_t>(input) & align_mask); len--, input++) {
crc = _mm_crc32_u8(crc, *input);
}
// Blast off the CRC32 calculation
#if defined(__x86_64__) || defined(__aarch64__)
CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, input, len);
#endif
CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, input, len);
CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, input, len);
CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, input, len);
// Post-process the crc
return ~crc;
#else
static_cast<void>(input);
static_cast<void>(crc);
static_cast<void>(len);
assert(false); //not available in this build, this code should be unreachable
#endif
}
[[gnu::const]]
bool has_hw_crc32() {
#if defined(WRENPP_WITH_SSE42)
# if defined(__amd64__) || defined(_M_AMD64)
//if (_may_i_use_cpu_feature(_FEATURE_SSE4_2)) {
if (__builtin_cpu_supports("sse4.2"))
return true;
else
# elif defined(__aarch64__)
//see
//https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
if (getauxval(AT_HWCAP) & HWCAP_CRC32)
return true;
else
# endif
auto best_crc32_function() {
#if HAVE_NEON
if (has_crc32_neon())
return &crc32c_neon;
#endif
return false;
#if HAVE_SSE42
if (has_crc32_sse42())
return &crc32c_sse42;
#endif
return &crc32<detail::g_castagnoli_polynomial>;
}
} //unnamed namespace
namespace detail {
[[gnu::const]]
std::uint32_t runtime_crc32c (const char* data, std::size_t size, std::uint32_t crc) {
static const auto crc32c_implem = (has_hw_crc32() ?
&crc32c_hw
:
&detail::crc32<detail::g_castagnoli_polynomial>
);
return (*crc32c_implem)(data, size, crc);
}
} //namespace detail
} //namespace wren
[[gnu::const]]
std::uint32_t runtime_crc32c (const char* data, std::size_t size, std::uint32_t crc) {
static const auto crc32c_implem = best_crc32_function();
return (*crc32c_implem)(data, size, crc);
}
} //namespace wren::detail

100
src/crc32/crc32_neon.cpp Normal file
View File

@ -0,0 +1,100 @@
/* Copyright 2020-2022, Michele Santullo
* This file is part of wrenpp.
*
* Wrenpp is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Wrenpp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with wrenpp. If not, see <http://www.gnu.org/licenses/>.
*/
#include <sys/auxv.h>
#include <asm/hwcap.h>
#include <cstdint>
// Byte-boundary alignment issues
#define CALC_CRC(op, crc, type, buf, len) \
do { \
for (; (len) >= sizeof (type); (len) -= sizeof(type), buf += sizeof (type)) { \
(crc) = op((crc), *(type *) (buf)); \
} \
} while(0)
namespace wren::detail {
//King_DuckZ - adapted from
//https://github.com/rurban/smhasher/blob/master/sse2neon.h
namespace {
[[gnu::always_inline]]
std::uint32_t neon_crc32cb(std::uint32_t crc, std::uint8_t v) {
__asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
: [c] "+r"(crc)
: [v] "r"(v));
}
[[gnu::always_inline]]
std::uint32_t neon_crc32ch (std::uint32_t crc, std::uint16_t v) {
__asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
: [c] "+r"(crc)
: [v] "r"(v));
}
[[gnu::always_inline]]
std::uint32_t neon_crc32cw(std::uint32_t crc, std::uint32_t v) {
__asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
: [c] "+r"(crc)
: [v] "r"(v));
}
[[gnu::always_inline]]
std::uint64_t neon_crc32cx (std::uint64_t crc, std::uint64_t v) {
__asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
: [c] "+r"(crc)
: [v] "r"(v));
}
} //unnamed namespace
[[gnu::const]]
bool has_crc32_neon() noexcept {
//see
//https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
const bool has_hw_crc32 = (getauxval(AT_HWCAP) & HWCAP_CRC32 ? true : false);
return has_hw_crc32;
}
/* Compute CRC-32C using the Intel hardware instruction. */
/* for better parallelization with bigger buffers see
http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */
[[gnu::pure]]
std::uint32_t crc32c_neon(const char* input, std::size_t len, std::uint32_t crc) noexcept {
//see https://github.com/rurban/smhasher/blob/master/crc32_hw.c
constexpr std::size_t align_size = alignof(std::uint64_t);
constexpr std::size_t align_mask = align_size - 1;
// XOR the initial CRC with INT_MAX
//crc ^= 0xFFFFFFFF;
crc = ~crc;
// Align the input to the word boundary
for (; (len > 0) && (reinterpret_cast<std::uintptr_t>(input) & align_mask); len--, input++) {
crc = neon_crc32cb(crc, *input);
}
// Blast off the CRC32 calculation
#if defined(__x86_64__) || defined(__aarch64__)
CALC_CRC(neon_crc32cx, crc, std::uint64_t, input, len);
#endif
CALC_CRC(neon_crc32cw, crc, std::uint32_t, input, len);
CALC_CRC(neon_crc32ch, crc, std::uint16_t, input, len);
CALC_CRC(neon_crc32cb, crc, std::uint8_t, input, len);
// Post-process the crc
return ~crc;
}
} //namespace wren::detail

66
src/crc32/crc32_sse42.cpp Normal file
View File

@ -0,0 +1,66 @@
/* Copyright 2020-2022, Michele Santullo
* This file is part of wrenpp.
*
* Wrenpp is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Wrenpp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with wrenpp. If not, see <http://www.gnu.org/licenses/>.
*/
#include <smmintrin.h>
#include <immintrin.h>
#include <cstdint>
// Byte-boundary alignment issues
#define CALC_CRC(op, crc, type, buf, len) \
do { \
for (; (len) >= sizeof (type); (len) -= sizeof(type), buf += sizeof (type)) { \
(crc) = op((crc), *(type *) (buf)); \
} \
} while(0)
namespace wren::detail {
[[gnu::const]]
bool has_crc32_sse42() noexcept {
const bool has_hw_crc32 = (__builtin_cpu_supports("sse4.2") ? true : false);
return has_hw_crc32;
}
/* Compute CRC-32C using the Intel hardware instruction. */
/* for better parallelization with bigger buffers see
http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */
[[gnu::pure]]
std::uint32_t crc32c_sse42(const char* input, std::size_t len, std::uint32_t crc) noexcept {
//see https://github.com/rurban/smhasher/blob/master/crc32_hw.c
constexpr std::size_t align_size = alignof(std::uint64_t);
constexpr std::size_t align_mask = align_size - 1;
// XOR the initial CRC with INT_MAX
//crc ^= 0xFFFFFFFF;
crc = ~crc;
// Align the input to the word boundary
for (; (len > 0) && (reinterpret_cast<std::uintptr_t>(input) & align_mask); len--, input++) {
crc = _mm_crc32_u8(crc, *input);
}
// Blast off the CRC32 calculation
#if defined(__x86_64__) || defined(__aarch64__)
CALC_CRC(_mm_crc32_u64, crc, std::uint64_t, input, len);
#endif
CALC_CRC(_mm_crc32_u32, crc, std::uint32_t, input, len);
CALC_CRC(_mm_crc32_u16, crc, std::uint16_t, input, len);
CALC_CRC(_mm_crc32_u8, crc, std::uint8_t, input, len);
// Post-process the crc
return ~crc;
}
} //namespace wren::detail

View File

@ -1,5 +1,21 @@
simd = import('unstable-simd')
compiler_opts = []
cpp = meson.get_compiler('cpp')
crc32_simd = simd.check('crc32_hw',
sse42: 'crc32_sse42.cpp',
neon: 'crc32_neon.cpp',
compiler: cpp,
)
crc32_objs = crc32_simd[0]
crc32_config = crc32_simd[1]
project_config_file = configure_file(
output: 'crc32_config.h',
configuration: crc32_config
)
if get_option('wrenpp_with_sse42')
if arch == 'amd64'
compiler_opts += ['-msse4.2']
@ -12,9 +28,10 @@ endif
crc32 = static_library('crc32',
'crc32.cpp',
include_directories: [public_incl, src_incl],
include_directories: [public_incl],
install: false,
cpp_args: compiler_opts + global_compiler_opts,
link_with: crc32_objs,
)
crc32_dep = declare_dependency(

View File

@ -1,141 +0,0 @@
#ifndef SSE2NEON_H
#define SSE2NEON_H
// This header file provides a simple API translation layer
// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
//
// This header file does not yet translate all of the SSE intrinsics.
//
// Contributors to this work are:
// John W. Ratcliff <jratcliffscarab@gmail.com>
// Brandon Rowlett <browlett@nvidia.com>
// Ken Fast <kfast@gdeb.com>
// Eric van Beurden <evanbeurden@nvidia.com>
// Alexander Potylitsin <apotylitsin@nvidia.com>
// Hasindu Gamaarachchi <hasindu2008@gmail.com>
// Jim Huang <jserv@biilabs.io>
// Mark Cheng <marktwtn@biilabs.io>
// Malcolm James MacLeod <malcolm@gulden.com>
// Devin Hussey (easyaspi314) <husseydevin@gmail.com>
// Sebastian Pop <spop@amazon.com>
// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
// Danila Kutenin <danilak@google.com>
// François Turban (JishinMaster) <francois.turban@gmail.com>
// Pei-Hsuan Hung <afcidk@gmail.com>
// Yang-Hao Yuan <yanghau@biilabs.io>
// Syoyo Fujita <syoyo@lighttransport.com>
// Brecht Van Lommel <brecht@blender.org>
/*
* sse2neon is freely redistributable under the MIT License.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
//King_DuckZ
//trimmed down version of
//https://github.com/rurban/smhasher/blob/master/sse2neon.h
#if defined(__GNUC__) || defined(__clang__)
#pragma push_macro("FORCE_INLINE")
#pragma push_macro("ALIGN_STRUCT")
#ifndef FORCE_INLINE
#define FORCE_INLINE static inline __attribute__((always_inline))
#endif
#else
#error "Macro name collisions may happen with unsupported compiler."
#ifdef FORCE_INLINE
#undef FORCE_INLINE
#endif
#define FORCE_INLINE static inline
#endif
#include <stdint.h>
#include <stdlib.h>
// Starting with the initial value in crc, accumulates a CRC32 value for
// unsigned 8-bit integer v.
// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
{
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
__asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
: [c] "+r"(crc)
: [v] "r"(v));
#else
crc ^= v;
for (int bit = 0; bit < 8; bit++) {
if (crc & 1)
crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
else
crc = (crc >> 1);
}
#endif
return crc;
}
// Starting with the initial value in crc, accumulates a CRC32 value for
// unsigned 16-bit integer v.
// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
{
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
__asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
: [c] "+r"(crc)
: [v] "r"(v));
#else
crc = _mm_crc32_u8(crc, v & 0xff);
crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
#endif
return crc;
}
// Starting with the initial value in crc, accumulates a CRC32 value for
// unsigned 32-bit integer v.
// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
{
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
__asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
: [c] "+r"(crc)
: [v] "r"(v));
#else
crc = _mm_crc32_u16(crc, v & 0xffff);
crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
#endif
return crc;
}
// Starting with the initial value in crc, accumulates a CRC32 value for
// unsigned 64-bit integer v.
// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
{
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
__asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
: [c] "+r"(crc)
: [v] "r"(v));
#else
crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
#endif
return crc;
}
#endif

View File

@ -1,4 +1,3 @@
src_incl = include_directories('.')
subdir('crc32')
project_config_file = configure_file(

View File

@ -27,5 +27,3 @@ static_assert(sizeof(void*) == ASM_PTR_SIZE, "Build system reports an unexpected
static_assert(sizeof(void(*)(int)) == ASM_FUNC_PTR_SIZE, "Build system reports an unexpected function pointer size, please ensure assembly code is correct");
#endif
#mesondefine WRENPP_WITH_SSE42