diff options
-rw-r--r-- | cmake/msvc.cmake | 1 | ||||
-rw-r--r-- | hash/hash-impl.hpp | 16 | ||||
-rw-r--r-- | hash/hash.hpp | 10 | ||||
-rw-r--r-- | hash/meow_hash-LICENSE.txt | 19 | ||||
-rw-r--r-- | hash/meow_hash.cpp | 48 | ||||
-rw-r--r-- | hash/meow_hash_x64_aesni.h | 750 | ||||
-rw-r--r-- | hash/xxhash.cpp | 19 | ||||
-rw-r--r-- | test/hash.cpp | 108 | ||||
-rw-r--r-- | userconfig-sthalik@Windows-Clang.cmake | 7 | ||||
-rw-r--r-- | userconfig-sthalik@Windows-GNU.cmake | 6 |
10 files changed, 918 insertions, 66 deletions
diff --git a/cmake/msvc.cmake b/cmake/msvc.cmake index 5638a3d8..e21e17d7 100644 --- a/cmake/msvc.cmake +++ b/cmake/msvc.cmake @@ -100,6 +100,7 @@ set(_CXXFLAGS "${_CFLAGS} -Zc:throwingNew -Zc:lambda") set(_CFLAGS_RELEASE "-O2 -Oit -Oy -Ox -Ob3 -fp:fast -GS- -GF -GL -Gw -Gy") if(NOT floormat-simd STREQUAL "") set(_CFLAGS_RELEASE "${_CFLAGS_RELEASE} -arch:${floormat-simd}") + set(_CFLAGS_DEBUG "${_CFLAGS_DEBUG} -arch:${floormat-simd}") endif() set(_CFLAGS_DEBUG "-guard:cf -MDd -RTCsu") set(_CXXFLAGS_RELEASE "${_CFLAGS_RELEASE}") diff --git a/hash/hash-impl.hpp b/hash/hash-impl.hpp index 3e33b38b..f752a889 100644 --- a/hash/hash-impl.hpp +++ b/hash/hash-impl.hpp @@ -1,4 +1,20 @@ #pragma once +#include "compat/defs.hpp" + +#if (defined __x86_64__ || defined __i386__ || defined _M_AMD64 || defined _M_IX86) && \ + (defined __AES__ || defined _MSC_VER && defined __AVX__) +#define FM_HASH_HAVE_MEOWHASH 1 +#else +#define FM_HASH_HAVE_MEOWHASH 0 +#endif + +#if FM_HASH_HAVE_MEOWHASH +namespace floormat::meow_hash { +size_t hash_buf(const void* __restrict buf, size_t size) noexcept; +size_t hash_int(uint32_t x) noexcept; +size_t hash_int(uint64_t x) noexcept; +} // namespace floormat::meow_hash +#endif namespace floormat::xxHash { size_t hash_buf(const void* __restrict buf, size_t size) noexcept; diff --git a/hash/hash.hpp b/hash/hash.hpp index e3df6e90..b5916e8f 100644 --- a/hash/hash.hpp +++ b/hash/hash.hpp @@ -3,12 +3,14 @@ #include <cr/StringView.h> namespace floormat { -namespace Hash = xxHash; -using Hash::hash_buf; -using Hash::hash_int; +namespace Hash_int = xxHash; +namespace Hash_str = xxHash; + +using Hash_int::hash_int; +using Hash_str::hash_buf; struct hash_string_view { [[nodiscard]] CORRADE_ALWAYS_INLINE size_t operator()(StringView str) const noexcept; }; -size_t hash_string_view::operator()(StringView str) const noexcept { return hash_buf(str.data(), str.size()); } +size_t hash_string_view::operator()(StringView str) const noexcept { return Hash_str::hash_buf(str.data(), str.size()); } } // namespace floormat diff --git a/hash/meow_hash-LICENSE.txt b/hash/meow_hash-LICENSE.txt new file mode 100644 index 00000000..c2558359 --- /dev/null +++ b/hash/meow_hash-LICENSE.txt @@ -0,0 +1,19 @@ +zlib License + +(C) Copyright 2018 Molly Rocket, Inc. + +This software is provided 'as-is', without any express or implied +warranty. In no event will the authors be held liable for any damages +arising from the use of this software. + +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it +freely, subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. diff --git a/hash/meow_hash.cpp b/hash/meow_hash.cpp new file mode 100644 index 00000000..02c4dc56 --- /dev/null +++ b/hash/meow_hash.cpp @@ -0,0 +1,48 @@ +#include "hash-impl.hpp" +#if FM_HASH_HAVE_MEOWHASH +#include "meow_hash_x64_aesni.h" +#include "compat/assert.hpp" + +#ifdef __GNUG__ +#pragma GCC diagnostic ignored "-Wsign-conversion" +#pragma GCC diagnostic ignored "-Wunused-macros" +#endif + +#define fm_MeowU64From(A, I) (*(meow_u64 *)&(A) + (I)) + +namespace floormat::meow_hash { +namespace { + +CORRADE_ALWAYS_INLINE size_t hash(void* __restrict buf, size_t size) +{ +#ifndef fm_NO_DEBUG + if (size % 16 != 0) [[unlikely]] + fm_abort("size %zu %% 16 == %zu", size, size % 16); +#endif + meow_u128 hash = MeowHash(MeowDefaultSeed, size, buf); + if constexpr(sizeof nullptr > 4) + return fm_MeowU64From(hash, 0); + else + return MeowU32From(hash, 0); +} + +} // namespace + +size_t hash_buf(const void* __restrict buf, size_t size) noexcept +{ + return hash(const_cast<void*>(buf), size); +} + +size_t hash_int(uint32_t x) noexcept +{ + return hash(&x, sizeof x); +} + +size_t hash_int(uint64_t x) noexcept +{ + return hash(&x, sizeof x); +} + +} // namespace floormat::meow_hash + +#endif diff --git a/hash/meow_hash_x64_aesni.h b/hash/meow_hash_x64_aesni.h new file mode 100644 index 00000000..72f85edf --- /dev/null +++ b/hash/meow_hash_x64_aesni.h @@ -0,0 +1,750 @@ +/* ======================================================================== + + Meow - A Fast Non-cryptographic Hash + (C) Copyright 2018-2019 by Molly Rocket, Inc. (https://mollyrocket.com) + + See https://mollyrocket.com/meowhash for details. + + ======================================================================== + + zlib License + + (C) Copyright 2018-2019 Molly Rocket, Inc. + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + ======================================================================== + + FAQ + + Q: What is it? + + A: Meow is a 128-bit Level 3 hash taking 128 bytes of seed. It operates + at very high speeds on x64 processors, and potentially other processors + that provide accelerated AES instructions. + + Q: What is it GOOD for? + + A: Quickly hashing any amount of data for comparison purposes such as + block deduplication or change detection. It is fast on all buffer + sizes, and can generally be used anywhere you need fast Level 3 + hashing without worrying about how big or small the inputs tend to be. + + However, substantial speed improvements could be made over Meow + if you either a) know you are always hashing an exact, small number of bytes, + or b) can always supply a small number of bytes in a buffer padded to some + fixed multiple of 16. + + Q: What is it BAD for? + + A: Anything requiring Level 4 or Level 5 security guarantees (see + http://nohatcoder.dk/2019-05-19-1.html#level3). Also, note that + Meow is a new hash and has not had the extensive community + cryptanalysis necessary to ensure that it is not breakable down to + a lower level of hash, so you must do your due diligence in + deciding when and where to use Meow instead of a slower but + more extensively studied existing hash. We have tried to design + it to provide Level 3 security, but the possibility of the hash + being broken in the future always exists. + + Q: Why is it called the "Meow hash"? + + A: It is named after a character in Meow the Infinite + (https://meowtheinfinite.com) + + Q: Who wrote it? + + A: The final Meow Hash was created as a collaboration between + JACOB CHRISTIAN MUNCH-ANDERSEN (https://twitter.com/nohatcoder) and + CASEY MURATORI (https://caseymuratori.com). Casey wrote the original + implementation for use in processing large-footprint assets for the + game 1935 (https://molly1935.com). Jacob was the first to analyze + that implementation and determine the adversarial bit strength, which + was weaker than they would have liked. + + Following that, the two collaborated to figure out how the hash + could be strengthened without reducing Meow's 16 bytes/cycle + maximum theoretical throughput. Jacob created the hash candidates + and Casey did the performance validation. After a long and + exhaustive effort, Jacob found the unaligned aes/add/xor formulation + that forms the current Meow hash core. + + A number of valuable additions to Meow Hash were also contributed + by other great folks along the way: + + JEFF ROBERTS (https://radgametools.com) provided a super slick + way to handle the residual end-of-buffer bytes that dramatically + improved Meow's small hash performance. + + MARTINS MOZEIKO (https://matrins.ninja) ported Meow to ARM and + ANSI-C, and added the proper preprocessor dressing for clean + compilation on a variety of compiler configurations. + + FABIAN GIESEN (https://fgiesen.wordpress.com) analyzed many + performance oddities that came up during development, and + helped get the benchmarking working properly across a number + of platforms. + + ARAS PRANCKEVICIUS (https://aras-p.info) provided the allocation + shim for compilation on Mac OS X. + + ======================================================================== */ + +// +// IMPORTANT(casey): We are currently evaluating this hash construction as +// the final one for Meow Hash. If you find a way to produce collisions +// that should not be possible with a Level 3 hash, find significant performance +// problems, or see any bugs in this version, please be sure to report them +// to the Meow Hash GitHub as soon as possible. We would like to know as +// much as we can about the robustness and performance before committing to +// it as the final construction. +// + +#ifdef _MSC_VER +#pragma system_header +#else +#pragma GCC system_header +#endif + +#if !defined(MEOW_HASH_X64_AESNI_H) + +#define MEOW_HASH_VERSION 5 +#define MEOW_HASH_VERSION_NAME "0.5/calico" + +#if !defined(meow_u8) + +#if _MSC_VER +#if !defined(__clang__) +#define INSTRUCTION_REORDER_BARRIER _ReadWriteBarrier() +#else +#endif +#include <intrin.h> +#else +#include <x86intrin.h> +#endif + +#define meow_u8 char unsigned +#define meow_u64 long long unsigned +#define meow_u128 __m128i + +#if __x86_64__ || _M_AMD64 +#define meow_umm long long unsigned +#define MeowU64From(A, I) (_mm_extract_epi64((A), (I))) +#elif __i386__ || _M_IX86 +#define meow_umm int unsigned +#define MeowU64From(A, I) (*(meow_u64 *)&(A)) +#else +#error Cannot determine architecture to use! +#endif + +#define MeowU32From(A, I) (_mm_extract_epi32((A), (I))) +#define MeowHashesAreEqual(A, B) (_mm_movemask_epi8(_mm_cmpeq_epi8((A), (B))) == 0xFFFF) + +#if !defined INSTRUCTION_REORDER_BARRIER +#define INSTRUCTION_REORDER_BARRIER +#endif + +#if !defined MEOW_PAGESIZE +#define MEOW_PAGESIZE 4096 +#endif + +#if !defined MEOW_PREFETCH +#define MEOW_PREFETCH 4096 +#endif + +#if !defined MEOW_PREFETCH_LIMIT +#define MEOW_PREFETCH_LIMIT 0x3ff +#endif + +#endif + +#define prefetcht0(A) _mm_prefetch((char *)(A), _MM_HINT_T0) +#define movdqu(A, B) A = _mm_loadu_si128((__m128i *)(B)) +#define movdqu_mem(A, B) _mm_storeu_si128((__m128i *)(A), B) +#define movq(A, B) A = _mm_set_epi64x(0, B); +#define aesdec(A, B) A = _mm_aesdec_si128(A, B) +#define pshufb(A, B) A = _mm_shuffle_epi8(A, B) +#define pxor(A, B) A = _mm_xor_si128(A, B) +#define paddq(A, B) A = _mm_add_epi64(A, B) +#define pand(A, B) A = _mm_and_si128(A, B) +#define palignr(A, B, i) A = _mm_alignr_epi8(A, B, i) +#define pxor_clear(A, B) A = _mm_setzero_si128(); // NOTE(casey): pxor_clear is a nonsense thing that is only here because compilers don't detect xor(a, a) is clearing a :( + +#define MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4) \ +aesdec(r1, r2); \ +INSTRUCTION_REORDER_BARRIER; \ +paddq(r3, i1); \ +pxor(r2, i2); \ +aesdec(r2, r4); \ +INSTRUCTION_REORDER_BARRIER; \ +paddq(r5, i3); \ +pxor(r4, i4); + +#define MEOW_MIX(r1, r2, r3, r4, r5, ptr) \ +MEOW_MIX_REG(r1, r2, r3, r4, r5, _mm_loadu_si128( (__m128i *) ((ptr) + 15) ), _mm_loadu_si128( (__m128i *) ((ptr) + 0) ), _mm_loadu_si128( (__m128i *) ((ptr) + 1) ), _mm_loadu_si128( (__m128i *) ((ptr) + 16) )) + +#define MEOW_SHUFFLE(r1, r2, r3, r4, r5, r6) \ +aesdec(r1, r4); \ +paddq(r2, r5); \ +pxor(r4, r6); \ +aesdec(r4, r2); \ +paddq(r5, r6); \ +pxor(r2, r3) + +#if MEOW_DUMP +struct meow_dump +{ + meow_u128 xmm[8]; + void *Ptr; + char const *Title; +}; +extern "C" meow_dump *MeowDumpTo; +meow_dump *MeowDumpTo; +#define MEOW_DUMP_STATE(T, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, ptr) \ +if(MeowDumpTo) \ +{ \ + MeowDumpTo->xmm[0] = xmm0; \ + MeowDumpTo->xmm[1] = xmm1; \ + MeowDumpTo->xmm[2] = xmm2; \ + MeowDumpTo->xmm[3] = xmm3; \ + MeowDumpTo->xmm[4] = xmm4; \ + MeowDumpTo->xmm[5] = xmm5; \ + MeowDumpTo->xmm[6] = xmm6; \ + MeowDumpTo->xmm[7] = xmm7; \ + MeowDumpTo->Ptr = ptr; \ + MeowDumpTo->Title = T; \ + ++MeowDumpTo; \ +} +#else +#define MEOW_DUMP_STATE(...) +#endif + +static meow_u8 MeowShiftAdjust[32] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; +static meow_u8 MeowMaskLen[32] = {255,255,255,255, 255,255,255,255, 255,255,255,255, 255,255,255,255, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; + +// NOTE(casey): The default seed is now a "nothing-up-our-sleeves" number for good measure. You may verify that it is just an encoding of Pi. +static meow_u8 MeowDefaultSeed[128] = +{ + 0x32, 0x43, 0xF6, 0xA8, 0x88, 0x5A, 0x30, 0x8D, + 0x31, 0x31, 0x98, 0xA2, 0xE0, 0x37, 0x07, 0x34, + 0x4A, 0x40, 0x93, 0x82, 0x22, 0x99, 0xF3, 0x1D, + 0x00, 0x82, 0xEF, 0xA9, 0x8E, 0xC4, 0xE6, 0xC8, + 0x94, 0x52, 0x82, 0x1E, 0x63, 0x8D, 0x01, 0x37, + 0x7B, 0xE5, 0x46, 0x6C, 0xF3, 0x4E, 0x90, 0xC6, + 0xCC, 0x0A, 0xC2, 0x9B, 0x7C, 0x97, 0xC5, 0x0D, + 0xD3, 0xF8, 0x4D, 0x5B, 0x5B, 0x54, 0x70, 0x91, + 0x79, 0x21, 0x6D, 0x5D, 0x98, 0x97, 0x9F, 0xB1, + 0xBD, 0x13, 0x10, 0xBA, 0x69, 0x8D, 0xFB, 0x5A, + 0xC2, 0xFF, 0xD7, 0x2D, 0xBD, 0x01, 0xAD, 0xFB, + 0x7B, 0x8E, 0x1A, 0xFE, 0xD6, 0xA2, 0x67, 0xE9, + 0x6B, 0xA7, 0xC9, 0x04, 0x5F, 0x12, 0xC7, 0xF9, + 0x92, 0x4A, 0x19, 0x94, 0x7B, 0x39, 0x16, 0xCF, + 0x70, 0x80, 0x1F, 0x2E, 0x28, 0x58, 0xEF, 0xC1, + 0x66, 0x36, 0x92, 0x0D, 0x87, 0x15, 0x74, 0xE6 +}; + +// +// NOTE(casey): Single block version +// + +static meow_u128 +MeowHash(void *Seed128Init, meow_umm Len, void *SourceInit) +{ + meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes + meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length) + + meow_u8 *rax = (meow_u8 *)SourceInit; + meow_u8 *rcx = (meow_u8 *)Seed128Init; + + // + // NOTE(casey): Seed the eight hash registers + // + + movdqu(xmm0, rcx + 0x00); + movdqu(xmm1, rcx + 0x10); + movdqu(xmm2, rcx + 0x20); + movdqu(xmm3, rcx + 0x30); + + movdqu(xmm4, rcx + 0x40); + movdqu(xmm5, rcx + 0x50); + movdqu(xmm6, rcx + 0x60); + movdqu(xmm7, rcx + 0x70); + + MEOW_DUMP_STATE("Seed", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); + + // + // NOTE(casey): Hash all full 256-byte blocks + // + + meow_umm BlockCount = (Len >> 8); + if(BlockCount > MEOW_PREFETCH_LIMIT) + { + // NOTE(casey): For large input, modern Intel x64's can't hit full speed without prefetching, so we use this loop + while(BlockCount--) + { + prefetcht0(rax + MEOW_PREFETCH + 0x00); + prefetcht0(rax + MEOW_PREFETCH + 0x40); + prefetcht0(rax + MEOW_PREFETCH + 0x80); + prefetcht0(rax + MEOW_PREFETCH + 0xc0); + + MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); + MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); + MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); + MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); + MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); + MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); + MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); + MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); + + rax += 0x100; + } + } + else + { + // NOTE(casey): For small input, modern Intel x64's can't hit full speed _with_ prefetching (because of port pressure), so we use this loop. + while(BlockCount--) + { + MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); + MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); + MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); + MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); + MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); + MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); + MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); + MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); + + rax += 0x100; + } + } + + MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); + + // + // NOTE(casey): Load any less-than-32-byte residual + // + + pxor_clear(xmm9, xmm9); + pxor_clear(xmm11, xmm11); + + // + // TODO(casey): I need to put more thought into how the end-of-buffer stuff is actually working out here, + // because I _think_ it may be possible to remove the first branch (on Len8) and let the mask zero out the + // result, but it would take a little thought to make sure it couldn't read off the end of the buffer due + // to the & 0xf on the align computation. + // + + // NOTE(casey): First, we have to load the part that is _not_ 16-byte aligned + meow_u8 *Last = (meow_u8 *)SourceInit + (Len & ~0xf); + int unsigned Len8 = (Len & 0xf); + if(Len8) + { + // NOTE(casey): Load the mask early + movdqu(xmm8, &MeowMaskLen[0x10 - Len8]); + + meow_u8 *LastOk = (meow_u8*)((((meow_umm)(((meow_u8 *)SourceInit)+Len - 1)) | (MEOW_PAGESIZE - 1)) - 16); + int Align = (Last > LastOk) ? ((int)(meow_umm)Last) & 0xf : 0; + movdqu(xmm10, &MeowShiftAdjust[Align]); + movdqu(xmm9, Last - Align); + pshufb(xmm9, xmm10); + + // NOTE(jeffr): and off the extra bytes + pand(xmm9, xmm8); + } + + // NOTE(casey): Next, we have to load the part that _is_ 16-byte aligned + if(Len & 0x10) + { + xmm11 = xmm9; + movdqu(xmm9, Last - 0x10); + } + + // + // NOTE(casey): Construct the residual and length injests + // + + xmm8 = xmm9; + xmm10 = xmm9; + palignr(xmm8, xmm11, 15); + palignr(xmm10, xmm11, 1); + + // NOTE(casey): We have room for a 128-bit nonce and a 64-bit none here, but + // the decision was made to leave them zero'd so as not to confuse people + // about hwo to use them or what security implications they had. + pxor_clear(xmm12, xmm12); + pxor_clear(xmm13, xmm13); + pxor_clear(xmm14, xmm14); + movq(xmm15, Len); + palignr(xmm12, xmm15, 15); + palignr(xmm14, xmm15, 1); + + MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 0); + + // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty + MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11); + + // NOTE(casey): Append the length, to avoid problems with our 32-byte padding + MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15); + + MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); + + // + // NOTE(casey): Hash all full 32-byte blocks + // + int unsigned LaneCount = (Len >> 5) & 0x7; + if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount; + if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount; + if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount; + if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount; + if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount; + if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount; + if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount; + + // + // NOTE(casey): Mix the eight lanes down to one 128-bit hash + // + + MixDown: + + MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); + + MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); + MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); + MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); + MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); + MEOW_SHUFFLE(xmm4, xmm5, xmm6, xmm0, xmm1, xmm2); + MEOW_SHUFFLE(xmm5, xmm6, xmm7, xmm1, xmm2, xmm3); + MEOW_SHUFFLE(xmm6, xmm7, xmm0, xmm2, xmm3, xmm4); + MEOW_SHUFFLE(xmm7, xmm0, xmm1, xmm3, xmm4, xmm5); + MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); + MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); + MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); + MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); + + MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); + + paddq(xmm0, xmm2); + paddq(xmm1, xmm3); + paddq(xmm4, xmm6); + paddq(xmm5, xmm7); + pxor(xmm0, xmm1); + pxor(xmm4, xmm5); + paddq(xmm0, xmm4); + + MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); + + return(xmm0); +} + +// +// NOTE(casey): Streaming construction +// + +typedef struct meow_state +{ + meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + meow_u64 TotalLengthInBytes; + + int unsigned BufferLen; + + meow_u8 Buffer[256]; + meow_u128 Pad[2]; // NOTE(casey): So we know we can over-read Buffer as necessary +} meow_state; + +static void +MeowBegin(meow_state *State, void *Seed128) +{ + meow_u8 *rcx = (meow_u8 *)Seed128; + + movdqu(State->xmm0, rcx + 0x00); + movdqu(State->xmm1, rcx + 0x10); + movdqu(State->xmm2, rcx + 0x20); + movdqu(State->xmm3, rcx + 0x30); + movdqu(State->xmm4, rcx + 0x40); + movdqu(State->xmm5, rcx + 0x50); + movdqu(State->xmm6, rcx + 0x60); + movdqu(State->xmm7, rcx + 0x70); + + MEOW_DUMP_STATE("Seed", State->xmm0, State->xmm1, State->xmm2, State->xmm3, State->xmm4, State->xmm5, State->xmm6, State->xmm7, 0); + + State->BufferLen = 0; + State->TotalLengthInBytes = 0; +} + +static void +MeowAbsorbBlocks(meow_state *State, meow_umm BlockCount, meow_u8 *rax) +{ + meow_u128 xmm0 = State->xmm0; + meow_u128 xmm1 = State->xmm1; + meow_u128 xmm2 = State->xmm2; + meow_u128 xmm3 = State->xmm3; + meow_u128 xmm4 = State->xmm4; + meow_u128 xmm5 = State->xmm5; + meow_u128 xmm6 = State->xmm6; + meow_u128 xmm7 = State->xmm7; + + if(BlockCount > MEOW_PREFETCH_LIMIT) + { + while(BlockCount--) + { + prefetcht0(rax + MEOW_PREFETCH + 0x00); + prefetcht0(rax + MEOW_PREFETCH + 0x40); + prefetcht0(rax + MEOW_PREFETCH + 0x80); + prefetcht0(rax + MEOW_PREFETCH + 0xc0); + + MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); + MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); + MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); + MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); + MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); + MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); + MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); + MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); + + rax += 0x100; + } + } + else + { + while(BlockCount--) + { + MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); + MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); + MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); + MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); + MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); + MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); + MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); + MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); + + rax += 0x100; + } + } + + State->xmm0 = xmm0; + State->xmm1 = xmm1; + State->xmm2 = xmm2; + State->xmm3 = xmm3; + State->xmm4 = xmm4; + State->xmm5 = xmm5; + State->xmm6 = xmm6; + State->xmm7 = xmm7; +} + +static void +MeowAbsorb(meow_state *State, meow_umm Len, void *SourceInit) +{ + State->TotalLengthInBytes += Len; + meow_u8 *Source = (meow_u8 *)SourceInit; + + // NOTE(casey): Handle any buffered residual + if(State->BufferLen) + { + int unsigned Fill = (sizeof(State->Buffer) - State->BufferLen); + if(Fill > Len) + { + Fill = (int unsigned)Len; + } + + Len -= Fill; + while(Fill--) + { + State->Buffer[State->BufferLen++] = *Source++; + } + + if(State->BufferLen == sizeof(State->Buffer)) + { + MeowAbsorbBlocks(State, 1, State->Buffer); + State->BufferLen = 0; + } + } + + // NOTE(casey): Handle any full blocks + meow_u64 BlockCount = (Len >> 8); + meow_u64 Advance = (BlockCount << 8); + MeowAbsorbBlocks(State, BlockCount, Source); + + Len -= Advance; + Source += Advance; + + // NOTE(casey): Store residual + while(Len--) + { + State->Buffer[State->BufferLen++] = *Source++; + } +} + +static meow_u128 +MeowEnd(meow_state *State, meow_u8 *Store128) +{ + meow_umm Len = State->TotalLengthInBytes; + + meow_u128 xmm0 = State->xmm0; + meow_u128 xmm1 = State->xmm1; + meow_u128 xmm2 = State->xmm2; + meow_u128 xmm3 = State->xmm3; + meow_u128 xmm4 = State->xmm4; + meow_u128 xmm5 = State->xmm5; + meow_u128 xmm6 = State->xmm6; + meow_u128 xmm7 = State->xmm7; + + meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + + meow_u8 *rax = State->Buffer; + + pxor_clear(xmm9, xmm9); + pxor_clear(xmm11, xmm11); + + meow_u8 *Last = (meow_u8 *)rax + (Len & 0xf0); + int unsigned Len8 = (Len & 0xf); + if(Len8) + { + movdqu(xmm8, &MeowMaskLen[0x10 - Len8]); + movdqu(xmm9, Last); + pand(xmm9, xmm8); + } + + if(Len & 0x10) + { + xmm11 = xmm9; + movdqu(xmm9, Last - 0x10); + } + + xmm8 = xmm9; + xmm10 = xmm9; + palignr(xmm8, xmm11, 15); + palignr(xmm10, xmm11, 1); + + pxor_clear(xmm12, xmm12); + pxor_clear(xmm13, xmm13); + pxor_clear(xmm14, xmm14); + movq(xmm15, Len); + palignr(xmm12, xmm15, 15); + palignr(xmm14, xmm15, 1); + + MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); + MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 0); + + // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty + MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11); + + // NOTE(casey): Append the length, to avoid problems with our 32-byte padding + MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15); + + MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); + + // + // NOTE(casey): Hash all full 32-byte blocks + // + int unsigned LaneCount = (Len >> 5) & 0x7; + if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount; + if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount; + if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount; + if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount; + if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount; + if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount; + if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount; + + // + // NOTE(casey): Mix the eight lanes down to one 128-bit hash + // + + MixDown: + + MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); + + MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); + MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); + MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); + MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); + MEOW_SHUFFLE(xmm4, xmm5, xmm6, xmm0, xmm1, xmm2); + MEOW_SHUFFLE(xmm5, xmm6, xmm7, xmm1, xmm2, xmm3); + MEOW_SHUFFLE(xmm6, xmm7, xmm0, xmm2, xmm3, xmm4); + MEOW_SHUFFLE(xmm7, xmm0, xmm1, xmm3, xmm4, xmm5); + MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); + MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); + MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); + MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); + + MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); + + if(Store128) + { + movdqu_mem(Store128 + 0x00, xmm0); + movdqu_mem(Store128 + 0x10, xmm1); + movdqu_mem(Store128 + 0x20, xmm2); + movdqu_mem(Store128 + 0x30, xmm3); + movdqu_mem(Store128 + 0x40, xmm4); + movdqu_mem(Store128 + 0x50, xmm5); + movdqu_mem(Store128 + 0x60, xmm6); + movdqu_mem(Store128 + 0x70, xmm7); + } + + paddq(xmm0, xmm2); + paddq(xmm1, xmm3); + paddq(xmm4, xmm6); + paddq(xmm5, xmm7); + pxor(xmm0, xmm1); + pxor(xmm4, xmm5); + paddq(xmm0, xmm4); + + MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); + + return(xmm0); +} + +#undef INSTRUCTION_REORDER_BARRIER +#undef prefetcht0 +#undef movdqu +#undef movdqu_mem +#undef movq +#undef aesdec +#undef pshufb +#undef pxor +#undef paddq +#undef pand +#undef palignr +#undef pxor_clear +#undef MEOW_MIX +#undef MEOW_MIX_REG +#undef MEOW_SHUFFLE +#undef MEOW_DUMP_STATE + +// +// NOTE(casey): If you need to create your own seed from non-random data, you can use MeowExpandSeed +// to create a seed which you then store for repeated use. It is _expensive_ to generate the seed, +// so you do not want to do this every time you hash. You _only_ want to do it when you actually +// need to create a new seed. +// + +static void +MeowExpandSeed(meow_umm InputLen, void *Input, meow_u8 *SeedResult) +{ + meow_state State; + meow_u64 LengthTab = (meow_u64)InputLen; // NOTE(casey): We need to always injest 8-byte lengths exactly, even on 32-bit builds, to ensure identical results + meow_umm InjestCount = (256 / InputLen) + 2; + + MeowBegin(&State, MeowDefaultSeed); + MeowAbsorb(&State, sizeof(LengthTab), &LengthTab); + while(InjestCount--) + { + MeowAbsorb(&State, InputLen, Input); + } + MeowEnd(&State, SeedResult); +} + +#define MEOW_HASH_X64_AESNI_H +#endif diff --git a/hash/xxhash.cpp b/hash/xxhash.cpp index c5ece41b..157232b9 100644 --- a/hash/xxhash.cpp +++ b/hash/xxhash.cpp @@ -1,31 +1,30 @@ #include "hash-impl.hpp" -#include "compat/map.hpp" #include "compat/iota.hpp" +#include <cstdio> #include <bit> -#ifdef __GNUG__ -//#pragma GCC diagnostic ignored "-Wdisabled-macro-expansion" -//#pragma GCC diagnostic ignored "-Wused-but-marked-unused" -#include "xxhash.inl" +#ifdef __clang__ +#pragma GCC diagnostic ignored "-Wdisabled-macro-expansion" +#pragma GCC diagnostic ignored "-Wused-but-marked-unused" #endif +#include "xxhash.inl" namespace floormat::xxHash { namespace { -constexpr inline auto seed = std::bit_cast<uint64_t>(iota_array<uint8_t, 8>); - CORRADE_ALWAYS_INLINE size_t do_xxhash(const void* __restrict buf, size_t size) noexcept { #ifdef __AVX2__ - return XXH3_64bits(buf, size); + return (size_t)XXH3_64bits(buf, size); #elif __SSE2__ return (size_t)XXH3_64bits(buf, size); #else + constexpr auto seed = std::bit_cast<size_t>(iota_array<uint8_t, sizeof nullptr>); if constexpr(sizeof nullptr > 4) - return XXH364(buf, size); + return XXH64(buf, size, seed); else - return XXH32(buf, size); + return XXH32(buf, size, seed); #endif } diff --git a/test/hash.cpp b/test/hash.cpp index 7d8e97d1..2c6a7516 100644 --- a/test/hash.cpp +++ b/test/hash.cpp @@ -8,10 +8,19 @@ #include <memory> #include <mg/Functions.h> +#ifdef __GNUG__ +#pragma GCC diagnostic ignored "-Wunused-macros" +#endif + +//#define FM_TEST_DEBUG_PRINT +//#define FM_TEST_DEBUG_VERBOSE + namespace floormat { namespace { +bool first = true; + void test_simple() { constexpr StringView list[] = { @@ -38,7 +47,7 @@ template<int16_t CoordMax, int8_t ZMax, uint32_t MaxCollisions> [[nodiscard]] bool test_collisions(const char* name) { constexpr size_t Iters = (CoordMax*2 + 1)*(CoordMax*2 + 1)*(1+ZMax); // NOLINT(*-implicit-widening-of-multiplication-result) - constexpr size_t BucketCount = Math::max(world::initial_capacity, size_t(Math::ceil(Iters / world::max_load_factor))); + constexpr size_t BucketCount = Math::max(world::initial_capacity, size_t(Math::ceil(Iters / 0.2f))); uint32_t num_collisions = 0, iters = 0; auto bitset_ = std::make_unique<std::bitset<BucketCount>>(false); auto& bitset = *bitset_; @@ -61,32 +70,50 @@ template<int16_t CoordMax, int8_t ZMax, uint32_t MaxCollisions> fm_assert_equal(Iters, iters); -//#define FM_TEST_DEBUG_PRINT + constexpr auto print = [](Debug::Color c, StringView status, StringView name, StringView op, + uint32_t num_collisions, uint32_t iters) + { + if (first) + { + DBG_nospace << ""; + } + first = false; + DBG_nospace << Debug::color(c) << status << Debug::resetColor + << ": " << name << "\t" + << num_collisions << "\t" << op << "\t" << MaxCollisions + << "\t\t(iters:" << iters << " buckets:" << BucketCount << ")"; + }; + +#define FM_FAIL fm_abort("test/hash: too many collisions") +#ifdef FM_TEST_DEBUG_PRINT +#define FM_MAYBE_FAIL void() +#else +#define FM_MAYBE_FAIL FM_MAYBE_FAIL_ +#endif +#define FM_MAYBE_FAIL_ do { if (!success) { DBG_nospace << ""; FM_FAIL; } } while (false) if (num_collisions > MaxCollisions) { - DBG_nospace << Debug::color(Debug::Color::Magenta) - << "fail" << Debug::resetColor << ": test/hash(\"" << name << "\")" - << " collisions " << num_collisions << " > " << MaxCollisions - << " (iters:" << iters << " buckets:" << BucketCount << ")"; + print(Debug::Color::Magenta, "fail", name, "!", num_collisions, iters); #ifdef FM_TEST_DEBUG_PRINT return false; #else - fm_abort("test/hash: too many collisions"); + FM_FAIL; #endif } - else if (num_collisions != MaxCollisions) + else if (num_collisions < MaxCollisions) { -#ifdef FM_TEST_DEBUG_PRINT - DBG_nospace << Debug::color(Debug::Color::Cyan) << "pass" << Debug::resetColor - << ": test/hash(\"" << name << "\") collisions " - << num_collisions << " <= " << MaxCollisions - << " (iters:" << iters << " buckets:" << BucketCount << ")"; +#if defined FM_TEST_DEBUG_PRINT && defined FM_TEST_DEBUG_VERBOSE + print(Debug::Color::Cyan, "pass", name, " ", num_collisions, iters); #endif } return true; } +#if !defined FM_TEST_DEBUG_PRINT && defined FM_TEST_DEBUG_VERBOSE +#error assertion failed: !defined FM_TEST_DEBUG_PRINT && defined FM_TEST_DEBUG_VERBOSE +#endif + } // namespace void Test::test_hash() @@ -94,46 +121,37 @@ void Test::test_hash() test_simple(); bool success = true; - -#ifdef FM_TEST_DEBUG_PRINT - DBG_nospace << ""; -#endif - -#ifdef FM_TEST_DEBUG_PRINT -#define FM_MAYBE_FAIL void() -#else -#define FM_MAYBE_FAIL FM_MAYBE_FAIL_ -#endif -#define FM_MAYBE_FAIL_ do { if (!success) { DBG_nospace << ""; fm_abort("test/hash: too many collisions"); } } while (false) - - success &= test_collisions< 2, 1, 1 >("small1"); FM_MAYBE_FAIL; - success &= test_collisions< 3, 0, 1 >("small2"); FM_MAYBE_FAIL; - success &= test_collisions< 4, 0, 1 >("small3"); FM_MAYBE_FAIL; - success &= test_collisions< 5, 0, 1 >("small4"); FM_MAYBE_FAIL; - success &= test_collisions< 6, 0, 4 >("mid1"); FM_MAYBE_FAIL; - success &= test_collisions< 7, 0, 7 >("mid2"); FM_MAYBE_FAIL; - success &= test_collisions< 8, 0, 10 >("mid3"); FM_MAYBE_FAIL; - success &= test_collisions< 9, 0, 15 >("mid4"); FM_MAYBE_FAIL; - success &= test_collisions< 10, 0, 22 >("large1"); FM_MAYBE_FAIL; - success &= test_collisions< 12, 0, 51 >("large2"); FM_MAYBE_FAIL; - success &= test_collisions< 14, 0, 94 >("large3"); FM_MAYBE_FAIL; - success &= test_collisions< 15, 0, 118 >("large4"); FM_MAYBE_FAIL; - success &= test_collisions< 16, 0, 107 >("large5"); FM_MAYBE_FAIL; + first = true; + + // best result is xxHash: + success &= test_collisions< 2, 1, 1 >("small1 "); FM_MAYBE_FAIL; + success &= test_collisions< 3, 0, 2 >("small2 "); FM_MAYBE_FAIL; + success &= test_collisions< 4, 0, 2 >("small3 "); FM_MAYBE_FAIL; + success &= test_collisions< 5, 0, 1 >("small4 "); FM_MAYBE_FAIL; + success &= test_collisions< 6, 0, 4 >("mid1 "); FM_MAYBE_FAIL; + success &= test_collisions< 7, 0, 7 >("mid2 "); FM_MAYBE_FAIL; + success &= test_collisions< 8, 0, 10 >("mid3 "); FM_MAYBE_FAIL; + success &= test_collisions< 9, 0, 15 >("mid4 "); FM_MAYBE_FAIL; + success &= test_collisions< 10, 0, 22 >("large1 "); FM_MAYBE_FAIL; + success &= test_collisions< 12, 0, 51 >("large2 "); FM_MAYBE_FAIL; + success &= test_collisions< 14, 0, 94 >("large3 "); FM_MAYBE_FAIL; + success &= test_collisions< 15, 0, 118 >("large4 "); FM_MAYBE_FAIL; + success &= test_collisions< 16, 0, 110 >("large5 "); FM_MAYBE_FAIL; success &= test_collisions< 3, 1, 2 >("zsmall1"); FM_MAYBE_FAIL; success &= test_collisions< 4, 1, 3 >("zsmall2"); FM_MAYBE_FAIL; - success &= test_collisions< 2, 3, 1 >("zmid1"); FM_MAYBE_FAIL; - success &= test_collisions< 3, 3, 7 >("zmid1"); FM_MAYBE_FAIL; - success &= test_collisions< 2, 6, 4 >("zmid2"); FM_MAYBE_FAIL; - success &= test_collisions< 2, 10, 14 >("zmid3"); FM_MAYBE_FAIL; - success &= test_collisions< 6, 2, 35 >("zmid4"); FM_MAYBE_FAIL; - success &= test_collisions< 7, 1, 28 >("zmid5"); FM_MAYBE_FAIL; + success &= test_collisions< 2, 3, 1 >("zmid1" ); FM_MAYBE_FAIL; + success &= test_collisions< 3, 3, 7 >("zmid1" ); FM_MAYBE_FAIL; + success &= test_collisions< 2, 6, 4 >("zmid2" ); FM_MAYBE_FAIL; + success &= test_collisions< 2, 10, 14 >("zmid3" ); FM_MAYBE_FAIL; + success &= test_collisions< 6, 2, 35 >("zmid4" ); FM_MAYBE_FAIL; + success &= test_collisions< 7, 1, 28 >("zmid5" ); FM_MAYBE_FAIL; success &= test_collisions< 10, 1, 89 >("zlarge1"); FM_MAYBE_FAIL; success &= test_collisions< 10, 4, 274 >("zlarge2"); FM_MAYBE_FAIL; success &= test_collisions< 16, 8, 1126 >("zlarge3"); FM_MAYBE_FAIL; success &= test_collisions< 32, 8, 4397 >("zlarge4"); FM_MAYBE_FAIL; (void)success; - //FM_MAYBE_FAIL_; + FM_MAYBE_FAIL_; } } // namespace floormat diff --git a/userconfig-sthalik@Windows-Clang.cmake b/userconfig-sthalik@Windows-Clang.cmake index b5928c10..ae71157f 100644 --- a/userconfig-sthalik@Windows-Clang.cmake +++ b/userconfig-sthalik@Windows-Clang.cmake @@ -1,5 +1,4 @@ if(CMAKE_BUILD_TYPE STREQUAL "DEBUG") - add_compile_options(-mavx2) if(FLOORMAT_WITH-COVERAGE) set(CMAKE_BUILD_TYPE DEBUG CACHE STRING "" FORCE) add_definitions( @@ -31,9 +30,9 @@ set(OpenCV_DIR "f:/dev/opentrack-depends/opencv/build-clang-amd64/install" CACHE set(CMAKE_INSTALL_MESSAGE NEVER) sets(STRING - CMAKE_C_FLAGS "-ggdb3 -gsplit-dwarf -gcolumn-info -gdwarf-aranges -gz=zlib" - CMAKE_C_FLAGS_DEBUG "-O0 -fstack-protector-all" - CMAKE_C_FLAGS_RELEASE "-O3 -ffast-math -march=nehalem -mtune=native -mpopcnt -mavx -fomit-frame-pointer -fno-stack-protector" + CMAKE_C_FLAGS "-march=x86-64-v2 -mtune=native -mavx2 -maes -ggdb3 -gsplit-dwarf -gcolumn-info -gdwarf-aranges -gz=zlib -fstack-protector-all" + CMAKE_C_FLAGS_DEBUG "-O0" + CMAKE_C_FLAGS_RELEASE "-Ofast -ffast-math -mpopcnt -fomit-frame-pointer -fno-stack-protector -static" CMAKE_EXE_LINKER_FLAGS_DEBUG "" CMAKE_SHARED_LINKER_FLAGS_DEBUG "" ) diff --git a/userconfig-sthalik@Windows-GNU.cmake b/userconfig-sthalik@Windows-GNU.cmake index 214ec5fd..a347ea8e 100644 --- a/userconfig-sthalik@Windows-GNU.cmake +++ b/userconfig-sthalik@Windows-GNU.cmake @@ -1,7 +1,7 @@ sets(STRING - CMAKE_C_FLAGS "" - CMAKE_C_FLAGS_DEBUG "-O0 -g -ggdb -fstack-protector-all -fstack-reuse=none -mavx2" - CMAKE_C_FLAGS_RELEASE "-O3 -ffast-math -ftree-vectorize -funsafe-loop-optimizations -march=haswell -mtune=native -mavx2 -flto -fipa-pta -fmerge-all-constants -static" + CMAKE_C_FLAGS "-march=x86-64-v2 -mtune=native -mavx2 -maes" + CMAKE_C_FLAGS_DEBUG "-O0 -g -ggdb -fstack-protector-all -fstack-reuse=none" + CMAKE_C_FLAGS_RELEASE "-Ofast -ftree-vectorize -funsafe-loop-optimizations -flto -fipa-pta -fmerge-all-constants -fno-stack-protector -static" ) sets(STRING |