return hash_native_result(hash, carry, length);
}
-/*
- * Inline assembly optimized SSE version for when SSE is present via CPUID
- * or the host compiler has __SSE__. This is about 16 cycles faster than
- * native at -O2 for GCC and 11 cycles for -O3.
- *
- * Tested with -m32 on a Phenom II X4 with:
- * gcc version 4.8.1 20130725 (prerelease) (GCC)
- */
-#if defined(__GNUC__) && defined(__i386__)
-static GMQCC_FORCEINLINE uint32_t hash_sse(const void *GMQCC_RESTRICT key, size_t length) {
- uint32_t ret;
- __asm__ __volatile__ (
- " mov %%eax, %%ebx\n"
- " mov %2, %%eax\n"
- " movd %%eax, %%xmm7\n"
- " shufps $0, %%xmm7, %%xmm7\n"
- " mov %3, %%eax\n"
- " movd %%eax, %%xmm6\n"
- " shufps $0, %%xmm6, %%xmm6\n"
- " lea (%%esi, %%ecx, 1), %%edi\n"
- " jmp 2f\n"
- "1:\n"
- " movaps (%%esi), %%xmm0\n"
- " pmulld %%xmm7, %%xmm0\n"
- " movaps %%xmm0, %%xmm2\n"
- " pslld $15, %%xmm0\n"
- " psrld $17, %%xmm2\n"
- " orps %%xmm2, %%xmm0\n"
- " pmulld %%xmm6, %%xmm0\n"
- " movd %%xmm0, %%eax\n"
- " xor %%eax, %%ebx\n"
- " rol $13, %%ebx\n"
- " imul $5, %%ebx\n"
- " add $0xE6546B64, %%ebx\n"
- " shufps $0x39, %%xmm0, %%xmm0\n"
- " movd %%xmm0, %%eax\n"
- " xor %%eax, %%ebx\n"
- " rol $13, %%ebx\n"
- " imul $5, %%ebx\n"
- " add $0xE6546B64, %%ebx\n"
- " shufps $0x39, %%xmm0, %%xmm0\n"
- " movd %%xmm0, %%eax\n"
- " xor %%eax, %%ebx\n"
- " rol $13, %%ebx\n"
- " imul $5, %%ebx\n"
- " add $0xE6546B64, %%ebx\n"
- " shufps $0x39, %%xmm0, %%xmm0\n"
- " movd %%xmm0, %%eax\n"
- " xor %%eax, %%ebx\n"
- " rol $13, %%ebx\n"
- " imul $5, %%ebx\n"
- " add $0xE6546B64, %%ebx\n"
- " add $16, %%esi\n"
- "2:\n"
- " cmp %%esi, %%edi\n"
- " jne 1b\n"
- " xor %%ecx, %%ebx\n"
- " mov %%ebx, %%eax\n"
- " shr $16, %%ebx\n"
- " xor %%ebx, %%eax\n"
- " imul $0x85EBCA6b, %%eax\n"
- " mov %%eax, %%ebx\n"
- " shr $13, %%ebx\n"
- " xor %%ebx, %%eax\n"
- " imul $0xC2B2AE35, %%eax\n"
- " mov %%eax, %%ebx\n"
- " shr $16, %%ebx\n"
- " xor %%ebx, %%eax\n"
- : "=a" (ret)
-
- : "a" (HASH_SEED),
- "i" (HASH_MASK1),
- "i" (HASH_MASK2),
- "S" (key),
- "c" (length)
-
- : "%ebx",
- "%edi"
- );
- return ret;
-}
-#endif
-
-#if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__)
-/*
- * Emulate MSVC _cpuid intrinsic for GCC/MinGW/Clang, this will be used
- * to determine if we should use the SSE route.
- */
-static GMQCC_FORCEINLINE void hash_cpuid(int *lanes, int entry) {
- __asm__ __volatile__ (
- "cpuid"
- : "=a"(lanes[0]),
- "=b"(lanes[1]),
- "=c"(lanes[2]),
- "=d"(lanes[3])
-
- : "a" (entry)
- );
-}
-
-#endif /* !(defined(__GNUC__) && defined(__i386__) */
-
static uint32_t hash_entry(const void *GMQCC_RESTRICT key, size_t length) {
-/*
- * No host SSE instruction set assumed do runtime test instead. This
- * is for MinGW32 mostly which doesn't define SSE.
- */
-#if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__)
- static bool memoize = false;
- static bool sse = false;
-
- if (GMQCC_UNLIKELY(!memoize)) {
- /*
- * Only calculate SSE one time, thus it's unlikely that this branch
- * is taken more than once.
- */
- static int lanes[4];
- hash_cpuid(lanes, 0);
- /*
- * It's very likely that lanes[0] will contain a value unless it
- * isn't a modern x86.
- */
- if (GMQCC_LIKELY(*lanes >= 1))
- sse = (lanes[3] & ((int)1 << 25)) != 0;
- memoize = true;
- }
-
- return (GMQCC_LIKELY(sse))
- ? hash_sse(key, length)
- : hash_native(key, length);
-/*
- * Same as above but this time host compiler was defined with SSE support.
- * This handles MinGW32 builds for i686+
- */
-#elif defined (__GNUC__) && defined(__i386__) && defined(__SSE__)
- return hash_sse(key, length);
-#else
- /*
- * Go the native route which itself is highly optimized as well for
- * unaligned load/store when dealing with LE.
- */
return hash_native(key, length);
-#endif
}
#define HASH_LEN_ALIGN (sizeof(size_t))