Remove SSE hash, t's just too much effort to maintain.

author Dale Weiler <killfieldengine@gmail.com>

Sat, 14 Dec 2013 22:30:51 +0000 (17:30 -0500)

committer Dale Weiler <killfieldengine@gmail.com>

Sat, 14 Dec 2013 22:30:51 +0000 (17:30 -0500)
author Dale Weiler <killfieldengine@gmail.com>
Sat, 14 Dec 2013 22:30:51 +0000 (17:30 -0500)
committer Dale Weiler <killfieldengine@gmail.com>
Sat, 14 Dec 2013 22:30:51 +0000 (17:30 -0500)
diff --git a/hash.c b/hash.c

index 276817e2134573c2425ddbc744901992e297f4e4..6ef3847b67648a3a48db4756f1ed86e85f8a9653 100644 (file)
--- a/hash.c
+++ b/hash.c
@@ -261,149 +261,8 @@ static GMQCC_FORCEINLINE GMQCC_USED uint32_t hash_native(const void *GMQCC_RESTR
      return hash_native_result(hash, carry, length);
  }
  
-/*
- * Inline assembly optimized SSE version for when SSE is present via CPUID
- * or the host compiler has __SSE__. This is about 16 cycles faster than
- * native at -O2 for GCC and 11 cycles for -O3.
- *
- *  Tested with -m32 on a Phenom II X4 with:
- *      gcc version 4.8.1 20130725 (prerelease) (GCC)
- */
-#if defined(__GNUC__) && defined(__i386__)
-static GMQCC_FORCEINLINE uint32_t hash_sse(const void *GMQCC_RESTRICT key, size_t length) {
-    uint32_t ret;
-    __asm__ __volatile__ (
-        "   mov %%eax, %%ebx\n"
-        "   mov %2, %%eax\n"
-        "   movd %%eax, %%xmm7\n"
-        "   shufps $0, %%xmm7, %%xmm7\n"
-        "   mov %3, %%eax\n"
-        "   movd %%eax, %%xmm6\n"
-        "   shufps $0, %%xmm6, %%xmm6\n"
-        "   lea (%%esi, %%ecx, 1), %%edi\n"
-        "   jmp 2f\n"
-        "1:\n"
-        "   movaps (%%esi), %%xmm0\n"
-        "   pmulld %%xmm7, %%xmm0\n"
-        "   movaps %%xmm0, %%xmm2\n"
-        "   pslld $15, %%xmm0\n"
-        "   psrld $17, %%xmm2\n"
-        "   orps %%xmm2, %%xmm0\n"
-        "   pmulld %%xmm6, %%xmm0\n"
-        "   movd %%xmm0, %%eax\n"
-        "   xor %%eax, %%ebx\n"
-        "   rol $13, %%ebx\n"
-        "   imul $5, %%ebx\n"
-        "   add $0xE6546B64, %%ebx\n"
-        "   shufps $0x39, %%xmm0, %%xmm0\n"
-        "   movd %%xmm0, %%eax\n"
-        "   xor %%eax, %%ebx\n"
-        "   rol $13, %%ebx\n"
-        "   imul $5, %%ebx\n"
-        "   add $0xE6546B64, %%ebx\n"
-        "   shufps $0x39, %%xmm0, %%xmm0\n"
-        "   movd %%xmm0, %%eax\n"
-        "   xor %%eax, %%ebx\n"
-        "   rol $13, %%ebx\n"
-        "   imul $5, %%ebx\n"
-        "   add $0xE6546B64, %%ebx\n"
-        "   shufps $0x39, %%xmm0, %%xmm0\n"
-        "   movd %%xmm0, %%eax\n"
-        "   xor %%eax, %%ebx\n"
-        "   rol $13, %%ebx\n"
-        "   imul $5, %%ebx\n"
-        "   add $0xE6546B64, %%ebx\n"
-        "   add $16, %%esi\n"
-        "2:\n"
-        "   cmp %%esi, %%edi\n"
-        "   jne 1b\n"
-        "   xor %%ecx, %%ebx\n"
-        "   mov %%ebx, %%eax\n"
-        "   shr $16, %%ebx\n"
-        "   xor %%ebx, %%eax\n"
-        "   imul $0x85EBCA6b, %%eax\n"
-        "   mov %%eax, %%ebx\n"
-        "   shr $13, %%ebx\n"
-        "   xor %%ebx, %%eax\n"
-        "   imul $0xC2B2AE35, %%eax\n"
-        "   mov %%eax, %%ebx\n"
-        "   shr $16, %%ebx\n"
-        "   xor %%ebx, %%eax\n"
-        :   "=a" (ret)
-
-        :   "a" (HASH_SEED),
-            "i" (HASH_MASK1),
-            "i" (HASH_MASK2),
-            "S" (key),
-            "c" (length)
-
-        :   "%ebx",
-            "%edi"
-    );
-    return ret;
-}
-#endif
-
-#if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__)
-/*
- * Emulate MSVC _cpuid intrinsic for GCC/MinGW/Clang, this will be used
- * to determine if we should use the SSE route.
- */
-static GMQCC_FORCEINLINE void hash_cpuid(int *lanes, int entry) {
-    __asm__ __volatile__ (
-        "cpuid"
-        :   "=a"(lanes[0]),
-            "=b"(lanes[1]),
-            "=c"(lanes[2]),
-            "=d"(lanes[3])
-
-        :   "a" (entry)
-    );
-}
-
-#endif /* !(defined(__GNUC__) && defined(__i386__) */
-
  static uint32_t hash_entry(const void *GMQCC_RESTRICT key, size_t length) {
-/*
- * No host SSE instruction set assumed do runtime test instead. This
- * is for MinGW32 mostly which doesn't define SSE.
- */
-#if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__)
-    static bool memoize = false;
-    static bool sse     = false;
-
-    if (GMQCC_UNLIKELY(!memoize)) {
-        /*
-         * Only calculate SSE one time, thus it's unlikely that this branch
-         * is taken more than once.
-         */
-        static int lanes[4];
-        hash_cpuid(lanes, 0);
-        /*
-         * It's very likely that lanes[0] will contain a value unless it
-         * isn't a modern x86.
-         */
-        if (GMQCC_LIKELY(*lanes >= 1))
-            sse = (lanes[3] & ((int)1 << 25)) != 0;
-        memoize = true;
-    }
-
-    return (GMQCC_LIKELY(sse))
-                ? hash_sse(key, length)
-                : hash_native(key, length);
-/*
- * Same as above but this time host compiler was defined with SSE support.
- * This handles MinGW32 builds for i686+
- */
-#elif defined (__GNUC__) && defined(__i386__) && defined(__SSE__)
-    return hash_sse(key, length);
-#else
-    /*
-     * Go the native route which itself is highly optimized as well for
-     * unaligned load/store when dealing with LE.
-     */
      return hash_native(key, length);
-#endif
  }
  
  #define HASH_LEN_ALIGN      (sizeof(size_t))
author	Dale Weiler <killfieldengine@gmail.com>
	Sat, 14 Dec 2013 22:30:51 +0000 (17:30 -0500)
committer	Dale Weiler <killfieldengine@gmail.com>
	Sat, 14 Dec 2013 22:30:51 +0000 (17:30 -0500)