[prev in list] [next in list] [prev in thread] [next in thread] 

List:       zlib-devel
Subject:    [Zlib-devel] [PATCH 07/11] Adds SSE2 optimized hash shifting to fill_window.
From:       james.t.kukunas () linux ! intel ! com (Jim Kukunas)
Date:       2014-03-18 19:15:39
Message-ID: 1395170143-1745-8-git-send-email-james.t.kukunas () linux ! intel ! com
[Download RAW message or body]

Uses SSE2 subtraction with saturation to shift the hash in
16B chunks. Renames the old fill_window implementation to
fill_window_c(), and adds a new fill_window_sse() implementation
in fill_window_sse.c.

Moves UPDATE_HASH into deflate.h and changes the scope of
read_buf from local to ZLIB_INTERNAL for sharing between
the two implementations.

Updates the configure script to check for SSE2 intrinsics and enables
this optimization by default on x86. The runtime check for SSE2 support
only occurs on 32-bit, as x86_64 requires SSE2. Adds an explicit
rule in Makefile.in to build fill_window_sse.c with the -msse2 compiler
flag, which is required for SSE2 intrinsics.
---
 Makefile.in       |   15 ++++-
 configure         |   45 ++++++++++++++
 deflate.c         |   59 +++++++++++++------
 deflate.h         |    8 +++
 fill_window_sse.c |  172 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 279 insertions(+), 20 deletions(-)
 create mode 100644 fill_window_sse.c

diff --git a/Makefile.in b/Makefile.in
index c61aa30..4774810 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -45,6 +45,9 @@ TAR=tar
 SHELL=/bin/sh
 EXE=
 
+FILL_WINDOW_SSE_o= 
+FILL_WINDOW_SSE_lo=
+
 prefix = /usr/local
 exec_prefix = ${prefix}
 libdir = ${exec_prefix}/lib
@@ -54,11 +57,11 @@ mandir = ${prefix}/share/man
 man3dir = ${mandir}/man3
 pkgconfigdir = ${libdir}/pkgconfig
 
-OBJZ = adler32.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o \
zutil.o +OBJZ = adler32.o crc32.o ${FILL_WINDOW_SSE_o} deflate.o infback.o inffast.o \
inflate.o inftrees.o trees.o zutil.o  OBJG = compress.o uncompr.o gzclose.o gzlib.o \
gzread.o gzwrite.o  OBJC = $(OBJZ) $(OBJG)
 
-PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo \
inftrees.lo trees.lo zutil.lo +PIC_OBJZ = adler32.lo crc32.lo ${FILL_WINDOW_SSE_lo} \
deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo  PIC_OBJG = \
compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo  PIC_OBJC = \
$(PIC_OBJZ) $(PIC_OBJG)  
@@ -113,6 +116,14 @@ test64: all64
 	fi; \
 	rm -f $$TMP64
 
+fill_window_sse.lo: fill_window_sse.c
+	- at mkdir objs 2>/dev/null || test -d objs
+	$(CC) $(SFLAGS) -msse2 -DPIC -c -o objs/$*.o $<
+	- at mv objs/$*.o $@
+
+fill_window_sse.o: fill_window_sse.c
+	${CC} ${CFLAGS} -msse2 -I. -c -o $@ fill_window_sse.c
+
 infcover.o: test/infcover.c zlib.h zconf.h
 	$(CC) $(CFLAGS) -I. -c -o $@ test/infcover.c
 
diff --git a/configure b/configure
index ff66ab3..9755cbe 100755
--- a/configure
+++ b/configure
@@ -760,6 +760,23 @@ EOF
   fi
 fi
 
+# Check for SSE2 intrinsics
+cat > $test.c << EOF
+#include <immintrin.h>
+int main(void)
+{
+    __m128i zero = _mm_setzero_si128();
+    return 0;
+}
+EOF
+if try ${CC} ${CFLAGS} -msse2 $test.c; then
+    echo "Checking for SSE2 intrinsics ... Yes." | tee -a configure.log
+    HAVE_SSE2_INTRIN=1
+else
+    echo "Checking for SSE2 intrinsics ... No." | tee -a configure.log
+    HAVE_SSE2_INTRIN=0
+fi
+
 # Set ARCH specific FLAGS
 case "${ARCH}" in
     x86_64)
@@ -774,6 +791,18 @@ case "${ARCH}" in
 
         CFLAGS="${CFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS"
         SFLAGS="${SFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS"
+
+        if test ${HAVE_SSE2_INTRIN} -eq 1; then
+            CFLAGS="${CFLAGS} -UCHECK_SSE2 -DHAVE_SSE2"
+            SFLAGS="${SFLAGS} -UCHECK_SSE2 -DHAVE_SSE2"
+            FILL_WINDOW_SSE_o="fill_window_sse.o"
+            FILL_WINDOW_SSE_lo="fill_window_sse.lo"
+            OBJS="${OBJS} ${FILL_WINDOW_SSE_o}"
+            PIC_OBJS="${PIC_OBJS} ${FILL_WINDOW_SSE_lo}"
+        else
+            FILL_WINDOW_SSE_o=""
+            FILL_WINDOW_SSE_lo=""
+        fi
     ;;
     i386 | i486 | i586 | i686)
         OBJC="${OBJC} x86.o"
@@ -787,6 +816,18 @@ case "${ARCH}" in
 
         CFLAGS="${CFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS"
         SFLAGS="${SFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS"
+
+        if test ${HAVE_SSE2_INTRIN} -eq 1; then
+            CFLAGS="${CFLAGS} -DCHECK_SSE2 -DHAVE_SSE2"
+            SFLAGS="${SFLAGS} -DCHECK_SSE2 -DHAVE_SSE2"
+            FILL_WINDOW_SSE_o="fill_window_sse.o"
+            FILL_WINDOW_SSE_lo="fill_window_sse.lo"
+            OBJS="${OBJS} ${FILL_WINDOW_SSE_o}"
+            PIC_OBJS="${PIC_OBJS} ${FILL_WINDOW_SSE_lo}"
+        else
+            FILL_WINDOW_SSE_o=""
+            FILL_WINDOW_SSE_lo=""
+        fi
     ;;
 esac
 
@@ -821,6 +862,8 @@ echo mandir = $mandir >> configure.log
 echo prefix = $prefix >> configure.log
 echo sharedlibdir = $sharedlibdir >> configure.log
 echo uname = $uname >> configure.log
+echo FILL_WINDOW_SSE_o = ${FILL_WINDOW_SSE_o} >> configure.log
+echo FILL_WINDOW_SSE_lo= ${FILL_WINDOW_SSE_lo} >> configure.log
 
 # udpate Makefile with the configure results
 sed < Makefile.in "
@@ -850,6 +893,8 @@ sed < Makefile.in "
 /^PIC_OBJC *=/s#=.*#= $PIC_OBJC#
 /^all: */s#:.*#: $ALL#
 /^test: */s#:.*#: $TEST#
+/^FILL_WINDOW_SSE_o *=/s#=.*#=$FILL_WINDOW_SSE_o#
+/^FILL_WINDOW_SSE_lo *=/s#=.*#=$FILL_WINDOW_SSE_lo#
 " > Makefile
 
 # create zlib.pc with the configure results
diff --git a/deflate.c b/deflate.c
index 96f555b..32df211 100644
--- a/deflate.c
+++ b/deflate.c
@@ -84,7 +84,7 @@ local block_state deflate_huff   OF((deflate_state *s, int flush));
 local void lm_init        OF((deflate_state *s));
 local void putShortMSB    OF((deflate_state *s, uInt b));
 local void flush_pending  OF((z_streamp strm));
-local int read_buf        OF((z_streamp strm, Bytef *buf, unsigned size));
+ZLIB_INTERNAL int read_buf        OF((z_streamp strm, Bytef *buf, unsigned size));
 #ifdef ASMV
       void match_init OF((void)); /* asm code initialization */
       uInt longest_match  OF((deflate_state *s, IPos cur_match));
@@ -158,14 +158,6 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers \
*/  /* rank Z_BLOCK between Z_NO_FLUSH and Z_PARTIAL_FLUSH */
 #define RANK(f) (((f) << 1) - ((f) > 4 ? 9 : 0))
 
-/* ===========================================================================
- * Update a hash value with the given input byte
- * IN  assertion: all calls to to UPDATE_HASH are made with consecutive
- *    input characters, so that a running hash key can be computed from the
- *    previous key instead of complete recalculation each time.
- */
-#define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)
-
 
 /* ===========================================================================
  * Insert string str in the dictionary and set match_head to the previous head
@@ -179,12 +171,12 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers \
                */
  */
 #ifdef FASTEST
 #define INSERT_STRING(s, str, match_head) \
-   (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
+   (UPDATE_HASH(s, s->ins_h, (str)), \
     match_head = s->head[s->ins_h], \
     s->head[s->ins_h] = (Pos)(str))
 #else
 #define INSERT_STRING(s, str, match_head) \
-   (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
+   (UPDATE_HASH(s, s->ins_h, (str)), \
     match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \
     s->head[s->ins_h] = (Pos)(str))
 #endif
@@ -197,6 +189,10 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers \
*/  s->head[s->hash_size-1] = NIL; \
     zmemzero((Bytef *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head));
 
+#ifdef CHECK_SSE2
+#include "x86.h"
+#endif
+
 /* ========================================================================= */
 int ZEXPORT deflateInit_(strm, level, version, stream_size)
     z_streamp strm;
@@ -230,6 +226,10 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, \
                memLevel, strategy,
      * output size for (length,distance) codes is <= 24 bits.
      */
 
+#ifdef CHECK_SSE2
+    x86_check_features();
+#endif
+
     if (version == Z_NULL || version[0] != my_version[0] ||
         stream_size != sizeof(z_stream)) {
         return Z_VERSION_ERROR;
@@ -365,7 +365,7 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
         str = s->strstart;
         n = s->lookahead - (MIN_MATCH-1);
         do {
-            UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
+            UPDATE_HASH(s, s->ins_h, str);
 #ifndef FASTEST
             s->prev[str & s->w_mask] = s->head[s->ins_h];
 #endif
@@ -1073,7 +1073,7 @@ int ZEXPORT deflateCopy (dest, source)
  * allocating a large strm->next_in buffer and copying from it.
  * (See also flush_pending()).
  */
-local int read_buf(strm, buf, size)
+ZLIB_INTERNAL int read_buf(strm, buf, size)
     z_streamp strm;
     Bytef *buf;
     unsigned size;
@@ -1171,10 +1171,31 @@ local void check_match(s, start, match, length)
  *    performed for at least two bytes (required for the zip translate_eol
  *    option -- not supported here).
  */
-local void fill_window(s)
+#ifdef HAVE_SSE2
+extern void fill_window_sse(deflate_state *s);
+#endif
+local void fill_window_c(deflate_state *s);
+
+local void fill_window(deflate_state *s)
+{
+#ifdef HAVE_SSE2
+#ifdef CHECK_SSE2
+    if (x86_cpu_has_sse2) {
+#endif
+        fill_window_sse(s);
+        return;
+#ifdef CHECK_SSE2
+    }
+#endif
+#endif
+    
+    fill_window_c(s);
+}
+
+local void fill_window_c(s)
     deflate_state *s;
 {
-    register unsigned n, m;
+    register unsigned n;
     register Posf *p;
     unsigned more;    /* Amount of free space at the end of the window. */
     uInt wsize = s->w_size;
@@ -1216,6 +1237,7 @@ local void fill_window(s)
             n = s->hash_size;
             p = &s->head[n];
             do {
+                unsigned m;
                 m = *--p;
                 *p = (Pos)(m >= wsize ? m-wsize : NIL);
             } while (--n);
@@ -1224,6 +1246,7 @@ local void fill_window(s)
 #ifndef FASTEST
             p = &s->prev[n];
             do {
+                unsigned m;
                 m = *--p;
                 *p = (Pos)(m >= wsize ? m-wsize : NIL);
                 /* If n is not on any hash chain, prev[n] is garbage but
@@ -1255,12 +1278,12 @@ local void fill_window(s)
         if (s->lookahead + s->insert >= MIN_MATCH) {
             uInt str = s->strstart - s->insert;
             s->ins_h = s->window[str];
-            UPDATE_HASH(s, s->ins_h, s->window[str + 1]);
+            UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1));
 #if MIN_MATCH != 3
             Call UPDATE_HASH() MIN_MATCH-3 more times
 #endif
             while (s->insert) {
-                UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
+                UPDATE_HASH(s, s->ins_h, str);
 #ifndef FASTEST
                 s->prev[str & s->w_mask] = s->head[s->ins_h];
 #endif
@@ -1478,7 +1501,7 @@ local block_state deflate_fast(s, flush)
                 s->strstart += s->match_length;
                 s->match_length = 0;
                 s->ins_h = s->window[s->strstart];
-                UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
+                UPDATE_HASH(s, s->ins_h, s->strstart+1 - (MIN_MATCH-1));
 #if MIN_MATCH != 3
                 Call UPDATE_HASH() MIN_MATCH-3 more times
 #endif
diff --git a/deflate.h b/deflate.h
index ce0299e..f1c1ed9 100644
--- a/deflate.h
+++ b/deflate.h
@@ -343,4 +343,12 @@ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf \
*buf,  flush = _tr_tally(s, distance, length)
 #endif
 
+/* ===========================================================================
+ * Update a hash value with the given input byte
+ * IN  assertion: all calls to to UPDATE_HASH are made with consecutive
+ *    input characters, so that a running hash key can be computed from the
+ *    previous key instead of complete recalculation each time.
+ */
+#define UPDATE_HASH(s,h,i) (h = (((h)<<s->hash_shift) ^ (s->window[i + \
(MIN_MATCH-1)])) & s->hash_mask) +
 #endif /* DEFLATE_H */
diff --git a/fill_window_sse.c b/fill_window_sse.c
new file mode 100644
index 0000000..bbc089c
--- /dev/null
+++ b/fill_window_sse.c
@@ -0,0 +1,172 @@
+/*
+ * Fill Window with SSE2-optimized hash shifting
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Authors:
+ *  Arjan van de Ven    <arjan at linux.intel.com>
+ *  Jim Kukunas         <james.t.kukunas at linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef HAVE_SSE2
+
+#include <immintrin.h>
+#include "deflate.h"
+
+void fill_window_sse(deflate_state *s)
+{
+    z_const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
+
+    register unsigned n;
+    register Posf *p;
+    unsigned more;    /* Amount of free space at the end of the window. */
+    uInt wsize = s->w_size;
+
+    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
+
+    do {
+        more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
+
+        /* Deal with !@#$% 64K limit: */
+        if (sizeof(int) <= 2) {
+            if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
+                more = wsize;
+
+            } else if (more == (unsigned)(-1)) {
+                /* Very unlikely, but possible on 16 bit machine if
+                 * strstart == 0 && lookahead == 1 (input done a byte at time)
+                 */
+                more--;
+            }
+        }
+
+        /* If the window is almost full and there is insufficient lookahead,
+         * move the upper half to the lower one to make room in the upper half.
+         */
+        if (s->strstart >= wsize+MAX_DIST(s)) {
+
+            zmemcpy(s->window, s->window+wsize, (unsigned)wsize);
+            s->match_start -= wsize;
+            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
+            s->block_start -= (long) wsize;
+
+            /* Slide the hash table (could be avoided with 32 bit values
+               at the expense of memory usage). We slide even when level == 0
+               to keep the hash table consistent if we switch back to level > 0
+               later. (Using level 0 permanently is not an optimal usage of
+               zlib, so we don't care about this pathological case.)
+             */
+            n = s->hash_size;
+            p = &s->head[n];
+            p -= 8;
+            do {
+                __m128i value, result;
+
+                value = _mm_loadu_si128((__m128i *)p);
+                result = _mm_subs_epu16(value, xmm_wsize);
+                _mm_storeu_si128((__m128i *)p, result);
+
+                p -= 8;
+                n -= 8;
+            } while (n > 0);
+
+            n = wsize;
+#ifndef FASTEST
+            p = &s->prev[n];
+            p -= 8;
+            do {
+                __m128i value, result;
+
+                value = _mm_loadu_si128((__m128i *)p);
+                result = _mm_subs_epu16(value, xmm_wsize);
+                _mm_storeu_si128((__m128i *)p, result);
+                
+                p -= 8;
+                n -= 8;
+            } while (n > 0);
+#endif
+            more += wsize;
+        }
+        if (s->strm->avail_in == 0) break;
+
+        /* If there was no sliding:
+         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
+         *    more == window_size - lookahead - strstart
+         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
+         * => more >= window_size - 2*WSIZE + 2
+         * In the BIG_MEM or MMAP case (not yet supported),
+         *   window_size == input_size + MIN_LOOKAHEAD  &&
+         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
+         * Otherwise, window_size == 2*WSIZE so more >= 2.
+         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
+         */
+        Assert(more >= 2, "more < 2");
+
+        n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
+        s->lookahead += n;
+
+        /* Initialize the hash value now that we have some input: */
+        if (s->lookahead + s->insert >= MIN_MATCH) {
+            uInt str = s->strstart - s->insert;
+            s->ins_h = s->window[str];
+            if (str >= 1)
+                UPDATE_HASH(s, s->ins_h, str + 2 - (MIN_MATCH-1));
+#if MIN_MATCH != 3
+            Call UPDATE_HASH() MIN_MATCH-3 more times
+#endif
+            while (s->insert) {
+                UPDATE_HASH(s, s->ins_h, str);
+#ifndef FASTEST
+                s->prev[str & s->w_mask] = s->head[s->ins_h];
+#endif
+                s->head[s->ins_h] = (Pos)str;
+                str++;
+                s->insert--;
+                if (s->lookahead + s->insert < MIN_MATCH)
+                    break;
+            }
+        }
+        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
+         * but this is not important since only literal bytes will be emitted.
+         */
+
+    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
+
+    /* If the WIN_INIT bytes after the end of the current data have never been
+     * written, then zero those bytes in order to avoid memory check reports of
+     * the use of uninitialized (or uninitialised as Julian writes) bytes by
+     * the longest match routines.  Update the high water mark for the next
+     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
+     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
+     */
+    if (s->high_water < s->window_size) {
+        ulg curr = s->strstart + (ulg)(s->lookahead);
+        ulg init;
+
+        if (s->high_water < curr) {
+            /* Previous high water mark below current data -- zero WIN_INIT
+             * bytes or up to end of window, whichever is less.
+             */
+            init = s->window_size - curr;
+            if (init > WIN_INIT)
+                init = WIN_INIT;
+            zmemzero(s->window + curr, (unsigned)init);
+            s->high_water = curr + init;
+        }
+        else if (s->high_water < (ulg)curr + WIN_INIT) {
+            /* High water mark at or above current data, but below current data
+             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
+             * to end of window, whichever is less.
+             */
+            init = (ulg)curr + WIN_INIT - s->high_water;
+            if (init > s->window_size - s->high_water)
+                init = s->window_size - s->high_water;
+            zmemzero(s->window + s->high_water, (unsigned)init);
+            s->high_water += init;
+        }
+    }
+
+    Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+           "not enough room for search");
+}
+#endif
-- 
1.7.1


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic