'[Zlib-devel] [PATCH 05/11] Tune longest_match implementation'

[prev in list] [next in list] [prev in thread] [next in thread] 

List:       zlib-devel
Subject:    [Zlib-devel] [PATCH 05/11] Tune longest_match implementation
From:       james.t.kukunas () linux ! intel ! com (Jim Kukunas)
Date:       2014-03-18 19:15:37
Message-ID: 1395170143-1745-6-git-send-email-james.t.kukunas () linux ! intel ! com
[Download RAW message or body]

Separates the byte-by-byte and short-by-short longest_match
implementations into two separately tweakable versions and
splits all of the longest match functions into a separate file.

Split the end-chain and early-chain scans and provide likely/unlikely
hints to improve branh prediction.

Add an early termination condition for levels 5 and under to stop
iterating the hash chain when the match length for the current
entry is less than the current best match.

Also adjust variable types and scopes to provide better optimization
hints to the compiler.
---
 deflate.c |  218 +-----------------------------------------
 match.c   |  318 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 zutil.h   |    8 ++
 3 files changed, 327 insertions(+), 217 deletions(-)
 create mode 100644 match.c

diff --git a/deflate.c b/deflate.c
index 6969577..96f555b 100644
--- a/deflate.c
+++ b/deflate.c
@@ -1131,223 +1131,7 @@ local void lm_init (s)
 #endif
 }
 
-#ifndef FASTEST
-/* ===========================================================================
- * Set match_start to the longest match starting at the given string and
- * return its length. Matches shorter or equal to prev_length are discarded,
- * in which case the result is equal to prev_length and match_start is
- * garbage.
- * IN assertions: cur_match is the head of the hash chain for the current
- *   string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
- * OUT assertion: the match length is not greater than s->lookahead.
- */
-#ifndef ASMV
-/* For 80x86 and 680x0, an optimized version will be provided in match.asm or
- * match.S. The code will be functionally equivalent.
- */
-local uInt longest_match(s, cur_match)
-    deflate_state *s;
-    IPos cur_match;                             /* current match */
-{
-    unsigned chain_length = s->max_chain_length;/* max hash chain length */
-    register Bytef *scan = s->window + s->strstart; /* current string */
-    register Bytef *match;                       /* matched string */
-    register int len;                           /* length of current match */
-    int best_len = s->prev_length;              /* best match length so far */
-    int nice_match = s->nice_match;             /* stop if match long enough */
-    IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
-        s->strstart - (IPos)MAX_DIST(s) : NIL;
-    /* Stop when cur_match becomes <= limit. To simplify the code,
-     * we prevent matches with the string of window index 0.
-     */
-    Posf *prev = s->prev;
-    uInt wmask = s->w_mask;
-
-#ifdef UNALIGNED_OK
-    /* Compare two bytes at a time. Note: this is not always beneficial.
-     * Try with and without -DUNALIGNED_OK to check.
-     */
-    register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1;
-    register ush scan_start = *(ushf*)scan;
-    register ush scan_end   = *(ushf*)(scan+best_len-1);
-#else
-    register Bytef *strend = s->window + s->strstart + MAX_MATCH;
-    register Byte scan_end1  = scan[best_len-1];
-    register Byte scan_end   = scan[best_len];
-#endif
-
-    /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
-     * It is easy to get rid of this optimization if necessary.
-     */
-    Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
-
-    /* Do not waste too much time if we already have a good match: */
-    if (s->prev_length >= s->good_match) {
-        chain_length >>= 2;
-    }
-    /* Do not look for matches beyond the end of the input. This is necessary
-     * to make deflate deterministic.
-     */
-    if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
-
-    Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
-
-    do {
-        Assert(cur_match < s->strstart, "no future");
-        match = s->window + cur_match;
-
-        /* Skip to next match if the match length cannot increase
-         * or if the match length is less than 2.  Note that the checks below
-         * for insufficient lookahead only occur occasionally for performance
-         * reasons.  Therefore uninitialized memory will be accessed, and
-         * conditional jumps will be made that depend on those values.
-         * However the length of the match is limited to the lookahead, so
-         * the output of deflate is not affected by the uninitialized values.
-         */
-#if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
-        /* This code assumes sizeof(unsigned short) == 2. Do not use
-         * UNALIGNED_OK if your compiler uses a different size.
-         */
-        if (*(ushf*)(match+best_len-1) != scan_end ||
-            *(ushf*)match != scan_start) continue;
-
-        /* It is not necessary to compare scan[2] and match[2] since they are
-         * always equal when the other bytes match, given that the hash keys
-         * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
-         * strstart+3, +5, ... up to strstart+257. We check for insufficient
-         * lookahead only every 4th comparison; the 128th check will be made
-         * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
-         * necessary to put more guard bytes at the end of the window, or
-         * to check more often for insufficient lookahead.
-         */
-        Assert(scan[2] == match[2], "scan[2]?");
-        scan++, match++;
-        do {
-        } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
-                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
-                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
-                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
-                 scan < strend);
-        /* The funny "do {}" generates better code on most compilers */
-
-        /* Here, scan <= window+strstart+257 */
-        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
-        if (*scan == *match) scan++;
-
-        len = (MAX_MATCH - 1) - (int)(strend-scan);
-        scan = strend - (MAX_MATCH-1);
-
-#else /* UNALIGNED_OK */
-
-        if (match[best_len]   != scan_end  ||
-            match[best_len-1] != scan_end1 ||
-            *match            != *scan     ||
-            *++match          != scan[1])      continue;
-
-        /* The check at best_len-1 can be removed because it will be made
-         * again later. (This heuristic is not always a win.)
-         * It is not necessary to compare scan[2] and match[2] since they
-         * are always equal when the other bytes match, given that
-         * the hash keys are equal and that HASH_BITS >= 8.
-         */
-        scan += 2, match++;
-        Assert(*scan == *match, "match[2]?");
-
-        /* We check for insufficient lookahead only every 8th comparison;
-         * the 256th check will be made at strstart+258.
-         */
-        do {
-        } while (*++scan == *++match && *++scan == *++match &&
-                 *++scan == *++match && *++scan == *++match &&
-                 *++scan == *++match && *++scan == *++match &&
-                 *++scan == *++match && *++scan == *++match &&
-                 scan < strend);
-
-        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
-
-        len = MAX_MATCH - (int)(strend - scan);
-        scan = strend - MAX_MATCH;
-
-#endif /* UNALIGNED_OK */
-
-        if (len > best_len) {
-            s->match_start = cur_match;
-            best_len = len;
-            if (len >= nice_match) break;
-#ifdef UNALIGNED_OK
-            scan_end = *(ushf*)(scan+best_len-1);
-#else
-            scan_end1  = scan[best_len-1];
-            scan_end   = scan[best_len];
-#endif
-        }
-    } while ((cur_match = prev[cur_match & wmask]) > limit
-             && --chain_length != 0);
-
-    if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
-    return s->lookahead;
-}
-#endif /* ASMV */
-
-#else /* FASTEST */
-
-/* ---------------------------------------------------------------------------
- * Optimized version for FASTEST only
- */
-local uInt longest_match(s, cur_match)
-    deflate_state *s;
-    IPos cur_match;                             /* current match */
-{
-    register Bytef *scan = s->window + s->strstart; /* current string */
-    register Bytef *match;                       /* matched string */
-    register int len;                           /* length of current match */
-    register Bytef *strend = s->window + s->strstart + MAX_MATCH;
-
-    /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
-     * It is easy to get rid of this optimization if necessary.
-     */
-    Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
-
-    Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
-
-    Assert(cur_match < s->strstart, "no future");
-
-    match = s->window + cur_match;
-
-    /* Return failure if the match length is less than 2:
-     */
-    if (match[0] != scan[0] || match[1] != scan[1]) return MIN_MATCH-1;
-
-    /* The check at best_len-1 can be removed because it will be made
-     * again later. (This heuristic is not always a win.)
-     * It is not necessary to compare scan[2] and match[2] since they
-     * are always equal when the other bytes match, given that
-     * the hash keys are equal and that HASH_BITS >= 8.
-     */
-    scan += 2, match += 2;
-    Assert(*scan == *match, "match[2]?");
-
-    /* We check for insufficient lookahead only every 8th comparison;
-     * the 256th check will be made at strstart+258.
-     */
-    do {
-    } while (*++scan == *++match && *++scan == *++match &&
-             *++scan == *++match && *++scan == *++match &&
-             *++scan == *++match && *++scan == *++match &&
-             *++scan == *++match && *++scan == *++match &&
-             scan < strend);
-
-    Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
-
-    len = MAX_MATCH - (int)(strend - scan);
-
-    if (len < MIN_MATCH) return MIN_MATCH - 1;
-
-    s->match_start = cur_match;
-    return (uInt)len <= s->lookahead ? (uInt)len : s->lookahead;
-}
-
-#endif /* FASTEST */
+#include "match.c"
 
 #ifdef DEBUG
 /* ===========================================================================
diff --git a/match.c b/match.c
new file mode 100644
index 0000000..6d68b8c
--- /dev/null
+++ b/match.c
@@ -0,0 +1,318 @@
+/*
+ * Set match_start to the longest match starting at the given string and
+ * return its length. Matches shorter or equal to prev_length are discarded,
+ * in which case the result is equal to prev_length and match_start is garbage.
+ *
+ * IN assertions: cur_match is the head of the hash chain for the current
+ * 	string (strstart) and its distance is <= MAX_DIST, and prev_length >=1
+ * OUT assertion: the match length is not greater than s->lookahead
+ */
+
+#include <stdint.h>
+
+#include "deflate.h"
+
+#ifdef FASTEST
+#define longest_match fastest_longest_match
+#elif (defined(UNALIGNED_OK) && MAX_MATCH == 258)
+#define longest_match std2_longest_match
+#else
+#define longest_match std1_longest_match
+#endif
+
+/*
+ * Standard longest_match
+ *
+ */
+local unsigned std1_longest_match(deflate_state *z_const s, z_const IPos cur_match)
+{
+	z_const unsigned wmask = s->w_mask;
+
+	unsigned chain_length;
+	IPos limit;
+	int len, best_len;
+	int nice_match = s->nice_match;
+	unsigned char *scan, *match, *strend, scan_end, scan_end1;
+	Posf *prev;
+	
+	/*
+	 * The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple
+	 * of 16. It is easy to get rid of this optimization if necessary.
+	 */
+	Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
+
+	/*
+	 * Do not waste too much time if we already have a good match
+	 */
+	best_len = s->prev_length;
+	chain_length = s->max_chain_length;
+	if (best_len >= s->good_match)
+		chain_length >>= 2;
+
+	/*
+	 * Do not looks for matches beyond the end of the input. This is
+	 * necessary to make deflate deterministic
+	 */
+	if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
+
+	/*
+	 * Stop when cur_match becomes <= limit. To simplify the code,
+	 * we prevent matches with the string of window index 0
+	 */
+	limit = s->strstart > MAX_DIST(s) ? s->strstart - MAX_DIST(s) : 0;
+
+	scan = s->window + s->strstart;
+	prev = s->prev;
+	strend = s->window + s->strstart + MAX_MATCH;
+	scan_end1 = scan[best_len-1];
+	scan_end = scan[best_len];
+
+	Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+		"need lookahead");
+	do {
+		Assert(cur_match < s->strstart, "no future");
+		match = s->window + cur_match;
+
+		/*
+		 * Skip to next match if the match length cannot increase
+		 * or if the match length is less than 2. Note that the checks
+		 * below for insufficient lookahead only occur occasionally
+		 * for performance reasons. Therefore uninitialized memory
+		 * will be accessed and conditional jumps will be made that
+		 * depend on those values. However the length of the match
+		 * is limited to the lookahead, so the output of deflate is not
+		 * affected by the uninitialized values.
+		 */
+		if (match[best_len] != scan_end ||
+		match[best_len-1] != scan_end1 ||
+		*match != *scan ||
+		*++match != scan[1])
+			continue;
+
+		/*
+		 * The check at best_len-1 can be removed because it will
+		 * be made again later. (This heuristic is not always a win.)
+		 * It is not necessary to compare scan[2] and match[2] since
+		 * they are always equal when the other bytes match, given
+		 * that the hash keys are equal and that HASH_BITS >= 8.
+		 */
+		scan += 2;
+		match++;
+		Assert(*scan == *match, "match[2]?");
+
+		/*
+		 * We check for insufficient lookahead only every 8th
+		 * comparision; the 256th check will be made at strstart + 258.
+		 */
+		do {
+		} while (*++scan == *++match && *++scan == *++match &&
+			 *++scan == *++match && *++scan == *++match &&
+			 *++scan == *++match && *++scan == *++match &&
+			 *++scan == *++match && *++scan == *++match &&
+			 scan < strend);
+
+		Assert(scan <= s->window+(unsigned int)(s->window_size-1),
+			"wild scan");
+
+		len = MAX_MATCH - (int)(strend - scan);
+		scan = strend - MAX_MATCH;
+
+		if (len > best_len) {
+			s->match_start = cur_match;
+			best_len = len;
+			if (len >= nice_match)
+				break;
+			scan_end1 = scan[best_len-1];
+			scan_end = scan[best_len];
+		} else {
+			/*
+			 * The probability of finding a match later if we here
+			 * is pretty low, so for performance it's best to
+			 * outright stop here for the lower compression levels
+			 */
+			if (s->level < 6)
+				break;
+		}
+	} while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length);
+
+	if ((unsigned int)best_len <= s->lookahead)
+		return best_len;
+	return s->lookahead;
+}
+
+/*
+ * UNALIGNED_OK longest_match
+ *
+ */
+local unsigned std2_longest_match(deflate_state *z_const s, z_const IPos cur_match)
+{
+	z_const unsigned wmask = s->w_mask;
+
+	unsigned short scan_start, scan_end;
+	unsigned chain_length;
+	IPos limit;
+	int len, best_len;
+	int nice_match = s->nice_match;
+	unsigned char *scan, *strend;
+	Posf *prev;
+	
+	/*
+	 * The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple
+	 * of 16. It is easy to get rid of this optimization if necessary.
+	 */
+	Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
+
+	/*
+	 * Do not waste too much time if we already have a good match
+	 */
+	best_len = s->prev_length;
+	chain_length = s->max_chain_length;
+	if (best_len >= s->good_match)
+		chain_length >>= 2;
+
+	/*
+	 * Do not looks for matches beyond the end of the input. This is
+	 * necessary to make deflate deterministic
+	 */
+	if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
+
+	/*
+	 * Stop when cur_match becomes <= limit. To simplify the code,
+	 * we prevent matches with the string of window index 0
+	 */
+	limit = s->strstart > MAX_DIST(s) ? s->strstart - MAX_DIST(s) : 0;
+
+	scan = s->window + s->strstart;
+	prev = s->prev;
+	strend = s->window + s->strstart + MAX_MATCH - 1;
+	scan_start = *(unsigned short *)scan;
+	scan_end = *(unsigned short *)(scan + best_len-1);
+
+	Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+		"need lookahead");
+	do {
+        unsigned char *match;
+		Assert(cur_match < s->strstart, "no future");
+		match = s->window + cur_match;
+		
+		/*
+		 * Skip to next match if the match length cannot increase
+		 * or if the match length is less than 2. Note that the checks
+		 * below for insufficient lookahead only occur occasionally
+		 * for performance reasons. Therefore uninitialized memory
+		 * will be accessed and conditional jumps will be made that
+		 * depend on those values. However the length of the match
+		 * is limited to the lookahead, so the output of deflate is not
+		 * affected by the uninitialized values.
+		 */
+		if (zlikely((*(unsigned short *)(match + best_len - 1) != scan_end)))
+			continue;
+		if (*(unsigned short *)match != scan_start)
+			continue;
+
+		/* It is not necessary to compare scan[2] and match[2] since
+		 * they are always equal when the other bytes match, given that
+		 * the hash keys are equal and that HASH_BITS >= 8. Compare 2
+		 * bytes at a time at strstart+3, +5, ... up to strstart+257.
+		 * We check for insufficient lookahead only every 4th
+		 * comparison; the 128th check will be made at strstart+257.
+		 * If MAX_MATCH-2 is not a multiple of 8, it is necessary to
+		 * put more guard bytes at the end of the window, or to check
+		 * more often for insufficient lookahead.
+		 */
+		Assert(scan[2] == match[2], "scan[2]?");
+		scan++;
+		match++;
+
+		do {
+		} while (*(unsigned short *)(scan += 2)== *(unsigned short *)(match += 2)&&
+			 *(unsigned short *)(scan += 2)== *(unsigned short *)(match += 2)&&
+			 *(unsigned short *)(scan += 2)== *(unsigned short *)(match += 2)&&
+			 *(unsigned short *)(scan += 2)== *(unsigned short *)(match += 2)&&
+			 scan < strend);
+
+		/*
+		 * Here, scan <= window + strstart + 257
+		 */
+		Assert(scan <= s->window+(unsigned)(s->window_size-1),
+			"wild scan");
+		if (*scan == *match)
+			scan++;
+
+		len = (MAX_MATCH -1) - (int)(strend-scan);
+		scan = strend - (MAX_MATCH-1);
+
+		if (len > best_len) {
+			s->match_start = cur_match;
+			best_len = len;
+			if (len >= nice_match)
+				break;
+			scan_end = *(unsigned short *)(scan + best_len - 1);
+		} else {
+			/*
+			 * The probability of finding a match later if we here
+			 * is pretty low, so for performance it's best to
+			 * outright stop here for the lower compression levels
+			 */
+			if (s->level < 6)
+				break;
+		}
+	} while (--chain_length && (cur_match = prev[cur_match & wmask]) > limit);
+
+	if ((unsigned)best_len <= s->lookahead)
+		return best_len;
+	return s->lookahead;
+}
+
+/*
+ * FASTEST-only longest_match
+ *
+ */
+local unsigned fastest_longest_match(deflate_state *z_const s, z_const IPos cur_match)
+{
+	unsigned char *scan, *match, *strend;
+	int len;
+
+	/*
+	 * The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple 
+	 * of 16. It is easy to get rid of this optimization if necessary
+	 */
+	Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
+
+	Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+		"need lookahead");
+
+	Assert(cur_match < s->strstart, "no future");
+
+	match = s->window + cur_match;
+	scan = s->window + s->strstart;
+	strend = s->window + s->strstart + MAX_MATCH;
+
+	if (*match++ != *scan++ || *match++ != *scan++)
+		return MIN_MATCH-1;
+
+	/*
+	 * The check at best_len-1 can be removed because it will be made
+	 * again later. (This heuristic is not always a win.) It is not
+	 * necessary to compare scan[2] and match[2] since they are always
+	 * equal when the other bytes match, given that the hash keys are equal
+	 * and that HASH_BITS >= 8.
+	 */
+	Assert(*scan == *match, "match[2]?");
+
+	do {
+	} while (*++scan == *++match && *++scan == *++match &&
+		 *++scan == *++match && *++scan == *++match &&
+		 *++scan == *++match && *++scan == *++match &&
+		 *++scan == *++match && *++scan == *++match &&
+		 scan < strend);
+	
+	Assert(scan <= s->window+(unsigned int)(s->window_size-1), "wild scan");
+
+	len = MAX_MATCH - (long)(strend - scan);
+	if (len < MIN_MATCH)
+		return MIN_MATCH-1;
+	
+	s->match_start = cur_match;
+	return len <= s->lookahead ? len : s->lookahead;
+}
diff --git a/zutil.h b/zutil.h
index 24ab06b..662e586 100644
--- a/zutil.h
+++ b/zutil.h
@@ -167,6 +167,14 @@ extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
   #pragma warn -8066
 #endif
 
+#if defined(__GNUC__)
+#   define zlikely(x)   __builtin_expect(!!(x), 1)
+#   define zunlikely(x) __builtin_expect(!!(x), 0)
+#else
+#   define zlikely(x)   x
+#   define zunlikely(x) x
+#endif
+
 /* provide prototypes for these when building zlib without LFS */
 #if !defined(_WIN32) && \
     (!defined(_LARGEFILE64_SOURCE) || _LFS64_LARGEFILE-0 == 0)
-- 
1.7.1




[prev in list] [next in list] [prev in thread] [next in thread]
Configure | About | News | Add a list | Sponsored by KoreLogic