[prev in list] [next in list] [prev in thread] [next in thread] 

List:       pgsql-bugs
Subject:    Re: BUG #15518: intarray index crashes hard
From:       Andrew Gierth <andrew () tao11 ! riddles ! org ! uk>
Date:       2018-11-23 21:49:22
Message-ID: 878t1jsczr.fsf () news-spur ! riddles ! org ! uk
[Download RAW message or body]

>>>>> "PG" == PG Bug reporting form <noreply@postgresql.org> writes:

 PG> Obviously it's not intended that gist__int_ops should actually work
 PG> with data of this kind - that's what gist__intbig_ops is for. But
 PG> it's not reasonable for it to crash rather than returning an error.

 PG> I'm working on a patch.

And here it is.

This version doesn't restrict the sparseness of keys any more than it
needs to for safety. This necessitates improving the O(N^2) compression
algorithm to O(N), because otherwise it's possible to create datasets
that cause a single compression call to run for months (with no check
for interrupts, either).

There are essentially 4 specific issues addressed:

1. Integer overflow in internal_size could result in memory corruption
   in decompression since a zero-length array would be allocated and
   then written to.

2. Integer overflow in g_int_compress could cause pessimal merge
   choices, resulting in unnecessarily large ranges (which would in turn
   trigger issue 1 above)

3. Even without overflow, array sizes could become large enough to cause
   unexplained memory allocation errors, so cap the sizes and report
   actual errors pointing at gist__intbig_ops as needed.

4. Large inputs to the compression function always consist of large runs
   of consecutive integers, and the compression loop was processing
   these one at a time in an O(N^2) manner with a lot of overhead. The
   expected runtime of this function could easily exceed 6 months as a
   result. Performing a linear-time first pass reduces the worst case
   to something on the order of seconds.

-- 
Andrew (irc:RhodiumToad)


["iafix.patch" (text/x-patch)]

diff --git a/contrib/intarray/_int_gist.c b/contrib/intarray/_int_gist.c
index 911d18023b..fe10cb56b6 100644
--- a/contrib/intarray/_int_gist.c
+++ b/contrib/intarray/_int_gist.c
@@ -13,6 +13,17 @@
 #define GETENTRY(vec,pos) ((ArrayType *) DatumGetPointer((vec)->vector[(pos)].key))
 
 /*
+ * Control the maximum sparseness of compressed keys.
+ *
+ * The upper safe bound for this limit is half the maximum allocatable array
+ * size. A lower bound would give more guarantees that pathological data
+ * wouldn't eat excessive CPU and memory, but at the expense of breaking
+ * possibly working (after a fashion) indexes.
+ */
+#define MAXNUMELTS (Min((MaxAllocSize / sizeof(Datum)),((MaxAllocSize - \
ARR_OVERHEAD_NONULLS(1)) / sizeof(int)))/2) +/* or: #define MAXNUMELTS 1000000 */
+
+/*
 ** GiST support methods
 */
 PG_FUNCTION_INFO_V1(g_int_consistent);
@@ -141,11 +152,13 @@ g_int_compress(PG_FUNCTION_ARGS)
 	GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
 	GISTENTRY  *retval;
 	ArrayType  *r;
-	int			len;
+	int			len,
+				lenr;
 	int		   *dr;
 	int			i,
-				min,
+				j,
 				cand;
+	int64		min;
 
 	if (entry->leafkey)
 	{
@@ -186,23 +199,62 @@ g_int_compress(PG_FUNCTION_ARGS)
 
 		dr = ARRPTR(r);
 
-		for (i = len - 1; i >= 0; i--)
-			dr[2 * i] = dr[2 * i + 1] = dr[i];
+		/*
+		 * "len" at this point is the number of ranges we will construct.
+		 * "lenr" is the number of ranges we must eventually remove by
+		 * merging, we must be careful to remove no more than this number.
+		 */
+		lenr = len - MAXNUMRANGE;
+
+		/*
+		 * Initially assume we can merge consecutive ints into a range. but we
+		 * must count every value removed and stop when lenr runs out
+		 */
+		for (j = i = len - 1; i > 0 && lenr > 0; i--, j--)
+		{
+			int		r_end = dr[i];
+			int		r_start = r_end;
+			while (i > 0 && lenr > 0 && dr[i-1] == r_start - 1)
+				--r_start, --i, --lenr;
+			dr[2*j] = r_start;
+			dr[2*j+1] = r_end;
+		}
+		/* just copy the rest, if any, as trivial ranges */
+		for (; i >= 0; i--, j--)
+			dr[2*j] = dr[2*j + 1] = dr[i];
 
-		len *= 2;
+		if (++j)
+		{
+			/*
+			 * shunt everything down to start at the right place
+			 */
+			memmove((void *) &dr[0], (void *) &dr[2*j], 2*(len - j) * sizeof(int32));
+		}
+		/*
+		 * make "len" be number of array elements, not ranges
+		 */
+		len = 2*(len - j);
 		cand = 1;
 		while (len > MAXNUMRANGE * 2)
 		{
-			min = INT_MAX;
+			min = PG_INT64_MAX;
 			for (i = 2; i < len; i += 2)
-				if (min > (dr[i] - dr[i - 1]))
+				if (min > ((int64)dr[i] - (int64)dr[i - 1]))
 				{
-					min = (dr[i] - dr[i - 1]);
+					min = ((int64)dr[i] - (int64)dr[i - 1]);
 					cand = i;
 				}
 			memmove((void *) &dr[cand - 1], (void *) &dr[cand + 1], (len - cand - 1) * \
sizeof(int32));  len -= 2;
 		}
+		/*
+		 * check sparseness of result
+		 */
+		lenr = internal_size(dr, len);
+		if (lenr < 0 || lenr > MAXNUMELTS)
+			ereport(ERROR,
+					(errmsg("data is too sparse, recreate index using gist__intbig_ops opclass \
instead"))); +
 		r = resize_intArrayType(r, len);
 		retval = palloc(sizeof(GISTENTRY));
 		gistentryinit(*retval, PointerGetDatum(r),
@@ -260,6 +312,9 @@ g_int_decompress(PG_FUNCTION_ARGS)
 
 	din = ARRPTR(in);
 	lenr = internal_size(din, lenin);
+	if (lenr < 0 || lenr > MAXNUMELTS)
+		ereport(ERROR,
+				(errmsg("compressed array is too big, recreate index using gist__intbig_ops \
opclass instead")));  
 	r = new_intArrayType(lenr);
 	dr = ARRPTR(r);
diff --git a/contrib/intarray/_int_tool.c b/contrib/intarray/_int_tool.c
index d86485dfa5..7288b35b02 100644
--- a/contrib/intarray/_int_tool.c
+++ b/contrib/intarray/_int_tool.c
@@ -225,6 +225,7 @@ new_intArrayType(int num)
 	/* if no elements, return a zero-dimensional array */
 	if (num <= 0)
 	{
+		Assert(num == 0);
 		r = construct_empty_array(INT4OID);
 		return r;
 	}
@@ -252,6 +253,7 @@ resize_intArrayType(ArrayType *a, int num)
 	/* if no elements, return a zero-dimensional array */
 	if (num <= 0)
 	{
+		Assert(num == 0);
 		ARR_NDIM(a) = 0;
 		return a;
 	}
@@ -289,14 +291,18 @@ int
 internal_size(int *a, int len)
 {
 	int			i,
-				size = 0;
+				size;
+	int64		tsize = 0;
 
 	for (i = 0; i < len; i += 2)
 	{
 		if (!i || a[i] != a[i - 1]) /* do not count repeated range */
-			size += a[i + 1] - a[i] + 1;
+			tsize += (int64)(a[i + 1]) - (int64)(a[i]) + 1;
 	}
 
+	size = (int) tsize;
+	if ((int64)size != tsize)
+		return -1;				/* overflow */
 	return size;
 }
 



[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic