[prev in list] [next in list] [prev in thread] [next in thread] 

List:       postgresql-hackers
Subject:    Re: Implement missing join selectivity estimation for range types
From:       Alena Rybakina <lena.ribackina () yandex ! ru>
Date:       2023-11-30 14:50:36
Message-ID: 5fc8d7f6-9cd5-4da3-8a94-1d06831441fe () yandex ! ru
[Download RAW message or body]

Hi!

Thank you for your work on the subject, I think it's a really useful 
feature and it allows optimizer to estimate more correctly clauses with 
such type of operator.

I rewieved your patch and noticed that some comments are repeated into 
multirangejoinsel functions, I suggest combining them.

The proposed changes are in the attached patch.


If this topic is about calculating selectivity, have you thought about 
adding cardinality calculation test for queries with this type of operator?

For example, you could form queries similar to those that you use in 
src/test/regress/sql/multirangetypes.sql and 
src/test/regress/sql/rangetypes.sql.

I added a few in the attached patch.

-- 
Regards,
Alena Rybakina

["changes.diff.txt" (text/plain)]

diff --git a/src/backend/utils/adt/multirangetypes_selfuncs.c \
b/src/backend/utils/adt/multirangetypes_selfuncs.c index c670d225a0c..7708768b89f \
                100644
--- a/src/backend/utils/adt/multirangetypes_selfuncs.c
+++ b/src/backend/utils/adt/multirangetypes_selfuncs.c
@@ -1620,14 +1620,15 @@ multirangejoinsel(PG_FUNCTION_ARGS)
 													hist1_lower, nhist1);
 				break;
 
+		/*
+		 * Start by comparing lower bounds and if they are equal
+		 * compare upper bounds for comparison operators
+		 */
 			case OID_MULTIRANGE_LESS_EQUAL_OP:
 
 				/*
 				 * A <= B
 				 *
-				 * Start by comparing lower bounds and if they are equal
-				 * compare upper bounds
-				 *
 				 * Negation of OID_RANGE_GREATER_OP.
 				 *
 				 * Overestimate by comparing only the lower bounds. Higher
@@ -1644,9 +1645,6 @@ multirangejoinsel(PG_FUNCTION_ARGS)
 				/*
 				 * A < B
 				 *
-				 * Start by comparing lower bounds and if they are equal
-				 * compare upper bounds
-				 *
 				 * Underestimate by comparing only the lower bounds. Higher
 				 * accuracy would require us to add P(lower1 = lower2) *
 				 * P(upper1 < upper2)
@@ -1661,9 +1659,6 @@ multirangejoinsel(PG_FUNCTION_ARGS)
 				/*
 				 * A >= B
 				 *
-				 * Start by comparing lower bounds and if they are equal
-				 * compare upper bounds
-				 *
 				 * Negation of OID_RANGE_LESS_OP.
 				 *
 				 * Overestimate by comparing only the lower bounds. Higher
@@ -1680,9 +1675,6 @@ multirangejoinsel(PG_FUNCTION_ARGS)
 				/*
 				 * A > B == B < A
 				 *
-				 * Start by comparing lower bounds and if they are equal
-				 * compare upper bounds
-				 *
 				 * Underestimate by comparing only the lower bounds. Higher
 				 * accuracy would require us to add P(lower1 = lower2) *
 				 * P(upper1 > upper2)
@@ -1773,18 +1765,16 @@ multirangejoinsel(PG_FUNCTION_ARGS)
 			case OID_MULTIRANGE_ADJACENT_MULTIRANGE_OP:
 			case OID_MULTIRANGE_ADJACENT_RANGE_OP:
 			case OID_RANGE_ADJACENT_MULTIRANGE_OP:
-
-				/*
-				 * just punt for now, estimation would require equality
-				 * selectivity for bounds
-				 */
 			case OID_MULTIRANGE_CONTAINS_ELEM_OP:
 			case OID_MULTIRANGE_ELEM_CONTAINED_OP:
 
-				/*
-				 * just punt for now, estimation would require extraction of
-				 * histograms for the anyelement
-				 */
+			   /*
+				* just punt for now:
+				* if it is a type of adjucent operation estimation
+				* it will require equality selectivity for bounds;
+				* if it is one of type of contain operation
+				* it will extraction of histograms for the any element.
+				*/
 			default:
 				break;
 		}
diff --git a/src/test/regress/expected/multirangetypes.out \
b/src/test/regress/expected/multirangetypes.out index 21d63d9bdac..72f15cf48e1 100644
--- a/src/test/regress/expected/multirangetypes.out
+++ b/src/test/regress/expected/multirangetypes.out
@@ -3364,6 +3364,25 @@ DETAIL:  A result of type anymultirange requires at least one \
                input of type anyr
 --
 -- test selectivity of multirange join operators
 --
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain analyze %s', $1)
+    loop
+        if first_row then
+            first_row := false;
+            tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+            return query select tmp[1]::int, tmp[2]::int;
+        end if;
+    end loop;
+end;
+$$;
 create table test_multirange_join_1 (imr1 int4multirange);
 create table test_multirange_join_2 (imr2 int4multirange);
 create table test_multirange_join_3 (imr3 int4multirange);
@@ -3421,6 +3440,40 @@ explain (costs off) select count(*) from \
                test_multirange_join_1, test_multirange
          ->  Seq Scan on test_multirange_join_1
 (9 rows)
 
+SELECT * FROM check_estimated_rows('
+   select count(*)
+   from test_multirange_join_1,
+        test_multirange_join_2
+   where imr1 && imr2
+');
+ estimated | actual 
+-----------+--------
+         1 |      1
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+   select count(*)
+   from test_multirange_join_1,
+        test_multirange_join_2
+   where imr1 << imr2
+');
+ estimated | actual 
+-----------+--------
+         1 |      1
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+   select count(*)
+   from test_multirange_join_1,
+        test_multirange_join_2
+   where imr1 >> imr2
+');
+ estimated | actual 
+-----------+--------
+         1 |      1
+(1 row)
+
 drop table test_multirange_join_1;
 drop table test_multirange_join_2;
 drop table test_multirange_join_3;
+drop function check_estimated_rows;
diff --git a/src/test/regress/expected/rangetypes.out \
b/src/test/regress/expected/rangetypes.out index 357bb3154b2..3168c12b2dc 100644
--- a/src/test/regress/expected/rangetypes.out
+++ b/src/test/regress/expected/rangetypes.out
@@ -1837,6 +1837,25 @@ DETAIL:  A result of type anyrange requires at least one input \
                of type anyrange
 --
 -- test selectivity of range join operators
 --
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain analyze %s', $1)
+    loop
+        if first_row then
+            first_row := false;
+            tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+            return query select tmp[1]::int, tmp[2]::int;
+        end if;
+    end loop;
+end;
+$$;
 create table test_range_join_1 (ir1 int4range);
 create table test_range_join_2 (ir2 int4range);
 create table test_range_join_3 (ir3 int4range);
@@ -1894,6 +1913,40 @@ explain (costs off) select count(*) from test_range_join_1, \
                test_range_join_2, t
          ->  Seq Scan on test_range_join_1
 (9 rows)
 
+SELECT * FROM check_estimated_rows('
+   select count(*)
+   from test_range_join_1,
+        test_range_join_2
+   where ir1 && ir2
+');
+ estimated | actual 
+-----------+--------
+         1 |      1
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+   select count(*)
+   from test_range_join_1,
+        test_range_join_2
+   where ir1 << ir2
+');
+ estimated | actual 
+-----------+--------
+         1 |      1
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+   select count(*)
+   from test_range_join_1,
+        test_range_join_2
+   where ir1 >> ir2
+');
+ estimated | actual 
+-----------+--------
+         1 |      1
+(1 row)
+
 drop table test_range_join_1;
 drop table test_range_join_2;
 drop table test_range_join_3;
+drop function check_estimated_rows;
diff --git a/src/test/regress/sql/multirangetypes.sql \
b/src/test/regress/sql/multirangetypes.sql index 4c62c31166a..cd828fc42c1 100644
--- a/src/test/regress/sql/multirangetypes.sql
+++ b/src/test/regress/sql/multirangetypes.sql
@@ -865,6 +865,27 @@ create function mr_table_fail(i anyelement) returns table(i \
                anyelement, r anymul
 --
 -- test selectivity of multirange join operators
 --
+
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain analyze %s', $1)
+    loop
+        if first_row then
+            first_row := false;
+            tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+            return query select tmp[1]::int, tmp[2]::int;
+        end if;
+    end loop;
+end;
+$$;
+
 create table test_multirange_join_1 (imr1 int4multirange);
 create table test_multirange_join_2 (imr2 int4multirange);
 create table test_multirange_join_3 (imr3 int4multirange);
@@ -885,6 +906,28 @@ explain (costs off) select count(*) from test_multirange_join_1, \
test_multirange  explain (costs off) select count(*) from test_multirange_join_1, \
test_multirange_join_2, test_multirange_join_3 where imr1 << imr2 and imr2 << imr3;  \
explain (costs off) select count(*) from test_multirange_join_1, \
test_multirange_join_2, test_multirange_join_3 where imr1 >> imr2 and imr2 >> imr3;  
+SELECT * FROM check_estimated_rows('
+   select count(*)
+   from test_multirange_join_1,
+        test_multirange_join_2
+   where imr1 && imr2
+');
+
+SELECT * FROM check_estimated_rows('
+   select count(*)
+   from test_multirange_join_1,
+        test_multirange_join_2
+   where imr1 << imr2
+');
+
+SELECT * FROM check_estimated_rows('
+   select count(*)
+   from test_multirange_join_1,
+        test_multirange_join_2
+   where imr1 >> imr2
+');
+
 drop table test_multirange_join_1;
 drop table test_multirange_join_2;
 drop table test_multirange_join_3;
+drop function check_estimated_rows;
diff --git a/src/test/regress/sql/rangetypes.sql \
b/src/test/regress/sql/rangetypes.sql index 1018a234a59..50ab6c4552b 100644
--- a/src/test/regress/sql/rangetypes.sql
+++ b/src/test/regress/sql/rangetypes.sql
@@ -633,6 +633,27 @@ create function table_fail(i anyelement) returns table(i \
                anyelement, r anyrange)
 --
 -- test selectivity of range join operators
 --
+
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain analyze %s', $1)
+    loop
+        if first_row then
+            first_row := false;
+            tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+            return query select tmp[1]::int, tmp[2]::int;
+        end if;
+    end loop;
+end;
+$$;
+
 create table test_range_join_1 (ir1 int4range);
 create table test_range_join_2 (ir2 int4range);
 create table test_range_join_3 (ir3 int4range);
@@ -653,6 +674,28 @@ explain (costs off) select count(*) from test_range_join_1, \
test_range_join_2, t  explain (costs off) select count(*) from test_range_join_1, \
test_range_join_2, test_range_join_3 where ir1 << ir2 and ir2 << ir3;  explain (costs \
off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 \
where ir1 >> ir2 and ir2 >> ir3;  
+SELECT * FROM check_estimated_rows('
+   select count(*)
+   from test_range_join_1,
+        test_range_join_2
+   where ir1 && ir2
+');
+
+SELECT * FROM check_estimated_rows('
+   select count(*)
+   from test_range_join_1,
+        test_range_join_2
+   where ir1 << ir2
+');
+
+SELECT * FROM check_estimated_rows('
+   select count(*)
+   from test_range_join_1,
+        test_range_join_2
+   where ir1 >> ir2
+');
+
 drop table test_range_join_1;
 drop table test_range_join_2;
 drop table test_range_join_3;
+drop function check_estimated_rows;



[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic