PostgreSQL对or exists产生的filter优化二

Posted robinson1988

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了PostgreSQL对or exists产生的filter优化二相关的知识,希望对你有一定的参考价值。


PostgreSQL会对or exists产生的filter进行优化,上一篇文章没有测试exists中有大表的情况,今天来测试一下exists中有大表的情况
注意:测试期间没有对表添加索引

create table a as select * from dba_objects;
create table b as select * from a;
create table c as select * from a;
create table d as select * from a;
insert into c select * from c;
.....反复执行,直到c有600MB.....
insert into d select * from d;
.....反复执行,直到d有600MB.....

orcl=> \\d+
                   List of relations
 Schema | Name | Type  | Owner |   Size   | Description
--------+------+-------+-------+----------+-------------
 public | a    | table | scott | 10192 kB |
 public | b    | table | scott | 10192 kB |
 public | c    | table | scott | 635 MB   |
 public | d    | table | scott | 635 MB   |

orcl=> select * from version();
                                                 version
---------------------------------------------------------------------------------------------------------
PostgreSQL 12.7 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-44), 64-bit

orcl=> show work_mem;
 work_mem
----------
 64MB

orcl=> set enable_mergejoin=false;
SET
orcl=> set max_parallel_workers_per_gather=0;
SET

orcl=> explain select count(*)
orcl->   from a
orcl->  where owner = 'SCOTT'
orcl->     or exists (select null
orcl(>           from b, c, d
orcl(>          where b.object_name = c.object_name
orcl(>            and c.data_object_id = d.data_object_id
orcl(>            and a.object_id = b.data_object_id
orcl(>            and a.object_name = c.object_name
orcl(>            and a.object_type = d.object_type);
                                                       QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=1844958352.45..1844958352.46 rows=1 width=8)
   ->  Seq Scan on a  (cost=0.00..1844958261.71 rows=36296 width=0)
         Filter: (((owner)::text = 'SCOTT'::text) OR (SubPlan 1))
         SubPlan 1
           ->  Nested Loop  (cost=0.00..482939.53 rows=19 width=0)
                 ->  Nested Loop  (cost=0.00..480580.51 rows=19 width=24)
                       Join Filter: (c.data_object_id = d.data_object_id)
                       ->  Seq Scan on d  (cost=0.00..139341.00 rows=119114 width=6)
                             Filter: ((a.object_type)::text = (object_type)::text)
                       ->  Materialize  (cost=0.00..139341.57 rows=113 width=30)
                             ->  Seq Scan on c  (cost=0.00..139341.00 rows=113 width=30)
                                   Filter: ((object_name)::text = (a.object_name)::text)
                 ->  Materialize  (cost=0.00..2358.78 rows=1 width=24)
                       ->  Seq Scan on b  (cost=0.00..2358.77 rows=1 width=24)
                             Filter: (((object_name)::text = (a.object_name)::text) AND (a.object_id = data_object_id))

执行计划中有 alternatives: SubPlan 1 or hashed SubPlan 2 才表示自动优化了filter
上面执行计划没有alternatives: SubPlan 1 or hashed SubPlan 2,说明没有对filter进行优化      
因为上面SQL要执行很久,所以就没有用explan analyze,用的explain
...期间多次加大work_mem,直到work_mem设置为6GB才能自动优化filter

orcl=> set work_mem='6GB';
SET

orcl=> explain analyze select count(*)
orcl->   from a
orcl->  where owner = 'SCOTT'
orcl->     or exists (select null
orcl(>           from b, c, d
orcl(>          where b.object_name = c.object_name
orcl(>            and c.data_object_id = d.data_object_id
orcl(>            and a.object_id = b.data_object_id
orcl(>            and a.object_name = c.object_name
orcl(>            and a.object_type = d.object_type);
                                                                    QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=1844958352.45..1844958352.46 rows=1 width=8) (actual time=24431.513..24431.516 rows=1 loops=1)
   ->  Seq Scan on a  (cost=0.00..1844958261.71 rows=36296 width=0) (actual time=24409.380..24431.290 rows=7141 loops=1)
         Filter: (((owner)::text = 'SCOTT'::text) OR (alternatives: SubPlan 1 or hashed SubPlan 2))
         Rows Removed by Filter: 65444
         SubPlan 1
           ->  Nested Loop  (cost=0.00..482939.53 rows=19 width=0) (never executed)
                 ->  Nested Loop  (cost=0.00..480580.51 rows=19 width=24) (never executed)
                       Join Filter: (c.data_object_id = d.data_object_id)
                       ->  Seq Scan on d  (cost=0.00..139341.00 rows=119114 width=6) (never executed)
                             Filter: ((a.object_type)::text = (object_type)::text)
                       ->  Materialize  (cost=0.00..139341.57 rows=113 width=30) (never executed)
                             ->  Seq Scan on c  (cost=0.00..139341.00 rows=113 width=30) (never executed)
                                   Filter: ((object_name)::text = (a.object_name)::text)
                 ->  Materialize  (cost=0.00..2358.78 rows=1 width=24) (never executed)
                       ->  Seq Scan on b  (cost=0.00..2358.77 rows=1 width=24) (never executed)
                             Filter: (((object_name)::text = (a.object_name)::text) AND (a.object_id = data_object_id))
         SubPlan 2
           ->  Hash Join  (cost=188698.56..6271022.23 rows=56022966 width=70) (actual time=543.025..11197.955 rows=67854336 loops=1)
                 Hash Cond: ((c_1.object_name)::text = (b_1.object_name)::text)
                 ->  Hash Join  (cost=185795.40..4344470.40 rows=30298200 width=32) (actual time=528.571..4526.991 rows=33587200 loops=1)
                       Hash Cond: (d_1.data_object_id = c_1.data_object_id)
                       ->  Seq Scan on d d_1  (cost=0.00..127727.40 rows=4645440 width=14) (actual time=0.005..304.969 rows=4645440 loops=1)
                       ->  Hash  (cost=127727.40..127727.40 rows=4645440 width=30) (actual time=522.689..522.689 rows=498688 loops=1)
                             Buckets: 8388608  Batches: 1  Memory Usage: 94476kB
                             ->  Seq Scan on c c_1  (cost=0.00..127727.40 rows=4645440 width=30) (actual time=0.013..366.180 rows=4645440 loops=1)
                 ->  Hash  (cost=1995.85..1995.85 rows=72585 width=30) (actual time=14.205..14.206 rows=72585 loops=1)
                       Buckets: 131072  Batches: 1  Memory Usage: 5073kB
                       ->  Seq Scan on b b_1  (cost=0.00..1995.85 rows=72585 width=30) (actual time=0.015..5.790 rows=72585 loops=1)
 Planning Time: 0.183 ms
 Execution Time: 24451.751 ms

现在对SQL进行改写,看跑多久

orcl=> show work_mem;
 work_mem
----------
 64MB

orcl=> explain analyze select count(*)
orcl->   from a
orcl->   left join (select b.data_object_id, c.object_name, d.object_type
orcl(>                from b,
orcl(>                     (select object_name, data_object_id
orcl(>                        from c
orcl(>                       group by object_name, data_object_id) c,   ---因为c是反复insert into的所以先去重
orcl(>                     (select object_type, data_object_id
orcl(>                        from d
orcl(>                       group by object_type, data_object_id) d    ---因为d是反复insert into的所以先去重
orcl(>               where b.object_name = c.object_name
orcl(>                 and c.data_object_id = d.data_object_id
orcl(>               group by b.data_object_id, c.object_name, d.object_type) b
orcl->     on a.object_id = b.data_object_id
orcl->    and a.object_name = b.object_name
orcl->    and a.object_type = b.object_type
orcl->  where a.owner = 'SCOTT'
orcl->     or b.data_object_id is not null;
                                                                          QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=327985240.14..327985240.15 rows=1 width=8) (actual time=1907.838..1907.844 rows=1 loops=1)
   ->  Hash Right Join  (cost=308668390.36..327985059.59 rows=72222 width=0) (actual time=1894.899..1907.616 rows=7141 loops=1)
         Hash Cond: ((b.data_object_id = a.object_id) AND ((c.object_name)::text = (a.object_name)::text) AND ((d.object_type)::text = (a.object_type)::text))
         Filter: (((a.owner)::text = 'SCOTT'::text) OR (b.data_object_id IS NOT NULL))
         Rows Removed by Filter: 65444
         ->  Group  (cost=308665124.27..321460593.04 rows=306880000 width=38) (actual time=1876.570..1879.191 rows=8119 loops=1)
               Group Key: b.data_object_id, c.object_name, d.object_type
               ->  Sort  (cost=308665124.27..311863991.46 rows=1279546877 width=38) (actual time=1876.567..1877.267 rows=16213 loops=1)
                     Sort Key: b.data_object_id, c.object_name, d.object_type
                     Sort Method: quicksort  Memory: 1658kB
                     ->  Hash Join  (cost=314975.18..45139336.41 rows=1279546877 width=38) (actual time=1842.973..1872.061 rows=16213 loops=1)
                           Hash Cond: (c.data_object_id = d.data_object_id)
                           ->  Hash Join  (cost=153857.76..192236.93 rows=818381 width=36) (actual time=962.781..986.699 rows=81598 loops=1)
                                 Hash Cond: ((c.object_name)::text = (b.object_name)::text)
                                 ->  HashAggregate  (cost=150954.60..155600.04 rows=464544 width=30) (actual time=949.993..957.684 rows=44913 loops=1)
                                       Group Key: c.object_name, c.data_object_id
                                       ->  Seq Scan on c  (cost=0.00..127727.40 rows=4645440 width=30) (actual time=0.005..278.648 rows=4645440 loops=1)
                                 ->  Hash  (cost=1995.85..1995.85 rows=72585 width=30) (actual time=12.725..12.725 rows=72585 loops=1)
                                       Buckets: 131072  Batches: 1  Memory Usage: 5073kB
                                       ->  Seq Scan on b  (cost=0.00..1995.85 rows=72585 width=30) (actual time=0.005..5.813 rows=72585 loops=1)
                           ->  Hash  (cost=157208.64..157208.64 rows=312702 width=14) (actual time=879.921..879.923 rows=7762 loops=1)
                                 Buckets: 524288  Batches: 1  Memory Usage: 4437kB
                                 ->  HashAggregate  (cost=150954.60..154081.62 rows=312702 width=14) (actual time=876.821..878.336 rows=7801 loops=1)
                                       Group Key: d.object_type, d.data_object_id
                                       ->  Seq Scan on d  (cost=0.00..127727.40 rows=4645440 width=14) (actual time=0.008..291.834 rows=4645440 loops=1)
         ->  Hash  (cost=1995.85..1995.85 rows=72585 width=43) (actual time=18.252..18.253 rows=72585 loops=1)
               Buckets: 131072  Batches: 1  Memory Usage: 6526kB
               ->  Seq Scan on a  (cost=0.00..1995.85 rows=72585 width=43) (actual time=0.011..6.330 rows=72585 loops=1)
 Planning Time: 0.150 ms
 Execution Time: 1909.591 ms

改写之后的SQL跑1.9秒,SQL改写这个地方有个坑,要先对c,d group by 去重,因为c,d join列有很多重复值
如果不先对c,d group by去重,就跑得慢了

orcl=> explain analyze select count(*)
orcl->   from a
orcl->   left join (select b.data_object_id, c.object_name, d.object_type
orcl(>                from b,c,d
orcl(>               where b.object_name = c.object_name
orcl(>                 and c.data_object_id = d.data_object_id
orcl(>               group by b.data_object_id, c.object_name, d.object_type) b
orcl->     on a.object_id = b.data_object_id
orcl->    and a.object_name = b.object_name
orcl->    and a.object_type = b.object_type
orcl->  where a.owner = 'SCOTT'
orcl->     or b.data_object_id is not null;
                                                                          QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=16875964.53..16875964.54 rows=1 width=8) (actual time=52812.548..52812.553 rows=1 loops=1)
   ->  Hash Right Join  (cost=15125066.20..16875783.97 rows=72222 width=0) (actual time=32338.372..52811.606 rows=7141 loops=1)
         Hash Cond: ((b.data_object_id = a.object_id) AND ((c.object_name)::text = (a.object_name)::text) AND ((d.object_type)::text = (a.object_type)::text))
         Filter: (((a.owner)::text = 'SCOTT'::text) OR (b.data_object_id IS NOT NULL))
         Rows Removed by Filter: 65444
         ->  Group  (cost=15121800.11..15682029.77 rows=56022966 width=38) (actual time=32237.800..52778.669 rows=8119 loops=1)
               Group Key: b.data_object_id, c.object_name, d.object_type
               ->  Sort  (cost=15121800.11..15261857.52 rows=56022966 width=38) (actual time=32237.795..45948.800 rows=67854336 loops=1)
                     Sort Key: b.data_object_id, c.object_name, d.object_type
                     Sort Method: external merge  Disk: 2977752kB
                     ->  Hash Join  (cost=220454.56..6379900.23 rows=56022966 width=38) (actual time=539.514..11181.495 rows=67854336 loops=1)
                           Hash Cond: ((c.object_name)::text = (b.object_name)::text)
                           ->  Hash Join  (cost=217551.40..4453348.40 rows=30298200 width=32) (actual time=526.866..4686.370 rows=33587200 loops=1)
                                 Hash Cond: (d.data_object_id = c.data_object_id)
                                 ->  Seq Scan on d  (cost=0.00..127727.40 rows=4645440 width=14) (actual time=0.006..381.212 rows=4645440 loops=1)
                                 ->  Hash  (cost=127727.40..127727.40 rows=4645440 width=30) (actual time=526.148..526.149 rows=498688 loops=1)
                                       Buckets: 1048576  Batches: 8  Memory Usage: 11678kB
                                       ->  Seq Scan on c  (cost=0.00..127727.40 rows=4645440 width=30) (actual time=0.006..366.592 rows=4645440 loops=1)
                           ->  Hash  (cost=1995.85..1995.85 rows=72585 width=30) (actual time=12.585..12.586 rows=72585 loops=1)
                                 Buckets: 131072  Batches: 1  Memory Usage: 5073kB
                                 ->  Seq Scan on b  (cost=0.00..1995.85 rows=72585 width=30) (actual time=0.004..5.661 rows=72585 loops=1)
         ->  Hash  (cost=1995.85..1995.85 rows=72585 width=43) (actual time=18.753..18.754 rows=72585 loops=1)
               Buckets: 131072  Batches: 1  Memory Usage: 6526kB
               ->  Seq Scan on a  (cost=0.00..1995.85 rows=72585 width=43) (actual time=0.010..6.298 rows=72585 loops=1)
 Planning Time: 0.126 ms
 Execution Time: 53032.011 ms

如果不先对c,d group by去重,b,c,d关联之后有    67854336 条数据,再group by就比较慢了
sort  (cost=15121800.11..15261857.52 rows=56022966 width=38) (actual time=32237.795..45948.800 rows=67854336 loops=1)
这里group by没有使用hash group by,而是用的sort group by有点奇怪,PG12是有hash group by的,sort group by 花了30多秒

在Oracle19c中测试一下不对c,d group by去重要跑多久,提前对c,d group by去重的就不用测试了,肯定秒杀

SQL> show parameter sga_target
 
NAME                                 TYPE        VALUE
------------------------------------ ----------- ------------------------------
sga_target                           big integer 596M

SQL> show parameter pga_aggregate_target
 
NAME                                 TYPE        VALUE
------------------------------------ ----------- ------------------------------
pga_aggregate_target                 big integer 199M
 
SQL> show parameter optimizer_feature
 
NAME                                 TYPE        VALUE
------------------------------------ ----------- ------------------------------
optimizer_features_enable            string      19.1.0
 
SQL> select count(*)
  from a
  left join (select b.data_object_id, c.object_name, d.object_type
               from b,c,d
              where b.object_name = c.object_name
                and c.data_object_id = d.data_object_id
              group by b.data_object_id, c.object_name, d.object_type) b
    on a.object_id = b.data_object_id
   and a.object_name = b.object_name
   and a.object_type = b.object_type
 where a.owner = 'SCOTT'
    or b.data_object_id is not null;  2    3    4    5    6    7    8    9   10   11   12

Elapsed: 00:00:01.14

Execution Plan
----------------------------------------------------------
Plan hash value: 2013066039

------------------------------------------------------------------------------------------------
| Id  | Operation                  | Name      | Rows  | Bytes |TempSpc| Cost (%CPU)| Time     |
------------------------------------------------------------------------------------------------
|   0 | SELECT STATEMENT           |           |     1 |   135 |       |   104K  (1)| 00:00:05 |
|   1 |  SORT AGGREGATE            |           |     1 |   135 |       |            |          |
|*  2 |   FILTER                   |           |       |       |       |            |          |
|*  3 |    HASH JOIN OUTER         |           |   297K|    38M|  4048K|   104K  (1)| 00:00:05 |
|   4 |     JOIN FILTER CREATE     | :BF0000   | 72585 |  3189K|       |   308   (1)| 00:00:01 |
|   5 |      TABLE ACCESS FULL     | A         | 72585 |  3189K|       |   308   (1)| 00:00:01 |
|   6 |     VIEW                   |           |  2763K|   237M|       | 90928   (1)| 00:00:04 |
|   7 |      HASH GROUP BY         |           |  2763K|   171M|   201M| 90928   (1)| 00:00:04 |
|   8 |       JOIN FILTER USE      | :BF0000   |  2763K|   171M|       | 48066   (1)| 00:00:02 |
|*  9 |        HASH JOIN           |           |  2763K|   171M|  5416K| 48066   (1)| 00:00:02 |
|  10 |         VIEW               | VW_GBF_18 |   241K|  2589K|       | 25871   (1)| 00:00:02 |
|  11 |          HASH GROUP BY     |           |   241K|  2589K|    88M| 25871   (1)| 00:00:02 |
|  12 |           TABLE ACCESS FULL| D         |  4645K|    48M|       | 17974   (1)| 00:00:01 |
|* 13 |         HASH JOIN          |           |   827K|    42M|  2768K| 19340   (1)| 00:00:01 |
|  14 |          TABLE ACCESS FULL | B         | 72585 |  1913K|       |   308   (1)| 00:00:01 |
|* 15 |          TABLE ACCESS FULL | C         |   498K|    12M|       | 17974   (1)| 00:00:01 |
------------------------------------------------------------------------------------------------

Predicate Information (identified by operation id):
---------------------------------------------------

   2 - filter("A"."OWNER"='SCOTT' OR "B"."DATA_OBJECT_ID" IS NOT NULL)
   3 - access("A"."OBJECT_ID"="B"."DATA_OBJECT_ID"(+) AND
              "A"."OBJECT_NAME"="B"."OBJECT_NAME"(+) AND "A"."OBJECT_TYPE"="B"."OBJECT_TYPE"(+))
   9 - access("C"."DATA_OBJECT_ID"="ITEM_1")
  13 - access("B"."OBJECT_NAME"="C"."OBJECT_NAME")
  15 - filter("C"."DATA_OBJECT_ID" IS NOT NULL)

Oracle19c真牛逼,直接秒杀了,而PG12.7要跑53秒,Oracle19c使用了JOIN FILTER,这里不做过的解释,找我报个班吧哈哈
测试一下Oracle11g

SQL> alter session set optimizer_features_enable='11.2.0.1';
Session altered.

SQL> select count(*)
  from a
  left join (select b.data_object_id, c.object_name, d.object_type
               from b,c,d
              where b.object_name = c.object_name
                and c.data_object_id = d.data_object_id
              group by b.data_object_id, c.object_name, d.object_type) b
    on a.object_id = b.data_object_id
   and a.object_name = b.object_name
   and a.object_type = b.object_type
 where a.owner = 'SCOTT'
    or b.data_object_id is not null;  2    3    4    5    6    7    8    9   10   11   12

Elapsed: 00:00:06.05

Execution Plan
----------------------------------------------------------
Plan hash value: 3005719873

-----------------------------------------------------------------------------------------
| Id  | Operation                | Name | Rows  | Bytes |TempSpc| Cost (%CPU)| Time     |
-----------------------------------------------------------------------------------------
|   0 | SELECT STATEMENT         |      |     1 |   135 |       |  1124K  (1)| 03:44:53 |
|   1 |  SORT AGGREGATE          |      |     1 |   135 |       |            |          |
|*  2 |   FILTER                 |      |       |       |       |            |          |
|*  3 |    HASH JOIN OUTER       |      |  5724K|   736M|  4048K|  1124K  (1)| 03:44:53 |
|   4 |     TABLE ACCESS FULL    | A    | 72585 |  3189K|       |   308   (1)| 00:00:04 |
|   5 |     VIEW                 |      |    53M|  4570M|       |   866K  (1)| 02:53:22 |
|   6 |      HASH GROUP BY       |      |    53M|  3301M|  3888M|   866K  (1)| 02:53:22 |
|*  7 |       HASH JOIN          |      |    53M|  3301M|    10M| 40587   (1)| 00:08:08 |
|*  8 |        TABLE ACCESS FULL | D    |   498K|  5357K|       | 17974   (1)| 00:03:36 |
|*  9 |        HASH JOIN         |      |   827K|    42M|  2768K| 19340   (1)| 00:03:53 |
|  10 |         TABLE ACCESS FULL| B    | 72585 |  1913K|       |   308   (1)| 00:00:04 |
|* 11 |         TABLE ACCESS FULL| C    |   498K|    12M|       | 17974   (1)| 00:03:36 |
-----------------------------------------------------------------------------------------

Predicate Information (identified by operation id):
---------------------------------------------------

   2 - filter("A"."OWNER"='SCOTT' OR "B"."DATA_OBJECT_ID" IS NOT NULL)
   3 - access("A"."OBJECT_TYPE"="B"."OBJECT_TYPE"(+) AND
              "A"."OBJECT_NAME"="B"."OBJECT_NAME"(+) AND
              "A"."OBJECT_ID"="B"."DATA_OBJECT_ID"(+))
   7 - access("C"."DATA_OBJECT_ID"="D"."DATA_OBJECT_ID")
   8 - filter("D"."DATA_OBJECT_ID" IS NOT NULL)
   9 - access("B"."OBJECT_NAME"="C"."OBJECT_NAME")
  11 - filter("C"."DATA_OBJECT_ID" IS NOT NULL)

Oracle11g 只需要6秒 

现在回过头来看PG12.7,将work_mem设置为6GB的时候,没有改写的SQL能在24秒跑完
对c,d去重的改写能在2秒内跑完,没有对c,d去重的改写要53秒,53秒里面有30多秒是在做SORT GROUP BY
PG12.7是有HASH GROUP BY的啊,咋没走呢?猜测是因为work_mem设置小了
经过反复测试,将work_mem设置为6GB,PG12.7终于将SORT GROUP BY换成了HASH GROUP BY了

orcl=> set work_mem='6GB';
SET
orcl=> explain analyze select count(*)
orcl->   from a
orcl->   left join (select b.data_object_id, c.object_name, d.object_type
orcl(>                from b,c,d
orcl(>               where b.object_name = c.object_name
orcl(>                 and c.data_object_id = d.data_object_id
orcl(>               group by b.data_object_id, c.object_name, d.object_type) b
orcl->     on a.object_id = b.data_object_id
orcl->    and a.object_name = b.object_name
orcl->    and a.object_type = b.object_type
orcl->  where a.owner = 'SCOTT'
orcl->     or b.data_object_id is not null;
                                                                          QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=8445358.89..8445358.90 rows=1 width=8) (actual time=23585.225..23585.230 rows=1 loops=1)
   ->  Hash Right Join  (cost=6694460.56..8445178.34 rows=72222 width=0) (actual time=23457.348..23584.892 rows=7141 loops=1)
         Hash Cond: ((b.data_object_id = a.object_id) AND ((c.object_name)::text = (a.object_name)::text) AND ((d.object_type)::text = (a.object_type)::text))
         Filter: (((a.owner)::text = 'SCOTT'::text) OR (b.data_object_id IS NOT NULL))
         Rows Removed by Filter: 65444
         ->  HashAggregate  (cost=6691194.48..7251424.14 rows=56022966 width=38) (actual time=23439.232..23552.655 rows=8119 loops=1)
               Group Key: b.data_object_id, c.object_name, d.object_type
               ->  Hash Join  (cost=188698.56..6271022.23 rows=56022966 width=38) (actual time=569.434..11237.613 rows=67854336 loops=1)
                     Hash Cond: ((c.object_name)::text = (b.object_name)::text)
                     ->  Hash Join  (cost=185795.40..4344470.40 rows=30298200 width=32) (actual time=556.956..4813.320 rows=33587200 loops=1)
                           Hash Cond: (d.data_object_id = c.data_object_id)
                           ->  Seq Scan on d  (cost=0.00..127727.40 rows=4645440 width=14) (actual time=0.032..337.453 rows=4645440 loops=1)
                           ->  Hash  (cost=127727.40..127727.40 rows=4645440 width=30) (actual time=550.510..550.512 rows=498688 loops=1)
                                 Buckets: 8388608  Batches: 1  Memory Usage: 94476kB
                                 ->  Seq Scan on c  (cost=0.00..127727.40 rows=4645440 width=30) (actual time=0.024..399.423 rows=4645440 loops=1)
                     ->  Hash  (cost=1995.85..1995.85 rows=72585 width=30) (actual time=12.411..12.411 rows=72585 loops=1)
                           Buckets: 131072  Batches: 1  Memory Usage: 5073kB
                           ->  Seq Scan on b  (cost=0.00..1995.85 rows=72585 width=30) (actual time=0.006..5.377 rows=72585 loops=1)
         ->  Hash  (cost=1995.85..1995.85 rows=72585 width=43) (actual time=18.014..18.014 rows=72585 loops=1)
               Buckets: 131072  Batches: 1  Memory Usage: 6526kB
               ->  Seq Scan on a  (cost=0.00..1995.85 rows=72585 width=43) (actual time=0.016..6.201 rows=72585 loops=1)
 Planning Time: 0.133 ms
 Execution Time: 23731.938 ms

虽然调大了work_mem消灭了SORT GROUP BY,改成了HASH GROUP BY,但是还是花了23秒啊,而Oracle11g只需6秒,19c秒杀
那么PG花的这23秒绝大部分消耗在哪了呢?
HashAggregate  (cost=6691194.48..7251424.14 rows=56022966 width=38) (actual time=23439.232..23552.655 rows=8119 loops=1)
从PG的执行计划中看到消耗在HASH GROUP BY上面了
看来PG的HASH GROUP BY算法与Oracle的HASH GROUP BY算法差得很远啊(特别是GROUP BY里面有大量重复值的时候)

现在来测试一下PG13

orcl=> select * from version();
                                                 version
---------------------------------------------------------------------------------------------------------
PostgreSQL 13.3 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-44), 64-bit

orcl=> show work_mem;
 work_mem
----------
 64MB

orcl=> set enable_mergejoin=false;
SET
orcl=> set max_parallel_workers_per_gather=0;
SET
orcl=> explain analyze select count(*)
orcl->   from a
orcl->   left join (select b.data_object_id, c.object_name, d.object_type
orcl(>                from b,c,d
orcl(>               where b.object_name = c.object_name
orcl(>                 and c.data_object_id = d.data_object_id
orcl(>               group by b.data_object_id, c.object_name, d.object_type) b
orcl->     on a.object_id = b.data_object_id
orcl->    and a.object_name = b.object_name
orcl->    and a.object_type = b.object_type
orcl->  where a.owner = 'SCOTT'
orcl->     or b.data_object_id is not null;
                                                                          QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=14578487.94..14578487.95 rows=1 width=8) (actual time=25718.944..25718.954 rows=1 loops=1)
   ->  Hash Right Join  (cost=11740734.28..14578307.38 rows=72222 width=0) (actual time=25706.382..25718.709 rows=7141 loops=1)
         Hash Cond: ((b.data_object_id = a.object_id) AND ((c.object_name)::text = (a.object_name)::text) AND ((d.object_type)::text = (a.object_type)::text))
         Filter: (((a.owner)::text = 'SCOTT'::text) OR (b.data_object_id IS NOT NULL))
         Rows Removed by Filter: 65444
         ->  HashAggregate  (cost=11737468.19..13288674.78 rows=60534891 width=38) (actual time=25624.865..25626.712 rows=8119 loops=1)
               Group Key: b.data_object_id, c.object_name, d.object_type
               Planned Partitions: 128  Batches: 1  Memory Usage: 13329kB
               ->  Hash Join  (cost=211358.66..6289328.00 rows=60534891 width=38) (actual time=2124.223..13706.422 rows=67854336 loops=1)
                     Hash Cond: ((c.object_name)::text = (b.object_name)::text)
                     ->  Hash Join  (cost=208455.50..4206160.86 rows=32775891 width=32) (actual time=2057.597..7272.708 rows=33587200 loops=1)
                           Hash Cond: (c.data_object_id = d.data_object_id)
                           ->  Seq Scan on c  (cost=0.00..127740.19 rows=4646719 width=30) (actual time=0.728..758.716 rows=4645440 loops=1)
                           ->  Hash  (cost=127719.00..127719.00 rows=4644600 width=14) (actual time=2010.512..2010.513 rows=498688 loops=1)
                                 Buckets: 2097152  Batches: 8  Memory Usage: 19180kB
                                 ->  Seq Scan on d  (cost=0.00..127719.00 rows=4644600 width=14) (actual time=0.792..1116.084 rows=4645440 loops=1)
                     ->  Hash  (cost=1995.85..1995.85 rows=72585 width=30) (actual time=66.418..66.419 rows=72585 loops=1)
                           Buckets: 131072  Batches: 1  Memory Usage: 5073kB
                           ->  Seq Scan on b  (cost=0.00..1995.85 rows=72585 width=30) (actual time=0.766..52.450 rows=72585 loops=1)
         ->  Hash  (cost=1995.85..1995.85 rows=72585 width=43) (actual time=81.249..81.249 rows=72585 loops=1)
               Buckets: 131072  Batches: 1  Memory Usage: 6526kB
               ->  Seq Scan on a  (cost=0.00..1995.85 rows=72585 width=43) (actual time=0.388..48.467 rows=72585 loops=1)
 Planning Time: 4.279 ms
 Execution Time: 25727.918 ms

在PG13中,不需要设置work_mem为6GB,就可以自动使用HASH GROUP BY,说明在PG13上HASH GROUP BY才基本成熟

总结:   PG的GROUP BY算法目前相比Oracle还是有很大差距的,在PG中GROUP BY要特别注意SQL写法,尽量提前GROUP BY
           PG的filter自动优化对付exists中的小表还行,对付大表就不行了,一定要做好SQL审核,别乱写SQL

       
           

以上是关于PostgreSQL对or exists产生的filter优化二的主要内容,如果未能解决你的问题,请参考以下文章

PostgreSQL对or exists产生的filter优化二

PostgreSQL对or exists产生的filter优化二

PostgreSQL对or exists产生的filter优化一

PostgreSQL对or exists产生的filter优化一

PostgreSQL对or exists的优化

postgresql----IN&&EXISTS