优化前,成本500+多
--存在的问题:多次扫描大表不同的区间,存在子查询。
SELECT
count(a.did) as new
,'one' as `stage`
from
(
SELECT
did
from
bili_dwd.dwd_demo_log_dot_yyyymmdd --埋点日志表,数据量大
where
dt >= '20230819'
and
dt <= '20230825'
and
dot_code in ('click_gift_a', 'click_gift_b', 'click_gift_c')
and
did not in (
SELECT
did
from
bili_dwd.dwd_demo_log_dot_yyyymmdd --埋点日志表
where
dt >= '20230718'
and
dt < '20230818'
and
dot_code in ('click_gift_a', 'click_gift_b', 'click_gift_c')
)
and
event_id = 1
GROUP by
did
) a
union all
SELECT
count(a.did) as new
,'two' as `stage`
from
(
SELECT
did
from
bili_dwd.dwd_demo_log_dot_yyyymmdd
where
dt >= '20230826'
and
dt <= '20230901'
and
dot_code in ('click_gift_a', 'click_gift_b', 'click_gift_c')
and
did not in (
SELECT
did
from
bili_dwd.dwd_demo_log_dot_yyyymmdd
where
dt >= '20230725'
and
dt < '20230825'
and
dot_code in ('click_gift_a', 'click_gift_b', 'click_gift_c')
)
and
event_id = 1
GROUP by
did
) a;
优化后,成本100+左右
--第一步创建临时表扫描相关分区数据
CREATE TABLE IF NOT EXISTS bili_tmp.tmp_demo_20230909 AS
SELECT
did
,event_id
,dt
FROM
bili_dwd.dwd_demo_log_dot_yyyymmdd -- 埋点日志表
WHERE
dt >= '20190718'
AND dt <= '20190901'
AND dot_code in ('click_gift_a', 'click_gift_b', 'click_gift_c')
GROUP BY
did
,event_id
,dt;
-- 第二步:针对数据进行相关逻辑处理,避免子查询,避免多次扫描大表
SELECT
count(a.did) as new
,'one' as `stage`
from
(
SELECT
did
from
bili_tmp.tmp_demo_20230909
where
dt >= '20230718'
and
dt <= '20230825'
GROUP by
did
HAVING
max(if(dt >= '20190819' AND dt <= '20190825' AND event_id = 1, 'Y', 'N')) = 'Y'
AND max(if(dt >= '20190718' AND dt <= '20190818', 'Y', 'N')) = 'N'
) a
union all
SELECT
count(a.did) as new
,'two' as `stage`
from
(
SELECT
did
from
bili_tmp.tmp_demo_20230909
where
dt >= '20190725'
and
dt <= '20190901'
GROUP by
did
HAVING
max(if(dt >= '20190826' AND dt <= '20190901' AND event_id = 1, 'Y', 'N')) = 'Y'
AND max(if(dt >= '20190725' AND dt <= '20190825', 'Y', 'N')) = 'N'
) a;