计算1~180天留存率数据仓库设计方案。在实际业务场景中可能是计算 1、3、7、14、30、60、90、180日留存这种方式,也可能是1~180连续每天都要计算,当然极端情况下也会有1~360日留存的计算。主要讲述建设思路。文章以1~7日数据进行计算了展示,计算180日留存时需要调整数据周期。
把生成的数据作为ods层数据,进行基础的数据清洗和加工。
新增用户表: 加工为用户维表或者是用户快照表。在实际业务场景中,通常建设为用户快照表,每天一份全量快照。
用户登录表: 在实际应用场景中,原始数据为用户的所有登录日志(或者是所有操作日志记录)。在dwd层,按照用户登录日期分区,包含用户当天的全量操作记录。在dws层按天进行分区,按照用户进行去重处理,每个活跃用户每天一条记录。(实际应用中一定要处理,减少计算留存数据的复杂度和计算量。给出的样例数据没有重复记录,不需要dwd层明细数据存储了,所以直接给出dws层的按日分区的表。
--新增用户表
create table t_dim_user_create
(
user_id string,
create_date string
) COMMENT '新增用户表'
partitioned by (dt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
-- 用户登录表
create table t_dws_user_login
(
user_id string
) COMMENT '用户登录表'
partitioned by (login_date string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
我们给出的数据无需太多处理,主要将原始数据进行分区。
--新增用户表数据插入(这里创建6月28日分区数据,6月28日是我们创建的用户的最后一天)
insert overwrite table t_dim_user_create partition(dt = '2024-06-28')
select user_id,create_date
from t_user_create
where create_date <= '2024-06-28';
--用户登录表
insert overwrite table t_dws_user_login partition(login_date)
select user_id,login_date
from t_user_login;
样例数据-用户登录表的分区数据
show partitions t_dws_user_login;
+------------------------+
| partition |
+------------------------+
| login_date=2024-01-01 |
| login_date=2024-01-02 |
| login_date=2024-01-03 |
| login_date=2024-01-04 |
| login_date=2024-01-05 |
| login_date=2024-01-06 |
| login_date=2024-01-07 |
| login_date=2024-01-08 |
| login_date=2024-01-09 |
| login_date=2024-01-10 |
……
以上就是基础数据的处理,着重将数据进行了分区处理,使其更加符合实际场景中的分区。
直接进行计算,产出对应的留存率数据
select create_date,
sum(r1) / count(distinct user_id) as r1_rate,
sum(r2) / count(distinct user_id) as r2_rate,
sum(r3) / count(distinct user_id) as r3_rate,
sum(r4) / count(distinct user_id) as r4_rate,
sum(r5) / count(distinct user_id) as r5_rate,
sum(r6) / count(distinct user_id) as r6_rate,
sum(r7) / count(distinct user_id) as r7_rate
from (select t_create.user_id,
t_create.create_date,
t_login.login_date,
case when datediff(to_date(t_login.login_date), to_date(t_create.create_date)) = 1 then 1 else 0 end as r1,
case when datediff(to_date(t_login.login_date), to_date(t_create.create_date)) = 2 then 1 else 0 end as r2,
case when datediff(to_date(t_login.login_date), to_date(t_create.create_date)) = 3 then 1 else 0 end as r3,
case when datediff(to_date(t_login.login_date), to_date(t_create.create_date)) = 4 then 1 else 0 end as r4,
case when datediff(to_date(t_login.login_date), to_date(t_create.create_date)) = 5 then 1 else 0 end as r5,
case when datediff(to_date(t_login.login_date), to_date(t_create.create_date)) = 6 then 1 else 0 end as r6,
case when datediff(to_date(t_login.login_date), to_date(t_create.create_date)) = 7 then 1 else 0 end as r7
from (select user_id,
create_date
from t_dim_user_create
where dt = '2024-06-28'
and create_date < '2024-06-28'
and create_date >= date_add('2024-06-28', -7)) t_create
left join
(select user_id,
login_date
from t_dws_user_login) t_login
on t_create.user_id = t_login.user_id
where to_date(t_login.login_date) > to_date(t_create.create_date)
and t_login.login_date > date_add('2024-06-28', -7)
and t_login.login_date <= '2024-06-28') tt
group by create_date
order by create_date asc
执行结果
+--------------+----------+----------+----------+----------+----------+----------+----------+
| create_date | r1_rate | r2_rate | r3_rate | r4_rate | r5_rate | r6_rate | r7_rate |
+--------------+----------+----------+----------+----------+----------+----------+----------+
| 2024-06-21 | 1.0 | 1.0 | 0.8 | 0.8 | 1.0 | 0.8 | 0.6 |
| 2024-06-22 | 0.8 | 1.0 | 0.8 | 0.9 | 0.9 | 0.9 | 0.0 |
| 2024-06-23 | 1.0 | 0.9 | 0.9 | 0.7 | 1.0 | 0.0 | 0.0 |
| 2024-06-24 | 1.0 | 1.0 | 0.9 | 0.7 | 0.0 | 0.0 | 0.0 |
| 2024-06-25 | 1.0 | 0.9 | 0.9 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2024-06-26 | 0.9 | 0.7 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2024-06-27 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
+--------------+----------+----------+----------+----------+----------+----------+----------+
先以竖表的方式建设DM层,其中DM层以create_date作为分区字段 建表语句
create table t_dm_user_retention1
(
retention_days bigint COMMENT '留存天数',
retention_rate double COMMENT '留存率'
) COMMENT '用户留存率1'
partitioned by (create_date string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
数据插入语句
insert overwrite table t_dm_user_retention1 partition (create_date)
select retention_days,
login_cnt / create_date_cnt as retention_reate,
create_date
from (select t_create.create_date,
create_date_cnt,
datediff(to_date(t_login.login_date), to_date(t_create.create_date)) as retention_days,
count(1) as login_cnt
from (select user_id,
create_date,
count(user_id) over (partition by create_date) as create_date_cnt
from t_dim_user_create
where dt= '2024-06-28' --限定数据分区
and create_date >= date_add('2024-06-28',-7)
and create_date <= '2024-06-28' --进行数据行裁剪
) t_create
left join
(select user_id,
login_date
from t_dws_user_login
where login_date > date_add('2024-06-28',-7)
and login_date <= '2024-06-28' --重点关注该部分使用的数据分区数量
) t_login
on t_create.user_id = t_login.user_id
where to_date(t_login.login_date) > to_date(t_create.create_date)
group by t_create.create_date, create_date_cnt, retention_days) t
样例数据
select * from t_dm_user_retention1 where create_date = '2024-06-21' order by retention_days asc;
+-----------------+-----------------+--------------+
| retention_days | retention_rate | create_date |
+-----------------+-----------------+--------------+
| 1 | 1.0 | 2024-06-21 |
| 2 | 1.0 | 2024-06-21 |
| 3 | 0.8 | 2024-06-21 |
| 4 | 0.8 | 2024-06-21 |
| 5 | 1.0 | 2024-06-21 |
| 6 | 0.8 | 2024-06-21 |
| 7 | 0.6 | 2024-06-21 |
+-----------------+-----------------+--------------+
查询最后留存数据
select *
from t_dm_user_retention1 pivot (
sum(retention_rate) as rate
for retention_days in(1 as 1_day,2 as 2_day,3 as 3_day,4 as 4_day,5 as 5_day,6 as 6_day,7 as 7_day)
)
where create_date >= '2024-06-21'
order by 1 asc
执行结果
+--------------+--------+--------+--------+--------+--------+--------+--------+
| create_date | 1_day | 2_day | 3_day | 4_day | 5_day | 6_day | 7_day |
+--------------+--------+--------+--------+--------+--------+--------+--------+
| 2024-06-21 | 1.0 | 1.0 | 0.8 | 0.8 | 1.0 | 0.8 | 0.6 |
| 2024-06-22 | 0.8 | 1.0 | 0.8 | 0.9 | 0.9 | 0.9 | NULL |
| 2024-06-23 | 1.0 | 0.9 | 0.9 | 0.7 | 1.0 | NULL | NULL |
| 2024-06-24 | 1.0 | 1.0 | 0.9 | 0.7 | NULL | NULL | NULL |
| 2024-06-25 | 1.0 | 0.9 | 0.9 | NULL | NULL | NULL | NULL |
| 2024-06-26 | 0.9 | 0.7 | NULL | NULL | NULL | NULL | NULL |
| 2024-06-27 | 1.0 | NULL | NULL | NULL | NULL | NULL | NULL |
+--------------+--------+--------+--------+--------+--------+--------+--------+
根据登陆日期作为分区,字段包含新增日期、留存天数、留存率和用户登陆日期。 建表语句
create table t_dm_user_retention2
(
create_date string COMMENT '新增日期',
retention_days bigint COMMENT '留存天数',
retention_rate double COMMENT '留存率'
) COMMENT '用户留存率2'
partitioned by (login_date string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
数据插入语句
--2024-06-28日数据处理
insert overwrite table t_dm_user_retention2 partition (login_date)
select create_date,
retention_days,
login_cnt / create_date_cnt as retention_reate,
login_date
from (select t_create.create_date,
t_login.login_date,
create_date_cnt,
datediff(to_date(t_login.login_date), to_date(t_create.create_date)) as retention_days,
count(1) as login_cnt
from (select user_id,
create_date,
count(user_id) over (partition by create_date) as create_date_cnt
from t_dim_user_create
where dt = '2024-06-28' --限定数据分区
and create_date >= date_add('2024-06-28', -7)
and create_date <= '2024-06-28' --进行数据行裁剪
) t_create
left join
(select user_id,
login_date
from t_dws_user_login
where login_date = '2024-06-28' --重点关注该部分使用的数据分区数量
) t_login
on t_create.user_id = t_login.user_id
where to_date(t_login.login_date) > to_date(t_create.create_date)
group by t_create.create_date, t_login.login_date, create_date_cnt, retention_days) t
为了能有2024-06-21日之后的所有留存数据,需要对前面7天的数据进行刷新。
--2024-06-27日数据处理
insert overwrite table t_dm_user_retention2 partition (login_date)
select create_date,
retention_days,
login_cnt / create_date_cnt as retention_reate,
login_date
from (select t_create.create_date,
t_login.login_date,
create_date_cnt,
datediff(to_date(t_login.login_date), to_date(t_create.create_date)) as retention_days,
count(1) as login_cnt
from (select user_id,
create_date,
count(user_id) over (partition by create_date) as create_date_cnt
from t_dim_user_create
where dt = '2024-06-27' --限定数据分区
and create_date >= date_add('2024-06-27', -7)
and create_date <= '2024-06-27' --进行数据行裁剪
) t_create
left join
(select user_id,
login_date
from t_dws_user_login
where login_date = '2024-06-27' --重点关注该部分使用的数据分区数量
) t_login
on t_create.user_id = t_login.user_id
where to_date(t_login.login_date) > to_date(t_create.create_date)
group by t_create.create_date, t_login.login_date, create_date_cnt, retention_days) t
样例数据
+--------------+-----------------+-----------------+-------------+
| create_date | retention_days | retention_rate | login_date |
+--------------+-----------------+-----------------+-------------+
| 2024-06-14 | 7 | 0.7 | 2024-06-21 |
| 2024-06-15 | 6 | 0.8 | 2024-06-21 |
| 2024-06-16 | 5 | 0.5 | 2024-06-21 |
| 2024-06-17 | 4 | 0.9 | 2024-06-21 |
| 2024-06-18 | 3 | 0.8 | 2024-06-21 |
| 2024-06-19 | 2 | 0.9 | 2024-06-21 |
| 2024-06-20 | 1 | 0.8 | 2024-06-21 |
| 2024-06-15 | 7 | 0.8 | 2024-06-22 |
| 2024-06-16 | 6 | 0.6 | 2024-06-22 |
| 2024-06-17 | 5 | 1.0 | 2024-06-22 |
| 2024-06-18 | 4 | 0.8 | 2024-06-22 |
| 2024-06-19 | 3 | 1.0 | 2024-06-22 |
| 2024-06-20 | 2 | 1.0 | 2024-06-22 |
| 2024-06-21 | 1 | 1.0 | 2024-06-22 |
| 2024-06-16 | 7 | 0.8 | 2024-06-23 |
| 2024-06-17 | 6 | 0.8 | 2024-06-23 |
| 2024-06-18 | 5 | 0.9 | 2024-06-23 |
| 2024-06-19 | 4 | 1.0 | 2024-06-23 |
| 2024-06-20 | 3 | 0.7 | 2024-06-23 |
| 2024-06-21 | 2 | 1.0 | 2024-06-23 |
| 2024-06-22 | 1 | 0.8 | 2024-06-23 |
| 2024-06-17 | 7 | 0.8 | 2024-06-24 |
| 2024-06-18 | 6 | 0.9 | 2024-06-24 |
| 2024-06-19 | 5 | 1.0 | 2024-06-24 |
| 2024-06-20 | 4 | 0.7 | 2024-06-24 |
| 2024-06-21 | 3 | 0.8 | 2024-06-24 |
| 2024-06-22 | 2 | 1.0 | 2024-06-24 |
| 2024-06-23 | 1 | 1.0 | 2024-06-24 |
| 2024-06-18 | 7 | 0.7 | 2024-06-25 |
| 2024-06-19 | 6 | 0.7 | 2024-06-25 |
| 2024-06-20 | 5 | 0.5 | 2024-06-25 |
| 2024-06-21 | 4 | 0.8 | 2024-06-25 |
| 2024-06-22 | 3 | 0.8 | 2024-06-25 |
| 2024-06-23 | 2 | 0.9 | 2024-06-25 |
| 2024-06-24 | 1 | 1.0 | 2024-06-25 |
| 2024-06-19 | 7 | 0.9 | 2024-06-26 |
| 2024-06-20 | 6 | 0.6 | 2024-06-26 |
| 2024-06-21 | 5 | 1.0 | 2024-06-26 |
| 2024-06-22 | 4 | 0.9 | 2024-06-26 |
| 2024-06-23 | 3 | 0.9 | 2024-06-26 |
| 2024-06-24 | 2 | 1.0 | 2024-06-26 |
| 2024-06-25 | 1 | 1.0 | 2024-06-26 |
| 2024-06-20 | 7 | 0.8 | 2024-06-27 |
| 2024-06-21 | 6 | 0.8 | 2024-06-27 |
| 2024-06-22 | 5 | 0.9 | 2024-06-27 |
| 2024-06-23 | 4 | 0.7 | 2024-06-27 |
| 2024-06-24 | 3 | 0.9 | 2024-06-27 |
| 2024-06-25 | 2 | 0.9 | 2024-06-27 |
| 2024-06-26 | 1 | 0.9 | 2024-06-27 |
| 2024-06-21 | 7 | 0.6 | 2024-06-28 |
| 2024-06-22 | 6 | 0.9 | 2024-06-28 |
| 2024-06-23 | 5 | 1.0 | 2024-06-28 |
| 2024-06-24 | 4 | 0.7 | 2024-06-28 |
| 2024-06-25 | 3 | 0.9 | 2024-06-28 |
| 2024-06-26 | 2 | 0.7 | 2024-06-28 |
| 2024-06-27 | 1 | 1.0 | 2024-06-28 |
+--------------+-----------------+-----------------+-------------+
查询结果
select *
from (select create_date, retention_days, retention_rate
from t_dm_user_retention2
where login_date > '2024-06-21'
and create_date >= '2024-06-21') t pivot (
sum(retention_rate) as rate
for retention_days in(1 as 1_day,2 as 2_day,3 as 3_day,4 as 4_day,5 as 5_day,6 as 6_day,7 as 7_day)
)
order by 1 asc
+--------------+--------+--------+--------+--------+--------+--------+--------+
| create_date | 1_day | 2_day | 3_day | 4_day | 5_day | 6_day | 7_day |
+--------------+--------+--------+--------+--------+--------+--------+--------+
| 2024-06-21 | 1.0 | 1.0 | 0.8 | 0.8 | 1.0 | 0.8 | 0.6 |
| 2024-06-22 | 0.8 | 1.0 | 0.8 | 0.9 | 0.9 | 0.9 | NULL |
| 2024-06-23 | 1.0 | 0.9 | 0.9 | 0.7 | 1.0 | NULL | NULL |
| 2024-06-24 | 1.0 | 1.0 | 0.9 | 0.7 | NULL | NULL | NULL |
| 2024-06-25 | 1.0 | 0.9 | 0.9 | NULL | NULL | NULL | NULL |
| 2024-06-26 | 0.9 | 0.7 | NULL | NULL | NULL | NULL | NULL |
| 2024-06-27 | 1.0 | NULL | NULL | NULL | NULL | NULL | NULL |
+--------------+--------+--------+--------+--------+--------+--------+--------+
我们需要思考和对比集中方式在日常每天调度中的计算量对应程序的复杂度。
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有