为完成留存数据,我们需要先准备相关数据,包含三份数据:日期维表、新增用户表、用户登录表。生成数据涉及不少知识点,之前基本都写过了,这里不再赘述。涉及函数如下。 space() split() posexplode() random() datediff() pow()
--日期维表
create table t_dim_date
(
c_date string
) COMMENT '日期维表'
row format DELIMITED fields terminated by '\t';
--新增用户表
create table t_user_create
(
user_id string,
create_date string
) COMMENT '新增用户表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
-- 用户登录表
create table t_user_login
(
user_id string,
login_date string
) COMMENT '用户登录表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
创建日期维表,从2024-01-01至2024-12-31日的数据
生成语句
--日期维表
insert into table t_dim_date(c_date)
select date_add(to_date('2024-01-01'), idx) as c_date
from (select posexplode(split(space(365), ' ')) as (idx, value)) t;
样例数据
+-------------+
| c_date |
+-------------+
| 2024-01-01 |
| 2024-01-02 |
| 2024-01-03 |
| 2024-01-04 |
| 2024-01-05 |
| 2024-01-06 |
| 2024-01-07 |
| 2024-01-08 |
| 2024-01-09 |
| 2024-01-10 |
+-------------+
从2024-01-01 开始每天新增10个人,连续180天,用户ID从1001开始。
insert into table t_user_create(user_id, create_date)
select cast(1001 + idx as string) as user_id,
date_add(to_date('2024-01-01'), cast(floor(idx / 10) as int)) as create_date
from (select posexplode(split(space(1800 - 1), ' ')) as (idx, value)) t;
样例数据
+----------+--------------+
| user_id | create_date |
+----------+--------------+
| 1001 | 2024-01-01 |
| 1002 | 2024-01-01 |
| 1003 | 2024-01-01 |
| 1004 | 2024-01-01 |
| 1005 | 2024-01-01 |
| 1006 | 2024-01-01 |
| 1007 | 2024-01-01 |
| 1008 | 2024-01-01 |
| 1009 | 2024-01-01 |
| 1010 | 2024-01-01 |
| 1011 | 2024-01-02 |
| 1012 | 2024-01-02 |
| 1013 | 2024-01-02 |
| 1014 | 2024-01-02 |
| 1015 | 2024-01-02 |
| 1016 | 2024-01-02 |
| 1017 | 2024-01-02 |
| 1018 | 2024-01-02 |
| 1019 | 2024-01-02 |
| 1020 | 2024-01-02 |
+----------+--------------+
为了让用户登录相对符合逐步流失的结果,用户登录的概率每天减少5%。
insert into t_user_login(user_id, login_date)
select user_id, c_date
from (select u.user_id,
d.c_date,
case
when d.c_date >= u.create_date and random() <= pow(0.95, datediff(d.c_date, u.create_date)) then 1
else 0 end as is_login
from t_user_create u
join (select c_date from t_dim_date) d
where d.c_date >= u.create_date) t
where t.is_login = 1;
样例数据
+----------+-------------+
| user_id | login_date |
+----------+-------------+
| 1001 | 2024-01-01 |
| 1001 | 2024-01-02 |
| 1001 | 2024-01-03 |
| 1001 | 2024-01-04 |
| 1001 | 2024-01-05 |
| 1001 | 2024-01-06 |
| 1001 | 2024-01-07 |
| 1001 | 2024-01-08 |
| 1001 | 2024-01-09 |
| 1001 | 2024-01-12 |
| 1001 | 2024-01-13 |
| 1001 | 2024-01-15 |
| 1001 | 2024-01-16 |
| 1001 | 2024-01-18 |
| 1001 | 2024-01-19 |
| 1001 | 2024-01-20 |
| 1001 | 2024-02-16 |
| 1001 | 2024-02-29 |
| 1001 | 2024-03-05 |
| 1002 | 2024-01-01 |
+----------+-------------+