
set hive.exec.mode.local.auto=true;
 开启hive的local模式
create table tmp.test(
   deviceid   string,
   dt         string
)row format delimited 
fields terminated by ','INSERT INTO tmp.test VALUES
	('deviceid1','2022-11-01'),
	('deviceid1','2022-11-01'),
	('deviceid1','2022-11-02'),
	('deviceid1','2022-11-02'),
	('deviceid1','2022-11-03'),
	('deviceid1','2022-11-05'),
	('deviceid1','2022-11-05'),
	('deviceid1','2022-11-05'),
	('deviceid1','2022-11-06'),
	('deviceid1','2022-11-07'),
	('deviceid1','2022-11-07'),
	('deviceid1','2022-11-08'),
	('deviceid1','2022-11-09'),
	('deviceid1','2022-11-10'),
	('deviceid2','2022-11-01'),
	('deviceid2','2022-11-01'),
	('deviceid2','2022-11-02'),
	('deviceid2','2022-11-02'),
	('deviceid2','2022-11-03'),
	('deviceid2','2022-11-05'),
	('deviceid2','2022-11-05'),
	('deviceid2','2022-11-05'),
	('deviceid2','2022-11-06'),
	('deviceid2','2022-11-07'),
	('deviceid2','2022-11-07'),
	('deviceid2','2022-11-08'),
	('deviceid2','2022-11-09'),
	('deviceid2','2022-11-10');
SELECT * FROM tmp.testSELECT deviceid,count(*)
from (
	SELECT deviceid, dt_sub,count(*) count_n
	from (
		SELECT deviceid , dt
				,date_sub(dt,ROW_NUMBER() over(PARTITION  by deviceid ORDER by dt)) as dt_sub
				,ROW_NUMBER() over(PARTITION  by deviceid ORDER by dt) as rn
		from (
					select deviceid,dt
					from tmp.test 
					where dt >= date_sub('2022-11-15',30)
					group by deviceid,dt
		)t1
	)t2
	group by deviceid,dt_sub
	having count(*) >= 2
)t3	
group by deviceid本小节利用row_number()开窗计算出每个设备deviceid的行号,再利用日期dt减去行号,求出相同的dt_sub即为连续登录的日期。
SELECT deviceid , dt
		,date_sub(dt,ROW_NUMBER() over(PARTITION  by deviceid ORDER by dt)) as dt_sub
		,ROW_NUMBER() over(PARTITION  by deviceid ORDER by dt) as rn
from (
			select deviceid,dt
			from tmp.test 
			where dt >= date_sub('2022-11-15',30)
			group by deviceid,dt
)t1Step 1:子查询从表 tmp.test 中选择 dt 大于等于 '2022-11-15' 减去 30天的数据,并按 deviceid 和 dt 进行分组。返回的结果将包含 deviceid 和 dt 列的值,这里是为了筛选出近一个月30天的数据。
Step 2:在外层查询中,使用窗口函数ROW_NUMBER()将每个 deviceid 分组内的数据按照 dt 进行排序,并为每行分配一个行号,即 rn 列。Step 2查询结果如下:
deviceid	dt		dt_sub		rn
deviceid1	2022-11-01	2022-10-31	1
deviceid1	2022-11-02	2022-10-31	2
deviceid1	2022-11-03	2022-10-31	3
deviceid1	2022-11-05	2022-11-01	4
deviceid1	2022-11-06	2022-11-01	5
deviceid1	2022-11-07	2022-11-01	6
deviceid1	2022-11-08	2022-11-01	7
deviceid1	2022-11-09	2022-11-01	8
deviceid1	2022-11-10	2022-11-01	9
deviceid2	2022-11-01	2022-10-31	1
deviceid2	2022-11-02	2022-10-31	2
deviceid2	2022-11-03	2022-10-31	3
deviceid2	2022-11-05	2022-11-01	4
deviceid2	2022-11-06	2022-11-01	5
deviceid2	2022-11-07	2022-11-01	6
deviceid2	2022-11-08	2022-11-01	7
deviceid2	2022-11-09	2022-11-01	8
deviceid2	2022-11-10	2022-11-01	9本小节求出连续登录大于2天的数量,并进行汇总计算。
SELECT deviceid, dt_sub,count(*) count_n
	from (
		SELECT deviceid , dt
				,date_sub(dt,ROW_NUMBER() over(PARTITION  by deviceid ORDER by dt)) as dt_sub
				,ROW_NUMBER() over(PARTITION  by deviceid ORDER by dt) as rn
		from (
					select deviceid,dt
					from tmp.test 
					where dt >= date_sub('2022-11-15',30)
					group by deviceid,dt
		)t1
	)t2
	group by deviceid,dt_sub
	having count(*) >= 2Step 3:对结果进行分组,按照 deviceid 和 dt_sub 进行分组,并计算了每个分组中的行数,即 count_n 列。
deviceid	dt_sub	count_n
deviceid1	2022-10-31	3
deviceid1	2022-11-01	6
deviceid2	2022-10-31	3
deviceid2	2022-11-01	6Step 4:对满足条件的分组再次进行了 GROUP BY deviceid,统计每个 deviceid 对应的分组数,即为需求用户数。
SELECT deviceid,count(*)
from (
	SELECT deviceid, dt_sub,count(*) count_n
	from (
		SELECT deviceid , dt
				,date_sub(dt,ROW_NUMBER() over(PARTITION  by deviceid ORDER by dt)) as dt_sub
				,ROW_NUMBER() over(PARTITION  by deviceid ORDER by dt) as rn
		from (
					select deviceid,dt
					from tmp.test 
					where dt >= date_sub('2022-11-15',30)
					group by deviceid,dt
		)t1
	)t2
	group by deviceid,dt_sub
	having count(*) >= 2
)t3	
group by deviceiddeviceid	count(*)
deviceid1	2
deviceid2	2select deviceid,dt
from tmp.test 
where dt >= date_sub('2022-11-15',30)这行语句可以理解为统计最近 11 月 15 日前的 30 天内的用户数据。它使用了 date_sub() 函数来计算日期,并使用 >= 运算符筛选出符合条件的数据,即日期大于等于最近的 11 月 15 日前的 30 天。这样就可以获取最近一个月内的用户数据。
deviceid	dt
deviceid1	2022-11-01
deviceid1	2022-11-01
deviceid1	2022-11-02
deviceid1	2022-11-02
deviceid1	2022-11-03
deviceid1	2022-11-05
deviceid1	2022-11-05
deviceid1	2022-11-05
deviceid1	2022-11-06
deviceid1	2022-11-07
deviceid1	2022-11-07
deviceid1	2022-11-08
deviceid1	2022-11-09
deviceid1	2022-11-10
deviceid2	2022-11-01
deviceid2	2022-11-01
deviceid2	2022-11-02
deviceid2	2022-11-02
deviceid2	2022-11-03
deviceid2	2022-11-05
deviceid2	2022-11-05
deviceid2	2022-11-05
deviceid2	2022-11-06
deviceid2	2022-11-07
deviceid2	2022-11-07
deviceid2	2022-11-08
deviceid2	2022-11-09
deviceid2	2022-11-10SELECT deviceid, COUNT(DISTINCT dt) AS active_days
	from (
		select deviceid,dt
		from tmp.test 
		where dt >= date_sub('2022-11-15',30)
	)t1
GROUP BY deviceid;按照 deviceid 进行分组,并使用 COUNT(DISTINCT dt) 函数计算每个设备的唯一日期数,即活跃天数。结果将返回每个设备和其对应的活跃天数。
deviceid	active_days
deviceid1	9
deviceid2	9select deviceid,max(count_n) max_days
from (
	SELECT deviceid, dt_sub,count(*) count_n
	from (
		SELECT deviceid , dt
				,date_sub(dt,ROW_NUMBER() over(PARTITION  by deviceid ORDER by dt)) as dt_sub
				,ROW_NUMBER() over(PARTITION  by deviceid ORDER by dt) as rn
		from (
					select deviceid,dt
					from tmp.test 
					where dt >= date_sub('2022-11-15',30)
					group by deviceid,dt
		)t1
	)t2
	group by deviceid,dt_sub
)t3
group by deviceid这个需求比较简单,在需求一的基础上,增加对deviceid的分组和对count_n的求最大值即可
deviceid	max_days
deviceid1	6
deviceid2	6SELECT x1,count(x1)
from (
	SELECT deviceid,dt_sub,active_days,
			case when  active_days >= 1 and active_days <= 3 then '[1-3]'
	          	 when active_days >= 4 and active_days <= 6 then '[4-6]'
	          	 when active_days >= 7 then '[7+]'  end  x1
	from (
		SELECT deviceid,dt_sub,count(*) as active_days
		FROM (
		    SELECT deviceid, dt,
		   			ROW_NUMBER() OVER (PARTITION BY deviceid ORDER BY dt) as rn,
		           date_sub(dt, ROW_NUMBER() OVER (PARTITION BY deviceid ORDER BY dt))  AS dt_sub
		    FROM (
		        SELECT deviceid, dt
		        FROM tmp.test 
		        WHERE dt >= date_sub('2022-11-15', 30)
		        GROUP BY deviceid, dt
		    ) t1
		) t2
		group by deviceid,dt_sub
	)t3
)t4
group by x1仍然是先计算出连续活跃的天数,在内层查询:
deviceid	dt_sub	active_days
deviceid1	2022-10-31	3
deviceid1	2022-11-01	6
deviceid2	2022-10-31	3
deviceid2	2022-11-01	6使用case when 进行如下结果样例操作,可以进行行转列:
/**
 * 区间         人数
 * [1-3]    2
 * [4-6]    3
 * [7+      4
 */
/**
 *  deviceid1	[1-3]
	deviceid1	[4-6]
	deviceid2	[1-3]
	deviceid2	[4-6]
 */更换满足本需求案例的新的数据和表。
CREATE TABLE tmp.testdt (
  deviceid VARCHAR(255),
  dt DATE
);
INSERT INTO tmp.testdt (deviceid, dt) VALUES
('deviceid1', '2023-06-01'),
('deviceid1', '2023-06-02'),
('deviceid1', '2023-06-03'),
('deviceid1', '2023-06-29'),
('deviceid1', '2023-06-30');select	count(DISTINCT if(dt_diff > 3,deviceid,null)) as `超过3天`,
		count(DISTINCT if(dt_diff > 5,deviceid,null)) as `超过5天`
from (
	select deviceid ,dt,pre_date,datediff(date_sub(dt,1),pre_date) as dt_diff
	from (
		SELECT deviceid,dt, LAG(dt,1,dt) OVER (partition by deviceid ORDER BY dt) AS pre_date
		from (
			select deviceid,dt
			from (
				select deviceid,dt
				from tmp.testdt where dt >= date_sub('2023-06-30',30)
				group by deviceid,dt
			)t1
		)t2
	)t3
)t4	SELECT deviceid,dt, LAG(dt,1,dt) OVER (partition by deviceid ORDER BY dt) AS pre_date
		from (
			select deviceid,dt
			from (
				select deviceid,dt
				from tmp.testdt where dt >= date_sub('2023-06-30',30)
				group by deviceid,dt
			)t1
		)t2Step 1:
以上查询用lag开窗,求出了给定date类型列中每个日期的前一个出现的日期,仅在本列中求出。查询结果如下:
deviceid	dt	pre_date
deviceid1	2023-06-01	2023-06-01
deviceid1	2023-06-02	2023-06-01
deviceid1	2023-06-03	2023-06-02
deviceid1	2023-06-29	2023-06-03
deviceid1	2023-06-30	2023-06-29Step 2:
使用datediff计算出dt和pre_date的间隔日期,即为沉默的天数。
deviceid	dt	pre_date dt_diff
deviceid1	2023-06-01	2023-06-01	-1
deviceid1	2023-06-02	2023-06-01	0
deviceid1	2023-06-03	2023-06-02	0
deviceid1	2023-06-29	2023-06-03	25
deviceid1	2023-06-30	2023-06-29	0最后通过count(DISTINCT if(dt_diff > 3,deviceid,null)) as 超过3天``给定查找的格式字段,即可求出。
超过3天	超过5天
1	1