在我们工作ETL或者在进行大数据面试的时候,总是常常会被手撕SQL,而常见的SQL有连续登录问题。一般的问法是“统计连续登录N天XX的XX”。
小编今天在这里给大家介绍两种方案轻松让你拿捏这类SQL问题。
mysql8.x和hive很多函数已经基本差不多都能满足了,为了执行效率和方便,这里以mysql为例,其他SQL类似,如果有不清楚的可以评论区留言。
在mysql中执行以下代码,生成相应的数据表
-- ---------------------------- -- Table structure for user_activity -- ---------------------------- DROP TABLE IF EXISTS `user_activity`; CREATE TABLE `user_activity` ( `user_id` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `activity_date` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL ) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic; -- ---------------------------- -- Records of user_activity -- ---------------------------- INSERT INTO `user_activity` VALUES ('user1', '2023-03-01'); INSERT INTO `user_activity` VALUES ('user2', '2023-03-02'); INSERT INTO `user_activity` VALUES ('user3', '2023-03-03'); INSERT INTO `user_activity` VALUES ('user4', '2023-03-04'); INSERT INTO `user_activity` VALUES ('user1', '2023-03-08'); INSERT INTO `user_activity` VALUES ('user2', '2023-03-08'); INSERT INTO `user_activity` VALUES ('user5', '2023-03-08'); INSERT INTO `user_activity` VALUES ('user6', '2023-03-08'); INSERT INTO `user_activity` VALUES ('user3', '2023-03-09'); INSERT INTO `user_activity` VALUES ('user5', '2023-03-09'); INSERT INTO `user_activity` VALUES ('user6', '2023-03-09'); INSERT INTO `user_activity` VALUES ('user7', '2023-03-09'); INSERT INTO `user_activity` VALUES ('user3', '2023-03-10'); INSERT INTO `user_activity` VALUES ('user5', '2023-03-10'); INSERT INTO `user_activity` VALUES ('user6', '2023-03-10'); INSERT INTO `user_activity` VALUES ('user7', '2023-03-10'); INSERT INTO `user_activity` VALUES ('user5', '2023-03-11'); INSERT INTO `user_activity` VALUES ('user6', '2023-03-11'); INSERT INTO `user_activity` VALUES ('user7', '2023-03-11'); INSERT INTO `user_activity` VALUES ('user6', '2023-03-12'); INSERT INTO `user_activity` VALUES ('user7', '2023-03-12'); INSERT INTO `user_activity` VALUES ('user7', '2023-03-13'); INSERT INTO `user_activity` VALUES ('user8', '2023-03-13'); INSERT INTO `user_activity` VALUES ('user7', '2023-03-14'); INSERT INTO `user_activity` VALUES ('user8', '2023-03-14'); INSERT INTO `user_activity` VALUES ('user7', '2023-03-15'); INSERT INTO `user_activity` VALUES ('user8', '2023-03-15'); INSERT INTO `user_activity` VALUES ('user8', '2023-03-16');SELECT * FROM `user_activity`结果如下:
user1 2023-03-01 user2 2023-03-02 user3 2023-03-03 user4 2023-03-04 user1 2023-03-08 user2 2023-03-08 user5 2023-03-08 user6 2023-03-08 user3 2023-03-09 user5 2023-03-09 user6 2023-03-09 user7 2023-03-09 user3 2023-03-10 user5 2023-03-10 user6 2023-03-10 user7 2023-03-10 user5 2023-03-11 user6 2023-03-11 user7 2023-03-11 user6 2023-03-12 user7 2023-03-12 user7 2023-03-13 user8 2023-03-13 user7 2023-03-14 user8 2023-03-14 user7 2023-03-15 user8 2023-03-15 user8 2023-03-16
select user_id, activity_date, ROW_NUMBER() over(partition by user_id order by activity_date) as rn from user_activity
SELECT user_id, activity_date, DATE_SUB(activity_date,INTERVAL rn DAY) as sub_date from( select user_id, activity_date, ROW_NUMBER() over(partition by user_id order by activity_date) as rn from user_activity )t1
SELECT user_id, min(activity_date) as min_date, max(activity_date) as max_date, count(1) as login_times from( SELECT user_id, activity_date, DATE_SUB(activity_date,INTERVAL rn DAY) as sub_date from( select user_id, activity_date, ROW_NUMBER() over(partition by user_id order by activity_date) as rn from user_activity )t1 )t2 group by user_id,sub_date having login_times>=3;
select user_id, LAG(activity_date,1,activity_date) over(partition by user_id order by activity_date) as lag_login_date, activity_date as current_login_date, LEAD(activity_date,1,activity_date) over(partition by user_id order by activity_date) as lead_login_date from user_activity
SELECT user_id, lag_login_date, current_login_date, lead_login_date from( select user_id, LAG(activity_date,1,activity_date) over(partition by user_id order by activity_date) as lag_login_date, activity_date as current_login_date, LEAD(activity_date,1,activity_date) over(partition by user_id order by activity_date) as lead_login_date from user_activity )t1 where datediff(current_login_date,lag_login_date)=1 and datediff(lead_login_date,current_login_date)=1;
SELECT user_id, min(activity_date) as min_date, max(activity_date) as max_date, count(1) as login_times from( SELECT user_id, activity_date, DATE_SUB(activity_date,INTERVAL rn DAY) as sub_date from( select user_id, activity_date, ROW_NUMBER() over(partition by user_id order by activity_date) as rn from user_activity )t1 )t2 group by user_id,sub_date having login_times>=3;
方案1,想法很简单,更容易实现,简单了解开窗排序函数和基础SQL能力即可完成。难度中
,
方案2,想法简单,实现难度更大一些,需要对开窗函数有一定的掌握和熟练。难度高