滴滴集团作为生活服务领域的头部企业,正在全面测试和上线StarRocks。其中橙心优选经过一年多的数据体系建设,我们逐渐将一部分需要实时交互查询、即席查询的多维数据分析需求由ClickHouse迁移到了StarRocks中,StarRocks在稳定性、实时性方面也给了我们良好的体验,接下来以StarRocks实现的漏斗分析为例介绍StarRocks在橙心优选运营数据分析应用中的实践。
“ 作者:王鹏,需求介绍
滴滴橙心优选数据架构部资深数据架构开发工程师,负责橙心优选大数据基础服务和数据应用的开发与建设 ”
create table 'temp_table'{
'user_id' string COMMENT '原始表去重后的用户ID'
}
insert overwrite table temp_table select user_id from fact_log_user_hive_table group by user_id
create table 'global_dict_by_userid_hive_table'{
'user_id' string COMMENT '原始用户ID',
'new_user_id' int COMMENT '对原始用户ID编码后的整型用户ID'
}
--4 更新Hive字典表
insert overwrite global_dict_by_userid_hive_table
select user_id, new_user_id from global_dict_by_userid_hive_table
--3 与历史的字段数据求并集
union all select t1.user_id,
--2 生成全局ID:用全局字典表中当前的最大用户ID加上新增用户的行号
(row_number() over(order by t1.user_id) + t2.max_id) as new_user_id
--1 获得新增的去重值集合
from
(
select user_id from temp_table
where user_id is not null
) t1
left join
(
select user_id, new_user_id, (max(new_user_id) over()) as max_id from
global_dict_by_userid_hive_table
) t2
on
t1.user_id = t2.user_id
where t2.newuser_id is null
insert overwrite fact_log_user_hive_table
select
a.user_id,
b.new_user_id
from
fact_log_user_hive_table a left join global_dict_by_userid_hive_table b
on a.user_id=b.user_id
CREATE TABLE `fact_log_user_doris_table` (
`new_user_id` bigint(20) NULL COMMENT "整型用户id",
`user_id` varchar(65533) NULL COMMENT "用户id",
`event_source` varchar(65533) NULL COMMENT "端(1:商城小程序 2:团长小程序 3:独立APP 4:主端)",
`is_new` varchar(65533) NULL COMMENT "是否新用户",
`identity` varchar(65533) NULL COMMENT "用户身份(团长或者普通用户)",
`biz_channel_name` varchar(65533) NULL COMMENT "当天首次落地页渠道名称",
`pro_id` varchar(65533) NULL COMMENT "省ID",
`pro_name` varchar(65533) NULL COMMENT "省名称",
`city_id` varchar(65533) NULL COMMENT "城市ID",
`city_name` varchar(65533) NULL COMMENT "城市名称",
`dt` date NULL COMMENT "分区",
`period_type` varchar(65533) NULL DEFAULT "daily" COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`index_id`, `user_id`, `biz_channel_name`, `pro_id`, `city_id`)
PARTITION BY RANGE(`dt`)(
PARTITION p20210731 VALUES [('2021-07-31'), ('2021-08-01')),
PARTITION p20210801 VALUES [('2021-08-01'), ('2021-08-02')),
PARTITION p20210802 VALUES [('2021-08-02'), ('2021-08-03')),
PARTITION p20210803 VALUES [('2021-08-03'), ('2021-08-04')),
PARTITION p20210804 VALUES [('2021-08-04'), ('2021-08-05')),
PARTITION p20210805 VALUES [('2021-08-05'), ('2021-08-06')),
PARTITION p20210806 VALUES [('2021-08-06'), ('2021-08-07')),
PARTITION p20210807 VALUES [('2021-08-07'), ('2021-08-08')),
PARTITION p20210808 VALUES [('2021-08-08'), ('2021-08-09')))
DISTRIBUTED BY HASH(`index_id`, `user_id`) BUCKETS 10
PROPERTIES (
"replication_num" = "3",
"dynamic_partition.enable" = "true",
"dynamic_partition.time_unit" = "DAY",
"dynamic_partition.time_zone" = "Asia/Shanghai",
"dynamic_partition.start" = "-2147483648",
"dynamic_partition.end" = "1",
"dynamic_partition.prefix" = "p",
"dynamic_partition.replication_num" = "-1",
"dynamic_partition.buckets" = "3",
"in_memory" = "false",
"storage_format" = "DEFAULT"
);
select city_id, count(distinct new_user_id) as countDistinctByID from fact_log_user_doris_table where `dt` >= '2021-08-01' AND `dt` <= '2021-08-07' AND `city_id` in (11, 12, 13) group by city_id
create materialized view city_user_count as select city_id, bitmap_union(to_bitmap(new_user_id)) from fact_log_user_doris_table group by city_id;