2.6 数据清洗转换操作

​ 目的: 主要是用于从ODS以及DIM层 将数据灌入到DWM层操作

  • 生成学生出勤状态信息表

基本SQL的实现: 相关的出勤状态未完成

select 
    ctudd.class_date,
    ctudd.class_id,
    student_id,
    '' as morning_att,   --未实现
    '' as afternoon_att, --未实现
    '' as  evening_att,  --未实现
    substr(ctudd.class_date,1,4) as yearinfo,
    substr(ctudd.class_date,6,2) as monthinfo,
    substr(ctudd.class_date,9,2) as dayinfo
from (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content !='开班典礼') ctudd
     left  join (select * from itcast_ods.student_signin_ods where share_state = 1) sso on sso.class_id = ctudd.class_id
     left join itcast_dimen.class_time_dimen ctd on ctd.id = sso.time_table_id
group  by ctudd.class_date , ctudd.class_id,sso.student_id;

探讨如何判断学生出勤状态呢?

接着 编写一个伪代码. 实现这个上述的逻辑:

select 
    ctudd.class_date,
    ctudd.class_id,
    student_id,
    if(
        sum(
            if( 
                sso.signin_time 
                    between ctd.morning_begin_time - 40   and  ctd.morning_end_time 
            ,1,0)

        ) > 0,  -- 如果大于0 认为当天的打卡记录中, 一定是有出勤的打卡记录, 如果小于等于0 认为没有出勤
       if(
           sum(
              if(
                 sso.signin_time 
                    between ctd.morning_begin_time - 40   and  ctd.morning_begin_time + 10
                , 1 , 0)
           ) >0, -- 如果大于0, 认为当天打卡记录中, 一定是有正常出勤的记录, 否则认为迟到出勤
       0,1),
    2)as morning_att,   --伪代码实现
    '' as afternoon_att, --未实现
    '' as  evening_att,  --未实现
    substr(ctudd.class_date,1,4) as yearinfo,
    substr(ctudd.class_date,6,2) as monthinfo,
    substr(ctudd.class_date,9,2) as dayinfo
from (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content !='开班典礼') ctudd
     left  join (select * from itcast_ods.student_signin_ods where share_state = 1) sso on sso.class_id = ctudd.class_id
     left join itcast_dimen.class_time_dimen ctd on ctd.id = sso.time_table_id
group  by ctudd.class_date , ctudd.class_id,sso.student_id;

探讨. 如何实现日期的 相加 和 相减 (对分钟处理)

select unix_timestamp('2021-10-08 15:40:30','yyyy-MM-dd HH:mm:ss') - 40*60;

发现: 作息时间表的 上课时间内容, 只有 时 分 秒 没有 年 月 日

实现上午的学生出勤统计SQL:

select 
    ctudd.class_date,
    ctudd.class_id,
    student_id,
    if(
        sum(
            if( 
                unix_timestamp(sso.signin_time,'yyyy-MM-dd HH:mm:ss')
                    between unix_timestamp(concat(ctudd.class_date,' ',ctd.morning_begin_time),'yyyy-MM-dd HH:mm:ss') - 40*60   
                        and  unix_timestamp(concat(ctudd.class_date,' ',ctd.morning_end_time),'yyyy-MM-dd HH:mm:ss')
            ,1,0)

        ) > 0,  -- 如果大于0 认为当天的打卡记录中, 一定是有出勤的打卡记录, 如果小于等于0 认为没有出勤
       if(
           sum(
              if(
                 unix_timestamp(sso.signin_time,'yyyy-MM-dd HH:mm:ss') 
                    between unix_timestamp(concat(ctudd.class_date,' ',ctd.morning_begin_time),'yyyy-MM-dd HH:mm:ss') - 40*60  
                        and  unix_timestamp(concat(ctudd.class_date,' ',ctd.morning_begin_time),'yyyy-MM-dd HH:mm:ss') + 10*60
                , 1 , 0)
           ) >0, -- 如果大于0, 认为当天打卡记录中, 一定是有正常出勤的记录, 否则认为迟到出勤
       0,1),
    2)as morning_att, 
    '' as afternoon_att, --未实现
    '' as  evening_att,  --未实现
    substr(ctudd.class_date,1,4) as yearinfo,
    substr(ctudd.class_date,6,2) as monthinfo,
    substr(ctudd.class_date,9,2) as dayinfo
from (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content !='开班典礼') ctudd
     left  join (select * from itcast_ods.student_signin_ods where share_state = 1) sso on sso.class_id = ctudd.class_id
     left join itcast_dimen.class_time_dimen ctd on ctd.id = sso.time_table_id
group  by ctudd.class_date , ctudd.class_id,sso.student_id;


select unix_timestamp('2021-10-08 15:40:30','yyyy-MM-dd HH:mm:ss') - 40*60; -- 1633676430
  • 最终实现:
select 
    ctudd.class_date,
    ctudd.class_id,
    student_id,
    if(
        sum(
            if( 
                unix_timestamp(sso.signin_time,'yyyy-MM-dd HH:mm:ss')
                    between unix_timestamp(concat(ctudd.class_date,' ',ctd.morning_begin_time),'yyyy-MM-dd HH:mm:ss') - 40*60   
                        and  unix_timestamp(concat(ctudd.class_date,' ',ctd.morning_end_time),'yyyy-MM-dd HH:mm:ss')
            ,1,0)

        ) > 0,  -- 如果大于0 认为当天的打卡记录中, 一定是有出勤的打卡记录, 如果小于等于0 认为没有出勤
       if(
           sum(
              if(
                 unix_timestamp(sso.signin_time,'yyyy-MM-dd HH:mm:ss') 
                    between unix_timestamp(concat(ctudd.class_date,' ',ctd.morning_begin_time),'yyyy-MM-dd HH:mm:ss') - 40*60  
                        and  unix_timestamp(concat(ctudd.class_date,' ',ctd.morning_begin_time),'yyyy-MM-dd HH:mm:ss') + 10*60
                , 1 , 0)
           ) >0, -- 如果大于0, 认为当天打卡记录中, 一定是有正常出勤的记录, 否则认为迟到出勤
       0,1),
    2)as morning_att, 
    if(
        sum(
            if( 
                unix_timestamp(sso.signin_time,'yyyy-MM-dd HH:mm:ss')
                    between unix_timestamp(concat(ctudd.class_date,' ',ctd.afternoon_begin_time),'yyyy-MM-dd HH:mm:ss') - 40*60   
                        and  unix_timestamp(concat(ctudd.class_date,' ',ctd.afternoon_end_time),'yyyy-MM-dd HH:mm:ss')
            ,1,0)

        ) > 0,  -- 如果大于0 认为当天的打卡记录中, 一定是有出勤的打卡记录, 如果小于等于0 认为没有出勤
       if(
           sum(
              if(
                 unix_timestamp(sso.signin_time,'yyyy-MM-dd HH:mm:ss') 
                    between unix_timestamp(concat(ctudd.class_date,' ',ctd.afternoon_begin_time),'yyyy-MM-dd HH:mm:ss') - 40*60  
                        and  unix_timestamp(concat(ctudd.class_date,' ',ctd.afternoon_begin_time),'yyyy-MM-dd HH:mm:ss') + 10*60
                , 1 , 0)
           ) >0, -- 如果大于0, 认为当天打卡记录中, 一定是有正常出勤的记录, 否则认为迟到出勤
       0,1),
    2) as afternoon_att, 
    if(
        sum(
            if( 
                unix_timestamp(sso.signin_time,'yyyy-MM-dd HH:mm:ss')
                    between unix_timestamp(concat(ctudd.class_date,' ',ctd.evening_begin_time),'yyyy-MM-dd HH:mm:ss') - 40*60   
                        and  unix_timestamp(concat(ctudd.class_date,' ',ctd.evening_end_time),'yyyy-MM-dd HH:mm:ss')
            ,1,0)

        ) > 0,  -- 如果大于0 认为当天的打卡记录中, 一定是有出勤的打卡记录, 如果小于等于0 认为没有出勤
       if(
           sum(
              if(
                 unix_timestamp(sso.signin_time,'yyyy-MM-dd HH:mm:ss') 
                    between unix_timestamp(concat(ctudd.class_date,' ',ctd.evening_begin_time),'yyyy-MM-dd HH:mm:ss') - 40*60  
                        and  unix_timestamp(concat(ctudd.class_date,' ',ctd.evening_begin_time),'yyyy-MM-dd HH:mm:ss') + 10*60
                , 1 , 0)
           ) >0, -- 如果大于0, 认为当天打卡记录中, 一定是有正常出勤的记录, 否则认为迟到出勤
       0,1),
    2) as  evening_att,  --未实现
    substr(ctudd.class_date,1,4) as yearinfo,
    substr(ctudd.class_date,6,2) as monthinfo,
    substr(ctudd.class_date,9,2) as dayinfo
from (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content !='开班典礼') ctudd
     left  join (select * from itcast_ods.student_signin_ods where share_state = 1) sso on sso.class_id = ctudd.class_id
     left join itcast_dimen.class_time_dimen ctd on ctd.id = sso.time_table_id
group  by ctudd.class_date , ctudd.class_id,sso.student_id;

将SQL的结果灌入到DWM层

set hive.auto.convert.join=false;

--分区
SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=10000;
set hive.exec.max.dynamic.partitions=100000;
set hive.exec.max.created.files=150000;
--hive压缩
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
--写入时压缩生效
set hive.exec.orc.compression.strategy=COMPRESSION;
--分桶
--set hive.enforce.bucketing=true;
--set hive.enforce.sorting=true;
--set hive.optimize.bucketmapjoin = true;
--set hive.auto.convert.sortmerge.join=true;
--set hive.auto.convert.sortmerge.join.noconditionaltask=true;
--并行执行
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=8;
--小文件合并
-- set mapred.max.split.size=2147483648;
-- set mapred.min.split.size.per.node=1000000000;
-- set mapred.min.split.size.per.rack=1000000000;
--矢量化查询
set hive.vectorized.execution.enabled=true;
--关联优化器
set hive.optimize.correlation=true;
--读取零拷贝
set hive.exec.orc.zerocopy=true;
--join数据倾斜
set hive.optimize.skewjoin=true;
-- set hive.skewjoin.key=100000;
set hive.optimize.skewjoin.compiletime=true;
set hive.optimize.union.remove=true;
-- group倾斜
set hive.groupby.skewindata=true;


insert into table itcast_dwm.student_attendance_dwm partition(yearinfo,monthinfo,dayinfo)
select 
    ctudd.class_date,
    ctudd.class_id,
    student_id,
    if(
        sum(
            if( 
                unix_timestamp(sso.signin_time,'yyyy-MM-dd HH:mm:ss')
                    between unix_timestamp(concat(ctudd.class_date,' ',ctd.morning_begin_time),'yyyy-MM-dd HH:mm:ss') - 40*60   
                        and  unix_timestamp(concat(ctudd.class_date,' ',ctd.morning_end_time),'yyyy-MM-dd HH:mm:ss')
            ,1,0)

        ) > 0,  -- 如果大于0 认为当天的打卡记录中, 一定是有出勤的打卡记录, 如果小于等于0 认为没有出勤
       if(
           sum(
              if(
                 unix_timestamp(sso.signin_time,'yyyy-MM-dd HH:mm:ss') 
                    between unix_timestamp(concat(ctudd.class_date,' ',ctd.morning_begin_time),'yyyy-MM-dd HH:mm:ss') - 40*60  
                        and  unix_timestamp(concat(ctudd.class_date,' ',ctd.morning_begin_time),'yyyy-MM-dd HH:mm:ss') + 10*60
                , 1 , 0)
           ) >0, -- 如果大于0, 认为当天打卡记录中, 一定是有正常出勤的记录, 否则认为迟到出勤
       0,1),
    2)as morning_att, 
    if(
        sum(
            if( 
                unix_timestamp(sso.signin_time,'yyyy-MM-dd HH:mm:ss')
                    between unix_timestamp(concat(ctudd.class_date,' ',ctd.afternoon_begin_time),'yyyy-MM-dd HH:mm:ss') - 40*60   
                        and  unix_timestamp(concat(ctudd.class_date,' ',ctd.afternoon_end_time),'yyyy-MM-dd HH:mm:ss')
            ,1,0)

        ) > 0,  -- 如果大于0 认为当天的打卡记录中, 一定是有出勤的打卡记录, 如果小于等于0 认为没有出勤
       if(
           sum(
              if(
                 unix_timestamp(sso.signin_time,'yyyy-MM-dd HH:mm:ss') 
                    between unix_timestamp(concat(ctudd.class_date,' ',ctd.afternoon_begin_time),'yyyy-MM-dd HH:mm:ss') - 40*60  
                        and  unix_timestamp(concat(ctudd.class_date,' ',ctd.afternoon_begin_time),'yyyy-MM-dd HH:mm:ss') + 10*60
                , 1 , 0)
           ) >0, -- 如果大于0, 认为当天打卡记录中, 一定是有正常出勤的记录, 否则认为迟到出勤
       0,1),
    2) as afternoon_att, 
    if(
        sum(
            if( 
                unix_timestamp(sso.signin_time,'yyyy-MM-dd HH:mm:ss')
                    between unix_timestamp(concat(ctudd.class_date,' ',ctd.evening_begin_time),'yyyy-MM-dd HH:mm:ss') - 40*60   
                        and  unix_timestamp(concat(ctudd.class_date,' ',ctd.evening_end_time),'yyyy-MM-dd HH:mm:ss')
            ,1,0)

        ) > 0,  -- 如果大于0 认为当天的打卡记录中, 一定是有出勤的打卡记录, 如果小于等于0 认为没有出勤
       if(
           sum(
              if(
                 unix_timestamp(sso.signin_time,'yyyy-MM-dd HH:mm:ss') 
                    between unix_timestamp(concat(ctudd.class_date,' ',ctd.evening_begin_time),'yyyy-MM-dd HH:mm:ss') - 40*60  
                        and  unix_timestamp(concat(ctudd.class_date,' ',ctd.evening_begin_time),'yyyy-MM-dd HH:mm:ss') + 10*60
                , 1 , 0)
           ) >0, -- 如果大于0, 认为当天打卡记录中, 一定是有正常出勤的记录, 否则认为迟到出勤
       0,1),
    2) as  evening_att,  --未实现
    substr(ctudd.class_date,1,4) as yearinfo,
    substr(ctudd.class_date,6,2) as monthinfo,
    substr(ctudd.class_date,9,2) as dayinfo
from (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content !='开班典礼') ctudd
     left  join (select * from itcast_ods.student_signin_ods where share_state = 1) sso on sso.class_id = ctudd.class_id
     left join itcast_dimen.class_time_dimen ctd on ctd.id = sso.time_table_id
where ctudd.class_date in ('2019-09-03','2019-09-04','2019-09-05')
group  by ctudd.class_date , ctudd.class_id,sso.student_id;
  • 班级出勤人数表

    SQL的实现

select 
    dateinfo,
    class_id,
    count(
        case 
            when morning_att in ('0','1') then student_id
            else null end
    ) as morning_att_count,

    count(
        case 
            when afternoon_att in ('0','1') then student_id
            else null end
    ) as afternoon_att_count,

    count(
        case 
            when evening_att in ('0','1') then student_id
            else null end
    ) as evening_att_count,

    sum(
        case 
            when morning_att ='1' then 1
            else 0 end
    ) as morning_late_count,

    sum(
        case 
            when afternoon_att ='1' then 1
            else 0 end
    ) as afternoon_late_count,

    sum(
        case 
            when evening_att ='1' then 1
            else 0 end
    ) as evening_late_count,

    yearinfo,
    monthinfo,
    dayinfo
from itcast_dwm.student_attendance_dwm
group by  dateinfo,yearinfo,monthinfo,dayinfo,class_id;

将查询的结果灌入到班级出勤信息表中

insert into  table itcast_dwm.class_attendance_dwm partition(yearinfo,monthinfo,dayinfo)
select 
    dateinfo,
    class_id,
    count(
        case 
            when morning_att in ('0','1') then student_id
            else null end
    ) as morning_att_count,


    count(
        case 
            when afternoon_att in ('0','1') then student_id
            else null end
    ) as afternoon_att_count,

    count(
        case 
            when evening_att in ('0','1') then student_id
            else null end
    ) as evening_att_count,

    sum(
        case 
            when morning_att ='1' then 1
            else 0 end
    ) as morning_late_count,

    sum(
        case 
            when afternoon_att ='1' then 1
            else 0 end
    ) as afternoon_late_count,

    sum(
        case 
            when evening_att ='1' then 1
            else 0 end
    ) as evening_late_count,

    yearinfo,
    monthinfo,
    dayinfo
from itcast_dwm.student_attendance_dwm
group by  dateinfo,yearinfo,monthinfo,dayinfo,class_id;
  • 班级请假人数表的统计生成

由于库表没有一条符合条件的请假数据, 所以在这里模拟一条请假数据

INSERT INTO itcast_ods.student_leave_apply_ods partition (dt)
values (125, 5032, 119142, 1, 3491, '2019-09-08 16:42:29', '狂犬疫苗最后一针', 1, 2, '2019-09-03 08:00:00', 2, '2019-09-03 23:30:00', 2, 1, 0, null, null, '医院打针,狂犬疫苗最后一针了', 1, '2019-09-02 08:56:54','2020-07-07');

计算上午的每天各个班级请假人数

select
    ctudd.class_date,
    ctudd.class_id,
    count(distinct sla.student_id) as morning_leave_count
from (select * from itcast_ods.student_leave_apply_ods where audit_state = 1 and cancel_state = 0 and valid_state = 1) sla
    join itcast_dimen.class_time_dimen ctd on sla.class_id = ctd.class_id
    join (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content != '开班典礼') ctudd on ctudd.class_id = sla.class_id
where ctudd.class_date between ctd.use_begin_date  and  ctd.use_end_date 
    and concat(ctudd.class_date,' ',ctd.morning_begin_time) >= sla.begin_time
    and concat(ctudd.class_date,' ',ctd.morning_begin_time) <= sla.end_time
group by ctudd.class_date,ctudd.class_id;

计算下午的每天各个班级的请假人数

select
    ctudd.class_date,
    ctudd.class_id,
    count(distinct sla.student_id) as afternoon_leave_count
from (select * from itcast_ods.student_leave_apply_ods where audit_state = 1 and cancel_state = 0 and valid_state = 1) sla
    join itcast_dimen.class_time_dimen ctd on sla.class_id = ctd.class_id
    join (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content != '开班典礼') ctudd on ctudd.class_id = sla.class_id
where ctudd.class_date between ctd.use_begin_date  and  ctd.use_end_date 
    and concat(ctudd.class_date,' ',ctd.afternoon_begin_time) >= sla.begin_time
    and concat(ctudd.class_date,' ',ctd.afternoon_begin_time) <= sla.end_time
group by ctudd.class_date,ctudd.class_id;

计算晚自习的每天各个班级的请假人数

select
    ctudd.class_date,
    ctudd.class_id,
    count(distinct sla.student_id) as evening_leave_count
from (select * from itcast_ods.student_leave_apply_ods where audit_state = 1 and cancel_state = 0 and valid_state = 1) sla
    join itcast_dimen.class_time_dimen ctd on sla.class_id = ctd.class_id
    join (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content != '开班典礼') ctudd on ctudd.class_id = sla.class_id
where ctudd.class_date between ctd.use_begin_date  and  ctd.use_end_date 
    and concat(ctudd.class_date,' ',ctd.evening_begin_time) >= sla.begin_time
    and concat(ctudd.class_date,' ',ctd.evening_begin_time) <= sla.end_time
group by ctudd.class_date,ctudd.class_id;

目前需要将三个结果的数据合并在一起, 如何实现呢?

黑马在线教育数仓实战9-LMLPHP

黑马在线教育数仓实战9-LMLPHP

实现最终的查询的SQL:

with A as (select
    ctudd.class_date,
    ctudd.class_id,
    count(distinct sla.student_id) as morning_leave_count
from (select * from itcast_ods.student_leave_apply_ods where audit_state = 1 and cancel_state = 0 and valid_state = 1) sla
    join itcast_dimen.class_time_dimen ctd on sla.class_id = ctd.class_id
    join (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content != '开班典礼') ctudd on ctudd.class_id = sla.class_id
where ctudd.class_date between ctd.use_begin_date  and  ctd.use_end_date 
    and concat(ctudd.class_date,' ',ctd.morning_begin_time) >= sla.begin_time
    and concat(ctudd.class_date,' ',ctd.morning_begin_time) <= sla.end_time
group by ctudd.class_date,ctudd.class_id),
    B AS (select
    ctudd.class_date,
    ctudd.class_id,
    count(distinct sla.student_id) as afternoon_leave_count
from (select * from itcast_ods.student_leave_apply_ods where audit_state = 1 and cancel_state = 0 and valid_state = 1) sla
    join itcast_dimen.class_time_dimen ctd on sla.class_id = ctd.class_id
    join (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content != '开班典礼') ctudd on ctudd.class_id = sla.class_id
where ctudd.class_date between ctd.use_begin_date  and  ctd.use_end_date 
    and concat(ctudd.class_date,' ',ctd.afternoon_begin_time) >= sla.begin_time
    and concat(ctudd.class_date,' ',ctd.afternoon_begin_time) <= sla.end_time
group by ctudd.class_date,ctudd.class_id),
    C AS (select
    ctudd.class_date,
    ctudd.class_id,
    count(distinct sla.student_id) as evening_leave_count
from (select * from itcast_ods.student_leave_apply_ods where audit_state = 1 and cancel_state = 0 and valid_state = 1) sla
    join itcast_dimen.class_time_dimen ctd on sla.class_id = ctd.class_id
    join (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content != '开班典礼') ctudd on ctudd.class_id = sla.class_id
where ctudd.class_date between ctd.use_begin_date  and  ctd.use_end_date 
    and concat(ctudd.class_date,' ',ctd.evening_begin_time) >= sla.begin_time
    and concat(ctudd.class_date,' ',ctd.evening_begin_time) <= sla.end_time
group by ctudd.class_date,ctudd.class_id),
temp as (select  
    coalesce(A.class_date,B.class_date,C.class_date) AS class_date,
    coalesce(A.class_id,B.class_id,C.class_id) AS class_id,
    nvl(A.morning_leave_count,0) as morning_leave_count,
    nvl(B.afternoon_leave_count,0) as afternoon_leave_count,
    nvl(C.evening_leave_count,0) as evening_leave_count
from A 
    full join B on A.class_date = B.class_date and A.class_id = B.class_id
    full join C on A.class_date = C.class_date and A.class_id = C.class_id)
select 
    class_date,
    class_id,
    sum(morning_leave_count) as morning_leave_count,
    sum(afternoon_leave_count) as afternoon_leave_count,
    sum(evening_leave_count) as evening_leave_count,
    substr(class_date,1,4) as yearinfo,
    substr(class_date,6,2) as monthinfo,
    substr(class_date,9,2) as dayinfo
from temp group by class_date,class_id;

将结果灌入到DWM层即可

set hive.auto.convert.join=false;

--分区
SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=10000;
set hive.exec.max.dynamic.partitions=100000;
set hive.exec.max.created.files=150000;
--hive压缩
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
--写入时压缩生效
set hive.exec.orc.compression.strategy=COMPRESSION;
--分桶
--set hive.enforce.bucketing=true;
--set hive.enforce.sorting=true;
--set hive.optimize.bucketmapjoin = true;
--set hive.auto.convert.sortmerge.join=true;
--set hive.auto.convert.sortmerge.join.noconditionaltask=true;
--并行执行
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=8;
--小文件合并
-- set mapred.max.split.size=2147483648;
-- set mapred.min.split.size.per.node=1000000000;
-- set mapred.min.split.size.per.rack=1000000000;
--矢量化查询
set hive.vectorized.execution.enabled=true;
--关联优化器
set hive.optimize.correlation=true;
--读取零拷贝
set hive.exec.orc.zerocopy=true;
--join数据倾斜
set hive.optimize.skewjoin=false;
-- set hive.skewjoin.key=100000;
set hive.optimize.skewjoin.compiletime=false;
set hive.optimize.union.remove=false;
-- group倾斜
set hive.groupby.skewindata=false;


with A as (select
    ctudd.class_date,
    ctudd.class_id,
    count(distinct sla.student_id) as morning_leave_count
from (select * from itcast_ods.student_leave_apply_ods where audit_state = 1 and cancel_state = 0 and valid_state = 1) sla
    join itcast_dimen.class_time_dimen ctd on sla.class_id = ctd.class_id
    join (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content != '开班典礼') ctudd on ctudd.class_id = sla.class_id
where ctudd.class_date between ctd.use_begin_date  and  ctd.use_end_date 
    and concat(ctudd.class_date,' ',ctd.morning_begin_time) >= sla.begin_time
    and concat(ctudd.class_date,' ',ctd.morning_begin_time) <= sla.end_time
group by ctudd.class_date,ctudd.class_id),
    B AS (select
    ctudd.class_date,
    ctudd.class_id,
    count(distinct sla.student_id) as afternoon_leave_count
from (select * from itcast_ods.student_leave_apply_ods where audit_state = 1 and cancel_state = 0 and valid_state = 1) sla
    join itcast_dimen.class_time_dimen ctd on sla.class_id = ctd.class_id
    join (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content != '开班典礼') ctudd on ctudd.class_id = sla.class_id
where ctudd.class_date between ctd.use_begin_date  and  ctd.use_end_date 
    and concat(ctudd.class_date,' ',ctd.afternoon_begin_time) >= sla.begin_time
    and concat(ctudd.class_date,' ',ctd.afternoon_begin_time) <= sla.end_time
group by ctudd.class_date,ctudd.class_id),
    C AS (select
    ctudd.class_date,
    ctudd.class_id,
    count(distinct sla.student_id) as evening_leave_count
from (select * from itcast_ods.student_leave_apply_ods where audit_state = 1 and cancel_state = 0 and valid_state = 1) sla
    join itcast_dimen.class_time_dimen ctd on sla.class_id = ctd.class_id
    join (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content != '开班典礼') ctudd on ctudd.class_id = sla.class_id
where ctudd.class_date between ctd.use_begin_date  and  ctd.use_end_date 
    and concat(ctudd.class_date,' ',ctd.evening_begin_time) >= sla.begin_time
    and concat(ctudd.class_date,' ',ctd.evening_begin_time) <= sla.end_time
group by ctudd.class_date,ctudd.class_id),
temp as (select  
    coalesce(A.class_date,B.class_date,C.class_date) AS class_date,
    coalesce(A.class_id,B.class_id,C.class_id) AS class_id,
    nvl(A.morning_leave_count,0) as morning_leave_count,
    nvl(B.afternoon_leave_count,0) as afternoon_leave_count,
    nvl(C.evening_leave_count,0) as evening_leave_count
from A 
    full join B on A.class_date = B.class_date and A.class_id = B.class_id
    full join C on A.class_date = C.class_date and A.class_id = C.class_id)
insert into table itcast_dwm.class_leave_dwm partition(yearinfo,monthinfo,dayinfo)
select 
    class_date,
    class_id,
    sum(morning_leave_count) as morning_leave_count,
    sum(afternoon_leave_count) as afternoon_leave_count,
    sum(evening_leave_count) as evening_leave_count,
    substr(class_date,1,4) as yearinfo,
    substr(class_date,6,2) as monthinfo,
    substr(class_date,9,2) as dayinfo
from temp group by class_date,class_id;
  • 旷课人数表统计

SQL最终实现

insert into table itcast_dwm.class_truant_dwm partition(yearinfo,monthinfo,dayinfo)
select 
    ctudd.class_date as dateinfo,
    ctudd.class_id,
    cssc.studying_student_count - nvl(cad.morning_att_count,0) - nvl(cld.morning_leave_count,0) as morning_truant_count,
    cssc.studying_student_count - nvl(cad.afternoon_att_count,0) - nvl(cld.afternoon_leave_count,0) as afternoon_truant_count,
    cssc.studying_student_count - nvl(cad.evening_att_count,0) - nvl(cld.evening_leave_count,0) as evening_truant_count,
    substr(ctudd.class_date,1,4) as yearinfo,
    substr(ctudd.class_date,6,2) as monthinfo,
    substr(ctudd.class_date,9,2) as dayinfo
from (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content != '开班典礼') ctudd 
    left join itcast_dimen.class_studying_student_count_dimen cssc on  ctudd.class_date = cssc.studying_date and ctudd.class_id = cssc.class_id
    left join itcast_dwm.class_attendance_dwm cad on ctudd.class_id = cad.class_id and ctudd.class_date = cad.dateinfo
    left join itcast_dwm.class_leave_dwm cld on ctudd.class_id = cld.class_id and ctudd.class_date = cld.dateinfo
where ctudd.class_date in('2019-09-03','2019-09-04','2019-09-05')
  • 汇总表
insert into table itcast_dwm.class_all_dwm partition(yearinfo,monthinfo,dayinfo)
select
    ctudd.class_date as dateinfo,
    ctudd.class_id,
    cssc.studying_student_count,

    cad.morning_att_count,
    concat(round(nvl(cad.morning_att_count,0) /  cssc.studying_student_count * 100,2),'%') as morning_att_ratio,
    cad.afternoon_att_count,
    concat(round(nvl(cad.afternoon_att_count,0) /  cssc.studying_student_count * 100,2),'%') as afternoon_att_ratio,
    cad.evening_att_count,
    concat(round(nvl(cad.evening_att_count,0) /  cssc.studying_student_count * 100,2),'%') as evening_att_ratio,

    cad.morning_late_count,
    concat(round(nvl(cad.morning_late_count,0) /  cssc.studying_student_count * 100,2),'%') as morning_late_ratio,
    cad.afternoon_late_count,
    concat(round(nvl(cad.afternoon_late_count,0) /  cssc.studying_student_count * 100,2),'%') as afternoon_late_ratio,
    cad.evening_late_count,
    concat(round(nvl(cad.evening_late_count,0) /  cssc.studying_student_count * 100,2),'%') as evening_late_ratio,

    cld.morning_leave_count,
    concat(round(nvl(cld.morning_leave_count,0) /  cssc.studying_student_count * 100,2),'%') as morning_leave_ratio,
    cld.afternoon_leave_count,
    concat(round(nvl(cld.afternoon_leave_count,0) /  cssc.studying_student_count * 100,2),'%') as afternoon_leave_ratio,
    cld.evening_leave_count,
    concat(round(nvl(cld.evening_leave_count,0) /  cssc.studying_student_count * 100,2),'%') as evening_leave_ratio,

    ctd.morning_truant_count,
    concat(round(nvl(ctd.morning_truant_count,0) /  cssc.studying_student_count * 100,2),'%') as morning_truant_ratio,
    ctd.afternoon_truant_count,
    concat(round(nvl(ctd.afternoon_truant_count,0) /  cssc.studying_student_count * 100,2),'%') as afternoon_truant_ratio,
    ctd.evening_truant_count,
    concat(round(nvl(ctd.evening_truant_count,0) /  cssc.studying_student_count * 100,2),'%') as evening_truant_ratio,

    substr(ctudd.class_date,1,4) as yearinfo,
    substr(ctudd.class_date,6,2) as monthinfo,
    substr(ctudd.class_date,9,2) as dayinfo


from (select * from itcast_dimen.course_table_upload_detail_dimen where nvl(content,'')!='' and content != '开班典礼') ctudd
    left join itcast_dimen.class_studying_student_count_dimen cssc on cssc.class_id = ctudd.class_id and cssc.studying_date = ctudd.class_date
    left join itcast_dwm.class_attendance_dwm cad on ctudd.class_id = cad.class_id and ctudd.class_date = cad.dateinfo
    left join itcast_dwm.class_leave_dwm cld on ctudd.class_id = cld.class_id and ctudd.class_date = cld.dateinfo
    left join itcast_dwm.class_truant_dwm ctd on ctudd.class_id = ctd.class_id and ctudd.class_date = ctd.dateinfo
where ctudd.class_date in ('2019-09-03','2019-09-04','2019-09-05');

2.7 数据的统计分析(DWS)

DWS: 细化维度统计操作

SQL实现

insert into table itcast_dws.class_attendance_dws partition(yearinfo,monthinfo,dayinfo)
select 
    concat(yearinfo,'-',monthinfo) as dateinfo,
    class_id,
    sum(studying_student_count) as studying_student_count,

    sum(morning_att_count) as  morning_att_count,
    concat(round(sum(morning_att_count) / sum(studying_student_count) *100,2),'%') as morning_att_ratio,
    sum(afternoon_att_count) as  afternoon_att_count,
    concat(round(sum(afternoon_att_count) / sum(studying_student_count) *100,2),'%') as afternoon_att_ratio,
    sum(evening_att_count) as evening_att_count,
    concat(round(sum(evening_att_count) / sum(studying_student_count) *100,2),'%') as evening_att_ratio,

    sum(morning_late_count) as  morning_late_count,
    concat(round(sum(morning_late_count) / sum(studying_student_count) *100,2),'%') as morning_late_ratio,
    sum(afternoon_late_count) as  afternoon_late_count,
    concat(round(sum(afternoon_late_count) / sum(studying_student_count) *100,2),'%') as afternoon_late_ratio,
    sum(evening_late_count) as evening_late_count,
    concat(round(sum(evening_late_count) / sum(studying_student_count) *100,2),'%') as evening_late_ratio,

    sum(morning_leave_count) as  morning_leave_count,
    concat(round(sum(morning_leave_count) / sum(studying_student_count) *100,2),'%') as morning_leave_ratio,
    sum(afternoon_leave_count) as  afternoon_leave_count,
    concat(round(sum(afternoon_leave_count) / sum(studying_student_count) *100,2),'%') as afternoon_leave_ratio,
    sum(evening_leave_count) as evening_leave_count,
    concat(round(sum(evening_leave_count) / sum(studying_student_count) *100,2),'%') as evening_leave_ratio,

    sum(morning_truant_count) as  morning_truant_count,
    concat(round(sum(morning_truant_count) / sum(studying_student_count) *100,2),'%') as morning_truant_ratio,
    sum(afternoon_truant_count) as  afternoon_truant_count,
    concat(round(sum(afternoon_truant_count) / sum(studying_student_count) *100,2),'%') as afternoon_truant_ratio,
    sum(evening_truant_count) as evening_truant_count,
    concat(round(sum(evening_truant_count) / sum(studying_student_count) *100,2),'%') as evening_truant_ratio,

    '4' as  time_type,
    yearinfo,
    monthinfo,
    '-1' as dayinfo

from  itcast_dwm.class_all_dwm
group by yearinfo,monthinfo,class_id;
05-05 14:42