Hive:使用平均函数和最频函数分组
Hive: group by using average function and most frequency function
我有一个这样的table结构
|---------------------|----------|-----------|
| col_1 | col_2 | col_3 |
|---------------------|----------|-----------|
| 2018-01-15 17:56 | A | 3 |
|---------------------|----------|-----------|
| 2018-01-15 17:56 | A | 2 |
|---------------------|----------|-----------|
| 2018-10-23 23:43 | B | True |
|---------------------|----------|-----------|
| 2018-10-23 23:43 | B | False |
|---------------------|----------|-----------|
| 2018-10-23 23:43 | A | 3 |
|---------------------|----------|-----------|
| 2018-10-23 23:43 | B | True |
|---------------------|----------|-----------|
我想按 col_1
分组,如果 col_2
是 A,则取 col_3
的平均值,如果 col_2
,则取 col_3
的频繁值是 B。期望的结果是
|---------------------|----------|-----------|
| col_1 | A | B |
|---------------------|----------|-----------|
| 2018-01-15 17:56 | 2.5 | Null |
|---------------------|----------|-----------|
| 2018-10-23 23:43 | 3 | True |
|---------------------|----------|-----------|
当col_2
为B时没有频率函数,我知道我可以做这样的事情
select col_1,
avg(case when col_2='A' then col_3 end) as A
from my_table
group by col_1
col_2
为B时如何添加频率功能?
使用解析函数,见代码注释:
with my_table as (
select stack(6,
'2018-01-15 17:56','A', '3' ,
'2018-01-15 17:56','A', '2' ,
'2018-10-23 23:43','B', 'True' ,
'2018-10-23 23:43','B', 'False',
'2018-10-23 23:43','A', '3' ,
'2018-10-23 23:43','B', 'True' ) as (col_1 , col_2, col_3)
)
select col_1, --final aggregation by col_1
max(avg) as A,
max(most_frequent) as B
from(
select col_1, col_2, col_3, cnt, --calculate avg and most_frequent
case when col_2='A' then avg(col_3) over(partition by col_1, col_2) else null end as avg,
case when col_2='B' then first_value(col_3) over(partition by col_1, col_2 order by cnt desc) else null end as most_frequent
from
(
select col_1, col_2, col_3, --calculate count
case when col_2='B' then count(*) over(partition by col_1, col_2, col_3) else null end as cnt
from my_table
)s
)s
group by col_1
;
结果:
col_1 a b
2018-01-15 17:56 2.5 NULL
2018-10-23 23:43 3.0 True
您可以使用两个聚合级别:
select col1,
(sum(case when col2 = 'A' then col3 * cnt end)/
sum(case when col2 = 'A' then cnt end)
) as A,
max(case when col2 = 'B' and seqnum = 1 then col3 end) as B
from (select col1, col2, col3, count(*) as cnt,
row_number() over (partition by col1 order by count(*) desc) as seqnum
from t
group by col1, col2, col3
) t
group by col1;
我有一个这样的table结构
|---------------------|----------|-----------|
| col_1 | col_2 | col_3 |
|---------------------|----------|-----------|
| 2018-01-15 17:56 | A | 3 |
|---------------------|----------|-----------|
| 2018-01-15 17:56 | A | 2 |
|---------------------|----------|-----------|
| 2018-10-23 23:43 | B | True |
|---------------------|----------|-----------|
| 2018-10-23 23:43 | B | False |
|---------------------|----------|-----------|
| 2018-10-23 23:43 | A | 3 |
|---------------------|----------|-----------|
| 2018-10-23 23:43 | B | True |
|---------------------|----------|-----------|
我想按 col_1
分组,如果 col_2
是 A,则取 col_3
的平均值,如果 col_2
,则取 col_3
的频繁值是 B。期望的结果是
|---------------------|----------|-----------|
| col_1 | A | B |
|---------------------|----------|-----------|
| 2018-01-15 17:56 | 2.5 | Null |
|---------------------|----------|-----------|
| 2018-10-23 23:43 | 3 | True |
|---------------------|----------|-----------|
当col_2
为B时没有频率函数,我知道我可以做这样的事情
select col_1,
avg(case when col_2='A' then col_3 end) as A
from my_table
group by col_1
col_2
为B时如何添加频率功能?
使用解析函数,见代码注释:
with my_table as (
select stack(6,
'2018-01-15 17:56','A', '3' ,
'2018-01-15 17:56','A', '2' ,
'2018-10-23 23:43','B', 'True' ,
'2018-10-23 23:43','B', 'False',
'2018-10-23 23:43','A', '3' ,
'2018-10-23 23:43','B', 'True' ) as (col_1 , col_2, col_3)
)
select col_1, --final aggregation by col_1
max(avg) as A,
max(most_frequent) as B
from(
select col_1, col_2, col_3, cnt, --calculate avg and most_frequent
case when col_2='A' then avg(col_3) over(partition by col_1, col_2) else null end as avg,
case when col_2='B' then first_value(col_3) over(partition by col_1, col_2 order by cnt desc) else null end as most_frequent
from
(
select col_1, col_2, col_3, --calculate count
case when col_2='B' then count(*) over(partition by col_1, col_2, col_3) else null end as cnt
from my_table
)s
)s
group by col_1
;
结果:
col_1 a b
2018-01-15 17:56 2.5 NULL
2018-10-23 23:43 3.0 True
您可以使用两个聚合级别:
select col1,
(sum(case when col2 = 'A' then col3 * cnt end)/
sum(case when col2 = 'A' then cnt end)
) as A,
max(case when col2 = 'B' and seqnum = 1 then col3 end) as B
from (select col1, col2, col3, count(*) as cnt,
row_number() over (partition by col1 order by count(*) desc) as seqnum
from t
group by col1, col2, col3
) t
group by col1;