基于 10 天间隔创建组
Create groups based on 10 day interval
我有以下数据table
Animal Immunization_Date
Cat 1/18/2017
Cat 1/27/2017
Cat 5/7/2017
Cat 5/12/2017
Dog 1/1/2017
Dog 1/5/2017
Dog 1/7/2017
Dog 3/25/2017
Dog 4/18/2017
我正在尝试根据动物的 10 天间隔创建排名,这将导致以下结果。 (寻找动物的第一个日期,然后在该日期后 10 天内的任何几天分配一组 1
。然后为未分配给 1
的动物取下一个日期并分配它2
然后将 2
分配给该日期后 10 天内的任意日期,等等...)
Animal Immunization_Date 10_Day_Group_Rank
Cat 1/18/2017 1
Cat 1/27/2017 1
Cat 5/7/2017 2
Cat 5/12/2017 2
Dog 1/1/2017 1
Dog 1/5/2017 1
Dog 1/7/2017 1
Dog 3/25/2017 2
Dog 4/18/2017 3
我一直在尝试以下代码,但我似乎无法让 10 天小组工作。
Select
dt.Animal,
dt.Immunization_Date,
sum(dt.10_day_Group) over(partition dt.Animal order by dt.Immunization_Date rows unbounded preceding) as 10_day_Group --creates a running total that is also the group
from
(
Select
Animal,
Immunization_Date,
case when min(Immunization_Date) over (partition by Animal order by Immunization_Date) <=10 then 1 else 0 end as 10_Day_Group --Create intervals of 10 days
from Table_A
) as dt
我不太确定如何将 10 天分组。
case when min(Immunization_Date) over (partition by Animal order by Immunization_Date) <=10 then 1 else 0 end as 10_Day_Group
我可以在 Excel 中使用以下方法做到这一点。我知道 excel 和 SQL 是不同的,但我希望如果在 Excel 中看到它是如何完成的,如果有什么可以在 SQL 中完成的话。
Excel 数据 table 看起来像这样(table 从单元格 A1
开始)。 (注意 Animal
需要排序,Immunization_Date
需要排序才能使 Excel 公式起作用)
Animal Immunization_Date Dummy_1 10_Day_Group
Cat 1/18/2017 1/18/2017 1
Cat 1/27/2017 1/18/2017 1
Cat 5/7/2017 5/7/2017 2
Cat 5/12/2017 5/7/2017 2
Dog 1/1/2017 1/1/2017 1
Dog 1/5/2017 1/1/2017 1
Dog 1/7/2017 1/1/2017 1
Dog 3/25/2017 3/25/2017 2
Dog 4/18/2017 4/18/2017 3
Dummy_1
的公式如下
IFERROR(IF(AND(A2=A1,B2-C1<=10),C1,B2),B2)
10_Day_Group
的公式如下
IFERROR(IF(AND(C2=C1,A2=A1),D1,IF(AND(A2=A1,C2<>C1),D1+1,1)),1)
我们可以利用 Teradata 的 PERIOD 数据类型及其相关函数来帮助解决这个问题,而不会变得太复杂。
这很接近。不准确,但接近:
WITH ta_period AS
(
SELECT
PERIOD(immunization_date - INTERVAL '10' DAY, immunization_date) AS periodbucket,
ROW_NUMBER() OVER (PARTITION BY animal ORDER BY immunization_date) AS animal_row,
table_a.animal,
table_a.immunization_date
FROM table_a
)
,cal_buckets AS
(
SELECT calendar_dateFROM Sys_Calendar."CALENDAR" cal
WHERE calendar_date >= (SELECT MIN(immunization_date) FROM table_a)
AND calendar_date <= (SELECT MAX(immunization_date) FROM table_a)
)
SELECT
TA.animal,
TA.immunization_date,
cal_buckets.bucket,
DENSE_RANK() OVER (PARTITION BY ta.animal ORDER BY ta_normal.periodbucket, cal_buckets.bucket ) AS ten_day_bucket
FROM
(
SELECT NORMALIZE
ta_period.animal,
ta_period.periodbucket P_INTERSECT ta_period.periodbucket AS periodbucket
FROM ta_period LEFT OUTER JOIN ta_period ta_period2
ON ta_period.periodbucket CONTAINS ta_period2.immunization_date
AND ta_period.animal = ta_period2.animal
AND ta_period.animal_row <> ta_period2.animal_row
) ta_normal
INNER JOIN ta_period ta ON
ta_normal.animal = ta.animal
AND ta_normal.periodbucket P_INTERSECT ta.periodbucket IS NOT NULL
INNER JOIN cal_buckets ON
ta.immunization_date = cal_buckets.calendar_date;
近似值...
SELECT
animal,
immunization_date,
DENSE_RANK() OVER (PARTITION BY animal
ORDER BY base_date,
CAST(immunization_date - base_date AS INT) / 10
)
AS group_id
FROM
(
SELECT
animal,
immunization_date,
MAX(
CASE WHEN immunization_date < lagged_immunization_date + 10
THEN NULL
ELSE immunization_date
END
)
OVER (PARTITION BY animal
ORDER BY immunization_date
ROWS UNBOUNDED PRECEDING
)
AS base_date
FROM
(
SELECT
animal,
immunization_date,
LAG(immunization_date) OVER (PARTITION BY animal
ORDER BY immunization_date
)
AS lagged_immunization_date
FROM
yourData
)
lagged_dates
)
base_dated
SQLFiddle 没有 TeraData 但上面的代码应该在 TeraData 中工作 and SQL服务器... http://sqlfiddle.com/#!18/68260/1
@MatBailie 的递归答案很好,但是当每只动物的行数增加时性能会变差。
当第一个 CTE 可以在 Volatile Table 中实现时,它将降低资源使用(因为 Teradata 的优化器没有实现这个结果,该死的):
CREATE VOLATILE TABLE boundaries AS
(
SELECT
i.*, -- need to add the alias
(
SELECT MIN(immunization_date)
FROM immunizations
WHERE animal = i.animal
AND immunization_date >= i.immunization_date + 10
)
AS next_boundary_date
FROM
immunizations i
)
WITH DATA
UNIQUE PRIMARY INDEX(animal, immunization_date)
ON COMMIT PRESERVE ROWS;
但是当你可以使用临时表时,你也可以使用简单的递归:
CREATE VOLATILE TABLE vt AS
(
SELECT
animal,
immunization_date,
Row_Number() -- add row number to simplify recursive processing
Over (PARTITION BY animal
ORDER BY immunization_date) AS rn
FROM immunizations AS i
)
WITH DATA
UNIQUE PRIMARY INDEX(animal, rn)
ON COMMIT PRESERVE ROWS;
WITH RECURSIVE cte AS
(
SELECT
animal, immunization_date, rn,
immunization_date+10 AS end_date, -- define the end of the range
1 AS grp -- SMALLINT = limited to 127 group, CAST to a larger INT for more groups
FROM vt
WHERE rn = 1 -- oldest row
UNION ALL
SELECT
vt.animal, vt.immunization_date, vt.rn,
-- check if the current row's date is within the 10 day range
-- otherwise increase the group number and define the new range end
CASE WHEN vt.immunization_date < end_date THEN cte.end_date ELSE vt.immunization_date +10 END,
CASE WHEN vt.immunization_date < end_date THEN cte.grp ELSE cte.grp+1 END
FROM cte
JOIN vt
ON vt.animal = cte.animal
AND vt.rn = cte.rn+1
)
SELECT *
FROM cte
ORDER BY 1,2
我有以下数据table
Animal Immunization_Date
Cat 1/18/2017
Cat 1/27/2017
Cat 5/7/2017
Cat 5/12/2017
Dog 1/1/2017
Dog 1/5/2017
Dog 1/7/2017
Dog 3/25/2017
Dog 4/18/2017
我正在尝试根据动物的 10 天间隔创建排名,这将导致以下结果。 (寻找动物的第一个日期,然后在该日期后 10 天内的任何几天分配一组 1
。然后为未分配给 1
的动物取下一个日期并分配它2
然后将 2
分配给该日期后 10 天内的任意日期,等等...)
Animal Immunization_Date 10_Day_Group_Rank
Cat 1/18/2017 1
Cat 1/27/2017 1
Cat 5/7/2017 2
Cat 5/12/2017 2
Dog 1/1/2017 1
Dog 1/5/2017 1
Dog 1/7/2017 1
Dog 3/25/2017 2
Dog 4/18/2017 3
我一直在尝试以下代码,但我似乎无法让 10 天小组工作。
Select
dt.Animal,
dt.Immunization_Date,
sum(dt.10_day_Group) over(partition dt.Animal order by dt.Immunization_Date rows unbounded preceding) as 10_day_Group --creates a running total that is also the group
from
(
Select
Animal,
Immunization_Date,
case when min(Immunization_Date) over (partition by Animal order by Immunization_Date) <=10 then 1 else 0 end as 10_Day_Group --Create intervals of 10 days
from Table_A
) as dt
我不太确定如何将 10 天分组。
case when min(Immunization_Date) over (partition by Animal order by Immunization_Date) <=10 then 1 else 0 end as 10_Day_Group
我可以在 Excel 中使用以下方法做到这一点。我知道 excel 和 SQL 是不同的,但我希望如果在 Excel 中看到它是如何完成的,如果有什么可以在 SQL 中完成的话。
Excel 数据 table 看起来像这样(table 从单元格 A1
开始)。 (注意 Animal
需要排序,Immunization_Date
需要排序才能使 Excel 公式起作用)
Animal Immunization_Date Dummy_1 10_Day_Group
Cat 1/18/2017 1/18/2017 1
Cat 1/27/2017 1/18/2017 1
Cat 5/7/2017 5/7/2017 2
Cat 5/12/2017 5/7/2017 2
Dog 1/1/2017 1/1/2017 1
Dog 1/5/2017 1/1/2017 1
Dog 1/7/2017 1/1/2017 1
Dog 3/25/2017 3/25/2017 2
Dog 4/18/2017 4/18/2017 3
Dummy_1
的公式如下
IFERROR(IF(AND(A2=A1,B2-C1<=10),C1,B2),B2)
10_Day_Group
的公式如下
IFERROR(IF(AND(C2=C1,A2=A1),D1,IF(AND(A2=A1,C2<>C1),D1+1,1)),1)
我们可以利用 Teradata 的 PERIOD 数据类型及其相关函数来帮助解决这个问题,而不会变得太复杂。
这很接近。不准确,但接近:
WITH ta_period AS
(
SELECT
PERIOD(immunization_date - INTERVAL '10' DAY, immunization_date) AS periodbucket,
ROW_NUMBER() OVER (PARTITION BY animal ORDER BY immunization_date) AS animal_row,
table_a.animal,
table_a.immunization_date
FROM table_a
)
,cal_buckets AS
(
SELECT calendar_dateFROM Sys_Calendar."CALENDAR" cal
WHERE calendar_date >= (SELECT MIN(immunization_date) FROM table_a)
AND calendar_date <= (SELECT MAX(immunization_date) FROM table_a)
)
SELECT
TA.animal,
TA.immunization_date,
cal_buckets.bucket,
DENSE_RANK() OVER (PARTITION BY ta.animal ORDER BY ta_normal.periodbucket, cal_buckets.bucket ) AS ten_day_bucket
FROM
(
SELECT NORMALIZE
ta_period.animal,
ta_period.periodbucket P_INTERSECT ta_period.periodbucket AS periodbucket
FROM ta_period LEFT OUTER JOIN ta_period ta_period2
ON ta_period.periodbucket CONTAINS ta_period2.immunization_date
AND ta_period.animal = ta_period2.animal
AND ta_period.animal_row <> ta_period2.animal_row
) ta_normal
INNER JOIN ta_period ta ON
ta_normal.animal = ta.animal
AND ta_normal.periodbucket P_INTERSECT ta.periodbucket IS NOT NULL
INNER JOIN cal_buckets ON
ta.immunization_date = cal_buckets.calendar_date;
近似值...
SELECT
animal,
immunization_date,
DENSE_RANK() OVER (PARTITION BY animal
ORDER BY base_date,
CAST(immunization_date - base_date AS INT) / 10
)
AS group_id
FROM
(
SELECT
animal,
immunization_date,
MAX(
CASE WHEN immunization_date < lagged_immunization_date + 10
THEN NULL
ELSE immunization_date
END
)
OVER (PARTITION BY animal
ORDER BY immunization_date
ROWS UNBOUNDED PRECEDING
)
AS base_date
FROM
(
SELECT
animal,
immunization_date,
LAG(immunization_date) OVER (PARTITION BY animal
ORDER BY immunization_date
)
AS lagged_immunization_date
FROM
yourData
)
lagged_dates
)
base_dated
SQLFiddle 没有 TeraData 但上面的代码应该在 TeraData 中工作 and SQL服务器... http://sqlfiddle.com/#!18/68260/1
@MatBailie 的递归答案很好,但是当每只动物的行数增加时性能会变差。
当第一个 CTE 可以在 Volatile Table 中实现时,它将降低资源使用(因为 Teradata 的优化器没有实现这个结果,该死的):
CREATE VOLATILE TABLE boundaries AS
(
SELECT
i.*, -- need to add the alias
(
SELECT MIN(immunization_date)
FROM immunizations
WHERE animal = i.animal
AND immunization_date >= i.immunization_date + 10
)
AS next_boundary_date
FROM
immunizations i
)
WITH DATA
UNIQUE PRIMARY INDEX(animal, immunization_date)
ON COMMIT PRESERVE ROWS;
但是当你可以使用临时表时,你也可以使用简单的递归:
CREATE VOLATILE TABLE vt AS
(
SELECT
animal,
immunization_date,
Row_Number() -- add row number to simplify recursive processing
Over (PARTITION BY animal
ORDER BY immunization_date) AS rn
FROM immunizations AS i
)
WITH DATA
UNIQUE PRIMARY INDEX(animal, rn)
ON COMMIT PRESERVE ROWS;
WITH RECURSIVE cte AS
(
SELECT
animal, immunization_date, rn,
immunization_date+10 AS end_date, -- define the end of the range
1 AS grp -- SMALLINT = limited to 127 group, CAST to a larger INT for more groups
FROM vt
WHERE rn = 1 -- oldest row
UNION ALL
SELECT
vt.animal, vt.immunization_date, vt.rn,
-- check if the current row's date is within the 10 day range
-- otherwise increase the group number and define the new range end
CASE WHEN vt.immunization_date < end_date THEN cte.end_date ELSE vt.immunization_date +10 END,
CASE WHEN vt.immunization_date < end_date THEN cte.grp ELSE cte.grp+1 END
FROM cte
JOIN vt
ON vt.animal = cte.animal
AND vt.rn = cte.rn+1
)
SELECT *
FROM cte
ORDER BY 1,2