基于 10 天间隔创建组

Question

我有以下数据table

Animal  Immunization_Date
Cat     1/18/2017
Cat     1/27/2017
Cat     5/7/2017
Cat     5/12/2017
Dog     1/1/2017
Dog     1/5/2017
Dog     1/7/2017
Dog     3/25/2017
Dog     4/18/2017

我正在尝试根据动物的 10 天间隔创建排名，这将导致以下结果。（寻找动物的第一个日期，然后在该日期后 10 天内的任何几天分配一组 1。然后为未分配给 1 的动物取下一个日期并分配它2 然后将 2 分配给该日期后 10 天内的任意日期，等等...)

Animal  Immunization_Date   10_Day_Group_Rank
Cat     1/18/2017           1
Cat     1/27/2017           1
Cat     5/7/2017            2
Cat     5/12/2017           2
Dog     1/1/2017            1
Dog     1/5/2017            1
Dog     1/7/2017            1
Dog     3/25/2017           2
Dog     4/18/2017           3

我一直在尝试以下代码，但我似乎无法让 10 天小组工作。

Select
dt.Animal,
dt.Immunization_Date,
sum(dt.10_day_Group) over(partition dt.Animal order by dt.Immunization_Date rows unbounded preceding) as 10_day_Group --creates a running total that is also the group
from
(
Select
Animal,
Immunization_Date,
case when min(Immunization_Date) over (partition by Animal order by Immunization_Date) <=10 then 1 else 0 end as 10_Day_Group --Create intervals of 10 days
from Table_A
) as dt

我不太确定如何将 10 天分组。

case when min(Immunization_Date) over (partition by Animal order by Immunization_Date) <=10 then 1 else 0 end as 10_Day_Group

我可以在 Excel 中使用以下方法做到这一点。我知道 excel 和 SQL 是不同的，但我希望如果在 Excel 中看到它是如何完成的，如果有什么可以在 SQL 中完成的话。

Excel 数据 table 看起来像这样（table 从单元格 A1 开始）。（注意 Animal 需要排序，Immunization_Date 需要排序才能使 Excel 公式起作用）

Animal  Immunization_Date   Dummy_1 10_Day_Group
Cat     1/18/2017       1/18/2017       1
Cat     1/27/2017       1/18/2017       1
Cat     5/7/2017        5/7/2017        2
Cat     5/12/2017       5/7/2017        2
Dog     1/1/2017        1/1/2017        1
Dog     1/5/2017        1/1/2017        1
Dog     1/7/2017        1/1/2017        1
Dog     3/25/2017       3/25/2017       2
Dog     4/18/2017       4/18/2017       3

Dummy_1的公式如下 IFERROR(IF(AND(A2=A1,B2-C1<=10),C1,B2),B2)

10_Day_Group的公式如下 IFERROR(IF(AND(C2=C1,A2=A1),D1,IF(AND(A2=A1,C2<>C1),D1+1,1)),1)

Answer 1

我们可以利用 Teradata 的 PERIOD 数据类型及其相关函数来帮助解决这个问题，而不会变得太复杂。

这很接近。不准确，但接近：

WITH ta_period AS
(
        SELECT
            PERIOD(immunization_date - INTERVAL '10' DAY, immunization_date) AS periodbucket,
            ROW_NUMBER() OVER (PARTITION BY animal ORDER BY immunization_date) AS animal_row,
            table_a.animal,
            table_a.immunization_date
        FROM table_a
)
,cal_buckets AS
(

    SELECT calendar_dateFROM Sys_Calendar."CALENDAR" cal
    WHERE calendar_date >= (SELECT MIN(immunization_date) FROM table_a)
        AND calendar_date <= (SELECT MAX(immunization_date) FROM table_a)
)
SELECT
    TA.animal,
    TA.immunization_date,
    cal_buckets.bucket,
    DENSE_RANK() OVER (PARTITION BY ta.animal ORDER BY ta_normal.periodbucket, cal_buckets.bucket ) AS ten_day_bucket
FROM
    (
        SELECT NORMALIZE    
            ta_period.animal,   
            ta_period.periodbucket P_INTERSECT ta_period.periodbucket AS periodbucket
        FROM ta_period LEFT OUTER JOIN ta_period ta_period2 
            ON ta_period.periodbucket CONTAINS ta_period2.immunization_date
                AND ta_period.animal = ta_period2.animal
                AND ta_period.animal_row <> ta_period2.animal_row   
    ) ta_normal
    INNER JOIN ta_period ta ON
        ta_normal.animal = ta.animal
        AND ta_normal.periodbucket P_INTERSECT ta.periodbucket IS NOT NULL
    INNER JOIN cal_buckets ON
        ta.immunization_date = cal_buckets.calendar_date;

Answer 2

近似值...

SELECT
  animal,
  immunization_date,
  DENSE_RANK() OVER (PARTITION BY animal
                         ORDER BY base_date,
                                  CAST(immunization_date - base_date AS INT) / 10
                    )
                      AS group_id
FROM
(  
  SELECT
    animal,
    immunization_date,
    MAX(
      CASE WHEN immunization_date < lagged_immunization_date + 10
           THEN NULL
           ELSE immunization_date
      END
    )
      OVER (PARTITION BY animal
                ORDER BY immunization_date
                    ROWS UNBOUNDED PRECEDING
           )
             AS base_date
  FROM
  (
    SELECT
      animal,
      immunization_date,
      LAG(immunization_date) OVER (PARTITION BY animal
                                       ORDER BY immunization_date
                                  )
                                    AS lagged_immunization_date
    FROM
      yourData
  )
    lagged_dates
)
  base_dated

SQLFiddle 没有 TeraData 但上面的代码应该在 TeraData 中工作 and SQL服务器... http://sqlfiddle.com/#!18/68260/1

Answer 3

@MatBailie 的递归答案很好，但是当每只动物的行数增加时性能会变差。

当第一个 CTE 可以在 Volatile Table 中实现时，它将降低资源使用（因为 Teradata 的优化器没有实现这个结果，该死的）：

CREATE VOLATILE TABLE boundaries AS
(
  SELECT
    i.*, -- need to add the alias 
    (
      SELECT MIN(immunization_date)
        FROM immunizations
       WHERE animal             = i.animal
         AND immunization_date >= i.immunization_date + 10
    )
      AS next_boundary_date
  FROM
    immunizations   i
 )
WITH DATA 
UNIQUE PRIMARY INDEX(animal, immunization_date)
ON COMMIT PRESERVE ROWS;

但是当你可以使用临时表时，你也可以使用简单的递归：

CREATE VOLATILE TABLE vt AS
 (
   SELECT
      animal, 
      immunization_date,
      Row_Number() -- add row number to simplify recursive processing
      Over (PARTITION BY animal
            ORDER BY immunization_date) AS rn
   FROM immunizations AS i
 )
WITH DATA 
UNIQUE PRIMARY INDEX(animal, rn)
ON COMMIT PRESERVE ROWS;

WITH RECURSIVE cte AS
 (
   SELECT
      animal, immunization_date, rn,
      immunization_date+10 AS end_date, -- define the end of the range 
      1 AS grp -- SMALLINT = limited to 127 group, CAST to a larger INT for more groups
   FROM vt
   WHERE rn = 1 -- oldest row

   UNION ALL

   SELECT 
      vt.animal, vt.immunization_date, vt.rn,
      -- check if the current row's date is within the 10 day range
      -- otherwise increase the group number and define the new range end
      CASE WHEN vt.immunization_date < end_date THEN cte.end_date ELSE vt.immunization_date +10 END,
      CASE WHEN vt.immunization_date < end_date THEN cte.grp      ELSE cte.grp+1 END
   FROM cte
   JOIN vt
     ON vt.animal = cte.animal
    AND vt.rn = cte.rn+1
 )
SELECT *
FROM cte
ORDER BY 1,2

基于 10 天间隔创建组

Create groups based on 10 day interval

sql

teradata

rank