使用时间戳创建滑动 Window 数据帧

Create Sliding Window DataFrames with Timestamps

我需要使用滑动 window 为覆盖 2021-01-01 00:00:002021-12-31 23:56:00 的多个 24 小时数据帧创建数据帧。数据帧之间的每个间隔为 6 小时(因此 start/end 小时为 00,06,12,18)。手动执行此操作将无法扩展,任何输入将不胜感激。

例如 df1、df3、df5 和 df7 将有 Timestamp 列包含 24 小时的数据。

start = '2021-01-01 00:00:00'
end = '2021-01-01 23:56:00'
df1 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]

start = '2021-01-01 06:00:00'
end = '2021-01-02 05:56:00'
df3 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]
 
start = '2021-01-01 12:00:00'
end = '2021-01-02 11:56:00'
df5 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]

start = '2021-01-01 18:00:00'
end = '2021-01-02 17:56:00'
df7 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]

最后一个数据帧 dfx 应该有 24 小时作为最后一个 Timestamp = '2021-12-31 23:56:00'

start = '2021-12-31 00:00:00'
end = '2021-12-31 23:56:00'
dfx = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]

一种方法是将数据重复 4 次,为每个分配一个标签作为开始时间戳,然后分组:

freq = 6

periods = 24 // freq
shifted = pd.to_timedelta(np.arange(0,24,freq), unit='H')
group = df.Timestamp.dt.floor(f'{freq}H')


groups = pd.concat([
    df.assign(start=group-shift) for shift in shifted
]).groupby('start')

l = len(groups)

for i,(k,d) in enumerate(groups) :
    # only select the full sliding windows
    if periods - 1 <= i< l - periods:
        display(d)

你会得到这样的东西:

              Timestamp      start
0   2021-01-01 00:00:00 2021-01-01
1   2021-01-01 00:04:00 2021-01-01
2   2021-01-01 00:08:00 2021-01-01
3   2021-01-01 00:12:00 2021-01-01
4   2021-01-01 00:16:00 2021-01-01
..                  ...        ...
355 2021-01-01 23:40:00 2021-01-01
356 2021-01-01 23:44:00 2021-01-01
357 2021-01-01 23:48:00 2021-01-01
358 2021-01-01 23:52:00 2021-01-01
359 2021-01-01 23:56:00 2021-01-01

这可以创建我需要的两个开始时间和结束时间列表。对于这些对中的每一对,我将把 2021 年的数据帧过滤成一个 24 小时的时间段。 create_df_windows 函数遵循 2021 日历年。

def create_df_windows():
  start_list = []
  end_list = []
  for a1 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12]:
    if (a1 == str(1).zfill(2) or a1 == str(3).zfill(2) or a1 == str(5).zfill(2) or a1 == str(7).zfill(2) or a1 == str(8).zfill(2) or a1 == str(10).zfill(2) or a1 == str(12).zfill(2)):
      for a2 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]:
        for b1, b2 in zip([str(0).zfill(2),str(6).zfill(2),12,18], [23,str(5).zfill(2),11,17]):
          start = '2021-{d1}-{d2} {t1}:00:00'.format(d1 = a1, d2 = a2, t1 = b1)
          end = '2021-{d1}-{d3} {t2}:56:00'.format(d1 = a1, d3 = a2, t2 = b2)
          start_list.append(pd.to_datetime(start, infer_datetime_format=True))
          end_list.append(pd.to_datetime(end, infer_datetime_format=True))
    elif a1 == str(2).zfill(2): #Feb
      for a2 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28]:
        for b1, b2 in zip([str(0).zfill(2),str(6).zfill(2),12,18], [23,str(5).zfill(2),11,17]):
          start = '2021-{d1}-{d2} {t1}:00:00'.format(d1 = a1, d2 = a2, t1 = b1)
          end = '2021-{d1}-{d3} {t2}:56:00'.format(d1 = a1, d3 = a2, t2 = b2)
          start_list.append(pd.to_datetime(start, infer_datetime_format=True))
          end_list.append(pd.to_datetime(end, infer_datetime_format=True))
    else:
      for a2 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]:
        for b1, b2 in zip([str(0).zfill(2),str(6).zfill(2),12,18], [23,str(5).zfill(2),11,17]):
          start = '2021-{d1}-{d2} {t1}:00:00'.format(d1 = a1, d2 = a2, t1 = b1)
          end = '2021-{d1}-{d3} {t2}:56:00'.format(d1 = a1, d3 = a2, t2 = b2)
          start_list.append(pd.to_datetime(start, infer_datetime_format=True))
          end_list.append(pd.to_datetime(end, infer_datetime_format=True)) 

  start_list.append(pd.to_datetime('2021-12-31 00:00:00', infer_datetime_format=True))
  end_list.append(pd.to_datetime('2021-12-31 23:56:00', infer_datetime_format=True))

  end_list2 = []
  for i in end_list:
    dt = pd.to_datetime(i, infer_datetime_format=True)+timedelta(days=1)
    end_list2.append(dt)
  end_list2[0] = pd.to_datetime(end_list2[0], infer_datetime_format=True)-timedelta(days=1)
  return start_list, end_list2