使用时间戳创建滑动 Window 数据帧
Create Sliding Window DataFrames with Timestamps
我需要使用滑动 window 为覆盖 2021-01-01 00:00:00
到 2021-12-31 23:56:00
的多个 24 小时数据帧创建数据帧。数据帧之间的每个间隔为 6 小时(因此 start/end 小时为 00,06,12,18)。手动执行此操作将无法扩展,任何输入将不胜感激。
例如 df1、df3、df5 和 df7 将有 Timestamp
列包含 24 小时的数据。
start = '2021-01-01 00:00:00'
end = '2021-01-01 23:56:00'
df1 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]
start = '2021-01-01 06:00:00'
end = '2021-01-02 05:56:00'
df3 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]
start = '2021-01-01 12:00:00'
end = '2021-01-02 11:56:00'
df5 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]
start = '2021-01-01 18:00:00'
end = '2021-01-02 17:56:00'
df7 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]
最后一个数据帧 dfx
应该有 24 小时作为最后一个 Timestamp = '2021-12-31 23:56:00'
start = '2021-12-31 00:00:00'
end = '2021-12-31 23:56:00'
dfx = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]
一种方法是将数据重复 4 次,为每个分配一个标签作为开始时间戳,然后分组:
freq = 6
periods = 24 // freq
shifted = pd.to_timedelta(np.arange(0,24,freq), unit='H')
group = df.Timestamp.dt.floor(f'{freq}H')
groups = pd.concat([
df.assign(start=group-shift) for shift in shifted
]).groupby('start')
l = len(groups)
for i,(k,d) in enumerate(groups) :
# only select the full sliding windows
if periods - 1 <= i< l - periods:
display(d)
你会得到这样的东西:
Timestamp start
0 2021-01-01 00:00:00 2021-01-01
1 2021-01-01 00:04:00 2021-01-01
2 2021-01-01 00:08:00 2021-01-01
3 2021-01-01 00:12:00 2021-01-01
4 2021-01-01 00:16:00 2021-01-01
.. ... ...
355 2021-01-01 23:40:00 2021-01-01
356 2021-01-01 23:44:00 2021-01-01
357 2021-01-01 23:48:00 2021-01-01
358 2021-01-01 23:52:00 2021-01-01
359 2021-01-01 23:56:00 2021-01-01
这可以创建我需要的两个开始时间和结束时间列表。对于这些对中的每一对,我将把 2021 年的数据帧过滤成一个 24 小时的时间段。 create_df_windows
函数遵循 2021 日历年。
def create_df_windows():
start_list = []
end_list = []
for a1 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12]:
if (a1 == str(1).zfill(2) or a1 == str(3).zfill(2) or a1 == str(5).zfill(2) or a1 == str(7).zfill(2) or a1 == str(8).zfill(2) or a1 == str(10).zfill(2) or a1 == str(12).zfill(2)):
for a2 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]:
for b1, b2 in zip([str(0).zfill(2),str(6).zfill(2),12,18], [23,str(5).zfill(2),11,17]):
start = '2021-{d1}-{d2} {t1}:00:00'.format(d1 = a1, d2 = a2, t1 = b1)
end = '2021-{d1}-{d3} {t2}:56:00'.format(d1 = a1, d3 = a2, t2 = b2)
start_list.append(pd.to_datetime(start, infer_datetime_format=True))
end_list.append(pd.to_datetime(end, infer_datetime_format=True))
elif a1 == str(2).zfill(2): #Feb
for a2 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28]:
for b1, b2 in zip([str(0).zfill(2),str(6).zfill(2),12,18], [23,str(5).zfill(2),11,17]):
start = '2021-{d1}-{d2} {t1}:00:00'.format(d1 = a1, d2 = a2, t1 = b1)
end = '2021-{d1}-{d3} {t2}:56:00'.format(d1 = a1, d3 = a2, t2 = b2)
start_list.append(pd.to_datetime(start, infer_datetime_format=True))
end_list.append(pd.to_datetime(end, infer_datetime_format=True))
else:
for a2 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]:
for b1, b2 in zip([str(0).zfill(2),str(6).zfill(2),12,18], [23,str(5).zfill(2),11,17]):
start = '2021-{d1}-{d2} {t1}:00:00'.format(d1 = a1, d2 = a2, t1 = b1)
end = '2021-{d1}-{d3} {t2}:56:00'.format(d1 = a1, d3 = a2, t2 = b2)
start_list.append(pd.to_datetime(start, infer_datetime_format=True))
end_list.append(pd.to_datetime(end, infer_datetime_format=True))
start_list.append(pd.to_datetime('2021-12-31 00:00:00', infer_datetime_format=True))
end_list.append(pd.to_datetime('2021-12-31 23:56:00', infer_datetime_format=True))
end_list2 = []
for i in end_list:
dt = pd.to_datetime(i, infer_datetime_format=True)+timedelta(days=1)
end_list2.append(dt)
end_list2[0] = pd.to_datetime(end_list2[0], infer_datetime_format=True)-timedelta(days=1)
return start_list, end_list2
我需要使用滑动 window 为覆盖 2021-01-01 00:00:00
到 2021-12-31 23:56:00
的多个 24 小时数据帧创建数据帧。数据帧之间的每个间隔为 6 小时(因此 start/end 小时为 00,06,12,18)。手动执行此操作将无法扩展,任何输入将不胜感激。
例如 df1、df3、df5 和 df7 将有 Timestamp
列包含 24 小时的数据。
start = '2021-01-01 00:00:00'
end = '2021-01-01 23:56:00'
df1 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]
start = '2021-01-01 06:00:00'
end = '2021-01-02 05:56:00'
df3 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]
start = '2021-01-01 12:00:00'
end = '2021-01-02 11:56:00'
df5 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]
start = '2021-01-01 18:00:00'
end = '2021-01-02 17:56:00'
df7 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]
最后一个数据帧 dfx
应该有 24 小时作为最后一个 Timestamp = '2021-12-31 23:56:00'
start = '2021-12-31 00:00:00'
end = '2021-12-31 23:56:00'
dfx = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]
一种方法是将数据重复 4 次,为每个分配一个标签作为开始时间戳,然后分组:
freq = 6
periods = 24 // freq
shifted = pd.to_timedelta(np.arange(0,24,freq), unit='H')
group = df.Timestamp.dt.floor(f'{freq}H')
groups = pd.concat([
df.assign(start=group-shift) for shift in shifted
]).groupby('start')
l = len(groups)
for i,(k,d) in enumerate(groups) :
# only select the full sliding windows
if periods - 1 <= i< l - periods:
display(d)
你会得到这样的东西:
Timestamp start
0 2021-01-01 00:00:00 2021-01-01
1 2021-01-01 00:04:00 2021-01-01
2 2021-01-01 00:08:00 2021-01-01
3 2021-01-01 00:12:00 2021-01-01
4 2021-01-01 00:16:00 2021-01-01
.. ... ...
355 2021-01-01 23:40:00 2021-01-01
356 2021-01-01 23:44:00 2021-01-01
357 2021-01-01 23:48:00 2021-01-01
358 2021-01-01 23:52:00 2021-01-01
359 2021-01-01 23:56:00 2021-01-01
这可以创建我需要的两个开始时间和结束时间列表。对于这些对中的每一对,我将把 2021 年的数据帧过滤成一个 24 小时的时间段。 create_df_windows
函数遵循 2021 日历年。
def create_df_windows():
start_list = []
end_list = []
for a1 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12]:
if (a1 == str(1).zfill(2) or a1 == str(3).zfill(2) or a1 == str(5).zfill(2) or a1 == str(7).zfill(2) or a1 == str(8).zfill(2) or a1 == str(10).zfill(2) or a1 == str(12).zfill(2)):
for a2 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]:
for b1, b2 in zip([str(0).zfill(2),str(6).zfill(2),12,18], [23,str(5).zfill(2),11,17]):
start = '2021-{d1}-{d2} {t1}:00:00'.format(d1 = a1, d2 = a2, t1 = b1)
end = '2021-{d1}-{d3} {t2}:56:00'.format(d1 = a1, d3 = a2, t2 = b2)
start_list.append(pd.to_datetime(start, infer_datetime_format=True))
end_list.append(pd.to_datetime(end, infer_datetime_format=True))
elif a1 == str(2).zfill(2): #Feb
for a2 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28]:
for b1, b2 in zip([str(0).zfill(2),str(6).zfill(2),12,18], [23,str(5).zfill(2),11,17]):
start = '2021-{d1}-{d2} {t1}:00:00'.format(d1 = a1, d2 = a2, t1 = b1)
end = '2021-{d1}-{d3} {t2}:56:00'.format(d1 = a1, d3 = a2, t2 = b2)
start_list.append(pd.to_datetime(start, infer_datetime_format=True))
end_list.append(pd.to_datetime(end, infer_datetime_format=True))
else:
for a2 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]:
for b1, b2 in zip([str(0).zfill(2),str(6).zfill(2),12,18], [23,str(5).zfill(2),11,17]):
start = '2021-{d1}-{d2} {t1}:00:00'.format(d1 = a1, d2 = a2, t1 = b1)
end = '2021-{d1}-{d3} {t2}:56:00'.format(d1 = a1, d3 = a2, t2 = b2)
start_list.append(pd.to_datetime(start, infer_datetime_format=True))
end_list.append(pd.to_datetime(end, infer_datetime_format=True))
start_list.append(pd.to_datetime('2021-12-31 00:00:00', infer_datetime_format=True))
end_list.append(pd.to_datetime('2021-12-31 23:56:00', infer_datetime_format=True))
end_list2 = []
for i in end_list:
dt = pd.to_datetime(i, infer_datetime_format=True)+timedelta(days=1)
end_list2.append(dt)
end_list2[0] = pd.to_datetime(end_list2[0], infer_datetime_format=True)-timedelta(days=1)
return start_list, end_list2