我如何自动计算 10 分钟时间步长中提供的值以 1 小时时间步长中的平均值?
How can i automatically calculate values provided in 10 min time steps to average values in 1 h time steps?
我的数据框包含 4 年内 10 分钟时间步长的数据。我的其他数据以 1 小时的时间步长提供。所以我需要将 10 分钟的数据转换为 1 小时的平均值,以便将它与我的其他数据结合起来。
这是我数据的前 25 行的输入:
structure(list(...1 = structure(c(1420070400, 1420071000, 1420071600,
1420072200, 1420072800, 1420073400, 1420074000, 1420074600, 1420075200,
1420075800, 1420076400, 1420077000, 1420077600, 1420078200, 1420078800,
1420079400, 1420080000, 1420080600, 1420081200, 1420081800, 1420082400,
1420083000, 1420083600, 1420084200, 1420084800), tzone = "UTC", class = c("POSIXct",
"POSIXt")), Dd1_1...2 = c(28.43869, 28.52743, 28.52743, 28.52743,
28.57417, 28.57417, 28.57417, 28.67358, 28.67358, 28.67358, 28.62449,
28.62449, 28.62449, 28.74658, 28.74658, 28.74658, 29.01779, 29.01779,
29.01779, 28.84344, 28.84344, 28.84344, 28.7406, 28.7406, 28.7406
), Dd1_3...3 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
Dd1_5...4 = c(30.58685, 31.82736, 31.82736, 31.82736, 29.77018,
29.77018, 29.77018, 29.91568, 29.91568, 29.91568, 29.69438,
29.69438, 29.69438, 30.45421, 30.45421, 30.45421, 30.06922,
30.06922, 30.06922, 30.15059, 30.15059, 30.15059, 30.41779,
30.41779, 30.41779), Dd1_1...5 = c(28.24055, 28.273, 28.273,
28.273, 28.26098, 28.26098, 28.26098, 28.29342, 28.29342,
28.29342, 28.24657, 28.24657, 28.24657, 28.34867, 28.34867,
28.34867, 28.30063, 28.30063, 28.30063, 28.17804, 28.17804,
28.17804, 28.30423, 28.30423, 28.30423), Dd1_3...6 = c(25.25317,
25.53266, 25.53266, 25.53266, 25.50203, 25.50203, 25.50203,
25.49713, 25.49713, 25.49713, 25.30346, 25.30346, 25.30346,
25.36477, 25.36477, 25.36477, 25.46037, 25.46037, 25.46037,
25.26299, 25.26299, 25.26299, 25.37948, 25.37948, 25.37948
), Dd1_5...7 = c(27.75881, 27.92162, 27.92162, 27.92162,
28.65922, 28.65922, 28.65922, 26.65354, 26.65354, 26.65354,
27.10426, 27.10426, 27.10426, 28.48427, 28.48427, 28.48427,
26.80799, 26.80799, 26.80799, 27.33572, 27.33572, 27.33572,
27.20004, 27.20004, 27.20004), Dd1_1...8 = c(28.23214, 28.81236,
28.81236, 28.81236, 28.81834, 28.81834, 28.81834, 28.29463,
28.29463, 28.29463, 28.35347, 28.35347, 28.35347, 28.19247,
28.19247, 28.19247, 28.30664, 28.30664, 28.30664, 28.24776,
28.24776, 28.24776, 28.2694, 28.2694, 28.2694), Dd1_3...9 = c(29.27281,
29.31683, 29.31683, 29.31683, 29.75479, 29.75479, 29.75479,
29.69912, 29.69912, 29.69912, 29.97595, 29.97595, 29.97595,
29.19542, 29.19542, 29.19542, 29.19661, 29.19661, 29.19661,
29.90859, 29.90859, 29.90859, 29.57938, 29.57938, 29.57938
), Dd1_5...10 = c(28.02402, 28.33666, 28.33666, 28.33666,
28.48547, 28.48547, 28.48547, 27.7383, 27.7383, 27.7383,
NA, NA, NA, 27.69604, 27.69604, 27.69604, 28.44109, 28.44109,
28.44109, 28.77409, 28.77409, 28.77409, 27.93246, 27.93246,
27.93246), Dd1_1...23 = c(-11.97827, -11.99765, -11.96991,
-12.04475, -12.07237, -12.12492, -12.1001, -12.15241, -12.1885,
-12.19432, -12.25232, -12.31324, -12.28841, -12.34945, -12.38554,
-12.36858, -12.40782, -12.44645, -12.46292, -12.50446, -12.54636,
-12.54369, -12.58269, -12.62713, -12.61042), Dd1_3...24 = c(-9.715732,
-9.693569, -9.621514, -9.632534, -9.57162, -9.563144, -9.48576,
-9.479946, -9.435623, -9.363811, -9.3913, -9.360904, -9.29454,
-9.277949, -9.261358, -9.183611, -9.18676, -9.150551, -9.08673,
-9.095086, -9.056576, -8.96793, -8.888002, -8.863055, -8.74692
), Dd1_5...25 = c(-9.247674, -9.076073, -8.876861, -8.791121,
-8.69424, -8.583553, -8.395483, -8.317978, -8.232359, -8.171687,
-8.105203, -8.091396, -7.887099, -7.862273, -7.765512, -7.594275,
-7.555886, -7.511563, -7.442535, -7.337661, -7.238479, -7.064819,
-6.924341, -6.858099, -6.615532)), row.names = c(NA, -25L
), class = c("tbl_df", "tbl", "data.frame"))
我希望有一个简单的解决方案来转换数据。
修改后的答案:
df["average"] = df["values"].rolling(6).mean()
现在,平均列将具有您想要的值,并且可以被挑选到新的数据框中。
旧答案:
如果第一行是第 0 小时、第 0 分钟和第 0 秒,那么第 6 条记录将是第 1 小时、第 0 分钟和第 0 秒。第 12 条记录将是第 2 小时 0 分 0 秒
所以我们正在考虑挑选第 0、6、12 条记录等等...
如果您认为这有帮助,那么我的代码可能是这样的
dfHour = df[df.index % 6 == 0]
以上将为您提供那些索引是 6 的倍数的行。
整洁宇宙
library(dplyr)
dat %>%
group_by(`...1` = trunc(`...1`, "hour")) %>%
summarize(across(everything(), mean), .groups = "drop")
# # A tibble: 5 x 13
# ...1 Dd1_1...2 Dd1_3...3 Dd1_5...4 Dd1_1...5 Dd1_3...6 Dd1_5...7 Dd1_1...8 Dd1_3...9 Dd1_5...10 Dd1_1...23 Dd1_3...24 Dd1_5...25
# <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2015-01-01 00:00:00 28.5 NA 30.9 28.3 25.5 28.1 28.7 29.5 28.3 -12.0 -9.63 -8.88
# 2 2015-01-01 01:00:00 28.6 NA 29.8 28.3 25.4 27.1 28.4 29.8 NA -12.2 -9.42 -8.22
# 3 2015-01-01 02:00:00 28.8 NA 30.2 28.3 25.4 27.7 28.3 29.3 NA -12.4 -9.23 -7.70
# 4 2015-01-01 03:00:00 28.8 NA 30.2 28.2 25.3 27.2 28.3 29.7 28.4 -12.5 -8.99 -7.14
# 5 2015-01-01 04:00:00 28.7 NA 30.4 28.3 25.4 27.2 28.3 29.6 27.9 -12.6 -8.75 -6.62
如果您需要对 NA
的 Dd1_t...10
值进行平均并删除 NA
,那么只需
dat %>%
group_by(`...1` = trunc(`...1`, "hour")) %>%
summarize(across(everything(), ~ mean(., na.rm = TRUE)), groups = "drop")
# # A tibble: 5 x 14
# ...1 Dd1_1...2 Dd1_3...3 Dd1_5...4 Dd1_1...5 Dd1_3...6 Dd1_5...7 Dd1_1...8 Dd1_3...9 Dd1_5...10 Dd1_1...23 Dd1_3...24 Dd1_5...25 groups
# <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
# 1 2015-01-01 00:00:00 28.5 NaN 30.9 28.3 25.5 28.1 28.7 29.5 28.3 -12.0 -9.63 -8.88 drop
# 2 2015-01-01 01:00:00 28.6 NaN 29.8 28.3 25.4 27.1 28.4 29.8 27.9 -12.2 -9.42 -8.22 drop
# 3 2015-01-01 02:00:00 28.8 NaN 30.2 28.3 25.4 27.7 28.3 29.3 28.0 -12.4 -9.23 -7.70 drop
# 4 2015-01-01 03:00:00 28.8 NaN 30.2 28.2 25.3 27.2 28.3 29.7 28.4 -12.5 -8.99 -7.14 drop
# 5 2015-01-01 04:00:00 28.7 NaN 30.4 28.3 25.4 27.2 28.3 29.6 27.9 -12.6 -8.75 -6.62 drop
虽然现在你有一个 NaN
问题:-)
data.table
library(data.table)
DT <- as.data.table(dat)
DT[, lapply(.SD, mean), by = list(`...1`=as.POSIXct(trunc(`...1`, "hour")))]
同样,对于 NA
s:
DT[, lapply(.SD, mean, na.rm = TRUE), by = list(`...1`=as.POSIXct(trunc(`...1`, "hour")))]
不清楚你的 table 叫什么。所以我称它为dat
.
试试这个:
library(dplyr)
library(lubridate)
dat %>%
mutate(hour = floor_date(x = ...1, unit = 'hour')) %>%
group_by(hour) %>%
select(-...1) %>%
summarise_all(mean)
Mutate 使用 lubridate 包中的 floor_date
提取日期的小时截断日期并将其放入名为 'hour' 的列中。然后我们按那个小时列分组,select 出(带负号)日期字段。最后,我们通过 mean
总结所有其他列
我的数据框包含 4 年内 10 分钟时间步长的数据。我的其他数据以 1 小时的时间步长提供。所以我需要将 10 分钟的数据转换为 1 小时的平均值,以便将它与我的其他数据结合起来。
这是我数据的前 25 行的输入:
structure(list(...1 = structure(c(1420070400, 1420071000, 1420071600,
1420072200, 1420072800, 1420073400, 1420074000, 1420074600, 1420075200,
1420075800, 1420076400, 1420077000, 1420077600, 1420078200, 1420078800,
1420079400, 1420080000, 1420080600, 1420081200, 1420081800, 1420082400,
1420083000, 1420083600, 1420084200, 1420084800), tzone = "UTC", class = c("POSIXct",
"POSIXt")), Dd1_1...2 = c(28.43869, 28.52743, 28.52743, 28.52743,
28.57417, 28.57417, 28.57417, 28.67358, 28.67358, 28.67358, 28.62449,
28.62449, 28.62449, 28.74658, 28.74658, 28.74658, 29.01779, 29.01779,
29.01779, 28.84344, 28.84344, 28.84344, 28.7406, 28.7406, 28.7406
), Dd1_3...3 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
Dd1_5...4 = c(30.58685, 31.82736, 31.82736, 31.82736, 29.77018,
29.77018, 29.77018, 29.91568, 29.91568, 29.91568, 29.69438,
29.69438, 29.69438, 30.45421, 30.45421, 30.45421, 30.06922,
30.06922, 30.06922, 30.15059, 30.15059, 30.15059, 30.41779,
30.41779, 30.41779), Dd1_1...5 = c(28.24055, 28.273, 28.273,
28.273, 28.26098, 28.26098, 28.26098, 28.29342, 28.29342,
28.29342, 28.24657, 28.24657, 28.24657, 28.34867, 28.34867,
28.34867, 28.30063, 28.30063, 28.30063, 28.17804, 28.17804,
28.17804, 28.30423, 28.30423, 28.30423), Dd1_3...6 = c(25.25317,
25.53266, 25.53266, 25.53266, 25.50203, 25.50203, 25.50203,
25.49713, 25.49713, 25.49713, 25.30346, 25.30346, 25.30346,
25.36477, 25.36477, 25.36477, 25.46037, 25.46037, 25.46037,
25.26299, 25.26299, 25.26299, 25.37948, 25.37948, 25.37948
), Dd1_5...7 = c(27.75881, 27.92162, 27.92162, 27.92162,
28.65922, 28.65922, 28.65922, 26.65354, 26.65354, 26.65354,
27.10426, 27.10426, 27.10426, 28.48427, 28.48427, 28.48427,
26.80799, 26.80799, 26.80799, 27.33572, 27.33572, 27.33572,
27.20004, 27.20004, 27.20004), Dd1_1...8 = c(28.23214, 28.81236,
28.81236, 28.81236, 28.81834, 28.81834, 28.81834, 28.29463,
28.29463, 28.29463, 28.35347, 28.35347, 28.35347, 28.19247,
28.19247, 28.19247, 28.30664, 28.30664, 28.30664, 28.24776,
28.24776, 28.24776, 28.2694, 28.2694, 28.2694), Dd1_3...9 = c(29.27281,
29.31683, 29.31683, 29.31683, 29.75479, 29.75479, 29.75479,
29.69912, 29.69912, 29.69912, 29.97595, 29.97595, 29.97595,
29.19542, 29.19542, 29.19542, 29.19661, 29.19661, 29.19661,
29.90859, 29.90859, 29.90859, 29.57938, 29.57938, 29.57938
), Dd1_5...10 = c(28.02402, 28.33666, 28.33666, 28.33666,
28.48547, 28.48547, 28.48547, 27.7383, 27.7383, 27.7383,
NA, NA, NA, 27.69604, 27.69604, 27.69604, 28.44109, 28.44109,
28.44109, 28.77409, 28.77409, 28.77409, 27.93246, 27.93246,
27.93246), Dd1_1...23 = c(-11.97827, -11.99765, -11.96991,
-12.04475, -12.07237, -12.12492, -12.1001, -12.15241, -12.1885,
-12.19432, -12.25232, -12.31324, -12.28841, -12.34945, -12.38554,
-12.36858, -12.40782, -12.44645, -12.46292, -12.50446, -12.54636,
-12.54369, -12.58269, -12.62713, -12.61042), Dd1_3...24 = c(-9.715732,
-9.693569, -9.621514, -9.632534, -9.57162, -9.563144, -9.48576,
-9.479946, -9.435623, -9.363811, -9.3913, -9.360904, -9.29454,
-9.277949, -9.261358, -9.183611, -9.18676, -9.150551, -9.08673,
-9.095086, -9.056576, -8.96793, -8.888002, -8.863055, -8.74692
), Dd1_5...25 = c(-9.247674, -9.076073, -8.876861, -8.791121,
-8.69424, -8.583553, -8.395483, -8.317978, -8.232359, -8.171687,
-8.105203, -8.091396, -7.887099, -7.862273, -7.765512, -7.594275,
-7.555886, -7.511563, -7.442535, -7.337661, -7.238479, -7.064819,
-6.924341, -6.858099, -6.615532)), row.names = c(NA, -25L
), class = c("tbl_df", "tbl", "data.frame"))
我希望有一个简单的解决方案来转换数据。
修改后的答案:
df["average"] = df["values"].rolling(6).mean()
现在,平均列将具有您想要的值,并且可以被挑选到新的数据框中。
旧答案: 如果第一行是第 0 小时、第 0 分钟和第 0 秒,那么第 6 条记录将是第 1 小时、第 0 分钟和第 0 秒。第 12 条记录将是第 2 小时 0 分 0 秒
所以我们正在考虑挑选第 0、6、12 条记录等等...
如果您认为这有帮助,那么我的代码可能是这样的
dfHour = df[df.index % 6 == 0]
以上将为您提供那些索引是 6 的倍数的行。
整洁宇宙
library(dplyr)
dat %>%
group_by(`...1` = trunc(`...1`, "hour")) %>%
summarize(across(everything(), mean), .groups = "drop")
# # A tibble: 5 x 13
# ...1 Dd1_1...2 Dd1_3...3 Dd1_5...4 Dd1_1...5 Dd1_3...6 Dd1_5...7 Dd1_1...8 Dd1_3...9 Dd1_5...10 Dd1_1...23 Dd1_3...24 Dd1_5...25
# <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2015-01-01 00:00:00 28.5 NA 30.9 28.3 25.5 28.1 28.7 29.5 28.3 -12.0 -9.63 -8.88
# 2 2015-01-01 01:00:00 28.6 NA 29.8 28.3 25.4 27.1 28.4 29.8 NA -12.2 -9.42 -8.22
# 3 2015-01-01 02:00:00 28.8 NA 30.2 28.3 25.4 27.7 28.3 29.3 NA -12.4 -9.23 -7.70
# 4 2015-01-01 03:00:00 28.8 NA 30.2 28.2 25.3 27.2 28.3 29.7 28.4 -12.5 -8.99 -7.14
# 5 2015-01-01 04:00:00 28.7 NA 30.4 28.3 25.4 27.2 28.3 29.6 27.9 -12.6 -8.75 -6.62
如果您需要对 NA
的 Dd1_t...10
值进行平均并删除 NA
,那么只需
dat %>%
group_by(`...1` = trunc(`...1`, "hour")) %>%
summarize(across(everything(), ~ mean(., na.rm = TRUE)), groups = "drop")
# # A tibble: 5 x 14
# ...1 Dd1_1...2 Dd1_3...3 Dd1_5...4 Dd1_1...5 Dd1_3...6 Dd1_5...7 Dd1_1...8 Dd1_3...9 Dd1_5...10 Dd1_1...23 Dd1_3...24 Dd1_5...25 groups
# <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
# 1 2015-01-01 00:00:00 28.5 NaN 30.9 28.3 25.5 28.1 28.7 29.5 28.3 -12.0 -9.63 -8.88 drop
# 2 2015-01-01 01:00:00 28.6 NaN 29.8 28.3 25.4 27.1 28.4 29.8 27.9 -12.2 -9.42 -8.22 drop
# 3 2015-01-01 02:00:00 28.8 NaN 30.2 28.3 25.4 27.7 28.3 29.3 28.0 -12.4 -9.23 -7.70 drop
# 4 2015-01-01 03:00:00 28.8 NaN 30.2 28.2 25.3 27.2 28.3 29.7 28.4 -12.5 -8.99 -7.14 drop
# 5 2015-01-01 04:00:00 28.7 NaN 30.4 28.3 25.4 27.2 28.3 29.6 27.9 -12.6 -8.75 -6.62 drop
虽然现在你有一个 NaN
问题:-)
data.table
library(data.table)
DT <- as.data.table(dat)
DT[, lapply(.SD, mean), by = list(`...1`=as.POSIXct(trunc(`...1`, "hour")))]
同样,对于 NA
s:
DT[, lapply(.SD, mean, na.rm = TRUE), by = list(`...1`=as.POSIXct(trunc(`...1`, "hour")))]
不清楚你的 table 叫什么。所以我称它为dat
.
试试这个:
library(dplyr)
library(lubridate)
dat %>%
mutate(hour = floor_date(x = ...1, unit = 'hour')) %>%
group_by(hour) %>%
select(-...1) %>%
summarise_all(mean)
Mutate 使用 lubridate 包中的 floor_date
提取日期的小时截断日期并将其放入名为 'hour' 的列中。然后我们按那个小时列分组,select 出(带负号)日期字段。最后,我们通过 mean