从数据框中选择元素
Selecting elements from a dataframe
我有一个非常大的电表数据集。它看起来像这样(每 5 分钟有一行):
df
Time Type Energy
1 2019-08-31 23:55:00 a -0.3558
2 2019-08-30 14:55:00 b -3.1189
3 2019-08-29 15:15:00 c 27.3856
4 2019-08-28 19:20:00 b -155.7758
5 2019-08-27 18:30:00 a -149.3617
还有一个关税 table 看起来像这样:
tf
Hour 0 1 2 3 4 5 6
1 0 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495
2 1 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495
3 2 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495
4 3 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495
...............................
等等最多 23 小时,以天作为列标题。
我需要计算数据集中每 5 分钟间隔的能源成本。似乎可以使用:
require(lubridate)
xlt <- as.POSIXlt(df$Time)
cost <- tf[xlt$hour,xlt$wday]* df$Energy
但这行不通,因为每个小时都选择了所有日期,而需要的是按小时和天索引的关税 table 中的单个元素。这是结果:
cost
4 3 2 1 0
23 -0.0531921 -0.0531921 -0.0531921 -0.0531921 -0.0531921
14 -0.7157876 -0.7157876 -0.7157876 -0.7157876 -0.7157876
15 6.2849952 6.2849952 6.2849952 6.2849952 6.2849952
19 -83.9631562 -83.9631562 -83.9631562 -83.9631562 -35.7505461
18 -80.5059563 -80.5059563 -80.5059563 -80.5059563 -34.2785102
dput(cost)
structure(list(`4` = c(-0.0531921, -0.71578755, 6.2849952, -83.9631562,
-80.5059563), `3` = c(-0.0531921, -0.71578755, 6.2849952, -83.9631562,
-80.5059563), `2` = c(-0.0531921, -0.71578755, 6.2849952, -83.9631562,
-80.5059563), `1` = c(-0.0531921, -0.71578755, 6.2849952, -83.9631562,
-80.5059563), `0` = c(-0.0531921, -0.71578755, 6.2849952, -35.7505461,
-34.27851015)), class = "data.frame", row.names = c(23L, 14L,
15L, 19L, 18L))
这是数据:
dput(head(tf))
structure(list(Hour = 0:5, `0` = c(0.1495, 0.1495, 0.1495, 0.1495,
0.1495, 0.1495), `1` = c(0.1495, 0.1495, 0.1495, 0.1495, 0.1495,
0.1495), `2` = c(0.1495, 0.1495, 0.1495, 0.1495, 0.1495, 0.1495
), `3` = c(0.1495, 0.1495, 0.1495, 0.1495, 0.1495, 0.1495), `4` = c(0.1495,
0.1495, 0.1495, 0.1495, 0.1495, 0.1495), `5` = c(0.1495, 0.1495,
0.1495, 0.1495, 0.1495, 0.1495), `6` = c(0.1495, 0.1495, 0.1495,
0.1495, 0.1495, 0.1495)), row.names = c(NA, 6L), class = "data.frame")
dput(df)
structure(list(Time = structure(list(sec = c(0, 0, 0, 0, 0),
min = c(55L, 55L, 15L, 20L, 30L), hour = c(23L, 14L, 15L,
19L, 18L), mday = 31:27, mon = c(7L, 7L, 7L, 7L, 7L), year = c(119L,
119L, 119L, 119L, 119L), wday = 6:2, yday = 242:238, isdst = c(0L,
0L, 0L, 0L, 0L), zone = c("AEST", "AEST", "AEST", "AEST",
"AEST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_)), class = c("POSIXlt", "POSIXt"
)), Type = structure(c(1L, 2L, 3L, 2L, 1L), .Label = c("a", "b",
"c"), class = "factor"), Energy = c(-0.3558, -3.1189, 27.3856,
-155.7758, -149.3617)), row.names = c(NA, -5L), class = "data.frame")
性能也是一个问题。如果这非常无知,我深表歉意,但我已经搜索了几天,但没有找到可比较的例子。
尝试使用带有行和列索引的 cbind
创建一个矩阵,然后用它从 tf
中提取关税,然后乘以 Energy
。
df$cost <- df$Energy * tf[cbind(df$Time$hour + 1, df$Time$wday + 1)]
在行索引中添加 1,因为小时从 0 开始,但行号从 1 开始,并在列索引中添加 1,因为我们想从第二列开始提取值(因为第一列是 Hour
) .
我有一个非常大的电表数据集。它看起来像这样(每 5 分钟有一行):
df
Time Type Energy
1 2019-08-31 23:55:00 a -0.3558
2 2019-08-30 14:55:00 b -3.1189
3 2019-08-29 15:15:00 c 27.3856
4 2019-08-28 19:20:00 b -155.7758
5 2019-08-27 18:30:00 a -149.3617
还有一个关税 table 看起来像这样:
tf
Hour 0 1 2 3 4 5 6
1 0 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495
2 1 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495
3 2 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495
4 3 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495 0.1495
...............................
等等最多 23 小时,以天作为列标题。 我需要计算数据集中每 5 分钟间隔的能源成本。似乎可以使用:
require(lubridate)
xlt <- as.POSIXlt(df$Time)
cost <- tf[xlt$hour,xlt$wday]* df$Energy
但这行不通,因为每个小时都选择了所有日期,而需要的是按小时和天索引的关税 table 中的单个元素。这是结果:
cost
4 3 2 1 0
23 -0.0531921 -0.0531921 -0.0531921 -0.0531921 -0.0531921
14 -0.7157876 -0.7157876 -0.7157876 -0.7157876 -0.7157876
15 6.2849952 6.2849952 6.2849952 6.2849952 6.2849952
19 -83.9631562 -83.9631562 -83.9631562 -83.9631562 -35.7505461
18 -80.5059563 -80.5059563 -80.5059563 -80.5059563 -34.2785102
dput(cost)
structure(list(`4` = c(-0.0531921, -0.71578755, 6.2849952, -83.9631562,
-80.5059563), `3` = c(-0.0531921, -0.71578755, 6.2849952, -83.9631562,
-80.5059563), `2` = c(-0.0531921, -0.71578755, 6.2849952, -83.9631562,
-80.5059563), `1` = c(-0.0531921, -0.71578755, 6.2849952, -83.9631562,
-80.5059563), `0` = c(-0.0531921, -0.71578755, 6.2849952, -35.7505461,
-34.27851015)), class = "data.frame", row.names = c(23L, 14L,
15L, 19L, 18L))
这是数据:
dput(head(tf))
structure(list(Hour = 0:5, `0` = c(0.1495, 0.1495, 0.1495, 0.1495,
0.1495, 0.1495), `1` = c(0.1495, 0.1495, 0.1495, 0.1495, 0.1495,
0.1495), `2` = c(0.1495, 0.1495, 0.1495, 0.1495, 0.1495, 0.1495
), `3` = c(0.1495, 0.1495, 0.1495, 0.1495, 0.1495, 0.1495), `4` = c(0.1495,
0.1495, 0.1495, 0.1495, 0.1495, 0.1495), `5` = c(0.1495, 0.1495,
0.1495, 0.1495, 0.1495, 0.1495), `6` = c(0.1495, 0.1495, 0.1495,
0.1495, 0.1495, 0.1495)), row.names = c(NA, 6L), class = "data.frame")
dput(df)
structure(list(Time = structure(list(sec = c(0, 0, 0, 0, 0),
min = c(55L, 55L, 15L, 20L, 30L), hour = c(23L, 14L, 15L,
19L, 18L), mday = 31:27, mon = c(7L, 7L, 7L, 7L, 7L), year = c(119L,
119L, 119L, 119L, 119L), wday = 6:2, yday = 242:238, isdst = c(0L,
0L, 0L, 0L, 0L), zone = c("AEST", "AEST", "AEST", "AEST",
"AEST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_)), class = c("POSIXlt", "POSIXt"
)), Type = structure(c(1L, 2L, 3L, 2L, 1L), .Label = c("a", "b",
"c"), class = "factor"), Energy = c(-0.3558, -3.1189, 27.3856,
-155.7758, -149.3617)), row.names = c(NA, -5L), class = "data.frame")
性能也是一个问题。如果这非常无知,我深表歉意,但我已经搜索了几天,但没有找到可比较的例子。
尝试使用带有行和列索引的 cbind
创建一个矩阵,然后用它从 tf
中提取关税,然后乘以 Energy
。
df$cost <- df$Energy * tf[cbind(df$Time$hour + 1, df$Time$wday + 1)]
在行索引中添加 1,因为小时从 0 开始,但行号从 1 开始,并在列索引中添加 1,因为我们想从第二列开始提取值(因为第一列是 Hour
) .