根据日期计算不同列的平均值
Computing mean of different columns depending on date
我的数据集是关于森林火灾和 NDVI 值(范围从 0 到 1 的值,表示表面的绿色程度)。它有一个初始列,表示第一行的森林火灾发生的时间,随后的列表示火灾发生前后不同日期的 NDVI 值。与火灾后的值相比,火灾前的 NDVI 值要高得多。类似于:
data1989 <- data.frame("date_fire" = c("1987-01-01", "1987-07-03", "1988-01-01"),
"1986-01-01" = c(0.5, 0.589, 0.66),
"1986-06-03" = c(0.56, 0.447, 0.75),
"1986-10-19" = c(0.8, NA, 0.83),
"1987-01-19" = c(0.75, 0.65,0.75),
"1987-06-19" = c(0.1, 0.55,0.811),
"1987-10-19" = c(0.15, 0.12, 0.780),
"1988-01-19" = c(0.2, 0.22,0.32),
"1988-06-19" = c(0.18, 0.21,0.23),
"1988-10-19" = c(0.21, 0.24, 0.250),
stringsAsFactors = FALSE)
> data1989
date_fire X1986.01.01 X1986.06.03 X1986.10.19 X1987.01.19 X1987.06.19 X1987.10.19 X1988.01.19 X1988.06.19 X1988.10.19
1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21
2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24
3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25
我想在森林火灾之前的新列中计算 NDVI 值的平均值。在第一种情况下,它将是第 2、3、4 和 5 列的平均值。
我需要得到的是:
date_fire X1986.01.01 X1986.06.03 X1986.10.19 X1987.01.19 X1987.06.19 X1987.10.19 X1988.01.19 X1988.06.19 X1988.10.19 meanPreFire
1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21 0.653
2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24 0.559
3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.764
谢谢!
编辑:解决方案
如何调整包含多列的代码以排除:
data1989 <- data.frame("date_fire" = c("1987-02-01", "1987-07-03", "1988-01-01"),
"type" = c("oak", "pine", "oak"),
"meanRainfall" = c(600, 300, 450),
"1986.01.01" = c(0.5, 0.589, 0.66),
"1986.06.03" = c(0.56, 0.447, 0.75),
"1986.10.19" = c(0.8, NA, 0.83),
"1987.01.19" = c(0.75, 0.65,0.75),
"1987.06.19" = c(0.1, 0.55,0.811),
"1987.10.19" = c(0.15, 0.12, 0.780),
"1988.01.19" = c(0.2, 0.22,0.32),
"1988.06.19" = c(0.18, 0.21,0.23),
"1988.10.19" = c(0.21, 0.24, 0.250),
check.names = FALSE,
stringsAsFactors = FALSE)
使用:
j1 <- findInterval(as.Date(data1989$date_fire), as.Date(names(data1989)[-(1:3)],format="%Y.%m.%d"))
m1 <- cbind(rep(seq_len(nrow(data1989)), j1), sequence(j1))
data1989$meanPreFire <- tapply(data1989[-(1:3)][m1], m1[,1], FUN = mean, na.rm = TRUE)
> data1989
date_fire type meanRainfall 1986.01.01 1986.06.03 1986.10.19 1987.01.19 1987.06.19 1987.10.19 1988.01.19 1988.06.19 1988.10.19 meanPreFire
1 1987-02-01 oak 600 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21 0.6525
2 1987-07-03 pine 300 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24 0.5590
3 1988-01-01 oak 450 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.7635
我们可以通过创建 row/column 索引来使用 base R
。列索引可以从 findInterval
与列名和 'date_fire'
j1 <- findInterval(as.Date(data1989$date_fire), as.Date(names(data1989)[-1]))
l1 <- lapply(j1+1, `:`, ncol(data1989)-1)
m1 <- cbind(rep(seq_len(nrow(data1989)), j1), sequence(j1))
m2 <- cbind(rep(seq_len(nrow(data1989)), lengths(l1)), unlist(l1))
data1989$meanPreFire <- tapply(data1989[-1][m1], m1[,1], FUN = mean, na.rm = TRUE)
data1989$meanPostFire <- tapply(data1989[-1][m2], m2[,1], FUN = mean, na.rm = TRUE)
data1989
# date_fire 1986-01-01 1986-06-03 1986-10-19 1987-01-19 1987-06-19 1987-10-19 1988-01-19 1988-06-19 1988-10-19
#1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21
#2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24
#3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25
# meanPreFire meanPostFire
#1 0.6200 0.2650000
#2 0.5590 0.1975000
#3 0.7635 0.2666667
或使用 data.table
中的 melt/dcast
library(data.table)
dcast(melt(setDT(data1989), id.var = 'date_fire')[,
.(value = mean(value, na.rm = TRUE)),
.(date_fire, grp = c('postFire', 'preFire')[1 + (as.IDate(variable) < as.IDate(date_fire))]) ], date_fire ~ grp)[data1989, on = .(date_fire)]
# date_fire postFire preFire 1986-01-01 1986-06-03 1986-10-19 1987-01-19 1987-06-19 1987-10-19 1988-01-19 1988-06-19
#1: 1987-01-01 0.2650000 0.6200 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18
#2: 1987-07-03 0.1975000 0.5590 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21
#3: 1988-01-01 0.2666667 0.7635 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23
# 1988-10-19
#1: 0.21
#2: 0.24
#3: 0.25
数据
data1989 <- data.frame("date_fire" = c("1987-01-01", "1987-07-03", "1988-01-01"),
"1986-01-01" = c(0.5, 0.589, 0.66),
"1986-06-03" = c(0.56, 0.447, 0.75),
"1986-10-19" = c(0.8, NA, 0.83),
"1987-01-19" = c(0.75, 0.65,0.75),
"1987-06-19" = c(0.1, 0.55,0.811),
"1987-10-19" = c(0.15, 0.12, 0.780),
"1988-01-19" = c(0.2, 0.22,0.32),
"1988-06-19" = c(0.18, 0.21,0.23),
"1988-10-19" = c(0.21, 0.24, 0.250), check.names = FALSE,
stringsAsFactors = FALSE)
将数据重塑为长格式并过滤森林火灾前的日期。
library(tidyverse)
data1989 %>%
pivot_longer(-date_fire, names_to = "date") %>%
mutate(date_fire = as.Date(date_fire),
date = as.Date(date, "X%Y.%m.%d")) %>%
filter(date < date_fire) %>%
group_by(date_fire) %>%
summarise(meanPreFire = mean(value, na.rm = T))
# # A tibble: 3 x 2
# date_fire meanPreFire
# <date> <dbl>
# 1 1987-01-01 0.62
# 2 1987-07-03 0.559
# 3 1988-01-01 0.764
如果我们将数据保持为长(呃)形式,解决方案会更加简洁...但这会重现所需的输出:
library(dplyr)
library(tidyr)
data1989 %>%
pivot_longer(-date_fire, names_to = "date_NDVI", values_to = "value", names_prefix = "^X") %>%
mutate(date_fire = as.Date(date_fire, "%Y-%m-%d"),
date_NDVI = as.Date(date_NDVI, "%Y.%m.%d")) %>%
group_by(date_fire) %>%
mutate(period = ifelse(date_NDVI < date_fire, "before_fire", "after_fire")) %>%
group_by(date_fire, period) %>%
mutate(average_NDVI = mean(value, na.rm = TRUE)) %>%
pivot_wider(names_from = date_NDVI, names_prefix = "X", values_from = value) %>%
pivot_wider(names_from = period, values_from = average_NDVI) %>%
group_by(date_fire) %>%
summarise_all(funs(sum(., na.rm=T)))
Returns:
# A tibble: 3 x 12
date_fire `X1986-01-01` `X1986-06-03` `X1986-10-19` `X1987-01-19` `X1987-06-19` `X1987-10-19` `X1988-01-19` `X1988-06-19` `X1988-10-19` before_fire after_fire
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1987-01-01 0.5 0.56 0.8 0.75 0.1 0.15 0.2 0.18 0.21 0.62 0.265
2 1987-07-03 0.589 0.447 0 0.65 0.55 0.12 0.22 0.21 0.24 0.559 0.198
3 1988-01-01 0.66 0.75 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.764 0.267
编辑:
如果我们在计算平均值后立即停止表达式,我们可以使用此结构中的数据轻松计算方差或解释可变数量的观察值。我认为将 date_fire
保留为单独的一列是可以的,但我建议将其他日期保留为一列(因为它们对应于观察结果)。特别是如果我们想使用 ggplot2
和其他 tidyverse
函数对数据进行更多分析。
我的数据集是关于森林火灾和 NDVI 值(范围从 0 到 1 的值,表示表面的绿色程度)。它有一个初始列,表示第一行的森林火灾发生的时间,随后的列表示火灾发生前后不同日期的 NDVI 值。与火灾后的值相比,火灾前的 NDVI 值要高得多。类似于:
data1989 <- data.frame("date_fire" = c("1987-01-01", "1987-07-03", "1988-01-01"),
"1986-01-01" = c(0.5, 0.589, 0.66),
"1986-06-03" = c(0.56, 0.447, 0.75),
"1986-10-19" = c(0.8, NA, 0.83),
"1987-01-19" = c(0.75, 0.65,0.75),
"1987-06-19" = c(0.1, 0.55,0.811),
"1987-10-19" = c(0.15, 0.12, 0.780),
"1988-01-19" = c(0.2, 0.22,0.32),
"1988-06-19" = c(0.18, 0.21,0.23),
"1988-10-19" = c(0.21, 0.24, 0.250),
stringsAsFactors = FALSE)
> data1989
date_fire X1986.01.01 X1986.06.03 X1986.10.19 X1987.01.19 X1987.06.19 X1987.10.19 X1988.01.19 X1988.06.19 X1988.10.19
1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21
2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24
3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25
我想在森林火灾之前的新列中计算 NDVI 值的平均值。在第一种情况下,它将是第 2、3、4 和 5 列的平均值。
我需要得到的是:
date_fire X1986.01.01 X1986.06.03 X1986.10.19 X1987.01.19 X1987.06.19 X1987.10.19 X1988.01.19 X1988.06.19 X1988.10.19 meanPreFire
1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21 0.653
2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24 0.559
3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.764
谢谢!
编辑:解决方案
如何调整包含多列的代码以排除:
data1989 <- data.frame("date_fire" = c("1987-02-01", "1987-07-03", "1988-01-01"),
"type" = c("oak", "pine", "oak"),
"meanRainfall" = c(600, 300, 450),
"1986.01.01" = c(0.5, 0.589, 0.66),
"1986.06.03" = c(0.56, 0.447, 0.75),
"1986.10.19" = c(0.8, NA, 0.83),
"1987.01.19" = c(0.75, 0.65,0.75),
"1987.06.19" = c(0.1, 0.55,0.811),
"1987.10.19" = c(0.15, 0.12, 0.780),
"1988.01.19" = c(0.2, 0.22,0.32),
"1988.06.19" = c(0.18, 0.21,0.23),
"1988.10.19" = c(0.21, 0.24, 0.250),
check.names = FALSE,
stringsAsFactors = FALSE)
使用:
j1 <- findInterval(as.Date(data1989$date_fire), as.Date(names(data1989)[-(1:3)],format="%Y.%m.%d"))
m1 <- cbind(rep(seq_len(nrow(data1989)), j1), sequence(j1))
data1989$meanPreFire <- tapply(data1989[-(1:3)][m1], m1[,1], FUN = mean, na.rm = TRUE)
> data1989
date_fire type meanRainfall 1986.01.01 1986.06.03 1986.10.19 1987.01.19 1987.06.19 1987.10.19 1988.01.19 1988.06.19 1988.10.19 meanPreFire
1 1987-02-01 oak 600 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21 0.6525
2 1987-07-03 pine 300 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24 0.5590
3 1988-01-01 oak 450 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.7635
我们可以通过创建 row/column 索引来使用 base R
。列索引可以从 findInterval
与列名和 'date_fire'
j1 <- findInterval(as.Date(data1989$date_fire), as.Date(names(data1989)[-1]))
l1 <- lapply(j1+1, `:`, ncol(data1989)-1)
m1 <- cbind(rep(seq_len(nrow(data1989)), j1), sequence(j1))
m2 <- cbind(rep(seq_len(nrow(data1989)), lengths(l1)), unlist(l1))
data1989$meanPreFire <- tapply(data1989[-1][m1], m1[,1], FUN = mean, na.rm = TRUE)
data1989$meanPostFire <- tapply(data1989[-1][m2], m2[,1], FUN = mean, na.rm = TRUE)
data1989
# date_fire 1986-01-01 1986-06-03 1986-10-19 1987-01-19 1987-06-19 1987-10-19 1988-01-19 1988-06-19 1988-10-19
#1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21
#2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24
#3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25
# meanPreFire meanPostFire
#1 0.6200 0.2650000
#2 0.5590 0.1975000
#3 0.7635 0.2666667
或使用 data.table
melt/dcast
library(data.table)
dcast(melt(setDT(data1989), id.var = 'date_fire')[,
.(value = mean(value, na.rm = TRUE)),
.(date_fire, grp = c('postFire', 'preFire')[1 + (as.IDate(variable) < as.IDate(date_fire))]) ], date_fire ~ grp)[data1989, on = .(date_fire)]
# date_fire postFire preFire 1986-01-01 1986-06-03 1986-10-19 1987-01-19 1987-06-19 1987-10-19 1988-01-19 1988-06-19
#1: 1987-01-01 0.2650000 0.6200 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18
#2: 1987-07-03 0.1975000 0.5590 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21
#3: 1988-01-01 0.2666667 0.7635 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23
# 1988-10-19
#1: 0.21
#2: 0.24
#3: 0.25
数据
data1989 <- data.frame("date_fire" = c("1987-01-01", "1987-07-03", "1988-01-01"),
"1986-01-01" = c(0.5, 0.589, 0.66),
"1986-06-03" = c(0.56, 0.447, 0.75),
"1986-10-19" = c(0.8, NA, 0.83),
"1987-01-19" = c(0.75, 0.65,0.75),
"1987-06-19" = c(0.1, 0.55,0.811),
"1987-10-19" = c(0.15, 0.12, 0.780),
"1988-01-19" = c(0.2, 0.22,0.32),
"1988-06-19" = c(0.18, 0.21,0.23),
"1988-10-19" = c(0.21, 0.24, 0.250), check.names = FALSE,
stringsAsFactors = FALSE)
将数据重塑为长格式并过滤森林火灾前的日期。
library(tidyverse)
data1989 %>%
pivot_longer(-date_fire, names_to = "date") %>%
mutate(date_fire = as.Date(date_fire),
date = as.Date(date, "X%Y.%m.%d")) %>%
filter(date < date_fire) %>%
group_by(date_fire) %>%
summarise(meanPreFire = mean(value, na.rm = T))
# # A tibble: 3 x 2
# date_fire meanPreFire
# <date> <dbl>
# 1 1987-01-01 0.62
# 2 1987-07-03 0.559
# 3 1988-01-01 0.764
如果我们将数据保持为长(呃)形式,解决方案会更加简洁...但这会重现所需的输出:
library(dplyr)
library(tidyr)
data1989 %>%
pivot_longer(-date_fire, names_to = "date_NDVI", values_to = "value", names_prefix = "^X") %>%
mutate(date_fire = as.Date(date_fire, "%Y-%m-%d"),
date_NDVI = as.Date(date_NDVI, "%Y.%m.%d")) %>%
group_by(date_fire) %>%
mutate(period = ifelse(date_NDVI < date_fire, "before_fire", "after_fire")) %>%
group_by(date_fire, period) %>%
mutate(average_NDVI = mean(value, na.rm = TRUE)) %>%
pivot_wider(names_from = date_NDVI, names_prefix = "X", values_from = value) %>%
pivot_wider(names_from = period, values_from = average_NDVI) %>%
group_by(date_fire) %>%
summarise_all(funs(sum(., na.rm=T)))
Returns:
# A tibble: 3 x 12
date_fire `X1986-01-01` `X1986-06-03` `X1986-10-19` `X1987-01-19` `X1987-06-19` `X1987-10-19` `X1988-01-19` `X1988-06-19` `X1988-10-19` before_fire after_fire
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1987-01-01 0.5 0.56 0.8 0.75 0.1 0.15 0.2 0.18 0.21 0.62 0.265
2 1987-07-03 0.589 0.447 0 0.65 0.55 0.12 0.22 0.21 0.24 0.559 0.198
3 1988-01-01 0.66 0.75 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.764 0.267
编辑:
如果我们在计算平均值后立即停止表达式,我们可以使用此结构中的数据轻松计算方差或解释可变数量的观察值。我认为将 date_fire
保留为单独的一列是可以的,但我建议将其他日期保留为一列(因为它们对应于观察结果)。特别是如果我们想使用 ggplot2
和其他 tidyverse
函数对数据进行更多分析。