R - 考虑重叠日期查找多行之间的相关性
R - Find correlation between multiple rows considering overlapping dates
我有一个数据 table,其中包含多个雨量计的降雨量测量值。这是我的数据集的示例:
library(data.table)
dat <- fread("https://www.dropbox.com/s/yub3db3739d80h2/dat.csv?dl=1")
> dat
ID date value
1: 937 2000-01-01 14.2
2: 937 2000-01-02 68.3
3: 937 2000-01-03 28.4
4: 937 2000-01-04 30.2
5: 937 2000-01-05 12.8
---
33905: 1600 2017-06-12 0.1
33906: 1600 2017-06-13 36.1
33907: 1600 2017-06-14 0.3
33908: 1600 2017-06-15 0.0
33909: 1600 2017-06-16 0.0
我还有一个数据 table,其中包含每个仪表的 ID 以及最近的几个仪表的 ID,以及它们测量降雨量的共同日期:
neighbors <- fread("https://www.dropbox.com/s/phhskbhxsxmrxy1/neighbours.csv?dl=1")
> neighbors
ID ID_nearest common_date_begin common_date_end diff_days
1: 1 1117 2000-03-01 2006-12-03 2468
2: 1 920 2000-03-01 2004-11-04 1709
3: 1000 48 2000-03-01 2006-12-03 2468
4: 1000 1600 2000-03-01 2017-06-16 6316
5: 1000 937 2000-03-01 2017-01-22 6171
6: 1001 352 2007-07-10 2017-06-16 3629
7: 1001 324 2007-07-10 2017-06-16 3629
8: 1002 1338 2006-01-01 2017-06-16 4184
9: 1002 412 2006-01-01 2009-07-12 1288
10: 1002 1330 2006-01-01 2017-06-16 4184
11: 1002 1349 2006-01-01 2017-06-16 4184
12: 1009 801 2006-01-01 2017-01-22 4039
例如,仪表 ID 1
有两个近邻:ID 的 1117
和 920
。 1
和 1117
台站的重叠测量周期为 2000 年 3 月 1 日至 2006 年 12 月 3 日。
对于 neighbors
中的每个组合,我需要计算重叠日期内主要和周围雨量计之间的降雨量测量值的相关性。
例如,第一对的相关性计算如下:
cor(dat[ID==1 & date %between% c("2000-03-01", "2006-12-03")]$value,
dat[ID==1117 & date %between% c("2000-03-01", "2006-12-03")]$value)
cor(dat[ID==1 & date %between% c("2000-03-01", "2004-11-04")]$value,
dat[ID==920 & date %between% c("2000-03-01", "2004-11-04")]$value)
预期的输出是这样的:
ID ID_nearest correlation n
1 1117 0.55 2468
1 920 0.48 1709
1000 48 0.77 2468
1000 1600 0.52 6316
1000 937 0.84 6171
neighbors
中的每个 ID
依此类推。
但我很难想出一种编程方式来实现这一目标。
我该怎么做?提前致谢。
这是一种方法
> df <- do.call(rbind, lapply(unique(neighbors$ID), function(id) {
d <- neighbors[neighbors[, "ID"] %in% id, ]
main.vals <- dat %>%
dplyr::filter(ID == id & (date >= d$common_date_begin & date <= max(d$common_date_end))) %>%
dplyr::select(value)
main.vals <- main.vals$value
nearest.vals <- lapply(unique(d$ID_nearest), function(neigh.id) {
r <- d[d$ID_nearest== neigh.id, ]
vals <- dat[dat$ID == neigh.id & (dat$date >= r$common_date_begin & dat$date <= r$common_date_end), ]
return (vals$value)
})
d <- d %>%
dplyr::select(-c(common_date_begin, common_date_end)) %>%
dplyr::mutate(correlation = sapply(nearest.vals, cor, y = main.vals),
n = diff_days)
return(d)
}))
> df
# ID ID_nearest diff_days correlation n
# 1 1 1117 2468 0.527024 2468
# 2 1 920 1709 -0.469635 1709
我们循环遍历邻居数据中的每个唯一 ID
,从 dat
date.frame 中过滤掉它的值,随后过滤掉 [=] 中每个邻居的值14=]data.frame,查看主id对应的rainfall与各邻居id的相关性
我使用了以下数据(修改为将 ID_nearest
值添加到 dat
):
library(dplyr)
library(magrittr)
dat <- read.table(text = "
1 2000-03-01 55.3
1 2000-03-02 55.6
1 2005-03-03 48.3
920 2000-03-01 14.2
920 2000-04-02 68.3
920 2000-04-03 68.4
1117 2003-03-01 0.1
1117 2003-06-13 36.1
1117 2003-06-14 0.3
", col.names = c("ID", "date", "value"))
dat$date <- as.POSIXct(dat$date)
neighbors <- read.table(text = "
ID ID_nearest common_date_begin common_date_end diff_days
1 1117 2000-03-01 2006-12-03 2468
1 920 2000-03-01 2004-11-04 1709
", header = TRUE)
neighbors$common_date_begin <- as.POSIXct(neighbors$common_date_begin)
neighbors$common_date_end <- as.POSIXct(neighbors$common_date_end)
试试这个
library(data.table)
dat <- fread("https://www.dropbox.com/s/yub3db3739d80h2/dat.csv?dl=1")
neighbors <- fread("https://www.dropbox.com/s/phhskbhxsxmrxy1/neighbours.csv?dl=1")
results <- neighbors[, -c(3:4)]
i <- as.numeric(neighbors[1, 1])
correlations <- matrix(NA, nrow = nrow(neighbors), ncol =1)
ids <- unique(neighbors$ID)
x <- 1
for (i in ids) {
temp <- neighbors[ID==i]
for (id in 1:nrow(temp)){
near_id <- as.numeric(temp[id, 2])
beg_date <- temp[id, 3]
end_date <- temp[id, 4]
correlations[x,1] <- cor(dat[ID==i & date %between% c(beg_date, end_date)]$value,
dat[ID==near_id & date %between% c(beg_date, end_date)]$value)
x <- x + 1
}
}
results <- cbind(results[, 1], results[, 2], correlations, results[, 3])
colnames(results) <- c("ID", "ID_nearest", "correlation", "n")
你可以先试试这个:
DT <- rnfl[neighbors, on=.(ID, date>=common_date_begin, date<=common_date_end),
c(mget(paste0("i.", names(neighbors))),
by=.EACHI,
.(date=x.date, v1=x.value))][, (1L:3L) := NULL]
setnames(DT, names(DT), gsub("i.", "", names(DT), fixed=TRUE))
DT[rnfl, on=.(ID_nearest=ID, date), v2 := value]
DT[, .(correlation=cor(v1, v2)), names(neighbors)]
如果太慢,我们可以尝试其他方法。
数据以及来自上一个问题的数据:
library(data.table)
rnfl <- data.table(ID=c(1,1,1,1,1,2,2,2,2,2),
date=Sys.Date() + c(0:4, 2:6),
value=c(17.6, 5.6, 4.5, 8.3, 11.7, 10.7, 15.6, 11.6, 8.3, 2.3))
near <- data.table(ID=1, ID_nearest=2)
summ <- rnfl[, .(startdate=date[1L], enddate=date[.N]),
.(ID, g=cumsum(c(0L, diff(date)!=1L)))]
setkey(summ, startdate, enddate)
olap <- unique(foverlaps(summ, summ)[ID!=i.ID, .(
ID1=pmin(ID, i.ID),
ID2=pmax(ID, i.ID),
common_date_begin=pmax(startdate, i.startdate),
common_date_end=pmin(enddate, i.enddate))])
near[, c("ID1", "ID2") := .(pmin(ID, ID_nearest), pmax(ID, ID_nearest))]
cols <- c("common_date_begin", "common_date_end")
neighbors <- near[olap, on=.(ID1, ID2), (cols) := mget(paste0("i.", cols))][,
n := as.integer(common_date_end - common_date_begin)]
我有一个数据 table,其中包含多个雨量计的降雨量测量值。这是我的数据集的示例:
library(data.table)
dat <- fread("https://www.dropbox.com/s/yub3db3739d80h2/dat.csv?dl=1")
> dat
ID date value
1: 937 2000-01-01 14.2
2: 937 2000-01-02 68.3
3: 937 2000-01-03 28.4
4: 937 2000-01-04 30.2
5: 937 2000-01-05 12.8
---
33905: 1600 2017-06-12 0.1
33906: 1600 2017-06-13 36.1
33907: 1600 2017-06-14 0.3
33908: 1600 2017-06-15 0.0
33909: 1600 2017-06-16 0.0
我还有一个数据 table,其中包含每个仪表的 ID 以及最近的几个仪表的 ID,以及它们测量降雨量的共同日期:
neighbors <- fread("https://www.dropbox.com/s/phhskbhxsxmrxy1/neighbours.csv?dl=1")
> neighbors
ID ID_nearest common_date_begin common_date_end diff_days
1: 1 1117 2000-03-01 2006-12-03 2468
2: 1 920 2000-03-01 2004-11-04 1709
3: 1000 48 2000-03-01 2006-12-03 2468
4: 1000 1600 2000-03-01 2017-06-16 6316
5: 1000 937 2000-03-01 2017-01-22 6171
6: 1001 352 2007-07-10 2017-06-16 3629
7: 1001 324 2007-07-10 2017-06-16 3629
8: 1002 1338 2006-01-01 2017-06-16 4184
9: 1002 412 2006-01-01 2009-07-12 1288
10: 1002 1330 2006-01-01 2017-06-16 4184
11: 1002 1349 2006-01-01 2017-06-16 4184
12: 1009 801 2006-01-01 2017-01-22 4039
例如,仪表 ID 1
有两个近邻:ID 的 1117
和 920
。 1
和 1117
台站的重叠测量周期为 2000 年 3 月 1 日至 2006 年 12 月 3 日。
对于 neighbors
中的每个组合,我需要计算重叠日期内主要和周围雨量计之间的降雨量测量值的相关性。
例如,第一对的相关性计算如下:
cor(dat[ID==1 & date %between% c("2000-03-01", "2006-12-03")]$value,
dat[ID==1117 & date %between% c("2000-03-01", "2006-12-03")]$value)
cor(dat[ID==1 & date %between% c("2000-03-01", "2004-11-04")]$value,
dat[ID==920 & date %between% c("2000-03-01", "2004-11-04")]$value)
预期的输出是这样的:
ID ID_nearest correlation n
1 1117 0.55 2468
1 920 0.48 1709
1000 48 0.77 2468
1000 1600 0.52 6316
1000 937 0.84 6171
neighbors
中的每个 ID
依此类推。
但我很难想出一种编程方式来实现这一目标。
我该怎么做?提前致谢。
这是一种方法
> df <- do.call(rbind, lapply(unique(neighbors$ID), function(id) {
d <- neighbors[neighbors[, "ID"] %in% id, ]
main.vals <- dat %>%
dplyr::filter(ID == id & (date >= d$common_date_begin & date <= max(d$common_date_end))) %>%
dplyr::select(value)
main.vals <- main.vals$value
nearest.vals <- lapply(unique(d$ID_nearest), function(neigh.id) {
r <- d[d$ID_nearest== neigh.id, ]
vals <- dat[dat$ID == neigh.id & (dat$date >= r$common_date_begin & dat$date <= r$common_date_end), ]
return (vals$value)
})
d <- d %>%
dplyr::select(-c(common_date_begin, common_date_end)) %>%
dplyr::mutate(correlation = sapply(nearest.vals, cor, y = main.vals),
n = diff_days)
return(d)
}))
> df
# ID ID_nearest diff_days correlation n
# 1 1 1117 2468 0.527024 2468
# 2 1 920 1709 -0.469635 1709
我们循环遍历邻居数据中的每个唯一 ID
,从 dat
date.frame 中过滤掉它的值,随后过滤掉 [=] 中每个邻居的值14=]data.frame,查看主id对应的rainfall与各邻居id的相关性
我使用了以下数据(修改为将 ID_nearest
值添加到 dat
):
library(dplyr)
library(magrittr)
dat <- read.table(text = "
1 2000-03-01 55.3
1 2000-03-02 55.6
1 2005-03-03 48.3
920 2000-03-01 14.2
920 2000-04-02 68.3
920 2000-04-03 68.4
1117 2003-03-01 0.1
1117 2003-06-13 36.1
1117 2003-06-14 0.3
", col.names = c("ID", "date", "value"))
dat$date <- as.POSIXct(dat$date)
neighbors <- read.table(text = "
ID ID_nearest common_date_begin common_date_end diff_days
1 1117 2000-03-01 2006-12-03 2468
1 920 2000-03-01 2004-11-04 1709
", header = TRUE)
neighbors$common_date_begin <- as.POSIXct(neighbors$common_date_begin)
neighbors$common_date_end <- as.POSIXct(neighbors$common_date_end)
试试这个
library(data.table)
dat <- fread("https://www.dropbox.com/s/yub3db3739d80h2/dat.csv?dl=1")
neighbors <- fread("https://www.dropbox.com/s/phhskbhxsxmrxy1/neighbours.csv?dl=1")
results <- neighbors[, -c(3:4)]
i <- as.numeric(neighbors[1, 1])
correlations <- matrix(NA, nrow = nrow(neighbors), ncol =1)
ids <- unique(neighbors$ID)
x <- 1
for (i in ids) {
temp <- neighbors[ID==i]
for (id in 1:nrow(temp)){
near_id <- as.numeric(temp[id, 2])
beg_date <- temp[id, 3]
end_date <- temp[id, 4]
correlations[x,1] <- cor(dat[ID==i & date %between% c(beg_date, end_date)]$value,
dat[ID==near_id & date %between% c(beg_date, end_date)]$value)
x <- x + 1
}
}
results <- cbind(results[, 1], results[, 2], correlations, results[, 3])
colnames(results) <- c("ID", "ID_nearest", "correlation", "n")
你可以先试试这个:
DT <- rnfl[neighbors, on=.(ID, date>=common_date_begin, date<=common_date_end),
c(mget(paste0("i.", names(neighbors))),
by=.EACHI,
.(date=x.date, v1=x.value))][, (1L:3L) := NULL]
setnames(DT, names(DT), gsub("i.", "", names(DT), fixed=TRUE))
DT[rnfl, on=.(ID_nearest=ID, date), v2 := value]
DT[, .(correlation=cor(v1, v2)), names(neighbors)]
如果太慢,我们可以尝试其他方法。
数据以及来自上一个问题的数据:
library(data.table)
rnfl <- data.table(ID=c(1,1,1,1,1,2,2,2,2,2),
date=Sys.Date() + c(0:4, 2:6),
value=c(17.6, 5.6, 4.5, 8.3, 11.7, 10.7, 15.6, 11.6, 8.3, 2.3))
near <- data.table(ID=1, ID_nearest=2)
summ <- rnfl[, .(startdate=date[1L], enddate=date[.N]),
.(ID, g=cumsum(c(0L, diff(date)!=1L)))]
setkey(summ, startdate, enddate)
olap <- unique(foverlaps(summ, summ)[ID!=i.ID, .(
ID1=pmin(ID, i.ID),
ID2=pmax(ID, i.ID),
common_date_begin=pmax(startdate, i.startdate),
common_date_end=pmin(enddate, i.enddate))])
near[, c("ID1", "ID2") := .(pmin(ID, ID_nearest), pmax(ID, ID_nearest))]
cols <- c("common_date_begin", "common_date_end")
neighbors <- near[olap, on=.(ID1, ID2), (cols) := mget(paste0("i.", cols))][,
n := as.integer(common_date_end - common_date_begin)]