如何使用 left_join 计算不同类别的平均值并嵌套在 R 中?
How to compute the mean in different categories using left_join and nest in R?
我正在尝试使用 left_join
和 nest
计算分箱数据的平均值。
bin.size = 100
第一个数据框:
df = data.frame(x =c(300,400),
y = c("sca1","sca2"))
x y
1 300 sca1
2 400 sca2
第二个数据帧:
df2 = data.frame(snp = c(1,2,10,100,1,2,14,16,399),
sca = c("sca1","sca1","sca1","sca1","sca2","sca2","sca2","sca2","sca2"))
snp r2 sca
1 1 0.70 sca1
2 2 0.80 sca1
3 10 0.70 sca1
4 100 0.10 sca1
5 1 0.90 sca2
6 2 0.98 sca2
7 14 0.80 sca2
8 16 0.80 sca2
9 399 0.01 sca2
来自@r2evans 的代码:
output_bin_LD = df %>%
left_join(nest(df2, snp, .key = "snp"), by = c("y" = "sca")) %>%
mutate(
cuts = map(x, ~ seq(0, ., by = bin.size)),
tbls = pmap(
.l = list(snp, cuts),
.f = function(xx, breaks) {
z <- table(cut(xx$snp, breaks))
data_frame(cut = names(z), count = z)
}
)
) %>%
select(y, tbls) %>%
unnest()
这个 正在做这个:
y cut count
1 sca1 (0,100] 4
2 sca1 (100,200] 0
3 sca1 (200,300] 0
4 sca2 (0,100] 4
5 sca2 (100,200] 0
6 sca2 (200,300] 0
7 sca2 (300,400] 1
最终目标是
y cut count mean
1 sca1 (0,100] 4 0.575
2 sca1 (100,200] 0 0
3 sca1 (200,300] 0 0
4 sca2 (0,100] 4 0.87
5 sca2 (100,200] 0 0
6 sca2 (200,300] 0 0
7 sca2 (300,400] 1 399
到目前为止我试过这个:
df %>%
left_join(nest(df2, snp, r2, .key = "snp"),
by = c("y" = "sca")) %>%
mutate(
cuts = map(x, ~ seq(0, ., by = 100)),
tbls = pmap(
.l = list(snp, cuts),
.f = function(xx, breaks) {
z <- table(cut(xx$snp, breaks))
a <- mean(cut(xx$r2, breaks))
data_frame(cut = names(z), count = z, mean = a)
} # .f
) # closing pmap
) %>% # mutate
select(y, tbls) %>%
unnest()
但它输出我 NA
s 和一条警告消息:
y cut count mean
1 sca1 (0,100] 4 NA
2 sca1 (100,200] 0 NA
3 sca1 (200,300] 0 NA
4 sca2 (0,100] 4 NA
5 sca2 (100,200] 0 NA
6 sca2 (200,300] 0 NA
7 sca2 (300,400] 1 NA
Warning messages:
1: In mean.default(cut(xx$r2, breaks)) :
argument is not numeric or logical: returning NA
2: In mean.default(cut(xx$r2, breaks)) :
argument is not numeric or logical: returning NA
我应该如何解决这个问题?我需要双重嵌套 table 吗?
不确定你的方法,但这里有一个稍微简单的方法..如果你感兴趣的话,使用data.table
包。您需要最新版本(当前为 1.10.0)才能正常工作(因为它是一项新功能)。
require(data.table) ## v1.9.8+
and <- b[a, on=.(sca=y, snp>start, snp<=end), ## 1
.(count=.N, mean=mean(r2, na.rm=TRUE)), ## 2
by=.EACHI] ## 3
对于 a
中的每一行,在 b
中查找匹配的行索引,同时根据提供给 on
参数的条件进行匹配。
length(matching row indices)
== .N
给出 count
并且 mean()
给出那些匹配索引的 r2
的平均值。
对于a
中的每一行,(2)
中的表达式是运行。
其中,a
是:
require(data.table) ## v1.9.8+
a <- setDT(df)[, .(start=seq(0, x-1, by=bin.size),
end=seq(bin.size, x, by=bin.size)),
by=y]
b <- fread("snp r2 sca
1 0.70 sca1
2 0.80 sca1
10 0.70 sca1
100 0.10 sca1
1 0.90 sca2
2 0.98 sca2
14 0.80 sca2
16 0.80 sca2
399 0.01 sca2")
这里有一个 tidyverse 选项,在 dplyr 上比在 purrr 上更重,这使得它更具可读性:
library(tidyverse)
df2 %>% group_by(sca, cut = cut(snp, seq(0, max(df$x), bin.size))) %>%
summarise(count = n(), # For each group, count rows
mean = mean(r2)) %>% # and calculate mean
# Add rows for every level of the cuts. Fill new rows with zeros.
complete(cut, fill = list(count = 0L, mean = 0)) %>%
separate(cut, c('from', 'to'), sep = ',') %>% # Split cut into two numbers
mutate_at(vars(from:to), parse_number) %>% # Extract numbers from strings
left_join(df, c(sca = 'y')) %>% # Join to get x value for each group
filter(to <= x) # Subset to rows where the max cut is within the range.
#> Source: local data frame [7 x 6]
#> Groups: sca [2]
#>
#> sca from to count mean x
#> <chr> <dbl> <dbl> <int> <dbl> <dbl>
#> 1 sca1 0 100 4 0.575 300
#> 2 sca1 100 200 0 0.000 300
#> 3 sca1 200 300 0 0.000 300
#> 4 sca2 0 100 4 0.870 400
#> 5 sca2 100 200 0 0.000 400
#> 6 sca2 200 300 0 0.000 400
#> 7 sca2 300 400 1 0.010 400
您实际上可以通过一些正则表达式和子集来避免加入和弄乱剪切:
df2 %>% group_by(sca, cut = cut(snp, seq(0, max(df$x), bin.size))) %>%
summarise(count = n(),
mean = mean(r2)) %>%
complete(cut, fill = list(count = 0L, mean = 0)) %>%
filter(as.integer(gsub('.*,(\d+).*', '\1', cut)) <= df$x[unique(sca) == df$y])
#> Source: local data frame [7 x 4]
#> Groups: sca [2]
#>
#> sca cut count mean
#> <chr> <fctr> <int> <dbl>
#> 1 sca1 (0,100] 4 0.575
#> 2 sca1 (100,200] 0 0.000
#> 3 sca1 (200,300] 0 0.000
#> 4 sca2 (0,100] 4 0.870
#> 5 sca2 (100,200] 0 0.000
#> 6 sca2 (200,300] 0 0.000
#> 7 sca2 (300,400] 1 0.010
数据
df <- structure(list(x = c(300, 400), y = c("sca1", "sca2")), .Names = c("x",
"y"), row.names = c(NA, -2L), class = "data.frame")
df2 <- structure(list(snp = c(1L, 2L, 10L, 100L, 1L, 2L, 14L, 16L, 399L
), r2 = c(0.7, 0.8, 0.7, 0.1, 0.9, 0.98, 0.8, 0.8, 0.01), sca = c("sca1",
"sca1", "sca1", "sca1", "sca2", "sca2", "sca2", "sca2", "sca2"
)), .Names = c("snp", "r2", "sca"), row.names = c(NA, -9L), class = "data.frame")
我正在尝试使用 left_join
和 nest
计算分箱数据的平均值。
bin.size = 100
第一个数据框:
df = data.frame(x =c(300,400),
y = c("sca1","sca2"))
x y
1 300 sca1
2 400 sca2
第二个数据帧:
df2 = data.frame(snp = c(1,2,10,100,1,2,14,16,399),
sca = c("sca1","sca1","sca1","sca1","sca2","sca2","sca2","sca2","sca2"))
snp r2 sca
1 1 0.70 sca1
2 2 0.80 sca1
3 10 0.70 sca1
4 100 0.10 sca1
5 1 0.90 sca2
6 2 0.98 sca2
7 14 0.80 sca2
8 16 0.80 sca2
9 399 0.01 sca2
来自@r2evans 的代码:
output_bin_LD = df %>%
left_join(nest(df2, snp, .key = "snp"), by = c("y" = "sca")) %>%
mutate(
cuts = map(x, ~ seq(0, ., by = bin.size)),
tbls = pmap(
.l = list(snp, cuts),
.f = function(xx, breaks) {
z <- table(cut(xx$snp, breaks))
data_frame(cut = names(z), count = z)
}
)
) %>%
select(y, tbls) %>%
unnest()
这个
y cut count
1 sca1 (0,100] 4
2 sca1 (100,200] 0
3 sca1 (200,300] 0
4 sca2 (0,100] 4
5 sca2 (100,200] 0
6 sca2 (200,300] 0
7 sca2 (300,400] 1
最终目标是
y cut count mean
1 sca1 (0,100] 4 0.575
2 sca1 (100,200] 0 0
3 sca1 (200,300] 0 0
4 sca2 (0,100] 4 0.87
5 sca2 (100,200] 0 0
6 sca2 (200,300] 0 0
7 sca2 (300,400] 1 399
到目前为止我试过这个:
df %>%
left_join(nest(df2, snp, r2, .key = "snp"),
by = c("y" = "sca")) %>%
mutate(
cuts = map(x, ~ seq(0, ., by = 100)),
tbls = pmap(
.l = list(snp, cuts),
.f = function(xx, breaks) {
z <- table(cut(xx$snp, breaks))
a <- mean(cut(xx$r2, breaks))
data_frame(cut = names(z), count = z, mean = a)
} # .f
) # closing pmap
) %>% # mutate
select(y, tbls) %>%
unnest()
但它输出我 NA
s 和一条警告消息:
y cut count mean
1 sca1 (0,100] 4 NA
2 sca1 (100,200] 0 NA
3 sca1 (200,300] 0 NA
4 sca2 (0,100] 4 NA
5 sca2 (100,200] 0 NA
6 sca2 (200,300] 0 NA
7 sca2 (300,400] 1 NA
Warning messages:
1: In mean.default(cut(xx$r2, breaks)) :
argument is not numeric or logical: returning NA
2: In mean.default(cut(xx$r2, breaks)) :
argument is not numeric or logical: returning NA
我应该如何解决这个问题?我需要双重嵌套 table 吗?
不确定你的方法,但这里有一个稍微简单的方法..如果你感兴趣的话,使用data.table
包。您需要最新版本(当前为 1.10.0)才能正常工作(因为它是一项新功能)。
require(data.table) ## v1.9.8+
and <- b[a, on=.(sca=y, snp>start, snp<=end), ## 1
.(count=.N, mean=mean(r2, na.rm=TRUE)), ## 2
by=.EACHI] ## 3
对于
a
中的每一行,在b
中查找匹配的行索引,同时根据提供给on
参数的条件进行匹配。length(matching row indices)
==.N
给出count
并且mean()
给出那些匹配索引的r2
的平均值。对于
a
中的每一行,(2)
中的表达式是运行。
其中,a
是:
require(data.table) ## v1.9.8+
a <- setDT(df)[, .(start=seq(0, x-1, by=bin.size),
end=seq(bin.size, x, by=bin.size)),
by=y]
b <- fread("snp r2 sca
1 0.70 sca1
2 0.80 sca1
10 0.70 sca1
100 0.10 sca1
1 0.90 sca2
2 0.98 sca2
14 0.80 sca2
16 0.80 sca2
399 0.01 sca2")
这里有一个 tidyverse 选项,在 dplyr 上比在 purrr 上更重,这使得它更具可读性:
library(tidyverse)
df2 %>% group_by(sca, cut = cut(snp, seq(0, max(df$x), bin.size))) %>%
summarise(count = n(), # For each group, count rows
mean = mean(r2)) %>% # and calculate mean
# Add rows for every level of the cuts. Fill new rows with zeros.
complete(cut, fill = list(count = 0L, mean = 0)) %>%
separate(cut, c('from', 'to'), sep = ',') %>% # Split cut into two numbers
mutate_at(vars(from:to), parse_number) %>% # Extract numbers from strings
left_join(df, c(sca = 'y')) %>% # Join to get x value for each group
filter(to <= x) # Subset to rows where the max cut is within the range.
#> Source: local data frame [7 x 6]
#> Groups: sca [2]
#>
#> sca from to count mean x
#> <chr> <dbl> <dbl> <int> <dbl> <dbl>
#> 1 sca1 0 100 4 0.575 300
#> 2 sca1 100 200 0 0.000 300
#> 3 sca1 200 300 0 0.000 300
#> 4 sca2 0 100 4 0.870 400
#> 5 sca2 100 200 0 0.000 400
#> 6 sca2 200 300 0 0.000 400
#> 7 sca2 300 400 1 0.010 400
您实际上可以通过一些正则表达式和子集来避免加入和弄乱剪切:
df2 %>% group_by(sca, cut = cut(snp, seq(0, max(df$x), bin.size))) %>%
summarise(count = n(),
mean = mean(r2)) %>%
complete(cut, fill = list(count = 0L, mean = 0)) %>%
filter(as.integer(gsub('.*,(\d+).*', '\1', cut)) <= df$x[unique(sca) == df$y])
#> Source: local data frame [7 x 4]
#> Groups: sca [2]
#>
#> sca cut count mean
#> <chr> <fctr> <int> <dbl>
#> 1 sca1 (0,100] 4 0.575
#> 2 sca1 (100,200] 0 0.000
#> 3 sca1 (200,300] 0 0.000
#> 4 sca2 (0,100] 4 0.870
#> 5 sca2 (100,200] 0 0.000
#> 6 sca2 (200,300] 0 0.000
#> 7 sca2 (300,400] 1 0.010
数据
df <- structure(list(x = c(300, 400), y = c("sca1", "sca2")), .Names = c("x",
"y"), row.names = c(NA, -2L), class = "data.frame")
df2 <- structure(list(snp = c(1L, 2L, 10L, 100L, 1L, 2L, 14L, 16L, 399L
), r2 = c(0.7, 0.8, 0.7, 0.1, 0.9, 0.98, 0.8, 0.8, 0.01), sca = c("sca1",
"sca1", "sca1", "sca1", "sca2", "sca2", "sca2", "sca2", "sca2"
)), .Names = c("snp", "r2", "sca"), row.names = c(NA, -9L), class = "data.frame")