映射列并使用 map_dbl 提取第一个数字
map over columns and extract the first number using map_dbl
我有以下数据:
08001 08003 08005 08006 08007 08009 08010_AM 08014_AM 0801501
08001 4875.276, 4875.276 8.448, 8.448 16.876, 16.876 14.56, 14.56 NULL 15.066, 15.066 24.953, 24.953 7.229, 7.229 7.074, 7.074
08003 15.632 3663.7 15.794 129.029 13.371 145.019 NULL 12.382 180.696
08005 10.611 4.911 2474.452 29.844 NULL 43.95 NULL 347.076 NULL
08006 7.291 69.755 29.844 10841.55 4947.665 200.296 NULL 25.047 19.926
08007 NULL 33.67 17.269 4954.248 3875.372 111.159 NULL NULL 9.84
08009 32.811 106.313 58.019 145.959 158.566 2791.247 NULL 6.568 12.59
08010_AM 40.875, 40.875 NULL 4.341, 4.341 NULL NULL NULL 3039.333, 3039.333 4.341, 4.341 NULL
08014_AM 4.732, 4.732 10.249, 10.249 311.877, 311.877 6.568, 6.568 NULL 10.831, 10.831 4.341, 4.341 420.177, 420.177 6.172, 6.172
0801501 12.344 220.281 NULL 32.741 NULL 18.797 NULL 6.172 1069.609
0801502 23.21 293.464 13.865 34.51 5.624 29.219 NULL NULL 4779.908
有些列中有两个数值。我可以使用以下内容映射列并取平均值:
mutate(across(everything(), ~ map_dbl(.x, mean, na.rm = TRUE)))
但是,这在整个数据集上花费的时间太长。我注意到所有重复的列数字都是相同的,所以我的问题是如何更改 map
函数而不是计算两个数字的平均值,取第一个数值?
即在第 1 列 08001
中有值 40.875, 40.875
,所以我怎样才能只提取第一个数字,而不是取这两个数字的平均值?
数据:
df <- structure(list(`08001` = list(c(4875.276, 4875.276), 15.632,
10.611, 7.291, NULL, 32.811, c(40.875, 40.875), c(4.732,
4.732), 12.344, 23.21), `08003` = list(c(8.448, 8.448), 3663.7,
4.911, 69.755, 33.67, 106.313, NULL, c(10.249, 10.249), 220.281,
293.464), `08005` = list(c(16.876, 16.876), 15.794, 2474.452,
29.844, 17.269, 58.019, c(4.341, 4.341), c(311.877, 311.877
), NULL, 13.865), `08006` = list(c(14.56, 14.56), 129.029,
29.844, 10841.553, 4954.248, 145.959, NULL, c(6.568, 6.568
), 32.741, 34.51), `08007` = list(NULL, 13.371, NULL, 4947.665,
3875.372, 158.566, NULL, NULL, NULL, 5.624), `08009` = list(
c(15.066, 15.066), 145.019, 43.95, 200.296, 111.159, 2791.247,
NULL, c(10.831, 10.831), 18.797, 29.219), `08010_AM` = list(
c(24.953, 24.953), NULL, NULL, NULL, NULL, NULL, c(3039.333,
3039.333), c(4.341, 4.341), NULL, NULL), `08014_AM` = list(
c(7.229, 7.229), 12.382, 347.076, 25.047, NULL, 6.568, c(4.341,
4.341), c(420.177, 420.177), 6.172, NULL), `0801501` = list(
c(7.074, 7.074), 180.696, NULL, 19.926, 9.84, 12.59, NULL,
c(6.172, 6.172), 1069.609, 4779.908), `0801502` = list(c(26.747,
26.747), 305.18, 17.419, 27.801, 5.624, 20.512, NULL, NULL, 4948.395,
3647.42)), row.names = c("08001", "08003", "08005", "08006",
"08007", "08009", "08010_AM", "08014_AM", "0801501", "0801502"
), class = "data.frame")
可以使用imap_dfc
-
library(tibble)
library(purrr)
imap_dfc(df, function(x, y)
tibble(!!y := map_dbl(x, ~if(length(.x)) .x[1] else NA)))
# `08001` `08003` `08005` `08006` `08007` `08009` `08010_AM` `08014_AM`
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 4875. 8.45 16.9 1.46e1 NA 15.1 25.0 7.23
# 2 15.6 3664. 15.8 1.29e2 13.4 145. NA 12.4
# 3 10.6 4.91 2474. 2.98e1 NA 44.0 NA 347.
# 4 7.29 69.8 29.8 1.08e4 4948. 200. NA 25.0
# 5 NA 33.7 17.3 4.95e3 3875. 111. NA NA
# 6 32.8 106. 58.0 1.46e2 159. 2791. NA 6.57
# 7 40.9 NA 4.34 NA NA NA 3039. 4.34
# 8 4.73 10.2 312. 6.57e0 NA 10.8 4.34 420.
# 9 12.3 220. NA 3.27e1 NA 18.8 NA 6.17
#10 23.2 293. 13.9 3.45e1 5.62 29.2 NA NA
# … with 2 more variables: 0801501 <dbl>, 0801502 <dbl>
上面的输出是一个小标题,小标题不支持行名。如果您想保留行名,请采用这种基本的 R 方法 -
df[] <- lapply(df, function(x)
sapply(x, function(y) if(length(y)) y[1] else NA))
df
# 08001 08003 08005 08006 08007 08009
#08001 4875.276 8.448 16.876 14.560 NA 15.066
#08003 15.632 3663.700 15.794 129.029 13.371 145.019
#08005 10.611 4.911 2474.452 29.844 NA 43.950
#08006 7.291 69.755 29.844 10841.553 4947.665 200.296
#08007 NA 33.670 17.269 4954.248 3875.372 111.159
#08009 32.811 106.313 58.019 145.959 158.566 2791.247
#08010_AM 40.875 NA 4.341 NA NA NA
#08014_AM 4.732 10.249 311.877 6.568 NA 10.831
#0801501 12.344 220.281 NA 32.741 NA 18.797
#0801502 23.210 293.464 13.865 34.510 5.624 29.219
# 08010_AM 08014_AM 0801501 0801502
#08001 24.953 7.229 7.074 26.747
#08003 NA 12.382 180.696 305.180
#08005 NA 347.076 NA 17.419
#08006 NA 25.047 19.926 27.801
#08007 NA NA 9.840 5.624
#08009 NA 6.568 12.590 20.512
#08010_AM 3039.333 4.341 NA NA
#08014_AM 4.341 420.177 6.172 NA
#0801501 NA 6.172 1069.609 4948.395
#0801502 NA NA 4779.908 3647.420
我们可以使用 map
和 summarise
library(dplyr)
library(purrr)
df %>%
summarise(across(everything(),
~ map_dbl(., function(x) if(is.null(x)) NA_real_ else x[1])))
08001 08003 08005 08006 08007 08009 08010_AM 08014_AM 0801501 0801502
1 4875.276 8.448 16.876 14.56 NA 15.066 24.953 7.229 7.074 26.747
2 15.632 3663.7 15.794 129.029 13.371 145.019 NA 12.382 180.696 305.18
3 10.611 4.911 2474.452 29.844 NA 43.95 NA 347.076 NA 17.419
4 7.291 69.755 29.844 10841.55 4947.665 200.296 NA 25.047 19.926 27.801
5 NA 33.67 17.269 4954.248 3875.372 111.159 NA NA 9.84 5.624
6 32.811 106.313 58.019 145.959 158.566 2791.247 NA 6.568 12.59 20.512
7 40.875 NA 4.341 NA NA NA 3039.333 4.341 NA NA
8 4.732 10.249 311.877 6.568 NA 10.831 4.341 420.177 6.172 NA
9 12.344 220.281 NA 32.741 NA 18.797 NA 6.172 1069.609 4948.395
10 23.21 293.464 13.865 34.51 5.624 29.219 NA NA 4779.908 3647.42
已更新 我对我的第一个解决方案做了一些修改以获得所需的输出,我没有注意到输出有问题。
library(dplyr)
library(purrr)
df %>%
map_dfc(~ .x %>% map_dbl(~ if(is.null(.x)) NA else .x[1]))
# A tibble: 10 x 10
`08001` `08003` `08005` `08006` `08007` `08009` `08010_AM` `08014_AM` `0801501` `0801502`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 4875. 8.45 16.9 14.6 NA 15.1 25.0 7.23 7.07 26.7
2 15.6 3664. 15.8 129. 13.4 145. NA 12.4 181. 305.
3 10.6 4.91 2474. 29.8 NA 44.0 NA 347. NA 17.4
4 7.29 69.8 29.8 10842. 4948. 200. NA 25.0 19.9 27.8
5 NA 33.7 17.3 4954. 3875. 111. NA NA 9.84 5.62
6 32.8 106. 58.0 146. 159. 2791. NA 6.57 12.6 20.5
7 40.9 NA 4.34 NA NA NA 3039. 4.34 NA NA
8 4.73 10.2 312. 6.57 NA 10.8 4.34 420. 6.17 NA
9 12.3 220. NA 32.7 NA 18.8 NA 6.17 1070. 4948.
10 23.2 293. 13.9 34.5 5.62 29.2 NA NA 4780. 3647.
否则我们可以使用 map_depth
:
df %>%
map_depth(2, ~ if(is.null(.x)) NA_real_ else .x[1]) %>%
map_dfc(~ .x %>% unlist)
# A tibble: 10 x 10
`08001` `08003` `08005` `08006` `08007` `08009` `08010_AM` `08014_AM` `0801501` `0801502`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 4875. 8.45 16.9 14.6 NA 15.1 25.0 7.23 7.07 26.7
2 15.6 3664. 15.8 129. 13.4 145. NA 12.4 181. 305.
3 10.6 4.91 2474. 29.8 NA 44.0 NA 347. NA 17.4
4 7.29 69.8 29.8 10842. 4948. 200. NA 25.0 19.9 27.8
5 NA 33.7 17.3 4954. 3875. 111. NA NA 9.84 5.62
6 32.8 106. 58.0 146. 159. 2791. NA 6.57 12.6 20.5
7 40.9 NA 4.34 NA NA NA 3039. 4.34 NA NA
8 4.73 10.2 312. 6.57 NA 10.8 4.34 420. 6.17 NA
9 12.3 220. NA 32.7 NA 18.8 NA 6.17 1070. 4948.
10 23.2 293. 13.9 34.5 5.62 29.2 NA NA 4780. 3647.
我有以下数据:
08001 08003 08005 08006 08007 08009 08010_AM 08014_AM 0801501
08001 4875.276, 4875.276 8.448, 8.448 16.876, 16.876 14.56, 14.56 NULL 15.066, 15.066 24.953, 24.953 7.229, 7.229 7.074, 7.074
08003 15.632 3663.7 15.794 129.029 13.371 145.019 NULL 12.382 180.696
08005 10.611 4.911 2474.452 29.844 NULL 43.95 NULL 347.076 NULL
08006 7.291 69.755 29.844 10841.55 4947.665 200.296 NULL 25.047 19.926
08007 NULL 33.67 17.269 4954.248 3875.372 111.159 NULL NULL 9.84
08009 32.811 106.313 58.019 145.959 158.566 2791.247 NULL 6.568 12.59
08010_AM 40.875, 40.875 NULL 4.341, 4.341 NULL NULL NULL 3039.333, 3039.333 4.341, 4.341 NULL
08014_AM 4.732, 4.732 10.249, 10.249 311.877, 311.877 6.568, 6.568 NULL 10.831, 10.831 4.341, 4.341 420.177, 420.177 6.172, 6.172
0801501 12.344 220.281 NULL 32.741 NULL 18.797 NULL 6.172 1069.609
0801502 23.21 293.464 13.865 34.51 5.624 29.219 NULL NULL 4779.908
有些列中有两个数值。我可以使用以下内容映射列并取平均值:
mutate(across(everything(), ~ map_dbl(.x, mean, na.rm = TRUE)))
但是,这在整个数据集上花费的时间太长。我注意到所有重复的列数字都是相同的,所以我的问题是如何更改 map
函数而不是计算两个数字的平均值,取第一个数值?
即在第 1 列 08001
中有值 40.875, 40.875
,所以我怎样才能只提取第一个数字,而不是取这两个数字的平均值?
数据:
df <- structure(list(`08001` = list(c(4875.276, 4875.276), 15.632,
10.611, 7.291, NULL, 32.811, c(40.875, 40.875), c(4.732,
4.732), 12.344, 23.21), `08003` = list(c(8.448, 8.448), 3663.7,
4.911, 69.755, 33.67, 106.313, NULL, c(10.249, 10.249), 220.281,
293.464), `08005` = list(c(16.876, 16.876), 15.794, 2474.452,
29.844, 17.269, 58.019, c(4.341, 4.341), c(311.877, 311.877
), NULL, 13.865), `08006` = list(c(14.56, 14.56), 129.029,
29.844, 10841.553, 4954.248, 145.959, NULL, c(6.568, 6.568
), 32.741, 34.51), `08007` = list(NULL, 13.371, NULL, 4947.665,
3875.372, 158.566, NULL, NULL, NULL, 5.624), `08009` = list(
c(15.066, 15.066), 145.019, 43.95, 200.296, 111.159, 2791.247,
NULL, c(10.831, 10.831), 18.797, 29.219), `08010_AM` = list(
c(24.953, 24.953), NULL, NULL, NULL, NULL, NULL, c(3039.333,
3039.333), c(4.341, 4.341), NULL, NULL), `08014_AM` = list(
c(7.229, 7.229), 12.382, 347.076, 25.047, NULL, 6.568, c(4.341,
4.341), c(420.177, 420.177), 6.172, NULL), `0801501` = list(
c(7.074, 7.074), 180.696, NULL, 19.926, 9.84, 12.59, NULL,
c(6.172, 6.172), 1069.609, 4779.908), `0801502` = list(c(26.747,
26.747), 305.18, 17.419, 27.801, 5.624, 20.512, NULL, NULL, 4948.395,
3647.42)), row.names = c("08001", "08003", "08005", "08006",
"08007", "08009", "08010_AM", "08014_AM", "0801501", "0801502"
), class = "data.frame")
可以使用imap_dfc
-
library(tibble)
library(purrr)
imap_dfc(df, function(x, y)
tibble(!!y := map_dbl(x, ~if(length(.x)) .x[1] else NA)))
# `08001` `08003` `08005` `08006` `08007` `08009` `08010_AM` `08014_AM`
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 4875. 8.45 16.9 1.46e1 NA 15.1 25.0 7.23
# 2 15.6 3664. 15.8 1.29e2 13.4 145. NA 12.4
# 3 10.6 4.91 2474. 2.98e1 NA 44.0 NA 347.
# 4 7.29 69.8 29.8 1.08e4 4948. 200. NA 25.0
# 5 NA 33.7 17.3 4.95e3 3875. 111. NA NA
# 6 32.8 106. 58.0 1.46e2 159. 2791. NA 6.57
# 7 40.9 NA 4.34 NA NA NA 3039. 4.34
# 8 4.73 10.2 312. 6.57e0 NA 10.8 4.34 420.
# 9 12.3 220. NA 3.27e1 NA 18.8 NA 6.17
#10 23.2 293. 13.9 3.45e1 5.62 29.2 NA NA
# … with 2 more variables: 0801501 <dbl>, 0801502 <dbl>
上面的输出是一个小标题,小标题不支持行名。如果您想保留行名,请采用这种基本的 R 方法 -
df[] <- lapply(df, function(x)
sapply(x, function(y) if(length(y)) y[1] else NA))
df
# 08001 08003 08005 08006 08007 08009
#08001 4875.276 8.448 16.876 14.560 NA 15.066
#08003 15.632 3663.700 15.794 129.029 13.371 145.019
#08005 10.611 4.911 2474.452 29.844 NA 43.950
#08006 7.291 69.755 29.844 10841.553 4947.665 200.296
#08007 NA 33.670 17.269 4954.248 3875.372 111.159
#08009 32.811 106.313 58.019 145.959 158.566 2791.247
#08010_AM 40.875 NA 4.341 NA NA NA
#08014_AM 4.732 10.249 311.877 6.568 NA 10.831
#0801501 12.344 220.281 NA 32.741 NA 18.797
#0801502 23.210 293.464 13.865 34.510 5.624 29.219
# 08010_AM 08014_AM 0801501 0801502
#08001 24.953 7.229 7.074 26.747
#08003 NA 12.382 180.696 305.180
#08005 NA 347.076 NA 17.419
#08006 NA 25.047 19.926 27.801
#08007 NA NA 9.840 5.624
#08009 NA 6.568 12.590 20.512
#08010_AM 3039.333 4.341 NA NA
#08014_AM 4.341 420.177 6.172 NA
#0801501 NA 6.172 1069.609 4948.395
#0801502 NA NA 4779.908 3647.420
我们可以使用 map
和 summarise
library(dplyr)
library(purrr)
df %>%
summarise(across(everything(),
~ map_dbl(., function(x) if(is.null(x)) NA_real_ else x[1])))
08001 08003 08005 08006 08007 08009 08010_AM 08014_AM 0801501 0801502
1 4875.276 8.448 16.876 14.56 NA 15.066 24.953 7.229 7.074 26.747
2 15.632 3663.7 15.794 129.029 13.371 145.019 NA 12.382 180.696 305.18
3 10.611 4.911 2474.452 29.844 NA 43.95 NA 347.076 NA 17.419
4 7.291 69.755 29.844 10841.55 4947.665 200.296 NA 25.047 19.926 27.801
5 NA 33.67 17.269 4954.248 3875.372 111.159 NA NA 9.84 5.624
6 32.811 106.313 58.019 145.959 158.566 2791.247 NA 6.568 12.59 20.512
7 40.875 NA 4.341 NA NA NA 3039.333 4.341 NA NA
8 4.732 10.249 311.877 6.568 NA 10.831 4.341 420.177 6.172 NA
9 12.344 220.281 NA 32.741 NA 18.797 NA 6.172 1069.609 4948.395
10 23.21 293.464 13.865 34.51 5.624 29.219 NA NA 4779.908 3647.42
已更新 我对我的第一个解决方案做了一些修改以获得所需的输出,我没有注意到输出有问题。
library(dplyr)
library(purrr)
df %>%
map_dfc(~ .x %>% map_dbl(~ if(is.null(.x)) NA else .x[1]))
# A tibble: 10 x 10
`08001` `08003` `08005` `08006` `08007` `08009` `08010_AM` `08014_AM` `0801501` `0801502`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 4875. 8.45 16.9 14.6 NA 15.1 25.0 7.23 7.07 26.7
2 15.6 3664. 15.8 129. 13.4 145. NA 12.4 181. 305.
3 10.6 4.91 2474. 29.8 NA 44.0 NA 347. NA 17.4
4 7.29 69.8 29.8 10842. 4948. 200. NA 25.0 19.9 27.8
5 NA 33.7 17.3 4954. 3875. 111. NA NA 9.84 5.62
6 32.8 106. 58.0 146. 159. 2791. NA 6.57 12.6 20.5
7 40.9 NA 4.34 NA NA NA 3039. 4.34 NA NA
8 4.73 10.2 312. 6.57 NA 10.8 4.34 420. 6.17 NA
9 12.3 220. NA 32.7 NA 18.8 NA 6.17 1070. 4948.
10 23.2 293. 13.9 34.5 5.62 29.2 NA NA 4780. 3647.
否则我们可以使用 map_depth
:
df %>%
map_depth(2, ~ if(is.null(.x)) NA_real_ else .x[1]) %>%
map_dfc(~ .x %>% unlist)
# A tibble: 10 x 10
`08001` `08003` `08005` `08006` `08007` `08009` `08010_AM` `08014_AM` `0801501` `0801502`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 4875. 8.45 16.9 14.6 NA 15.1 25.0 7.23 7.07 26.7
2 15.6 3664. 15.8 129. 13.4 145. NA 12.4 181. 305.
3 10.6 4.91 2474. 29.8 NA 44.0 NA 347. NA 17.4
4 7.29 69.8 29.8 10842. 4948. 200. NA 25.0 19.9 27.8
5 NA 33.7 17.3 4954. 3875. 111. NA NA 9.84 5.62
6 32.8 106. 58.0 146. 159. 2791. NA 6.57 12.6 20.5
7 40.9 NA 4.34 NA NA NA 3039. 4.34 NA NA
8 4.73 10.2 312. 6.57 NA 10.8 4.34 420. 6.17 NA
9 12.3 220. NA 32.7 NA 18.8 NA 6.17 1070. 4948.
10 23.2 293. 13.9 34.5 5.62 29.2 NA NA 4780. 3647.