将一个组的值添加到R中的另一组

add values of one group into another group in R

我对如何将组中的值添加到组中的其余元素然后删除该行有疑问。例如:

df <- data.frame(Year=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2),
                 Cluster=c("a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","c","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","d"),
                 Seed=c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,99,99,99,99,99,99),
                 Day=c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1),
                 value=c(5,2,1,2,8,6,7,9,3,5,2,1,2,8,6,55,66,77,88,99,10))

在上面的示例中,我的数据按年、簇、种子和日分组,其中需要将 seed=99 值添加到基于(年、簇和日)组的上述行中,然后删除此行。例如:第 16 行是 (Year=1, Cluster=a,Day=1 and Seed=99) 组的一部分,第 16 行的值为 55 应添加到第 1 行 (5+55) , 第 6 行 (6+55) 和第 11 行 (2+55) 以及第 16 行应删除。但是当涉及到第 21 行时,它位于 cluster=C 中且 seed=99,应该按原样保留在数据库中,因为它找不到 year+cluster+day 组合中的任何匹配项。

我的实际数据是 100 万条记录,有 10 年、80 个集群、500 天和 10+1(1 到 10 和 99)个种子,所以正在寻找有效的解决方案。

     Year Cluster Seed Day value
1     1       a    1   1    60
2     1       a    1   2    68
3     1       a    1   3    78
4     1       a    1   4    90
5     1       a    1   5   107
6     1       a    2   1    61
7     1       a    2   2    73
8     1       a    2   3    86
9     1       a    2   4    91
10    1       a    2   5   104
11    1       a    3   1    57
12    1       a    3   2    67
13    1       a    3   3    79
14    1       a    3   4    96
15    1       a    3   5   105
16    1       c   99   1    10
17    2       b    1   1    60
18    2       b    1   2    68
19    2       b    1   3    78
20    2       b    1   4    90
21    2       b    1   5   107
22    2       b    2   1    61
23    2       b    2   2    73
24    2       b    2   3    86
25    2       b    2   4    91
26    2       b    2   5   104
27    2       b    3   1    57
28    2       b    3   2    67
29    2       b    3   3    79
30    2       b    3   4    96
31    2       b    3   5   105
32    2       d   99   1    10

一个data.table方法:

library(data.table)

df <- setDT(df)[, `:=` (value = ifelse(Seed != 99, value + value[Seed == 99], value),
                  flag = Seed == 99 & .N == 1), by = .(Year, Cluster, Day)][!(Seed == 99 & flag == FALSE),][, "flag" := NULL]

输出:

df[]

    Year Cluster Seed Day value
 1:    1       a    1   1    60
 2:    1       a    1   2    68
 3:    1       a    1   3    78
 4:    1       a    1   4    90
 5:    1       a    1   5   107
 6:    1       a    2   1    61
 7:    1       a    2   2    73
 8:    1       a    2   3    86
 9:    1       a    2   4    91
10:    1       a    2   5   104
11:    1       a    3   1    57
12:    1       a    3   2    67
13:    1       a    3   3    79
14:    1       a    3   4    96
15:    1       a    3   5   105
16:    1       c   99   1    10
17:    2       b    1   1    60
18:    2       b    1   2    68
19:    2       b    1   3    78
20:    2       b    1   4    90
21:    2       b    1   5   107
22:    2       b    2   1    61
23:    2       b    2   2    73
24:    2       b    2   3    86
25:    2       b    2   4    91
26:    2       b    2   5   104
27:    2       b    3   1    57
28:    2       b    3   2    67
29:    2       b    3   3    79
30:    2       b    3   4    96
31:    2       b    3   5   105
32:    2       d   99   1    10

这是使用 tidyverse 的方法。如果您正在寻找一百万行的速度,data.table 解决方案可能会表现更好。

library(tidyverse)

df <- data.frame(Year=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2),
                 Cluster=c("a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","c","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","d"),
                 Seed=c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,99,99,99,99,99,99),
                 Day=c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1),
                 value=c(5,2,1,2,8,6,7,9,3,5,2,1,2,8,6,55,66,77,88,99,10))

seeds <- df %>% 
  filter(Seed == 99) 

matches <- df %>% 
  filter(Seed != 99) %>% 
  inner_join(select(seeds, -Seed), by = c("Year", "Cluster", "Day")) %>% 
  mutate(value = value.x + value.y) %>% 
  select(Year, Cluster, Seed, Day, value)

no_matches <- anti_join(seeds, matches, by = c("Year", "Cluster", "Day"))

bind_rows(matches, no_matches) %>% 
  arrange(Year, Cluster, Seed, Day)
#>    Year Cluster Seed Day value
#> 1     1       a    1   1    60
#> 2     1       a    1   2    68
#> 3     1       a    1   3    78
#> 4     1       a    1   4    90
#> 5     1       a    1   5   107
#> 6     1       a    2   1    61
#> 7     1       a    2   2    73
#> 8     1       a    2   3    86
#> 9     1       a    2   4    91
#> 10    1       a    2   5   104
#> 11    1       a    3   1    57
#> 12    1       a    3   2    67
#> 13    1       a    3   3    79
#> 14    1       a    3   4    96
#> 15    1       a    3   5   105
#> 16    1       c   99   1    10
#> 17    2       b    1   1    60
#> 18    2       b    1   2    68
#> 19    2       b    1   3    78
#> 20    2       b    1   4    90
#> 21    2       b    1   5   107
#> 22    2       b    2   1    61
#> 23    2       b    2   2    73
#> 24    2       b    2   3    86
#> 25    2       b    2   4    91
#> 26    2       b    2   5   104
#> 27    2       b    3   1    57
#> 28    2       b    3   2    67
#> 29    2       b    3   3    79
#> 30    2       b    3   4    96
#> 31    2       b    3   5   105
#> 32    2       d   99   1    10

reprex package (v0.2.1)

创建于 2018-11-23