根据来自 R 中另一个的信息,将具有计数总和的新列添加到数据框

Add a new column with sum of count to a dataframe according to informations from another in R

根据另一个 tab2.

,我需要帮助才能将计数列添加到名为 tab1 的 table 中

这是第一个选项卡:

tab1

    Event_Groups Other_column
1      1_G1,2_G2            A
2           2_G1            B
3           4_G4            C
4 7_G5,8_G5,9_G5            D

如您在 Event_Groups 列中所见,我有 2 条信息(EventGroups 数字以“_”分隔)。这些信息也将在 tab2$Grouptab2$Event 中找到,并且想法是针对 tab1 行中的每个元素(以逗号分隔),以计算 [=16 中的行数=] 其中 VALUE1 < 10 AND VALUE2 > 30 然后将此计数添加到名为 Sum_count.

的新列中的 tab1

这是 tab2

  Group Event VALUE1 VALUE2
1    G1     1      5     50  <- VALUE1 < 10 & VALUE2 > 30 : count 1  
2    G1     2     6      20  <- VALUE2 < 30  : count 0 
3    G2     2     50     50  <- VALUE1 > 10  : count 0
4    G3     3      0      0
5    G4     1      0      0
6    G4     4      2     40  <- VALUE1 < 10 & VALUE2 > 30 : count 1 
7    G5     7      1     70  <- VALUE1 < 10 & VALUE2 > 30 : count 1 
8    G5     8      4     67  <- VALUE1 < 10 & VALUE2 > 30 : count 1 
9    G5     9      3     60  <- VALUE1 < 10 & VALUE2 > 30 : count 1 

示例:

这是预期的结果 tab1 dataframe;

Event_Groups     Other_column Sum_count
1_G1,2_G2        A            1
2_G1             B            0
4_G4             C            1
7_G5,8_G5,9_G5   D            3

不知道我说的够不够清楚,有问题欢迎提问。


这里有两个 tabledput 格式,如果有帮助的话:

tab1

structure(list(Event_Groups = structure(1:4, .Label = c("1_G1,2_G2", 
"2_G1", "4_G4", "7_G5,8_G5,9_G5"), class = "factor"), Other_column =
structure(1:4, .Label = c("A",  "B", "C", "D"), class = "factor")),
class = "data.frame", row.names = c(NA, 
-4L))

tab2

structure(list(Group = structure(c(1L, 1L, 2L, 3L, 4L, 4L, 5L, 
5L, 5L), .Label = c("G1", "G2", "G3", "G4", "G5"), class = "factor"), 
    Event = c(1L, 2L, 2L, 3L, 1L, 4L, 7L, 8L, 9L), VALUE1 = c(5L, 
    6L, 50L, 0L, 0L, 2L, 1L, 4L, 3L), VALUE2 = c(50, 20, 50, 
    0, 0, 40, 70, 67, 60)), class = "data.frame", row.names = c(NA, 
-9L))

你可以试试tidyverse

library(tidyverse)

tab1 %>% 
  rownames_to_column() %>% 
  separate_rows(Event_Groups, sep = ",") %>% 
  separate(Event_Groups, into =  c("Event", "Group"), sep="_", convert = T) %>% 
  left_join(tab2 %>% 
             mutate(count = as.numeric(VALUE1 < 10 & VALUE2 > 30)), 
            by = c("Event", "Group")) %>% 
  unite(Event_Groups, Event, Group) %>% 
  group_by(rowname)  %>% 
  summarise(Event_Groups = toString(Event_Groups),
            Other_column = unique(Other_column),
            count =sum(count))
# A tibble: 4 x 4
  rowname Event_Groups     Other_column count
  <chr>   <chr>            <chr>        <dbl>
1 1       1_G1, 2_G2       A                1
2 2       2_G1             B                0
3 3       4_G4             C                1
4 4       7_G5, 8_G5, 9_G5 D                3

这是一种方法:

library(dplyr)
library(tidyr)

tab1 %>% 
  mutate(Event_Groups = as.character(Event_Groups)) %>% 
  separate_rows(Event_Groups, sep = ",") %>% 
  left_join(., 
            tab2 %>% 
              unite(col = "Event_Groups", Event, Group) %>% 
              mutate(count = if_else(VALUE1 < 10 & VALUE2 > 30,1L, 0L))) %>%
  group_by(Other_column) %>% 
  summarise(Event_Groups = paste(unique(Event_Groups), collapse = ","),
            Sum_count = sum(count)) %>% 
  select(Event_Groups, everything())

#> Joining, by = "Event_Groups"
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 4 x 3
#>   Event_Groups   Other_column Sum_count
#>   <chr>          <fct>            <int>
#> 1 1_G1,2_G2      A                    1
#> 2 2_G1           B                    0
#> 3 4_G4           C                    1
#> 4 7_G5,8_G5,9_G5 D                    3

reprex package (v0.3.0)

于 2021-07-29 创建