根据R中的条件总结和创建新变量
Summarizing and creating new variable according to condition in R
我做了这个虚拟数据集:
df = data.frame(Order = "Order1",
Condition = c("P", "A", "B", "C", "D", "E", "F"),
Value = c(500, -10, -5,0,0, -10,0))
假设某些条件属于不同的组。
list = list( Group1 = c("A", "B"),
Group2 = c("C", "D"),
Group3 = c("E","F"))
我需要根据获得每组总和和计数的条件来汇总它们。
预期输出:
Order P Group1 Group2 Group3 Group1n Group2n Group3n
Order1 500 -15 0 -10 2 0 1
我在想:
df %>% group_by(Order) %>% summarise(Group1 = sum(Value[Condition == "A" | Condition == "B" ]),
Group2 = sum(Value[Condition == "C" | Condition == "D" ] ),
Group3 = sum(Value[Condition == "E" | Condition == "F"]),
Group1n = length(Value[Condition == "A" | Condition == "B" ]),
Group2n = length(Value[Condition == "C" | Condition == "D" ]),
Group3n = length(Value[Condition == "E" | Condition == "F" ]))
我的输出:
# A tibble: 1 x 7
Order Group1 Group2 Group3 Group1n Group2n Group3n
<fct> <dbl> <dbl> <dbl> <dbl> <int> <int>
Order1 -15.0 0 -10.0 2 2 2
但我无法正确计算。还有一种有效的方法可以让我传递列表而不显式地写条件 ==A 或 B ... 等等
谢谢
这应该能满足您的需求:
df %>%
group_by(Order) %>%
summarise(Group1 = sum(Value[Condition == "A" | Condition == "B" ]),
Group2 = sum(Value[Condition == "C" | Condition == "D" ] ),
Group3 = sum(Value[Condition == "E" | Condition == "F"]),
Group1n = sum(Condition == "A" | Condition == "B"),
Group2n = sum(Condition == "C" | Condition == "D"),
Group3n = sum(Condition == "E" | Condition == "F"))
你可以再清理一下。这个版本也只计算非零值(而不是每个组中的所有行。)
# don't rename "list"
list_of_groups = list( Group1 = c("A", "B"),
Group2 = c("C", "D"),
Group3 = c("E","F"))
df %>%
group_by(Order) %>%
summarise(Group1 = sum(Value[Condition %in% list_of_groups$Group1]),
Group2 = sum(Value[Condition %in% list_of_groups$Group2] ),
Group3 = sum(Value[Condition %in% list_of_groups$Group3]),
Group1n = sum(Condition %in% list_of_groups$Group1 & Value != 0),
Group2n = sum(Condition %in% list_of_groups$Group2 & Value != 0),
Group3n = sum(Condition %in% list_of_groups$Group3 & Value != 0))
利用您的组列表 - 这样您就不必在更改组时修复所有问题(如果相关)。
我推荐这个作为整洁的解决方案:
group_map = data.frame(Condition = unlist(list), Group = rep(names(list), lengths(list)), stringsAsFactors = FALSE)
result = df %>% mutate(Condition = as.character(Condition)) %>%
inner_join(group_map) %>%
group_by(Group, Order) %>%
summarize(sums = sum(Value), n_nonzero = sum(Value != 0))
result
# # A tibble: 3 x 4
# # Groups: Group [?]
# Group Order sums n_nonzero
# <chr> <fctr> <dbl> <int>
# 1 Group1 Order1 -15 2
# 2 Group2 Order1 0 0
# 3 Group3 Order1 -10 1
如果您需要宽格式,可以使用 data.table:
重塑多个列
library(data.table)
setDT(result)
data.table::dcast(Order ~ Group, data = result, value.var = c("sums", "n_nonzero"))
# Order sums_Group1 sums_Group2 sums_Group3 n_nonzero_Group1 n_nonzero_Group2 n_nonzero_Group3
# 1: Order1 -15 0 -10 2 0 1
我做了这个虚拟数据集:
df = data.frame(Order = "Order1",
Condition = c("P", "A", "B", "C", "D", "E", "F"),
Value = c(500, -10, -5,0,0, -10,0))
假设某些条件属于不同的组。
list = list( Group1 = c("A", "B"),
Group2 = c("C", "D"),
Group3 = c("E","F"))
我需要根据获得每组总和和计数的条件来汇总它们。
预期输出:
Order P Group1 Group2 Group3 Group1n Group2n Group3n
Order1 500 -15 0 -10 2 0 1
我在想:
df %>% group_by(Order) %>% summarise(Group1 = sum(Value[Condition == "A" | Condition == "B" ]),
Group2 = sum(Value[Condition == "C" | Condition == "D" ] ),
Group3 = sum(Value[Condition == "E" | Condition == "F"]),
Group1n = length(Value[Condition == "A" | Condition == "B" ]),
Group2n = length(Value[Condition == "C" | Condition == "D" ]),
Group3n = length(Value[Condition == "E" | Condition == "F" ]))
我的输出:
# A tibble: 1 x 7
Order Group1 Group2 Group3 Group1n Group2n Group3n
<fct> <dbl> <dbl> <dbl> <dbl> <int> <int>
Order1 -15.0 0 -10.0 2 2 2
但我无法正确计算。还有一种有效的方法可以让我传递列表而不显式地写条件 ==A 或 B ... 等等
谢谢
这应该能满足您的需求:
df %>%
group_by(Order) %>%
summarise(Group1 = sum(Value[Condition == "A" | Condition == "B" ]),
Group2 = sum(Value[Condition == "C" | Condition == "D" ] ),
Group3 = sum(Value[Condition == "E" | Condition == "F"]),
Group1n = sum(Condition == "A" | Condition == "B"),
Group2n = sum(Condition == "C" | Condition == "D"),
Group3n = sum(Condition == "E" | Condition == "F"))
你可以再清理一下。这个版本也只计算非零值(而不是每个组中的所有行。)
# don't rename "list"
list_of_groups = list( Group1 = c("A", "B"),
Group2 = c("C", "D"),
Group3 = c("E","F"))
df %>%
group_by(Order) %>%
summarise(Group1 = sum(Value[Condition %in% list_of_groups$Group1]),
Group2 = sum(Value[Condition %in% list_of_groups$Group2] ),
Group3 = sum(Value[Condition %in% list_of_groups$Group3]),
Group1n = sum(Condition %in% list_of_groups$Group1 & Value != 0),
Group2n = sum(Condition %in% list_of_groups$Group2 & Value != 0),
Group3n = sum(Condition %in% list_of_groups$Group3 & Value != 0))
利用您的组列表 - 这样您就不必在更改组时修复所有问题(如果相关)。
我推荐这个作为整洁的解决方案:
group_map = data.frame(Condition = unlist(list), Group = rep(names(list), lengths(list)), stringsAsFactors = FALSE)
result = df %>% mutate(Condition = as.character(Condition)) %>%
inner_join(group_map) %>%
group_by(Group, Order) %>%
summarize(sums = sum(Value), n_nonzero = sum(Value != 0))
result
# # A tibble: 3 x 4
# # Groups: Group [?]
# Group Order sums n_nonzero
# <chr> <fctr> <dbl> <int>
# 1 Group1 Order1 -15 2
# 2 Group2 Order1 0 0
# 3 Group3 Order1 -10 1
如果您需要宽格式,可以使用 data.table:
重塑多个列library(data.table)
setDT(result)
data.table::dcast(Order ~ Group, data = result, value.var = c("sums", "n_nonzero"))
# Order sums_Group1 sums_Group2 sums_Group3 n_nonzero_Group1 n_nonzero_Group2 n_nonzero_Group3
# 1: Order1 -15 0 -10 2 0 1