比较同一数据框中组中的行
Compare rows in groups in the same data frame
我的数据是这样的:
library(dplyr)
library(data.table)
df <- data.frame(
customernumber = c("111", "111", "111", "111", "111","222", "222", "222", "222", "222", "222", "222"),
ordernumber = c("1", "1", "1", "2", "2", "1", "1", "1", "1", "2", "2", "3"),
article = c("JeansA", "JeansA", "ShirtA", "JeansA", "JeansB", "ShirtA", "ShirtB", "ShirtB", "JeansA", "JeansB", "ShirtA", "JeansB"),
size = c("40", "42", "40", "42", "44", "36", "36", "40", "40", "38", "44", "36"),
returned = c("1", "1", "0", "0", "1", "1", "1", "0", "0", "0", "0", "0")
)
输出:
customernumber ordernumber article size returned
1 111 1 JeansA 40 1
2 111 1 JeansA 42 1
3 111 1 ShirtA 40 0
4 111 2 JeansA 42 0
5 111 2 JeansB 44 1
6 222 1 ShirtA 36 1
7 222 1 ShirtB 36 1
8 222 1 ShirtB 40 0
9 222 1 JeansA 40 0
10 222 2 JeansB 38 0
11 222 2 ShirtA 44 0
12 222 3 JeansB 36 0
现在我想标记每个客户的所有订单,其中一件商品已被退回,但在下一个订单中以不同的尺寸再次订购。因此,所有仅被交换而不能真正被视为新订单的物品。所以最终结果应该是这样的:
结果:
customernumber ordernumber article size returned changed
1 111 1 JeansA 40 1 0
2 111 1 JeansA 42 1 0
3 111 1 ShirtA 40 0 0
4 111 2 JeansA 42 0 1
5 111 2 JeansB 44 1 0
6 222 1 ShirtA 36 1 0
7 222 1 ShirtB 36 1 0
8 222 1 ShirtB 40 0 0
9 222 1 JeansA 40 0 0
10 222 2 JeansB 38 0 0
11 222 2 ShirtA 44 0 1
12 222 3 JeansB 36 0 0
我以为我可以通过使用dyplr(或data.table)引入一个滞后变量来解决这个问题,但我只设法在同一组内滞后该变量,但我未能将它滞后到下一组.这是:
df %>%
group_by(customernumber, ordernumber, article) %>%
mutate(lag_size = lag(size, order_by = article))
或:
df <- data.table(df)
setorder(df, customernumber, ordernumber, article)
df[,lag_size := shift(size), by = .(customernumber, ordernumber, article)]
我不想考虑for循环(甚至不确定是否能解决问题),因为数据集很大而且需要ages.And我总体上真的很缺乏想法。因此,我们将不胜感激。
谢谢!
插件:
我无意中遇到了与此案例相关的另一个问题。我只想将下一个后续订单中另一个 size 中已订购的文章标记为已更改,而不是如果相同尺寸的相同文章再次被订购。因此,更改变量的标准为:
订单 n:已返回 == 1
订单 n+1:同一篇文章,大小不同 --> 已更改 == 1(否则已更改 == 0)
这是更新后的例子:
df <- data.frame(
customernumber = c("111", "111", "111", "111", "111", "111","222", "222", "222", "222", "222", "222", "222"),
ordernumber = c("1", "1", "1", "2", "2", "2", "1", "1", "1", "1", "2", "2", "3"),
article = c("JeansA", "JeansA", "ShirtA", "JeansA", "JeansA", "JeansB", "ShirtA", "ShirtB", "ShirtB", "JeansA", "JeansB", "ShirtA", "JeansB"),
size = c("40", "42", "40", "40", "44", "44", "36", "36", "40", "40", "38", "44", "36"),
returned = c("1", "1", "0", "0", "1", "1", "1", "1", "0", "0", "0", "0", "0")
)
输出:
customernumber ordernumber article size returned
1 111 1 JeansA 40 1
2 111 1 JeansA 42 1
3 111 1 ShirtA 40 0
4 111 2 JeansA 40 0
5 111 2 JeansA 44 1
6 111 2 JeansB 44 1
7 222 1 ShirtA 36 1
8 222 1 ShirtB 36 1
9 222 1 ShirtB 40 0
10 222 1 JeansA 40 0
11 222 2 JeansB 38 0
11 222 2 ShirtA 44 0
12 222 3 JeansB 36 0
结果:
customernumber ordernumber article size returned changed
1 111 1 JeansA 40 1 0
2 111 1 JeansA 42 1 0
3 111 1 ShirtA 40 0 0
4 111 2 JeansA 40 0 0
5 111 2 JeansA 44 1 1
6 111 2 JeansB 44 1 0
7 222 1 ShirtA 36 1 0
8 222 1 ShirtB 36 1 0
9 222 1 ShirtB 40 0 0
10 222 1 JeansA 40 0 0
11 222 2 JeansB 38 0 0
11 222 2 ShirtA 44 0 1
12 222 3 JeansB 36 0 0
抱歉造成混淆,实际上我在示例中犯了一个错误,错误地填充了已更改的变量。如果你还在帮助我,我将不胜感激。
谢谢!
您正在处理多个滞后条件,因此我们需要多个 lag
命令来创建该条件。然后我们可以使用 case_when
创建 changed
列。
df2 <- df %>%
group_by(customernumber, article) %>%
mutate(lag_returned = lag(returned),
lag_ordernumber = lag(ordernumber)) %>%
ungroup() %>%
mutate(changed = case_when(
returned %in% "0" &
duplicated(article) &
lag_returned %in% "1" &
ordernumber != lag_ordernumber ~ "1",
TRUE ~ "0"
)) %>%
select(-starts_with("lag"))
df2
# # A tibble: 12 x 6
# customernumber ordernumber article size returned changed
# <fct> <fct> <fct> <fct> <fct> <chr>
# 1 111 1 JeansA 40 1 0
# 2 111 1 JeansA 42 1 0
# 3 111 1 ShirtA 40 0 0
# 4 111 2 JeansA 42 0 1
# 5 111 2 JeansB 44 1 0
# 6 222 1 ShirtA 36 1 0
# 7 222 1 ShirtB 36 1 0
# 8 222 1 ShirtB 40 0 0
# 9 222 1 JeansA 40 0 0
# 10 222 2 JeansB 38 0 0
# 11 222 2 ShirtA 44 0 1
# 12 222 3 JeansB 36 0 0
新答案:
data.table
的可能解决方案:
library(data.table)
setDT(df)
df[, changed := 0
][df[df, on = .(customernumber, ordernumber < ordernumber, article), nomatch = 0
][size != i.size & returned == 1, .SD[!i.size %in% size], by = .(customernumber, ordernumber, article)
][, .(customernumber, ordernumber, article, size = i.size)][, unique(.SD)]
, on = .(customernumber, ordernumber, article, size), changed := 1][]
给出:
customernumber ordernumber article size returned changed
1: 111 1 JeansA 40 1 0
2: 111 1 JeansA 42 1 0
3: 111 1 ShirtA 40 0 0
4: 111 2 JeansA 40 0 0
5: 111 2 JeansA 44 1 1
6: 111 2 JeansB 44 1 0
7: 222 1 ShirtA 36 1 0
8: 222 1 ShirtB 36 1 0
9: 222 1 ShirtB 40 0 0
10: 222 1 JeansA 40 0 0
11: 222 2 JeansB 38 0 0
12: 222 2 ShirtA 44 0 1
13: 222 3 JeansB 36 0 0
旧答案:
library(data.table)
setDT(df)
df[df[returned == 0][df[returned == 1]
, on = .(customernumber, article)
][ordernumber != i.ordernumber]
, on = .(customernumber, article, returned)
, changed := i.returned
][, changed := replace(changed, is.na(changed), 0)][]
给出:
customernumber ordernumber article size returned changed
1: 111 1 JeansA 40 1 0
2: 111 1 JeansA 42 1 0
3: 111 1 ShirtA 40 0 0
4: 111 2 JeansA 42 0 1
5: 111 2 JeansB 44 1 0
6: 222 1 ShirtA 36 1 0
7: 222 1 ShirtB 36 1 0
8: 222 1 ShirtB 40 0 0
9: 222 1 JeansA 40 0 0
10: 222 2 JeansB 38 0 0
11: 222 2 ShirtA 44 0 1
12: 222 3 JeansB 36 0 0
我的数据是这样的:
library(dplyr)
library(data.table)
df <- data.frame(
customernumber = c("111", "111", "111", "111", "111","222", "222", "222", "222", "222", "222", "222"),
ordernumber = c("1", "1", "1", "2", "2", "1", "1", "1", "1", "2", "2", "3"),
article = c("JeansA", "JeansA", "ShirtA", "JeansA", "JeansB", "ShirtA", "ShirtB", "ShirtB", "JeansA", "JeansB", "ShirtA", "JeansB"),
size = c("40", "42", "40", "42", "44", "36", "36", "40", "40", "38", "44", "36"),
returned = c("1", "1", "0", "0", "1", "1", "1", "0", "0", "0", "0", "0")
)
输出:
customernumber ordernumber article size returned
1 111 1 JeansA 40 1
2 111 1 JeansA 42 1
3 111 1 ShirtA 40 0
4 111 2 JeansA 42 0
5 111 2 JeansB 44 1
6 222 1 ShirtA 36 1
7 222 1 ShirtB 36 1
8 222 1 ShirtB 40 0
9 222 1 JeansA 40 0
10 222 2 JeansB 38 0
11 222 2 ShirtA 44 0
12 222 3 JeansB 36 0
现在我想标记每个客户的所有订单,其中一件商品已被退回,但在下一个订单中以不同的尺寸再次订购。因此,所有仅被交换而不能真正被视为新订单的物品。所以最终结果应该是这样的:
结果:
customernumber ordernumber article size returned changed
1 111 1 JeansA 40 1 0
2 111 1 JeansA 42 1 0
3 111 1 ShirtA 40 0 0
4 111 2 JeansA 42 0 1
5 111 2 JeansB 44 1 0
6 222 1 ShirtA 36 1 0
7 222 1 ShirtB 36 1 0
8 222 1 ShirtB 40 0 0
9 222 1 JeansA 40 0 0
10 222 2 JeansB 38 0 0
11 222 2 ShirtA 44 0 1
12 222 3 JeansB 36 0 0
我以为我可以通过使用dyplr(或data.table)引入一个滞后变量来解决这个问题,但我只设法在同一组内滞后该变量,但我未能将它滞后到下一组.这是:
df %>%
group_by(customernumber, ordernumber, article) %>%
mutate(lag_size = lag(size, order_by = article))
或:
df <- data.table(df)
setorder(df, customernumber, ordernumber, article)
df[,lag_size := shift(size), by = .(customernumber, ordernumber, article)]
我不想考虑for循环(甚至不确定是否能解决问题),因为数据集很大而且需要ages.And我总体上真的很缺乏想法。因此,我们将不胜感激。
谢谢!
插件:
我无意中遇到了与此案例相关的另一个问题。我只想将下一个后续订单中另一个 size 中已订购的文章标记为已更改,而不是如果相同尺寸的相同文章再次被订购。因此,更改变量的标准为:
订单 n:已返回 == 1 订单 n+1:同一篇文章,大小不同 --> 已更改 == 1(否则已更改 == 0)
这是更新后的例子:
df <- data.frame(
customernumber = c("111", "111", "111", "111", "111", "111","222", "222", "222", "222", "222", "222", "222"),
ordernumber = c("1", "1", "1", "2", "2", "2", "1", "1", "1", "1", "2", "2", "3"),
article = c("JeansA", "JeansA", "ShirtA", "JeansA", "JeansA", "JeansB", "ShirtA", "ShirtB", "ShirtB", "JeansA", "JeansB", "ShirtA", "JeansB"),
size = c("40", "42", "40", "40", "44", "44", "36", "36", "40", "40", "38", "44", "36"),
returned = c("1", "1", "0", "0", "1", "1", "1", "1", "0", "0", "0", "0", "0")
)
输出:
customernumber ordernumber article size returned
1 111 1 JeansA 40 1
2 111 1 JeansA 42 1
3 111 1 ShirtA 40 0
4 111 2 JeansA 40 0
5 111 2 JeansA 44 1
6 111 2 JeansB 44 1
7 222 1 ShirtA 36 1
8 222 1 ShirtB 36 1
9 222 1 ShirtB 40 0
10 222 1 JeansA 40 0
11 222 2 JeansB 38 0
11 222 2 ShirtA 44 0
12 222 3 JeansB 36 0
结果:
customernumber ordernumber article size returned changed
1 111 1 JeansA 40 1 0
2 111 1 JeansA 42 1 0
3 111 1 ShirtA 40 0 0
4 111 2 JeansA 40 0 0
5 111 2 JeansA 44 1 1
6 111 2 JeansB 44 1 0
7 222 1 ShirtA 36 1 0
8 222 1 ShirtB 36 1 0
9 222 1 ShirtB 40 0 0
10 222 1 JeansA 40 0 0
11 222 2 JeansB 38 0 0
11 222 2 ShirtA 44 0 1
12 222 3 JeansB 36 0 0
抱歉造成混淆,实际上我在示例中犯了一个错误,错误地填充了已更改的变量。如果你还在帮助我,我将不胜感激。
谢谢!
您正在处理多个滞后条件,因此我们需要多个 lag
命令来创建该条件。然后我们可以使用 case_when
创建 changed
列。
df2 <- df %>%
group_by(customernumber, article) %>%
mutate(lag_returned = lag(returned),
lag_ordernumber = lag(ordernumber)) %>%
ungroup() %>%
mutate(changed = case_when(
returned %in% "0" &
duplicated(article) &
lag_returned %in% "1" &
ordernumber != lag_ordernumber ~ "1",
TRUE ~ "0"
)) %>%
select(-starts_with("lag"))
df2
# # A tibble: 12 x 6
# customernumber ordernumber article size returned changed
# <fct> <fct> <fct> <fct> <fct> <chr>
# 1 111 1 JeansA 40 1 0
# 2 111 1 JeansA 42 1 0
# 3 111 1 ShirtA 40 0 0
# 4 111 2 JeansA 42 0 1
# 5 111 2 JeansB 44 1 0
# 6 222 1 ShirtA 36 1 0
# 7 222 1 ShirtB 36 1 0
# 8 222 1 ShirtB 40 0 0
# 9 222 1 JeansA 40 0 0
# 10 222 2 JeansB 38 0 0
# 11 222 2 ShirtA 44 0 1
# 12 222 3 JeansB 36 0 0
新答案:
data.table
的可能解决方案:
library(data.table)
setDT(df)
df[, changed := 0
][df[df, on = .(customernumber, ordernumber < ordernumber, article), nomatch = 0
][size != i.size & returned == 1, .SD[!i.size %in% size], by = .(customernumber, ordernumber, article)
][, .(customernumber, ordernumber, article, size = i.size)][, unique(.SD)]
, on = .(customernumber, ordernumber, article, size), changed := 1][]
给出:
customernumber ordernumber article size returned changed 1: 111 1 JeansA 40 1 0 2: 111 1 JeansA 42 1 0 3: 111 1 ShirtA 40 0 0 4: 111 2 JeansA 40 0 0 5: 111 2 JeansA 44 1 1 6: 111 2 JeansB 44 1 0 7: 222 1 ShirtA 36 1 0 8: 222 1 ShirtB 36 1 0 9: 222 1 ShirtB 40 0 0 10: 222 1 JeansA 40 0 0 11: 222 2 JeansB 38 0 0 12: 222 2 ShirtA 44 0 1 13: 222 3 JeansB 36 0 0
旧答案:
library(data.table)
setDT(df)
df[df[returned == 0][df[returned == 1]
, on = .(customernumber, article)
][ordernumber != i.ordernumber]
, on = .(customernumber, article, returned)
, changed := i.returned
][, changed := replace(changed, is.na(changed), 0)][]
给出:
customernumber ordernumber article size returned changed 1: 111 1 JeansA 40 1 0 2: 111 1 JeansA 42 1 0 3: 111 1 ShirtA 40 0 0 4: 111 2 JeansA 42 0 1 5: 111 2 JeansB 44 1 0 6: 222 1 ShirtA 36 1 0 7: 222 1 ShirtB 36 1 0 8: 222 1 ShirtB 40 0 0 9: 222 1 JeansA 40 0 0 10: 222 2 JeansB 38 0 0 11: 222 2 ShirtA 44 0 1 12: 222 3 JeansB 36 0 0