dplyr: group_by() 基于列中的值和位置
dplyr: group_by() based on value AND Position in column
我有一个包含 start
、end
和 type
列的数据集:
structure(list(number = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), start = structure(c(1121472000,
1184544000, 1196467200, 1186963200, 1199145600, 1228089600, 1232582400,
1233446400, 1246406400, 1262304000, 1271030400, 1285891200, 1293840000,
1344211200, 1367366400, 1413158400, 1427846400, 1441065600, 1451952000,
1461542400, 1464739200, 1491004800, 1501545600, 1506816000, 1509494400,
1548979200, 1557705600, 1561939200, 1572825600, 1590969600, 1606780800
), tzone = "UTC", class = c("POSIXct", "POSIXt")), end = structure(c(1184457600,
1196380800, 253402214400, 1199059200, 1228003200, 1232496000,
1233360000, 1246320000, 1262217600, 1270944000, 1285804800, 1293753600,
1344124800, 1367280000, 1413072000, 1427760000, 1440979200, 1451865600,
1461456000, 1464652800, 1490918400, 1501459200, 1506729600, 1509408000,
1548892800, 1557619200, 1561852800, 1572739200, 1590883200, 1606694400,
253402214400), tzone = "UTC", class = c("POSIXct", "POSIXt")),
type = c("A", "A", "A", "B", "B", "B", "C", "D", "D", "D",
"D", "D", "D", "D", "E", "D", "E", "E", "F", "E", "E", "E",
"E", "E", "E", "E", "E", "E", "E", "E", "E")), row.names = c(NA,
-31L), class = c("tbl_df", "tbl", "data.frame"))
tail(data, 20)
number start end type
<dbl> <dttm> <dttm> <chr>
1 1 2010-10-01 00:00:00 2010-12-31 00:00:00 D
2 1 2011-01-01 00:00:00 2012-08-05 00:00:00 D
3 1 2012-08-06 00:00:00 2013-04-30 00:00:00 D
4 1 2013-05-01 00:00:00 2014-10-12 00:00:00 E
5 1 2014-10-13 00:00:00 2015-03-31 00:00:00 D
6 1 2015-04-01 00:00:00 2015-08-31 00:00:00 E
7 1 2015-09-01 00:00:00 2016-01-04 00:00:00 E
8 1 2016-01-05 00:00:00 2016-04-24 00:00:00 F
9 1 2016-04-25 00:00:00 2016-05-31 00:00:00 E
10 1 2016-06-01 00:00:00 2017-03-31 00:00:00 E
11 1 2017-04-01 00:00:00 2017-07-31 00:00:00 E
12 1 2017-08-01 00:00:00 2017-09-30 00:00:00 E
13 1 2017-10-01 00:00:00 2017-10-31 00:00:00 E
14 1 2017-11-01 00:00:00 2019-01-31 00:00:00 E
15 1 2019-02-01 00:00:00 2019-05-12 00:00:00 E
16 1 2019-05-13 00:00:00 2019-06-30 00:00:00 E
17 1 2019-07-01 00:00:00 2019-11-03 00:00:00 E
18 1 2019-11-04 00:00:00 2020-05-31 00:00:00 E
19 1 2020-06-01 00:00:00 2020-11-30 00:00:00 E
20 1 2020-12-01 00:00:00 9999-12-31 00:00:00 E
问题:
我想根据列类型的组来汇总数据但是我希望这些组只针对具有相同值的相邻行。
这里的相关列是 type
,在这里我不仅要根据值 itslef 进行分组,例如“E”或“D”,但也基于位置进行分组(仅将“D”、“E”等的相邻值组合在一起。
到目前为止,我显然可以使用:
> data_mre %>%
+ group_by(number, type) %>%
+ summarise(start = dplyr::first(start),
+ end = dplyr::last(end),
+ .groups = "drop")
# A tibble: 6 x 4
number type start end
<dbl> <chr> <dttm> <dttm>
1 1 A 2005-07-16 00:00:00 9999-12-31 00:00:00
2 1 B 2007-08-13 00:00:00 2009-01-21 00:00:00
3 1 C 2009-01-22 00:00:00 2009-01-31 00:00:00
4 1 D 2009-02-01 00:00:00 2015-03-31 00:00:00
5 1 E 2013-05-01 00:00:00 9999-12-31 00:00:00
6 1 F 2016-01-05 00:00:00 2016-04-24 00:00:00
但这会构建重叠行的组,并且不会考虑行内的位置。有没有办法让我也可以将该行作为 group_by()
语句中的参数?
这是我想象中的结果:
number type start end
<dbl> <chr> <dttm> <dttm>
1 1 A 2005-07-16 00:00:00 9999-12-31 00:00:00
2 1 B 2007-08-13 00:00:00 2009-01-21 00:00:00
3 1 C 2009-01-22 00:00:00 2009-01-31 00:00:00
4 1 D 2009-02-01 00:00:00 2013-04-30 00:00:00
5 1 E 2013-05-01 00:00:00 2014-10-12 00:00:00
6 1 D 2014-10-13 00:00:00 2015-03-31 00:00:00
5 1 E 2015-04-01 00:00:00 2016-01-04 00:00:00
6 1 F 2016-01-05 00:00:00 2016-04-24 00:00:00
5 1 E 2016-04-25 00:00:00 9999-12-31 00:00:00
您可以使用data.table::rleid
使用tidy/dplyr:
data %>%
group_by(group = data.table::rleid(type)) %>%
summarize(number= first(number), type = first(type), start=first(start), end=last(end)) %>%
select(!group)
输出:
number type start end
<dbl> <chr> <dttm> <dttm>
1 1 A 2005-07-16 00:00:00 9999-12-31 00:00:00
2 1 B 2007-08-13 00:00:00 2009-01-21 00:00:00
3 1 C 2009-01-22 00:00:00 2009-01-31 00:00:00
4 1 D 2009-02-01 00:00:00 2013-04-30 00:00:00
5 1 E 2013-05-01 00:00:00 2014-10-12 00:00:00
6 1 D 2014-10-13 00:00:00 2015-03-31 00:00:00
7 1 E 2015-04-01 00:00:00 2016-01-04 00:00:00
8 1 F 2016-01-05 00:00:00 2016-04-24 00:00:00
9 1 E 2016-04-25 00:00:00 9999-12-31 00:00:00
使用data.table
library(data.table)
setDT(data)[, .(
number=first(number),
start = first(start),
end = last(end),
type = first(type)
),
by=.(rleid(type))][,.(number, type, start, end )]
输出
number type start end
<num> <char> <POSc> <POSc>
1: 1 A 2005-07-16 9999-12-31
2: 1 B 2007-08-13 2009-01-21
3: 1 C 2009-01-22 2009-01-31
4: 1 D 2009-02-01 2013-04-30
5: 1 E 2013-05-01 2014-10-12
6: 1 D 2014-10-13 2015-03-31
7: 1 E 2015-04-01 2016-01-04
8: 1 F 2016-01-05 2016-04-24
9: 1 E 2016-04-25 9999-12-31
我有一个包含 start
、end
和 type
列的数据集:
structure(list(number = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), start = structure(c(1121472000,
1184544000, 1196467200, 1186963200, 1199145600, 1228089600, 1232582400,
1233446400, 1246406400, 1262304000, 1271030400, 1285891200, 1293840000,
1344211200, 1367366400, 1413158400, 1427846400, 1441065600, 1451952000,
1461542400, 1464739200, 1491004800, 1501545600, 1506816000, 1509494400,
1548979200, 1557705600, 1561939200, 1572825600, 1590969600, 1606780800
), tzone = "UTC", class = c("POSIXct", "POSIXt")), end = structure(c(1184457600,
1196380800, 253402214400, 1199059200, 1228003200, 1232496000,
1233360000, 1246320000, 1262217600, 1270944000, 1285804800, 1293753600,
1344124800, 1367280000, 1413072000, 1427760000, 1440979200, 1451865600,
1461456000, 1464652800, 1490918400, 1501459200, 1506729600, 1509408000,
1548892800, 1557619200, 1561852800, 1572739200, 1590883200, 1606694400,
253402214400), tzone = "UTC", class = c("POSIXct", "POSIXt")),
type = c("A", "A", "A", "B", "B", "B", "C", "D", "D", "D",
"D", "D", "D", "D", "E", "D", "E", "E", "F", "E", "E", "E",
"E", "E", "E", "E", "E", "E", "E", "E", "E")), row.names = c(NA,
-31L), class = c("tbl_df", "tbl", "data.frame"))
tail(data, 20)
number start end type
<dbl> <dttm> <dttm> <chr>
1 1 2010-10-01 00:00:00 2010-12-31 00:00:00 D
2 1 2011-01-01 00:00:00 2012-08-05 00:00:00 D
3 1 2012-08-06 00:00:00 2013-04-30 00:00:00 D
4 1 2013-05-01 00:00:00 2014-10-12 00:00:00 E
5 1 2014-10-13 00:00:00 2015-03-31 00:00:00 D
6 1 2015-04-01 00:00:00 2015-08-31 00:00:00 E
7 1 2015-09-01 00:00:00 2016-01-04 00:00:00 E
8 1 2016-01-05 00:00:00 2016-04-24 00:00:00 F
9 1 2016-04-25 00:00:00 2016-05-31 00:00:00 E
10 1 2016-06-01 00:00:00 2017-03-31 00:00:00 E
11 1 2017-04-01 00:00:00 2017-07-31 00:00:00 E
12 1 2017-08-01 00:00:00 2017-09-30 00:00:00 E
13 1 2017-10-01 00:00:00 2017-10-31 00:00:00 E
14 1 2017-11-01 00:00:00 2019-01-31 00:00:00 E
15 1 2019-02-01 00:00:00 2019-05-12 00:00:00 E
16 1 2019-05-13 00:00:00 2019-06-30 00:00:00 E
17 1 2019-07-01 00:00:00 2019-11-03 00:00:00 E
18 1 2019-11-04 00:00:00 2020-05-31 00:00:00 E
19 1 2020-06-01 00:00:00 2020-11-30 00:00:00 E
20 1 2020-12-01 00:00:00 9999-12-31 00:00:00 E
问题:
我想根据列类型的组来汇总数据但是我希望这些组只针对具有相同值的相邻行。
这里的相关列是 type
,在这里我不仅要根据值 itslef 进行分组,例如“E”或“D”,但也基于位置进行分组(仅将“D”、“E”等的相邻值组合在一起。
到目前为止,我显然可以使用:
> data_mre %>%
+ group_by(number, type) %>%
+ summarise(start = dplyr::first(start),
+ end = dplyr::last(end),
+ .groups = "drop")
# A tibble: 6 x 4
number type start end
<dbl> <chr> <dttm> <dttm>
1 1 A 2005-07-16 00:00:00 9999-12-31 00:00:00
2 1 B 2007-08-13 00:00:00 2009-01-21 00:00:00
3 1 C 2009-01-22 00:00:00 2009-01-31 00:00:00
4 1 D 2009-02-01 00:00:00 2015-03-31 00:00:00
5 1 E 2013-05-01 00:00:00 9999-12-31 00:00:00
6 1 F 2016-01-05 00:00:00 2016-04-24 00:00:00
但这会构建重叠行的组,并且不会考虑行内的位置。有没有办法让我也可以将该行作为 group_by()
语句中的参数?
这是我想象中的结果:
number type start end
<dbl> <chr> <dttm> <dttm>
1 1 A 2005-07-16 00:00:00 9999-12-31 00:00:00
2 1 B 2007-08-13 00:00:00 2009-01-21 00:00:00
3 1 C 2009-01-22 00:00:00 2009-01-31 00:00:00
4 1 D 2009-02-01 00:00:00 2013-04-30 00:00:00
5 1 E 2013-05-01 00:00:00 2014-10-12 00:00:00
6 1 D 2014-10-13 00:00:00 2015-03-31 00:00:00
5 1 E 2015-04-01 00:00:00 2016-01-04 00:00:00
6 1 F 2016-01-05 00:00:00 2016-04-24 00:00:00
5 1 E 2016-04-25 00:00:00 9999-12-31 00:00:00
您可以使用data.table::rleid
使用tidy/dplyr:
data %>%
group_by(group = data.table::rleid(type)) %>%
summarize(number= first(number), type = first(type), start=first(start), end=last(end)) %>%
select(!group)
输出:
number type start end
<dbl> <chr> <dttm> <dttm>
1 1 A 2005-07-16 00:00:00 9999-12-31 00:00:00
2 1 B 2007-08-13 00:00:00 2009-01-21 00:00:00
3 1 C 2009-01-22 00:00:00 2009-01-31 00:00:00
4 1 D 2009-02-01 00:00:00 2013-04-30 00:00:00
5 1 E 2013-05-01 00:00:00 2014-10-12 00:00:00
6 1 D 2014-10-13 00:00:00 2015-03-31 00:00:00
7 1 E 2015-04-01 00:00:00 2016-01-04 00:00:00
8 1 F 2016-01-05 00:00:00 2016-04-24 00:00:00
9 1 E 2016-04-25 00:00:00 9999-12-31 00:00:00
使用data.table
library(data.table)
setDT(data)[, .(
number=first(number),
start = first(start),
end = last(end),
type = first(type)
),
by=.(rleid(type))][,.(number, type, start, end )]
输出
number type start end
<num> <char> <POSc> <POSc>
1: 1 A 2005-07-16 9999-12-31
2: 1 B 2007-08-13 2009-01-21
3: 1 C 2009-01-22 2009-01-31
4: 1 D 2009-02-01 2013-04-30
5: 1 E 2013-05-01 2014-10-12
6: 1 D 2014-10-13 2015-03-31
7: 1 E 2015-04-01 2016-01-04
8: 1 F 2016-01-05 2016-04-24
9: 1 E 2016-04-25 9999-12-31