删除不需要的数据条目
Remove unwanted data entries
为垃圾标题道歉,但很难用措辞表达问题。
我从客户那里收到数据,他们在现场手动启动和停止测试,但有时他们双击启动测试或停止测试,这让我很难自动输出。
这是一些数据
dput(test)
structure(list(time = structure(c(1645290963, 1645291107, 1645291112,
1645291203, 1645291306, 1645291441, 1645291532, 1645291689, 1645291721,
1645291866, 1645292051, 1645292182, 1645292444, 1645292539, 1645292557,
1645292935, 1645293077, 1645293117, 1645293229, 1645293275, 1645293425,
1645293429, 1645293555, 1645293584, 1645293735), tzone = "", class = c("POSIXct",
"POSIXt")), type = c("StartTesting", "StopTesting", "StartTesting",
"StopTesting", "StartTesting", "StopTesting", "StartTesting",
"StopTesting", "StartTesting", "StopTesting", "StartTesting",
"StopTesting", "StartTesting", "StopTesting", "StopTesting",
"StartTesting", "StopTesting", "StartTesting", "StopTesting",
"StartTesting", "StopTesting", "StartTesting", "StopTesting",
"StartTesting", "StopTesting"), comments = c("", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "")), row.names = c("event.12", "event.13", "event.14",
"event.15", "event.16", "event.17", "event.18", "event.19", "event.20",
"event.21", "event.22", "event.23", "event.25", "event.26", "event.27",
"event.28", "event.29", "event.30", "event.31", "event.32", "event.33",
"event.34", "event.35", "event.36", "event.37"), class = "data.frame")
很难发现,但在 rowname - event.26 他们按了两次 StopTesting
。我需要找到一种方法来确保在我的数据处理过程中有相同数量的启动和停止我尝试如下所示清理数据,但它阻止了我 cbind
数据重新组合在一起。
testdatstart <- filter(test, type =="StartTesting")
names(testdatstart)[names(testdatstart) == 'time'] <- 'TestStart'
names(testdatstart)[names(testdatstart) == 'comments'] <- 'StartComments'
testdatfin <- filter(test, type =="StopTesting")
names(testdatfin)[names(testdatfin) == 'time'] <- 'TestStop'
names(testdatfin)[names(testdatfin) == 'comments'] <- 'StopComments'
testdat <- cbind(testdatstart, testdatfin)
testdat <- testdat %>%
select(TestStart,TestStop, StartComments, StopComments)
我想要的输出是这样的(我手动删除了 event.26 来实现这个)
dput(testdat)
structure(list(TestStart = structure(c(1645290963, 1645291112,
1645291306, 1645291532, 1645291721, 1645292051, 1645292444, 1645292935,
1645293117, 1645293275, 1645293429, 1645293584), tzone = "", class = c("POSIXct",
"POSIXt")), TestStop = structure(c(1645291107, 1645291203, 1645291441,
1645291689, 1645291866, 1645292182, 1645292539, 1645293077, 1645293229,
1645293425, 1645293555, 1645293735), tzone = "", class = c("POSIXct",
"POSIXt")), StartComments = c("", "", "", "", "", "", "", "",
"", "", "", ""), StopComments = c("", "", "", "", "", "", "",
"", "", "", "", "")), class = "data.frame", row.names = c("event.12",
"event.14", "event.16", "event.18", "event.20", "event.22", "event.25",
"event.28", "event.30", "event.32", "event.34", "event.36"))
所以我正在努力寻找一种方法来识别所需的模式,应该 运行 开始、停止、开始、停止等,然后从双重条目中删除第二个。
您可以使用 dplyr
包和 filter
类型列:我们只需使用 [=15= 检查 type
是否等于之前的 type
].
之后,您可以使用 summarise
:
转换数据
library(dplyr)
df %>%
as_tibble(rownames = "rownames") %>%
filter(type != lag(type, default = "")) %>%
group_by(gr = cumsum(type == "StartTesting")) %>%
summarise(rownames = first(rownames), TestStart = time[1], TestStop = time[2], .groups = "drop") %>%
select(-gr)
# A tibble: 12 x 3
rownames TestStart TestStop
<chr> <dttm> <dttm>
1 event.12 2022-02-19 18:16:03 2022-02-19 18:18:27
2 event.14 2022-02-19 18:18:32 2022-02-19 18:20:03
3 event.16 2022-02-19 18:21:46 2022-02-19 18:24:01
4 event.18 2022-02-19 18:25:32 2022-02-19 18:28:09
5 event.20 2022-02-19 18:28:41 2022-02-19 18:31:06
6 event.22 2022-02-19 18:34:11 2022-02-19 18:36:22
7 event.25 2022-02-19 18:40:44 2022-02-19 18:42:19
8 event.28 2022-02-19 18:48:55 2022-02-19 18:51:17
9 event.30 2022-02-19 18:51:57 2022-02-19 18:53:49
10 event.32 2022-02-19 18:54:35 2022-02-19 18:57:05
11 event.34 2022-02-19 18:57:09 2022-02-19 18:59:15
12 event.36 2022-02-19 18:59:44 2022-02-19 19:02:15
library(tidyverse)
df %>%
mutate(id = cumsum(type == "StartTesting")) %>%
arrange(id, type, desc(time)) %>%
distinct(id, type, .keep_all = T) %>%
pivot_wider(names_from = type, values_from = c(time, comments))
id time_StartTesting time_StopTesting comments_StartTesting comments_StopTesting
<int> <dttm> <dttm> <chr> <chr>
1 1 2022-02-19 18:16:03 2022-02-19 18:18:27 "" ""
2 2 2022-02-19 18:18:32 2022-02-19 18:20:03 "" ""
3 3 2022-02-19 18:21:46 2022-02-19 18:24:01 "" ""
4 4 2022-02-19 18:25:32 2022-02-19 18:28:09 "" ""
5 5 2022-02-19 18:28:41 2022-02-19 18:31:06 "" ""
6 6 2022-02-19 18:34:11 2022-02-19 18:36:22 "" ""
7 7 2022-02-19 18:40:44 2022-02-19 18:42:37 "" ""
8 8 2022-02-19 18:48:55 2022-02-19 18:51:17 "" ""
9 9 2022-02-19 18:51:57 2022-02-19 18:53:49 "" ""
10 10 2022-02-19 18:54:35 2022-02-19 18:57:05 "" ""
11 11 2022-02-19 18:57:09 2022-02-19 18:59:15 "" ""
12 12 2022-02-19 18:59:44 2022-02-19 19:02:15 "" ""
为垃圾标题道歉,但很难用措辞表达问题。
我从客户那里收到数据,他们在现场手动启动和停止测试,但有时他们双击启动测试或停止测试,这让我很难自动输出。
这是一些数据
dput(test)
structure(list(time = structure(c(1645290963, 1645291107, 1645291112,
1645291203, 1645291306, 1645291441, 1645291532, 1645291689, 1645291721,
1645291866, 1645292051, 1645292182, 1645292444, 1645292539, 1645292557,
1645292935, 1645293077, 1645293117, 1645293229, 1645293275, 1645293425,
1645293429, 1645293555, 1645293584, 1645293735), tzone = "", class = c("POSIXct",
"POSIXt")), type = c("StartTesting", "StopTesting", "StartTesting",
"StopTesting", "StartTesting", "StopTesting", "StartTesting",
"StopTesting", "StartTesting", "StopTesting", "StartTesting",
"StopTesting", "StartTesting", "StopTesting", "StopTesting",
"StartTesting", "StopTesting", "StartTesting", "StopTesting",
"StartTesting", "StopTesting", "StartTesting", "StopTesting",
"StartTesting", "StopTesting"), comments = c("", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "")), row.names = c("event.12", "event.13", "event.14",
"event.15", "event.16", "event.17", "event.18", "event.19", "event.20",
"event.21", "event.22", "event.23", "event.25", "event.26", "event.27",
"event.28", "event.29", "event.30", "event.31", "event.32", "event.33",
"event.34", "event.35", "event.36", "event.37"), class = "data.frame")
很难发现,但在 rowname - event.26 他们按了两次 StopTesting
。我需要找到一种方法来确保在我的数据处理过程中有相同数量的启动和停止我尝试如下所示清理数据,但它阻止了我 cbind
数据重新组合在一起。
testdatstart <- filter(test, type =="StartTesting")
names(testdatstart)[names(testdatstart) == 'time'] <- 'TestStart'
names(testdatstart)[names(testdatstart) == 'comments'] <- 'StartComments'
testdatfin <- filter(test, type =="StopTesting")
names(testdatfin)[names(testdatfin) == 'time'] <- 'TestStop'
names(testdatfin)[names(testdatfin) == 'comments'] <- 'StopComments'
testdat <- cbind(testdatstart, testdatfin)
testdat <- testdat %>%
select(TestStart,TestStop, StartComments, StopComments)
我想要的输出是这样的(我手动删除了 event.26 来实现这个)
dput(testdat)
structure(list(TestStart = structure(c(1645290963, 1645291112,
1645291306, 1645291532, 1645291721, 1645292051, 1645292444, 1645292935,
1645293117, 1645293275, 1645293429, 1645293584), tzone = "", class = c("POSIXct",
"POSIXt")), TestStop = structure(c(1645291107, 1645291203, 1645291441,
1645291689, 1645291866, 1645292182, 1645292539, 1645293077, 1645293229,
1645293425, 1645293555, 1645293735), tzone = "", class = c("POSIXct",
"POSIXt")), StartComments = c("", "", "", "", "", "", "", "",
"", "", "", ""), StopComments = c("", "", "", "", "", "", "",
"", "", "", "", "")), class = "data.frame", row.names = c("event.12",
"event.14", "event.16", "event.18", "event.20", "event.22", "event.25",
"event.28", "event.30", "event.32", "event.34", "event.36"))
所以我正在努力寻找一种方法来识别所需的模式,应该 运行 开始、停止、开始、停止等,然后从双重条目中删除第二个。
您可以使用 dplyr
包和 filter
类型列:我们只需使用 [=15= 检查 type
是否等于之前的 type
].
之后,您可以使用 summarise
:
library(dplyr)
df %>%
as_tibble(rownames = "rownames") %>%
filter(type != lag(type, default = "")) %>%
group_by(gr = cumsum(type == "StartTesting")) %>%
summarise(rownames = first(rownames), TestStart = time[1], TestStop = time[2], .groups = "drop") %>%
select(-gr)
# A tibble: 12 x 3
rownames TestStart TestStop
<chr> <dttm> <dttm>
1 event.12 2022-02-19 18:16:03 2022-02-19 18:18:27
2 event.14 2022-02-19 18:18:32 2022-02-19 18:20:03
3 event.16 2022-02-19 18:21:46 2022-02-19 18:24:01
4 event.18 2022-02-19 18:25:32 2022-02-19 18:28:09
5 event.20 2022-02-19 18:28:41 2022-02-19 18:31:06
6 event.22 2022-02-19 18:34:11 2022-02-19 18:36:22
7 event.25 2022-02-19 18:40:44 2022-02-19 18:42:19
8 event.28 2022-02-19 18:48:55 2022-02-19 18:51:17
9 event.30 2022-02-19 18:51:57 2022-02-19 18:53:49
10 event.32 2022-02-19 18:54:35 2022-02-19 18:57:05
11 event.34 2022-02-19 18:57:09 2022-02-19 18:59:15
12 event.36 2022-02-19 18:59:44 2022-02-19 19:02:15
library(tidyverse)
df %>%
mutate(id = cumsum(type == "StartTesting")) %>%
arrange(id, type, desc(time)) %>%
distinct(id, type, .keep_all = T) %>%
pivot_wider(names_from = type, values_from = c(time, comments))
id time_StartTesting time_StopTesting comments_StartTesting comments_StopTesting
<int> <dttm> <dttm> <chr> <chr>
1 1 2022-02-19 18:16:03 2022-02-19 18:18:27 "" ""
2 2 2022-02-19 18:18:32 2022-02-19 18:20:03 "" ""
3 3 2022-02-19 18:21:46 2022-02-19 18:24:01 "" ""
4 4 2022-02-19 18:25:32 2022-02-19 18:28:09 "" ""
5 5 2022-02-19 18:28:41 2022-02-19 18:31:06 "" ""
6 6 2022-02-19 18:34:11 2022-02-19 18:36:22 "" ""
7 7 2022-02-19 18:40:44 2022-02-19 18:42:37 "" ""
8 8 2022-02-19 18:48:55 2022-02-19 18:51:17 "" ""
9 9 2022-02-19 18:51:57 2022-02-19 18:53:49 "" ""
10 10 2022-02-19 18:54:35 2022-02-19 18:57:05 "" ""
11 11 2022-02-19 18:57:09 2022-02-19 18:59:15 "" ""
12 12 2022-02-19 18:59:44 2022-02-19 19:02:15 "" ""