删除不需要的数据条目

Remove unwanted data entries

为垃圾标题道歉,但很难用措辞表达问题。

我从客户那里收到数据,他们在现场手动启动和停止测试,但有时他们双击启动测试或停止测试,这让我很难自动输出。

这是一些数据

dput(test)
structure(list(time = structure(c(1645290963, 1645291107, 1645291112, 
1645291203, 1645291306, 1645291441, 1645291532, 1645291689, 1645291721, 
1645291866, 1645292051, 1645292182, 1645292444, 1645292539, 1645292557, 
1645292935, 1645293077, 1645293117, 1645293229, 1645293275, 1645293425, 
1645293429, 1645293555, 1645293584, 1645293735), tzone = "", class = c("POSIXct", 
"POSIXt")), type = c("StartTesting", "StopTesting", "StartTesting", 
"StopTesting", "StartTesting", "StopTesting", "StartTesting", 
"StopTesting", "StartTesting", "StopTesting", "StartTesting", 
"StopTesting", "StartTesting", "StopTesting", "StopTesting", 
"StartTesting", "StopTesting", "StartTesting", "StopTesting", 
"StartTesting", "StopTesting", "StartTesting", "StopTesting", 
"StartTesting", "StopTesting"), comments = c("", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "")), row.names = c("event.12", "event.13", "event.14", 
"event.15", "event.16", "event.17", "event.18", "event.19", "event.20", 
"event.21", "event.22", "event.23", "event.25", "event.26", "event.27", 
"event.28", "event.29", "event.30", "event.31", "event.32", "event.33", 
"event.34", "event.35", "event.36", "event.37"), class = "data.frame")

很难发现,但在 rowname - event.26 他们按了两次 StopTesting。我需要找到一种方法来确保在我的数据处理过程中有相同数量的启动和停止我尝试如下所示清理数据,但它阻止了我 cbind 数据重新组合在一起。

testdatstart <- filter(test, type =="StartTesting")
names(testdatstart)[names(testdatstart) == 'time'] <- 'TestStart'
names(testdatstart)[names(testdatstart) == 'comments'] <- 'StartComments'

testdatfin <- filter(test, type =="StopTesting")
names(testdatfin)[names(testdatfin) == 'time'] <- 'TestStop'
names(testdatfin)[names(testdatfin) == 'comments'] <- 'StopComments'

testdat <- cbind(testdatstart, testdatfin)
testdat <- testdat %>%
  select(TestStart,TestStop, StartComments, StopComments)

我想要的输出是这样的(我手动删除了 event.26 来实现这个)

dput(testdat)
structure(list(TestStart = structure(c(1645290963, 1645291112, 
1645291306, 1645291532, 1645291721, 1645292051, 1645292444, 1645292935, 
1645293117, 1645293275, 1645293429, 1645293584), tzone = "", class = c("POSIXct", 
"POSIXt")), TestStop = structure(c(1645291107, 1645291203, 1645291441, 
1645291689, 1645291866, 1645292182, 1645292539, 1645293077, 1645293229, 
1645293425, 1645293555, 1645293735), tzone = "", class = c("POSIXct", 
"POSIXt")), StartComments = c("", "", "", "", "", "", "", "", 
"", "", "", ""), StopComments = c("", "", "", "", "", "", "", 
"", "", "", "", "")), class = "data.frame", row.names = c("event.12", 
"event.14", "event.16", "event.18", "event.20", "event.22", "event.25", 
"event.28", "event.30", "event.32", "event.34", "event.36"))

所以我正在努力寻找一种方法来识别所需的模式,应该 运行 开始、停止、开始、停止等,然后从双重条目中删除第二个。

您可以使用 dplyr 包和 filter 类型列:我们只需使用 [=15= 检查 type 是否等于之前的 type ]. 之后,您可以使用 summarise:

转换数据
library(dplyr)
df %>%
  as_tibble(rownames = "rownames") %>% 
  filter(type != lag(type, default = "")) %>%
  group_by(gr = cumsum(type == "StartTesting")) %>%
  summarise(rownames = first(rownames), TestStart = time[1], TestStop = time[2], .groups = "drop") %>%
  select(-gr)

# A tibble: 12 x 3
   rownames TestStart           TestStop           
   <chr>    <dttm>              <dttm>             
 1 event.12 2022-02-19 18:16:03 2022-02-19 18:18:27
 2 event.14 2022-02-19 18:18:32 2022-02-19 18:20:03
 3 event.16 2022-02-19 18:21:46 2022-02-19 18:24:01
 4 event.18 2022-02-19 18:25:32 2022-02-19 18:28:09
 5 event.20 2022-02-19 18:28:41 2022-02-19 18:31:06
 6 event.22 2022-02-19 18:34:11 2022-02-19 18:36:22
 7 event.25 2022-02-19 18:40:44 2022-02-19 18:42:19
 8 event.28 2022-02-19 18:48:55 2022-02-19 18:51:17
 9 event.30 2022-02-19 18:51:57 2022-02-19 18:53:49
10 event.32 2022-02-19 18:54:35 2022-02-19 18:57:05
11 event.34 2022-02-19 18:57:09 2022-02-19 18:59:15
12 event.36 2022-02-19 18:59:44 2022-02-19 19:02:15
library(tidyverse)
df %>% 
  mutate(id = cumsum(type == "StartTesting")) %>% 
  arrange(id, type, desc(time)) %>% 
  distinct(id, type, .keep_all = T) %>% 
  pivot_wider(names_from = type, values_from = c(time, comments))  

      id time_StartTesting   time_StopTesting    comments_StartTesting comments_StopTesting
   <int> <dttm>              <dttm>              <chr>                 <chr>               
 1     1 2022-02-19 18:16:03 2022-02-19 18:18:27 ""                    ""                  
 2     2 2022-02-19 18:18:32 2022-02-19 18:20:03 ""                    ""                  
 3     3 2022-02-19 18:21:46 2022-02-19 18:24:01 ""                    ""                  
 4     4 2022-02-19 18:25:32 2022-02-19 18:28:09 ""                    ""                  
 5     5 2022-02-19 18:28:41 2022-02-19 18:31:06 ""                    ""                  
 6     6 2022-02-19 18:34:11 2022-02-19 18:36:22 ""                    ""                  
 7     7 2022-02-19 18:40:44 2022-02-19 18:42:37 ""                    ""                  
 8     8 2022-02-19 18:48:55 2022-02-19 18:51:17 ""                    ""                  
 9     9 2022-02-19 18:51:57 2022-02-19 18:53:49 ""                    ""                  
10    10 2022-02-19 18:54:35 2022-02-19 18:57:05 ""                    ""                  
11    11 2022-02-19 18:57:09 2022-02-19 18:59:15 ""                    ""                  
12    12 2022-02-19 18:59:44 2022-02-19 19:02:15 ""                    ""