将值分隔到 R 中的现有列中

Separating values into existing column in R

我正在整理我使用 tabulizer 从 PDF 读入 R 的一些数据。不幸的是,有些单元格没有被正确读取。在第 9 列(37.1 公里处的第 5 段)中,第 3 行和第 4 行包含本应在第 10 列(最终时间)中结束的信息。

如何仅为这些行分隔列 (9) 并将必要的数据粘贴到现有的列 (10) 中?

我知道如何使用 tidyr::separate 函数,但不知道如何(如果)在这里应用它。任何帮助和指导将不胜感激。

structure(list(Rank = c("23", "24", "25", "26"), `Race Number` = c("13", 
"11", "29", "30"), Name = c("FOSS Tobias S.", "McNULTY Brandon", 
"BENNETT George", "KUKRLE Michael"), `NOC Code` = c("NOR", "USA", 
"NZL", "CZE"), `Split 1 at 9.7km` = c("13:47.65(22)", "13:28.23(15)", 
"14:05.46(30)", "14:05.81(32)"), `Split 2 at 15.0km` = c("19:21.16(22)", 
"19:04.80(18)", "19:47.53(31)", "19:48.77(32)"), `Split 3 at 22.1km` = c("29:17.44(24)", 
"29:01.94(20)", "29:58.88(28)", "29:58.09(27)"), `Split 4 at 31.8km` = c("44:06.82(24)", 
"43:51.67(23)", "44:40.28(25)", "44:42.74(26)"), `Split 5 at 37.1km` = c("49:49.65(24)", 
"49:40.49(23)", "50:21.82(25)1:00:28.39 (25)", "50:30.02(26)1:00:41.55 (26)"
), `Final Time` = c("59:51.68 (23)", "59:57.73 (24)", "", ""), 
    `Time Behind` = c("+4:47.49", "+4:53.54", "+5:24.20", "+5:37.36"
    ), `Average Speed` = c("44.302", "44.228", "43.854", "43.696"
    )), class = "data.frame", row.names = c(NA, -4L))

正在调用 df 到您的数据框:

library(tidyr)
library(dplyr)
df %>%  
  separate(`Split 5 at 37.1km`, into = c("Split 5 at 37.1km","aux"), sep = "\)") %>% 
  mutate(`Final Time` = coalesce(if_else(`Final Time`!="",`Final Time`, NA_character_), paste0(aux, ")")), 
          aux = NULL, 
          `Split 5 at 37.1km` = paste0(`Split 5 at 37.1km`, ")"))

  Rank Race Number            Name NOC Code Split 1 at 9.7km Split 2 at 15.0km Split 3 at 22.1km Split 4 at 31.8km Split 5 at 37.1km      Final Time
1   23          13  FOSS Tobias S.      NOR     13:47.65(22)      19:21.16(22)      29:17.44(24)      44:06.82(24)      49:49.65(24)   59:51.68 (23)
2   24          11 McNULTY Brandon      USA     13:28.23(15)      19:04.80(18)      29:01.94(20)      43:51.67(23)      49:40.49(23)   59:57.73 (24)
3   25          29  BENNETT George      NZL     14:05.46(30)      19:47.53(31)      29:58.88(28)      44:40.28(25)      50:21.82(25) 1:00:28.39 (25)
4   26          30  KUKRLE Michael      CZE     14:05.81(32)      19:48.77(32)      29:58.09(27)      44:42.74(26)      50:30.02(26) 1:00:41.55 (26)
  Time Behind Average Speed
1    +4:47.49        44.302
2    +4:53.54        44.228
3    +5:24.20        43.854
4    +5:37.36        43.696

我的回答不是很花哨,但它对最后时间列中的任何数字都有效。只要末尾的括号中始终有数字,它就可以工作。

# dummy df
df <- data.frame("split" = c("49:49.65(24)", "49:40.49(23)", "50:21.82(25)1:00:28.39 (25)", "50:30.02(26)1:00:41.55 (26)"),
                 "final" = c("59:51.68 (23)", "59:57.73 (24)", "", ""))

# combining & splitting strings
merge_strings <- paste0(df$split, df$final)      
split_strings <- strsplit(merge_strings, ")")
df$split <- paste0(unlist(lapply(split_strings, "[[", 1)),")")
df$final <- paste0(unlist(lapply(split_strings, "[[", 2)),")")

这给出:

         split           final
1 49:49.65(24)   59:51.68 (23)
2 49:40.49(23)   59:57.73 (24)
3 50:21.82(25) 1:00:28.39 (25)
4 50:30.02(26) 1:00:41.55 (26)

您可以使用 dplyrstringr:

library(dplyr)
library(stringr)

data %>% 
  mutate(`Final Time` = ifelse(`Final Time` == "", str_remove(`Split 5 at 37.1km`, "\d+:\d+\.\d+\(\d+\)"), `Final Time`),
         `Split 5 at 37.1km` = str_extract(`Split 5 at 37.1km`, "\d+:\d+\.\d+\(\d+\)"))

哪个returns

  Rank Race Number            Name NOC Code Split 1 at 9.7km Split 2 at 15.0km Split 3 at 22.1km Split 4 at 31.8km
1   23          13  FOSS Tobias S.      NOR     13:47.65(22)      19:21.16(22)      29:17.44(24)      44:06.82(24)
2   24          11 McNULTY Brandon      USA     13:28.23(15)      19:04.80(18)      29:01.94(20)      43:51.67(23)
3   25          29  BENNETT George      NZL     14:05.46(30)      19:47.53(31)      29:58.88(28)      44:40.28(25)
4   26          30  KUKRLE Michael      CZE     14:05.81(32)      19:48.77(32)      29:58.09(27)      44:42.74(26)
  Split 5 at 37.1km      Final Time Time Behind Average Speed
1      49:49.65(24)   59:51.68 (23)    +4:47.49        44.302
2      49:40.49(23)   59:57.73 (24)    +4:53.54        44.228
3      50:21.82(25) 1:00:28.39 (25)    +5:24.20        43.854
4      50:30.02(26) 1:00:41.55 (26)    +5:37.36        43.696

我喜欢使用 regex 和 stringr。虽然这里有一些次优代码,但关键步骤是 str_extract()。使用这个我们可以 select 我们想要的两个子串,第一次的和第二次的。如果任何一个时间缺失,那么我们将有一个缺失值。因此,我们可以根据缺失发生的位置填写列。

正则字符串如下^((\d+:)?\d{2}:\d{2}.\d{2}\(\d+\))\.?+((\d+:)?\d{2}:\d{2}.\d{2} \(\d+\))$。这里我们有 4 个捕获组,第一组和第三组分别捕获两个完整时间。第二个和第四个 select 包含小时的可选组(这确保完全捕获超过一个小时的时间。此外,我们检查可选的 space.

我的代码如下:

library(tidyverse)

data <- structure(list(Rank = c("23", "24", "25", "26"), `Race Number` = c("13", 
                                                                   "11", "29", "30"), Name = c("FOSS Tobias S.", "McNULTY Brandon", 
                                                                                               "BENNETT George", "KUKRLE Michael"), `NOC Code` = c("NOR", "USA", 
                                                                                                                                                   "NZL", "CZE"), `Split 1 at 9.7km` = c("13:47.65(22)", "13:28.23(15)", 
                                                                                                                                                                                         "14:05.46(30)", "14:05.81(32)"), `Split 2 at 15.0km` = c("19:21.16(22)", 
                                                                                                                                                                                                                                                  "19:04.80(18)", "19:47.53(31)", "19:48.77(32)"), `Split 3 at 22.1km` = c("29:17.44(24)", 
                                                                                                                                                                                                                                                                                                                           "29:01.94(20)", "29:58.88(28)", "29:58.09(27)"), `Split 4 at 31.8km` = c("44:06.82(24)", 
                                                                                                                                                                                                                                                                                                                                                                                                    "43:51.67(23)", "44:40.28(25)", "44:42.74(26)"), `Split 5 at 37.1km` = c("49:49.65(24)", 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                             "49:40.49(23)", "50:21.82(25)1:00:28.39 (25)", "50:30.02(26)1:00:41.55 (26)"
                                                                                                                                                                                                                                                                                                                                                                                                    ), `Final Time` = c("59:51.68 (23)", "59:57.73 (24)", "", ""), 
               `Time Behind` = c("+4:47.49", "+4:53.54", "+5:24.20", "+5:37.36"
               ), `Average Speed` = c("44.302", "44.228", "43.854", "43.696"
               )), class = "data.frame", row.names = c(NA, -4L))

# Take data and use a matching string to the regex pattern
 data |> 
  mutate(match = map(`Split 5 at 37.1km`, ~unlist(str_match(., "^((\d+:)?\d{2}:\d{2}.\d{2}\(\d+\))((\d+:)?\d{2}:\d{2}.\d{2} ?\(\d+\))$")))) |> 
# Grab the strings that match the whole first and second/final times
  mutate(match1 = map(match, ~.[[2]]), match2 = map(match, ~.[[4]]), .keep = "unused") |> 
# Check where the NAs are and put into the dataframe accordingly
  mutate(`Split 5 at 37.1km`= ifelse(is.na(match1), `Split 5 at 37.1km`, match1),
         `Final Time` = ifelse(is.na(match2), `Final Time`, match2), .keep = "unused")
#>   Rank Race Number            Name NOC Code Split 1 at 9.7km Split 2 at 15.0km
#> 1   23          13  FOSS Tobias S.      NOR     13:47.65(22)      19:21.16(22)
#> 2   24          11 McNULTY Brandon      USA     13:28.23(15)      19:04.80(18)
#> 3   25          29  BENNETT George      NZL     14:05.46(30)      19:47.53(31)
#> 4   26          30  KUKRLE Michael      CZE     14:05.81(32)      19:48.77(32)
#>   Split 3 at 22.1km Split 4 at 31.8km Split 5 at 37.1km      Final Time
#> 1      29:17.44(24)      44:06.82(24)      49:49.65(24)   59:51.68 (23)
#> 2      29:01.94(20)      43:51.67(23)      49:40.49(23)   59:57.73 (24)
#> 3      29:58.88(28)      44:40.28(25)      50:21.82(25) 1:00:28.39 (25)
#> 4      29:58.09(27)      44:42.74(26)      50:30.02(26) 1:00:41.55 (26)
#>   Time Behind Average Speed
#> 1    +4:47.49        44.302
#> 2    +4:53.54        44.228
#> 3    +5:24.20        43.854
#> 4    +5:37.36        43.696

reprex package (v2.0.0)

于 2021-07-28 创建

注意上面我使用的是 R 4.1 以上的基管 |> 如果您使用的是较早的 R 版本,这可以简单地用 magrittr 管替换 %>%