R:将 youtube 视频持续时间格式化为适当的时间(秒)
R: Formatting youtube video duration into proper time (seconds)
我有向量(列数据),其中包含 R 中字符串格式的 youtube 播放持续时间。
x <- c(PT1H8S, PT9M55S, PT13M57S, PT1M5S, PT30M12S, PT1H21M5S, PT6M48S, PT31S, PT2M)
如何删除 PT
然后以秒格式获取总持续时间?
结果向量应该是c(3608, 595, 837, 65, 1812, 4865, 408, 31, 120)
示例:PT1H21M5S
秒的形式 = 4865。
(计算为1H = 1*3600
、21M = 21*60
、5S = 5*1
)
我用正则表达式命令编写了一个小应用循环,删除了秒、分钟或小时以外的所有内容,然后将所有内容转换为秒。
x <- c("PT1H8S", "PT9M55S", "PT13M57S", "PT1M5S", "PT30M12S", "PT1H21M5S", "PT6M48S")
x2 <- sapply(x, function(i){
t <- as.numeric(gsub("^(.*)M|^(.*)H|S$", "", i))
if(grepl("M", i)) t <- t + as.numeric(gsub("^(.*)PT|^(.*)H|M(.*)$", "",i)) * 60
if(grepl("H", i)) t <- t + as.numeric(gsub("^(.*)PT|H(.*)$", "",i)) * 3600
t
})
x2
PT1H8S PT9M55S PT13M57S PT1M5S PT30M12S PT1H21M5S PT6M48S
3608 595 837 65 1812 4865 408
编辑:根据请求
x <- c("PT1H8S", "PT9M55S", "PT13M57S", "PT1M5S", "PT30M12S", "PT1H21M5S", "PT6M48S", "PT31S", "PT2M")
x2 <- sapply(x, function(i){
t <- 0
if(grepl("S", i)) t <- t + as.numeric(gsub("^(.*)PT|^(.*)M|^(.*)H|S$", "", i))
if(grepl("M", i)) t <- t + as.numeric(gsub("^(.*)PT|^(.*)H|M(.*)$", "",i)) * 60
if(grepl("H", i)) t <- t + as.numeric(gsub("^(.*)PT|H(.*)$", "",i)) * 3600
t
})
x2
PT1H8S PT9M55S PT13M57S PT1M5S PT30M12S PT1H21M5S PT6M48S PT31S PT2M
3608 595 837 65 1812 4865 408 31 120
这应该涵盖所有情况。如果有更多,诀窍是改变正则表达式。 ^
是字符向量的开始,$
是结束。 (.*)
就是一切。所以 ^(.*)H
表示从开始到 H 之间的所有内容。我们将其替换为空。
您可以使用 Lubridate 包:
library(lubridate)
x <- c("PT1H8S", "PT9M55S", "PT13M57S", "PT1M5S", "PT30M12S", "PT1H21M5S", "PT6M48S")
x2 <- as.numeric(duration(x))
x2
[1] 3608 595 837 65 1812 4865 408
这里有一个 dplyr
和 stringr
的解决方案:
df %>%
# extract hours, minutes, and seconds and convert to numeric:
mutate(
h = as.numeric(str_extract(x, "(?<=PT)\d+(?=H)")),
m = as.numeric(str_extract(x, "(?<=PT|H)\d+(?=M)")),
s = as.numeric(str_extract(x, "(?<=PT|H|M)\d+(?=S)"))
) %>%
# replace NA with 0:
mutate(
across(everything(), replace_na, 0)
) %>%
# calculate time in seconds:
mutate(sec = h*3600+m*60+s)
x h m s sec
1 PT1H8S 1 0 8 3608
2 PT9M55S 0 9 55 595
3 PT13M57S 0 13 57 837
4 PT1M5S 0 1 5 65
5 PT30M12S 0 30 12 1812
6 PT1H21M5S 1 21 5 4865
7 PT6M48S 0 6 48 408
8 PT31S 0 0 31 31
9 PT2M 0 2 0 120
数据:
df <- data.frame(x = c("PT1H8S", "PT9M55S", "PT13M57S", "PT1M5S", "PT30M12S", "PT1H21M5S", "PT6M48S", "PT31S", "PT2M"))
我有向量(列数据),其中包含 R 中字符串格式的 youtube 播放持续时间。
x <- c(PT1H8S, PT9M55S, PT13M57S, PT1M5S, PT30M12S, PT1H21M5S, PT6M48S, PT31S, PT2M)
如何删除 PT
然后以秒格式获取总持续时间?
结果向量应该是c(3608, 595, 837, 65, 1812, 4865, 408, 31, 120)
示例:PT1H21M5S
秒的形式 = 4865。
(计算为1H = 1*3600
、21M = 21*60
、5S = 5*1
)
我用正则表达式命令编写了一个小应用循环,删除了秒、分钟或小时以外的所有内容,然后将所有内容转换为秒。
x <- c("PT1H8S", "PT9M55S", "PT13M57S", "PT1M5S", "PT30M12S", "PT1H21M5S", "PT6M48S")
x2 <- sapply(x, function(i){
t <- as.numeric(gsub("^(.*)M|^(.*)H|S$", "", i))
if(grepl("M", i)) t <- t + as.numeric(gsub("^(.*)PT|^(.*)H|M(.*)$", "",i)) * 60
if(grepl("H", i)) t <- t + as.numeric(gsub("^(.*)PT|H(.*)$", "",i)) * 3600
t
})
x2
PT1H8S PT9M55S PT13M57S PT1M5S PT30M12S PT1H21M5S PT6M48S
3608 595 837 65 1812 4865 408
编辑:根据请求
x <- c("PT1H8S", "PT9M55S", "PT13M57S", "PT1M5S", "PT30M12S", "PT1H21M5S", "PT6M48S", "PT31S", "PT2M")
x2 <- sapply(x, function(i){
t <- 0
if(grepl("S", i)) t <- t + as.numeric(gsub("^(.*)PT|^(.*)M|^(.*)H|S$", "", i))
if(grepl("M", i)) t <- t + as.numeric(gsub("^(.*)PT|^(.*)H|M(.*)$", "",i)) * 60
if(grepl("H", i)) t <- t + as.numeric(gsub("^(.*)PT|H(.*)$", "",i)) * 3600
t
})
x2
PT1H8S PT9M55S PT13M57S PT1M5S PT30M12S PT1H21M5S PT6M48S PT31S PT2M
3608 595 837 65 1812 4865 408 31 120
这应该涵盖所有情况。如果有更多,诀窍是改变正则表达式。 ^
是字符向量的开始,$
是结束。 (.*)
就是一切。所以 ^(.*)H
表示从开始到 H 之间的所有内容。我们将其替换为空。
您可以使用 Lubridate 包:
library(lubridate)
x <- c("PT1H8S", "PT9M55S", "PT13M57S", "PT1M5S", "PT30M12S", "PT1H21M5S", "PT6M48S")
x2 <- as.numeric(duration(x))
x2
[1] 3608 595 837 65 1812 4865 408
这里有一个 dplyr
和 stringr
的解决方案:
df %>%
# extract hours, minutes, and seconds and convert to numeric:
mutate(
h = as.numeric(str_extract(x, "(?<=PT)\d+(?=H)")),
m = as.numeric(str_extract(x, "(?<=PT|H)\d+(?=M)")),
s = as.numeric(str_extract(x, "(?<=PT|H|M)\d+(?=S)"))
) %>%
# replace NA with 0:
mutate(
across(everything(), replace_na, 0)
) %>%
# calculate time in seconds:
mutate(sec = h*3600+m*60+s)
x h m s sec
1 PT1H8S 1 0 8 3608
2 PT9M55S 0 9 55 595
3 PT13M57S 0 13 57 837
4 PT1M5S 0 1 5 65
5 PT30M12S 0 30 12 1812
6 PT1H21M5S 1 21 5 4865
7 PT6M48S 0 6 48 408
8 PT31S 0 0 31 31
9 PT2M 0 2 0 120
数据:
df <- data.frame(x = c("PT1H8S", "PT9M55S", "PT13M57S", "PT1M5S", "PT30M12S", "PT1H21M5S", "PT6M48S", "PT31S", "PT2M"))