将向量转换为 R 中具有两列的数据框
Transformation of a vector to a data frame with two columns in R
我有以下数据集,它由 ID(整数,尽管所有都存储为一个因子)和时间单位组成,属于上面的 ID(请参阅下面的数据摘录,总数据有 300' 000+ 个值)。
10000007
01:33:28
00:00:05
10000010
00:00:39
00:02:21
00:05:59
00:40:40
00:09:53
10000014
00:09:37
00:00:00
00:00:15
00:00:02
00:40:51
10000022
00:41:44
01:32:32
10000023
01:07:14
01:25:27
1000003
00:00:58
00:03:42
00:05:58
00:00:29
00:00:01
00:04:02
00:00:10
00:01:08
00:07:52
00:10:18
00:12:52
00:09:39
00:07:48
00:04:29
我想要的是一个数据框,其中一列是ID,另一列是时间。像这样:
10000007 01:33:28
10000007 00:00:05
10000010 00:00:39
10000010 00:02:21
10000010 00:05:59
10000010 00:40:40
10000010 00:09:53
10000014 00:09:37
10000014 00:00:00
10000014 00:00:15
10000014 00:00:02
10000014 00:40:51
...
我苦恼于不同 ID 之间的时间戳数量不同,以及我无法有效地将格式更改为数字,这可能会使操作更方便。
最终,我们的想法是将时间汇总为每个 ID 只有一次。非常感谢您!
一个dplyr
选项可以是:
df %>%
group_by(grp = cumsum(grepl("^100", V1))) %>%
mutate(V2 = first(V1)) %>%
slice(-1) %>%
ungroup() %>%
select(-grp)
V1 V2
<chr> <chr>
1 01:33:28 10000007
2 00:00:05 10000007
3 00:00:39 10000010
4 00:02:21 10000010
5 00:05:59 10000010
6 00:40:40 10000010
7 00:09:53 10000010
8 00:09:37 10000014
9 00:00:00 10000014
10 00:00:15 10000014
# … with 20 more rows
在base R
中,我们可以使用stack
i1 <- !grepl(":", df$V1)
out <- stack(setNames(split(df$V1[!i1], cumsum(i1)[!i1]), df$V1[i1]))[2:1]
head(out)
# ind values
#1 10000007 01:33:28
#2 10000007 00:00:05
#3 10000010 00:00:39
#4 10000010 00:02:21
#5 10000010 00:05:59
#6 10000010 00:40:40
dim(out)
#[1] 30 2
或者可以通过 data.frame
调用
来完成
data.frame(V1 = df$V1[i1][cumsum(i1)], V2 = df$V1)[!i1,]
或 transform
transform(df, V2 = V1[i1][cumsum(i1)])[!i1,]
数据
df <- structure(list(V1 = c("10000007", "01:33:28", "00:00:05", "10000010",
"00:00:39", "00:02:21", "00:05:59", "00:40:40", "00:09:53", "10000014",
"00:09:37", "00:00:00", "00:00:15", "00:00:02", "00:40:51", "10000022",
"00:41:44", "01:32:32", "10000023", "01:07:14", "01:25:27", "1000003",
"00:00:58", "00:03:42", "00:05:58", "00:00:29", "00:00:01", "00:04:02",
"00:00:10", "00:01:08", "00:07:52", "00:10:18", "00:12:52", "00:09:39",
"00:07:48", "00:04:29")), class = "data.frame", row.names = c(NA,
-36L))
在基础 R 中,拆分为 as.POSIXct
产生 NA
的那些值,然后 rbind
结果。
res <- do.call(rbind.data.frame,
by(x, cumsum(is.na(as.POSIXct(x, format="%T"))), function(x)
cbind(as.character(x)[1], as.character(x)[-1])))
head(res)
# V1 V2
# 1.1 10000007 01:33:28
# 1.2 10000007 00:00:05
# 2.1 10000010 00:00:39
# 2.2 10000010 00:02:21
# 2.3 10000010 00:05:59
# 2.4 10000010 00:40:40
数据:
x <- structure(c(31L, 30L, 4L, 32L, 8L, 11L, 16L, 24L, 21L, 33L, 19L,
1L, 6L, 3L, 25L, 34L, 26L, 29L, 35L, 27L, 28L, 36L, 9L, 12L,
15L, 7L, 2L, 13L, 5L, 10L, 18L, 22L, 23L, 20L, 17L, 14L), .Label = c("00:00:00",
"00:00:01", "00:00:02", "00:00:05", "00:00:10", "00:00:15", "00:00:29",
"00:00:39", "00:00:58", "00:01:08", "00:02:21", "00:03:42", "00:04:02",
"00:04:29", "00:05:58", "00:05:59", "00:07:48", "00:07:52", "00:09:37",
"00:09:39", "00:09:53", "00:10:18", "00:12:52", "00:40:40", "00:40:51",
"00:41:44", "01:07:14", "01:25:27", "01:32:32", "01:33:28", "10000007",
"10000010", "10000014", "10000022", "10000023", "1000003"), class = "factor")
我有以下数据集,它由 ID(整数,尽管所有都存储为一个因子)和时间单位组成,属于上面的 ID(请参阅下面的数据摘录,总数据有 300' 000+ 个值)。
10000007
01:33:28
00:00:05
10000010
00:00:39
00:02:21
00:05:59
00:40:40
00:09:53
10000014
00:09:37
00:00:00
00:00:15
00:00:02
00:40:51
10000022
00:41:44
01:32:32
10000023
01:07:14
01:25:27
1000003
00:00:58
00:03:42
00:05:58
00:00:29
00:00:01
00:04:02
00:00:10
00:01:08
00:07:52
00:10:18
00:12:52
00:09:39
00:07:48
00:04:29
我想要的是一个数据框,其中一列是ID,另一列是时间。像这样:
10000007 01:33:28
10000007 00:00:05
10000010 00:00:39
10000010 00:02:21
10000010 00:05:59
10000010 00:40:40
10000010 00:09:53
10000014 00:09:37
10000014 00:00:00
10000014 00:00:15
10000014 00:00:02
10000014 00:40:51
...
我苦恼于不同 ID 之间的时间戳数量不同,以及我无法有效地将格式更改为数字,这可能会使操作更方便。
最终,我们的想法是将时间汇总为每个 ID 只有一次。非常感谢您!
一个dplyr
选项可以是:
df %>%
group_by(grp = cumsum(grepl("^100", V1))) %>%
mutate(V2 = first(V1)) %>%
slice(-1) %>%
ungroup() %>%
select(-grp)
V1 V2
<chr> <chr>
1 01:33:28 10000007
2 00:00:05 10000007
3 00:00:39 10000010
4 00:02:21 10000010
5 00:05:59 10000010
6 00:40:40 10000010
7 00:09:53 10000010
8 00:09:37 10000014
9 00:00:00 10000014
10 00:00:15 10000014
# … with 20 more rows
在base R
中,我们可以使用stack
i1 <- !grepl(":", df$V1)
out <- stack(setNames(split(df$V1[!i1], cumsum(i1)[!i1]), df$V1[i1]))[2:1]
head(out)
# ind values
#1 10000007 01:33:28
#2 10000007 00:00:05
#3 10000010 00:00:39
#4 10000010 00:02:21
#5 10000010 00:05:59
#6 10000010 00:40:40
dim(out)
#[1] 30 2
或者可以通过 data.frame
调用
data.frame(V1 = df$V1[i1][cumsum(i1)], V2 = df$V1)[!i1,]
或 transform
transform(df, V2 = V1[i1][cumsum(i1)])[!i1,]
数据
df <- structure(list(V1 = c("10000007", "01:33:28", "00:00:05", "10000010",
"00:00:39", "00:02:21", "00:05:59", "00:40:40", "00:09:53", "10000014",
"00:09:37", "00:00:00", "00:00:15", "00:00:02", "00:40:51", "10000022",
"00:41:44", "01:32:32", "10000023", "01:07:14", "01:25:27", "1000003",
"00:00:58", "00:03:42", "00:05:58", "00:00:29", "00:00:01", "00:04:02",
"00:00:10", "00:01:08", "00:07:52", "00:10:18", "00:12:52", "00:09:39",
"00:07:48", "00:04:29")), class = "data.frame", row.names = c(NA,
-36L))
在基础 R 中,拆分为 as.POSIXct
产生 NA
的那些值,然后 rbind
结果。
res <- do.call(rbind.data.frame,
by(x, cumsum(is.na(as.POSIXct(x, format="%T"))), function(x)
cbind(as.character(x)[1], as.character(x)[-1])))
head(res)
# V1 V2
# 1.1 10000007 01:33:28
# 1.2 10000007 00:00:05
# 2.1 10000010 00:00:39
# 2.2 10000010 00:02:21
# 2.3 10000010 00:05:59
# 2.4 10000010 00:40:40
数据:
x <- structure(c(31L, 30L, 4L, 32L, 8L, 11L, 16L, 24L, 21L, 33L, 19L,
1L, 6L, 3L, 25L, 34L, 26L, 29L, 35L, 27L, 28L, 36L, 9L, 12L,
15L, 7L, 2L, 13L, 5L, 10L, 18L, 22L, 23L, 20L, 17L, 14L), .Label = c("00:00:00",
"00:00:01", "00:00:02", "00:00:05", "00:00:10", "00:00:15", "00:00:29",
"00:00:39", "00:00:58", "00:01:08", "00:02:21", "00:03:42", "00:04:02",
"00:04:29", "00:05:58", "00:05:59", "00:07:48", "00:07:52", "00:09:37",
"00:09:39", "00:09:53", "00:10:18", "00:12:52", "00:40:40", "00:40:51",
"00:41:44", "01:07:14", "01:25:27", "01:32:32", "01:33:28", "10000007",
"10000010", "10000014", "10000022", "10000023", "1000003"), class = "factor")