使用 R 将行转换为新列
Converting rows into new columns using R
样本是这样的:
# A tibble: 10 x 18
trip_id position_lon_1 position_lat_1 position_lon_2 position_lat_2 position_lon_3 position_lat_3 position_lon_4 position_lat_4 position_lon_5
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 101. 13.7 101. 13.7 101. 13.7 101. 13.7 101.
2 2 101. 13.7 101. 13.7 101. 13.7 101. 13.7 101.
3 3 101. 13.8 101. 13.7 101. 13.7 101. 13.7 101.
4 4 101. 13.8 101. 13.8 101. 13.8 101. 13.7 101.
5 5 101. 13.8 101. 13.8 101. 13.8 101. 13.8 101.
6 6 101. 13.7 101. 13.7 101. 13.7 101. 13.7 101.
7 7 101. 13.7 101. 13.7 101. 13.7 101. 13.7 101.
8 1977 101. 13.7 101. 13.7 101. 13.7 101. 13.7 101.
9 1978 100. 13.7 101. 13.7 101. 13.7 101. 13.7 101.
10 1979 101. 13.8 101. 13.8 101. 13.8 101. 13.7 101.
# ... with 8 more variables: position_lat_5 <dbl>, position_lon_6 <dbl>, position_lat_6 <dbl>, start_time1 <dbl>, end_time1 <dbl>, length <dbl>,
# purpose <chr>, clusterNum <dbl>
example <- structure(list(trip_id = c(1, 2, 3, 4, 5, 6, 7, 1977, 1978, 1979), position_lon_1 = c(100.5423, 100.53881, 100.5824, 100.53724,100.52715, 100.50491, 100.51813, 100.57668, 100.49228, 100.52679), position_lat_1 = c(13.7444, 13.71867, 13.75446, 13.76453,13.76981, 13.73128, 13.73133, 13.73969, 13.7447, 13.76592), position_lon_2 = c(100.54767,100.53822, 100.57764, 100.53057, 100.53335, 100.50947, 100.52216,100.57951, 100.50324, 100.52661), position_lat_2 = c(13.74409,13.72446, 13.74911, 13.76827, 13.75848, 13.72012, 13.73132, 13.73407,13.74693, 13.76381), position_lon_3 = c(100.54828, 100.54081,100.58012, 100.53755, 100.53717, 100.53354, 100.52927, 100.58263,100.51611, 100.52553), position_lat_3 = c(13.74762, 13.72764,13.74582, 13.75122, 13.76324, 13.72289, 13.73678, 13.73423, 13.73462,13.75167), position_lon_4 = c(100.54839, 100.55916, 100.58443,100.52854, 100.53642, 100.53843, 100.53061, 100.58309, 100.5224,100.53361), position_lat_4 = c(13.7487, 13.72082, 13.7449, 13.73238,13.76578, 13.73429, 13.74567, 13.73556, 13.72415, 13.74591),position_lon_5 = c(100.54841, 100.54585, 100.59693, 100.51346,100.54466, 100.54338, 100.53061, 100.58449, 100.5156, 100.53465), position_lat_5 = c(13.74902, 13.74356, 13.74322, 13.73616,13.76328, 13.76289, 13.74567, 13.73958, 13.71989, 13.74595), position_lon_6 = c(100.54841, 100.54585, 100.59693, 100.51346,100.54466, 100.54338, 100.53061, 100.58449, 100.5156, 100.53465), position_lat_6 = c(13.74902, 13.74356, 13.74322, 13.73616,13.76328, 13.76289, 13.74567, 13.73958, 13.71989, 13.74595), start_time1 = c(6.72472222222222, 6.82194444444444, 7.57361111111111,6.56611111111111, 6.31111111111111, 6.29916666666667, 6.59555555555556,16.3666666666667, 16.1305555555556, 16.0111111111111), end_time1 = c(6.82027777777778,7.30527777777778, 7.79777777777778, 7.13111111111111, 6.66111111111111,6.89916666666667, 6.77388888888889, 16.55, 16.6925, 16.7827777777778), length = c(835.421673262, 2857.08440862, 2003.94967386,4057.83640791, 2026.71013038, 5435.16534982, 2083.02205084,844.774412796, 3727.75857424, 2367.29993395), purpose = c("Bank/ATM","Business", "Infrastructure/Hydro", "Service", "Bank/ATM","Automobile", "Food", "Food", "Food", "Business"), clusterNum = c(2,2, 1, 3, 2, 2, 2, 4, 3, 1)), row.names = c(NA, -10L), class = c("tbl_df","tbl", "data.frame"))
我想通过将行数据转换为新列来创建新的数据框。这就是我预期的输出结果;
R 中的任何包和函数都可以实现这一点吗?
感谢您的帮助。
这可能对你有用
library(tidyverse)
example %>%
pivot_longer(cols = starts_with("position"),
names_to = c(".value", "position"),
names_pattern = "^position_(.*)_(.)",
values_drop_na = TRUE)
我希望这是你的想法:
library(dplyr)
library(tidyr)
example %>%
pivot_longer(starts_with("position"), names_to = c(NA, ".value", "position"),
names_pattern = "(\w+)_(\w+)_(\d)")
# A tibble: 60 x 9
trip_id start_time1 end_time1 length purpose clusterNum position lon lat
<dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 1 6.72 6.82 835. Bank/ATM 2 1 101. 13.7
2 1 6.72 6.82 835. Bank/ATM 2 2 101. 13.7
3 1 6.72 6.82 835. Bank/ATM 2 3 101. 13.7
4 1 6.72 6.82 835. Bank/ATM 2 4 101. 13.7
5 1 6.72 6.82 835. Bank/ATM 2 5 101. 13.7
6 1 6.72 6.82 835. Bank/ATM 2 6 101. 13.7
7 2 6.82 7.31 2857. Business 2 1 101. 13.7
8 2 6.82 7.31 2857. Business 2 2 101. 13.7
9 2 6.82 7.31 2857. Business 2 3 101. 13.7
10 2 6.82 7.31 2857. Business 2 4 101. 13.7
# ... with 50 more rows
您正在寻找的是一种将列转换为行的方法(而不是相反),但要保持“位置”标识符并将经度和纬度存储在单独的列中。 tidyr 的 pivot_longer
函数可以让你做到这一点!查看文档 (here) 以获取对不同参数的详细解释。
library(tidyverse)
example %>%
pivot_longer(
starts_with("position"),names_sep = "_", names_to = c(NA, ".value", "pos")
)
#> # A tibble: 60 x 9
#> trip_id start_time1 end_time1 length purpose clusterNum pos lon lat
#> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
#> 1 1 6.72 6.82 835. Bank/ATM 2 1 101. 13.7
#> 2 1 6.72 6.82 835. Bank/ATM 2 2 101. 13.7
#> 3 1 6.72 6.82 835. Bank/ATM 2 3 101. 13.7
#> 4 1 6.72 6.82 835. Bank/ATM 2 4 101. 13.7
#> 5 1 6.72 6.82 835. Bank/ATM 2 5 101. 13.7
#> 6 1 6.72 6.82 835. Bank/ATM 2 6 101. 13.7
#> 7 2 6.82 7.31 2857. Business 2 1 101. 13.7
#> 8 2 6.82 7.31 2857. Business 2 2 101. 13.7
#> 9 2 6.82 7.31 2857. Business 2 3 101. 13.7
#> 10 2 6.82 7.31 2857. Business 2 4 101. 13.7
#> # ... with 50 more rows
为了完整起见,这里有一个使用 melt()
和 measure()
函数的解决方案(data.table 1.14.1 版新增):
library(data.table) # development version 1.14.1 used
long <- melt(setDT(example), measure.vars =
measure(value.name, position, pattern = "(lon|lat)_(\d)"))[order(trip_id)]
head(long, 12)
trip_id start_time1 end_time1 length purpose clusterNum position lon lat
1: 1 6.724722 6.820278 835.4217 Bank/ATM 2 1 100.5423 13.74440
2: 1 6.724722 6.820278 835.4217 Bank/ATM 2 2 100.5477 13.74409
3: 1 6.724722 6.820278 835.4217 Bank/ATM 2 3 100.5483 13.74762
4: 1 6.724722 6.820278 835.4217 Bank/ATM 2 4 100.5484 13.74870
5: 1 6.724722 6.820278 835.4217 Bank/ATM 2 5 100.5484 13.74902
6: 1 6.724722 6.820278 835.4217 Bank/ATM 2 6 100.5484 13.74902
7: 2 6.821944 7.305278 2857.0844 Business 2 1 100.5388 13.71867
8: 2 6.821944 7.305278 2857.0844 Business 2 2 100.5382 13.72446
9: 2 6.821944 7.305278 2857.0844 Business 2 3 100.5408 13.72764
10: 2 6.821944 7.305278 2857.0844 Business 2 4 100.5592 13.72082
11: 2 6.821944 7.305278 2857.0844 Business 2 5 100.5459 13.74356
12: 2 6.821944 7.305278 2857.0844 Business 2 6 100.5459 13.74356
如果需要,可以通过
更改列顺序
setcolorder(long, c("trip_id", "lon", "lat", "position"))[]
head(long, 12)
trip_id lon lat position start_time1 end_time1 length purpose clusterNum
1: 1 100.5423 13.74440 1 6.724722 6.820278 835.4217 Bank/ATM 2
2: 1 100.5477 13.74409 2 6.724722 6.820278 835.4217 Bank/ATM 2
3: 1 100.5483 13.74762 3 6.724722 6.820278 835.4217 Bank/ATM 2
4: 1 100.5484 13.74870 4 6.724722 6.820278 835.4217 Bank/ATM 2
5: 1 100.5484 13.74902 5 6.724722 6.820278 835.4217 Bank/ATM 2
6: 1 100.5484 13.74902 6 6.724722 6.820278 835.4217 Bank/ATM 2
7: 2 100.5388 13.71867 1 6.821944 7.305278 2857.0844 Business 2
8: 2 100.5382 13.72446 2 6.821944 7.305278 2857.0844 Business 2
9: 2 100.5408 13.72764 3 6.821944 7.305278 2857.0844 Business 2
10: 2 100.5592 13.72082 4 6.821944 7.305278 2857.0844 Business 2
11: 2 100.5459 13.74356 5 6.821944 7.305278 2857.0844 Business 2
12: 2 100.5459 13.74356 6 6.821944 7.305278 2857.0844 Business 2
样本是这样的:
# A tibble: 10 x 18
trip_id position_lon_1 position_lat_1 position_lon_2 position_lat_2 position_lon_3 position_lat_3 position_lon_4 position_lat_4 position_lon_5
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 101. 13.7 101. 13.7 101. 13.7 101. 13.7 101.
2 2 101. 13.7 101. 13.7 101. 13.7 101. 13.7 101.
3 3 101. 13.8 101. 13.7 101. 13.7 101. 13.7 101.
4 4 101. 13.8 101. 13.8 101. 13.8 101. 13.7 101.
5 5 101. 13.8 101. 13.8 101. 13.8 101. 13.8 101.
6 6 101. 13.7 101. 13.7 101. 13.7 101. 13.7 101.
7 7 101. 13.7 101. 13.7 101. 13.7 101. 13.7 101.
8 1977 101. 13.7 101. 13.7 101. 13.7 101. 13.7 101.
9 1978 100. 13.7 101. 13.7 101. 13.7 101. 13.7 101.
10 1979 101. 13.8 101. 13.8 101. 13.8 101. 13.7 101.
# ... with 8 more variables: position_lat_5 <dbl>, position_lon_6 <dbl>, position_lat_6 <dbl>, start_time1 <dbl>, end_time1 <dbl>, length <dbl>,
# purpose <chr>, clusterNum <dbl>
example <- structure(list(trip_id = c(1, 2, 3, 4, 5, 6, 7, 1977, 1978, 1979), position_lon_1 = c(100.5423, 100.53881, 100.5824, 100.53724,100.52715, 100.50491, 100.51813, 100.57668, 100.49228, 100.52679), position_lat_1 = c(13.7444, 13.71867, 13.75446, 13.76453,13.76981, 13.73128, 13.73133, 13.73969, 13.7447, 13.76592), position_lon_2 = c(100.54767,100.53822, 100.57764, 100.53057, 100.53335, 100.50947, 100.52216,100.57951, 100.50324, 100.52661), position_lat_2 = c(13.74409,13.72446, 13.74911, 13.76827, 13.75848, 13.72012, 13.73132, 13.73407,13.74693, 13.76381), position_lon_3 = c(100.54828, 100.54081,100.58012, 100.53755, 100.53717, 100.53354, 100.52927, 100.58263,100.51611, 100.52553), position_lat_3 = c(13.74762, 13.72764,13.74582, 13.75122, 13.76324, 13.72289, 13.73678, 13.73423, 13.73462,13.75167), position_lon_4 = c(100.54839, 100.55916, 100.58443,100.52854, 100.53642, 100.53843, 100.53061, 100.58309, 100.5224,100.53361), position_lat_4 = c(13.7487, 13.72082, 13.7449, 13.73238,13.76578, 13.73429, 13.74567, 13.73556, 13.72415, 13.74591),position_lon_5 = c(100.54841, 100.54585, 100.59693, 100.51346,100.54466, 100.54338, 100.53061, 100.58449, 100.5156, 100.53465), position_lat_5 = c(13.74902, 13.74356, 13.74322, 13.73616,13.76328, 13.76289, 13.74567, 13.73958, 13.71989, 13.74595), position_lon_6 = c(100.54841, 100.54585, 100.59693, 100.51346,100.54466, 100.54338, 100.53061, 100.58449, 100.5156, 100.53465), position_lat_6 = c(13.74902, 13.74356, 13.74322, 13.73616,13.76328, 13.76289, 13.74567, 13.73958, 13.71989, 13.74595), start_time1 = c(6.72472222222222, 6.82194444444444, 7.57361111111111,6.56611111111111, 6.31111111111111, 6.29916666666667, 6.59555555555556,16.3666666666667, 16.1305555555556, 16.0111111111111), end_time1 = c(6.82027777777778,7.30527777777778, 7.79777777777778, 7.13111111111111, 6.66111111111111,6.89916666666667, 6.77388888888889, 16.55, 16.6925, 16.7827777777778), length = c(835.421673262, 2857.08440862, 2003.94967386,4057.83640791, 2026.71013038, 5435.16534982, 2083.02205084,844.774412796, 3727.75857424, 2367.29993395), purpose = c("Bank/ATM","Business", "Infrastructure/Hydro", "Service", "Bank/ATM","Automobile", "Food", "Food", "Food", "Business"), clusterNum = c(2,2, 1, 3, 2, 2, 2, 4, 3, 1)), row.names = c(NA, -10L), class = c("tbl_df","tbl", "data.frame"))
我想通过将行数据转换为新列来创建新的数据框。这就是我预期的输出结果;
R 中的任何包和函数都可以实现这一点吗? 感谢您的帮助。
这可能对你有用
library(tidyverse)
example %>%
pivot_longer(cols = starts_with("position"),
names_to = c(".value", "position"),
names_pattern = "^position_(.*)_(.)",
values_drop_na = TRUE)
我希望这是你的想法:
library(dplyr)
library(tidyr)
example %>%
pivot_longer(starts_with("position"), names_to = c(NA, ".value", "position"),
names_pattern = "(\w+)_(\w+)_(\d)")
# A tibble: 60 x 9
trip_id start_time1 end_time1 length purpose clusterNum position lon lat
<dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 1 6.72 6.82 835. Bank/ATM 2 1 101. 13.7
2 1 6.72 6.82 835. Bank/ATM 2 2 101. 13.7
3 1 6.72 6.82 835. Bank/ATM 2 3 101. 13.7
4 1 6.72 6.82 835. Bank/ATM 2 4 101. 13.7
5 1 6.72 6.82 835. Bank/ATM 2 5 101. 13.7
6 1 6.72 6.82 835. Bank/ATM 2 6 101. 13.7
7 2 6.82 7.31 2857. Business 2 1 101. 13.7
8 2 6.82 7.31 2857. Business 2 2 101. 13.7
9 2 6.82 7.31 2857. Business 2 3 101. 13.7
10 2 6.82 7.31 2857. Business 2 4 101. 13.7
# ... with 50 more rows
您正在寻找的是一种将列转换为行的方法(而不是相反),但要保持“位置”标识符并将经度和纬度存储在单独的列中。 tidyr 的 pivot_longer
函数可以让你做到这一点!查看文档 (here) 以获取对不同参数的详细解释。
library(tidyverse)
example %>%
pivot_longer(
starts_with("position"),names_sep = "_", names_to = c(NA, ".value", "pos")
)
#> # A tibble: 60 x 9
#> trip_id start_time1 end_time1 length purpose clusterNum pos lon lat
#> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
#> 1 1 6.72 6.82 835. Bank/ATM 2 1 101. 13.7
#> 2 1 6.72 6.82 835. Bank/ATM 2 2 101. 13.7
#> 3 1 6.72 6.82 835. Bank/ATM 2 3 101. 13.7
#> 4 1 6.72 6.82 835. Bank/ATM 2 4 101. 13.7
#> 5 1 6.72 6.82 835. Bank/ATM 2 5 101. 13.7
#> 6 1 6.72 6.82 835. Bank/ATM 2 6 101. 13.7
#> 7 2 6.82 7.31 2857. Business 2 1 101. 13.7
#> 8 2 6.82 7.31 2857. Business 2 2 101. 13.7
#> 9 2 6.82 7.31 2857. Business 2 3 101. 13.7
#> 10 2 6.82 7.31 2857. Business 2 4 101. 13.7
#> # ... with 50 more rows
为了完整起见,这里有一个使用 melt()
和 measure()
函数的解决方案(data.table 1.14.1 版新增):
library(data.table) # development version 1.14.1 used
long <- melt(setDT(example), measure.vars =
measure(value.name, position, pattern = "(lon|lat)_(\d)"))[order(trip_id)]
head(long, 12)
trip_id start_time1 end_time1 length purpose clusterNum position lon lat 1: 1 6.724722 6.820278 835.4217 Bank/ATM 2 1 100.5423 13.74440 2: 1 6.724722 6.820278 835.4217 Bank/ATM 2 2 100.5477 13.74409 3: 1 6.724722 6.820278 835.4217 Bank/ATM 2 3 100.5483 13.74762 4: 1 6.724722 6.820278 835.4217 Bank/ATM 2 4 100.5484 13.74870 5: 1 6.724722 6.820278 835.4217 Bank/ATM 2 5 100.5484 13.74902 6: 1 6.724722 6.820278 835.4217 Bank/ATM 2 6 100.5484 13.74902 7: 2 6.821944 7.305278 2857.0844 Business 2 1 100.5388 13.71867 8: 2 6.821944 7.305278 2857.0844 Business 2 2 100.5382 13.72446 9: 2 6.821944 7.305278 2857.0844 Business 2 3 100.5408 13.72764 10: 2 6.821944 7.305278 2857.0844 Business 2 4 100.5592 13.72082 11: 2 6.821944 7.305278 2857.0844 Business 2 5 100.5459 13.74356 12: 2 6.821944 7.305278 2857.0844 Business 2 6 100.5459 13.74356
如果需要,可以通过
更改列顺序setcolorder(long, c("trip_id", "lon", "lat", "position"))[]
head(long, 12)
trip_id lon lat position start_time1 end_time1 length purpose clusterNum 1: 1 100.5423 13.74440 1 6.724722 6.820278 835.4217 Bank/ATM 2 2: 1 100.5477 13.74409 2 6.724722 6.820278 835.4217 Bank/ATM 2 3: 1 100.5483 13.74762 3 6.724722 6.820278 835.4217 Bank/ATM 2 4: 1 100.5484 13.74870 4 6.724722 6.820278 835.4217 Bank/ATM 2 5: 1 100.5484 13.74902 5 6.724722 6.820278 835.4217 Bank/ATM 2 6: 1 100.5484 13.74902 6 6.724722 6.820278 835.4217 Bank/ATM 2 7: 2 100.5388 13.71867 1 6.821944 7.305278 2857.0844 Business 2 8: 2 100.5382 13.72446 2 6.821944 7.305278 2857.0844 Business 2 9: 2 100.5408 13.72764 3 6.821944 7.305278 2857.0844 Business 2 10: 2 100.5592 13.72082 4 6.821944 7.305278 2857.0844 Business 2 11: 2 100.5459 13.74356 5 6.821944 7.305278 2857.0844 Business 2 12: 2 100.5459 13.74356 6 6.821944 7.305278 2857.0844 Business 2