R 在 data.frame 的所有行上应用距离函数
R Apply distance function on all rows of data.frame
我有 data.frame(见下文)机场代码。我正在尝试 运行 a (airportr::airport_distance) 来获取每对之间的距离。当我尝试在完整数据帧上 运行 时出现错误(请参见下面的代码)。知道为什么这行不通吗?
df1 <- structure(list(orig_station = c("LAX", "BUF", "ATL", "DEN", "ORD",
"DEN", "MEM", "TYS", "IAH", "CID"), dest_station = c("SFO", "MIA",
"CAE", "DEN", "IND", "DEN", "MEM", "TPA", "IAH", "PDX")), row.names = c(NA,
10L), class = "data.frame")
df1$dist <- airport_distance(df1$orig_station, df1$dest_station)
我们可以使用 Map
或 mapply
因为函数不是 Vectorize
d.
library(airportr)
df1$dist <- unlist(Map(airport_distance, df1$orig_station, df1$dest_station))
或 apply
df1$dist <- apply(df1[c('orig_station', 'dest_station')], 1,
function(x) airport_distance(x[1], x[2]))
或者另一种选择是 Vectorize
Vectorize(airport_distance)(df1$orig_station, df1$dest_station)
# LAX BUF ATL DEN ORD DEN MEM TYS IAH CID
# 543.3598 1912.5540 307.6851 0.0000 285.6848 0.0000 0.0000 882.3557 0.0000 2500.2793
或使用tidyverse
library(dplyr)
library(purrr)
df1 %>%
mutate(dist = map2_dbl(orig_station, dest_station, airport_distance))
-输出
# orig_station dest_station dist
#1 LAX SFO 543.3598
#2 BUF MIA 1912.5540
#3 ATL CAE 307.6851
#4 DEN DEN 0.0000
#5 ORD IND 285.6848
#6 DEN DEN 0.0000
#7 MEM MEM 0.0000
#8 TYS TPA 882.3557
#9 IAH IAH 0.0000
#10 CID PDX 2500.2793
或使用rowwise
df1 %>%
rowwise %>%
mutate(dist = airport_distance(orig_station, dest_station)) %>%
ungroup
查看了 airport_distance
函数,发现它没有向量化。这不好,因为对于大型数据集,您将无法计算距离。您可能应该考虑编写矢量化函数。一个简单的例子是:
vec_dist <- function(df){
air <- unlist(df)
match1 <- dplyr::filter(airports, IATA%in%unique(air))
point <- match(air, match1$IATA)
lon <- matrix((match1$Longitude * pi/180)[point], ncol = 2)
lat <- matrix((match1$Latitude * pi/180)[point], ncol = 2)
radius <- 6373
dlon = lon[,2] - lon[,1]
dlat = lat[,2] - lat[,1]
a = (sin(dlat/2))^2 + cos(lat[,1]) * cos(lat[,2]) * (sin(dlon/2))^2
b = 2 * atan2(sqrt(a), sqrt(1 - a))
cbind(df, dist= radius * b)
}
vec_dist(df1)
orig_station dest_station dist
1 LAX SFO 543.3598
2 BUF MIA 1912.5540
3 ATL CAE 307.6851
4 DEN DEN 0.0000
5 ORD IND 285.6848
6 DEN DEN 0.0000
7 MEM MEM 0.0000
8 TYS TPA 882.3557
9 IAH IAH 0.0000
10 CID PDX 2500.2793
为什么我会考虑编写自己的函数?一个快速的基准给你想法:
microbenchmark::microbenchmark(vec_dist(df1),
unlist_Map=unlist(Map(airport_distance, df1$orig_station, df1$dest_station)),
apply_=apply(df1[c('orig_station', 'dest_station')], 1, function(x) airport_distance(x[1], x[2])),
vectorize=Vectorize(airport_distance)(df1$orig_station, df1$dest_station), times=2)
Unit: milliseconds
expr min lq mean median uq max neval
vec_dist(df1) 3.176101 3.176101 3.536051 3.536051 3.896001 3.896001 2
unlist_Map 431.611700 431.611700 498.710251 498.710251 565.808801 565.808801 2
apply_ 572.807201 572.807201 577.864401 577.864401 582.921601 582.921601 2
vectorize 483.825801 483.825801 528.993851 528.993851 574.161900 574.161900 2
然而,这是 运行 它在 10 行的数据上。如果数据以几乎相似的点增加会发生什么?
df1 <- df1[rep(1:10, each=100), ]
Unit: milliseconds
expr min lq mean median uq max neval
vec_dist(df1) 7.084901 7.084901 8.564601 8.564601 10.0443 10.0443 2
unlist_Map 45161.593601 45161.593601 45229.421051 45229.421051 45297.2485 45297.2485 2
apply_ 45536.644800 45536.644800 53869.454001 53869.454001 62202.2632 62202.2632 2
vectorize 45286.505601 45286.505601 51775.855502 51775.855502 58265.2054 58265.2054 2
我有 data.frame(见下文)机场代码。我正在尝试 运行 a (airportr::airport_distance) 来获取每对之间的距离。当我尝试在完整数据帧上 运行 时出现错误(请参见下面的代码)。知道为什么这行不通吗?
df1 <- structure(list(orig_station = c("LAX", "BUF", "ATL", "DEN", "ORD",
"DEN", "MEM", "TYS", "IAH", "CID"), dest_station = c("SFO", "MIA",
"CAE", "DEN", "IND", "DEN", "MEM", "TPA", "IAH", "PDX")), row.names = c(NA,
10L), class = "data.frame")
df1$dist <- airport_distance(df1$orig_station, df1$dest_station)
我们可以使用 Map
或 mapply
因为函数不是 Vectorize
d.
library(airportr)
df1$dist <- unlist(Map(airport_distance, df1$orig_station, df1$dest_station))
或 apply
df1$dist <- apply(df1[c('orig_station', 'dest_station')], 1,
function(x) airport_distance(x[1], x[2]))
或者另一种选择是 Vectorize
Vectorize(airport_distance)(df1$orig_station, df1$dest_station)
# LAX BUF ATL DEN ORD DEN MEM TYS IAH CID
# 543.3598 1912.5540 307.6851 0.0000 285.6848 0.0000 0.0000 882.3557 0.0000 2500.2793
或使用tidyverse
library(dplyr)
library(purrr)
df1 %>%
mutate(dist = map2_dbl(orig_station, dest_station, airport_distance))
-输出
# orig_station dest_station dist
#1 LAX SFO 543.3598
#2 BUF MIA 1912.5540
#3 ATL CAE 307.6851
#4 DEN DEN 0.0000
#5 ORD IND 285.6848
#6 DEN DEN 0.0000
#7 MEM MEM 0.0000
#8 TYS TPA 882.3557
#9 IAH IAH 0.0000
#10 CID PDX 2500.2793
或使用rowwise
df1 %>%
rowwise %>%
mutate(dist = airport_distance(orig_station, dest_station)) %>%
ungroup
查看了 airport_distance
函数,发现它没有向量化。这不好,因为对于大型数据集,您将无法计算距离。您可能应该考虑编写矢量化函数。一个简单的例子是:
vec_dist <- function(df){
air <- unlist(df)
match1 <- dplyr::filter(airports, IATA%in%unique(air))
point <- match(air, match1$IATA)
lon <- matrix((match1$Longitude * pi/180)[point], ncol = 2)
lat <- matrix((match1$Latitude * pi/180)[point], ncol = 2)
radius <- 6373
dlon = lon[,2] - lon[,1]
dlat = lat[,2] - lat[,1]
a = (sin(dlat/2))^2 + cos(lat[,1]) * cos(lat[,2]) * (sin(dlon/2))^2
b = 2 * atan2(sqrt(a), sqrt(1 - a))
cbind(df, dist= radius * b)
}
vec_dist(df1)
orig_station dest_station dist
1 LAX SFO 543.3598
2 BUF MIA 1912.5540
3 ATL CAE 307.6851
4 DEN DEN 0.0000
5 ORD IND 285.6848
6 DEN DEN 0.0000
7 MEM MEM 0.0000
8 TYS TPA 882.3557
9 IAH IAH 0.0000
10 CID PDX 2500.2793
为什么我会考虑编写自己的函数?一个快速的基准给你想法:
microbenchmark::microbenchmark(vec_dist(df1),
unlist_Map=unlist(Map(airport_distance, df1$orig_station, df1$dest_station)),
apply_=apply(df1[c('orig_station', 'dest_station')], 1, function(x) airport_distance(x[1], x[2])),
vectorize=Vectorize(airport_distance)(df1$orig_station, df1$dest_station), times=2)
Unit: milliseconds
expr min lq mean median uq max neval
vec_dist(df1) 3.176101 3.176101 3.536051 3.536051 3.896001 3.896001 2
unlist_Map 431.611700 431.611700 498.710251 498.710251 565.808801 565.808801 2
apply_ 572.807201 572.807201 577.864401 577.864401 582.921601 582.921601 2
vectorize 483.825801 483.825801 528.993851 528.993851 574.161900 574.161900 2
然而,这是 运行 它在 10 行的数据上。如果数据以几乎相似的点增加会发生什么?
df1 <- df1[rep(1:10, each=100), ]
Unit: milliseconds
expr min lq mean median uq max neval
vec_dist(df1) 7.084901 7.084901 8.564601 8.564601 10.0443 10.0443 2
unlist_Map 45161.593601 45161.593601 45229.421051 45229.421051 45297.2485 45297.2485 2
apply_ 45536.644800 45536.644800 53869.454001 53869.454001 62202.2632 62202.2632 2
vectorize 45286.505601 45286.505601 51775.855502 51775.855502 58265.2054 58265.2054 2