在 R 中,计算两个数据帧之间的函数
In R, evaluate function between two data frames
我正在尝试对两个数据框中的值进行评估,并使用结果创建一个新的数据框。我是 R 的新手,我正在努力避免旧的编码习惯。换句话说,我拼命试图避免使用循环,但在这种情况下无法弄清楚 plyr 或类似的东西。
在示例中,我创建了机场、飞行员和一个以公里为单位获取距离的函数。我的问题是试图确定每个飞行员离哪个主要机场最近以及与每个机场的距离。
#Build Airports
code <- c("IAH", "DFW", "Denver", "STL")
lat <- c(29.97, 32.90, 39.75, 38.75)
long <- c(95.35, 97.03, 104.87, 90.37)
airports <- data.frame(code, lat, long)
#Build Pilots
names <- c("James", "Fiona", "Seamus")
lat <- c(32.335131, 44.913223, 28.849631)
long <- c(-84.989067, -97.151334, -96.917240)
pilots <- data.frame(names, lat, long)
#Create distance function
distInKm <- function(lat1, long1, lat2, long2) {
dlat = (lat2 * 0.01745329) - (lat1 * 0.01745329) #pi/180 convert to radians
dlong = (long2 * 0.01745329) - (long1 * 0.01745329)
step1 = (sin(dlat / 2)) ^ 2 + cos(lat1 * 0.01745329) * cos(long2 * 0.01745329) * (sin(dlong / 2)) ^ 2
step2 = 2 * atan2(sqrt(step1), sqrt(1 - step1))
dist = 6372.798 * step2 #R is the radius of earth (40041.47 / (2 * pi))
dist
}
感谢您的宝贵时间。
首先,您的机场经度应该为负,而实际为正,这会影响结果。让我们修复它们,使结果更有意义:
airports$long <- -airports$long
现在,您可以使用 apply
来评估每个机场的所有飞行员。 geosphere
包有几个计算直线距离的函数,包括distGeo
和distHaversine
。
library(geosphere)
pilots$closest_airport <- apply(pilots[, 3:2], 1, function(x){
airports[which.min(distGeo(x, airports[, 3:2])), 'code']
})
pilots$airport_distance <- apply(pilots[, 3:2], 1, function(x){
min(distGeo(x, airports[, 3:2])) / 1000 # /1000 to convert m to km
})
pilots
## names lat long closest_airport airport_distance
## 1 James 32.33513 -84.98907 STL 862.5394
## 2 Fiona 44.91322 -97.15133 Denver 855.8088
## 3 Seamus 28.84963 -96.91724 IAH 196.3559
或者如果您想要所有距离而不只是最小距离,cbind
由 apply
:
产生的矩阵
pilots <- cbind(pilots, t(apply(pilots[, 3:2], 1, function(x){
setNames(distGeo(x, airports[, 3:2]) / 1000, airports$code)
})))
pilots
## names lat long closest_airport IAH DFW Denver STL
## 1 James 32.33513 -84.98907 STL 1021.6523 1131.2129 1965.6586 862.5394
## 2 Fiona 44.91322 -97.15133 Denver 1666.0359 1333.6842 855.8088 885.8480
## 3 Seamus 28.84963 -96.91724 IAH 196.3559 449.1838 1412.0664 1253.4874
翻译成dplyr
,继承plyr
,
library(dplyr)
pilots %>% rowwise() %>%
mutate(closest_airport = airports[which.min(distGeo(c(long, lat), airports[, 3:2])), 'code'],
airport_distance = min(distGeo(c(long, lat), airports[, 3:2])) / 1000)
## Source: local data frame [3 x 5]
## Groups: <by row>
##
## # A tibble: 3 × 5
## names lat long closest_airport airport_distance
## <fctr> <dbl> <dbl> <fctr> <dbl>
## 1 James 32.33513 -84.98907 STL 862.5394
## 2 Fiona 44.91322 -97.15133 Denver 855.8088
## 3 Seamus 28.84963 -96.91724 IAH 196.3559
或对于所有距离,使用 bind_cols
和上述方法,或 unnest
列表列并重塑:
library(tidyverse)
pilots %>% rowwise() %>%
mutate(closest_airport = airports[which.min(distGeo(c(long, lat), airports[, 3:2])), 'code'],
data = list(data_frame(airport = airports$code,
distance = distGeo(c(long, lat), airports[, 3:2]) / 1000))) %>%
unnest() %>%
spread(airport, distance)
## # A tibble: 3 × 8
## names lat long closest_airport Denver DFW IAH STL
## * <fctr> <dbl> <dbl> <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 Fiona 44.91322 -97.15133 Denver 855.8088 1333.6842 1666.0359 885.8480
## 2 James 32.33513 -84.98907 STL 1965.6586 1131.2129 1021.6523 862.5394
## 3 Seamus 28.84963 -96.91724 IAH 1412.0664 449.1838 196.3559 1253.4874
或更直接但不太清晰,
pilots %>% rowwise() %>%
mutate(closest_airport = airports[which.min(distGeo(c(long, lat), airports[, 3:2])), 'code'],
data = (distGeo(c(long, lat), airports[, 3:2]) / 1000) %>%
setNames(airports$code) %>% t() %>% as_data_frame() %>% list()) %>%
unnest()
## # A tibble: 3 × 8
## names lat long closest_airport IAH DFW Denver STL
## <fctr> <dbl> <dbl> <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 James 32.33513 -84.98907 STL 1021.6523 1131.2129 1965.6586 862.5394
## 2 Fiona 44.91322 -97.15133 Denver 1666.0359 1333.6842 855.8088 885.8480
## 3 Seamus 28.84963 -96.91724 IAH 196.3559 449.1838 1412.0664 1253.4874
我正在尝试对两个数据框中的值进行评估,并使用结果创建一个新的数据框。我是 R 的新手,我正在努力避免旧的编码习惯。换句话说,我拼命试图避免使用循环,但在这种情况下无法弄清楚 plyr 或类似的东西。
在示例中,我创建了机场、飞行员和一个以公里为单位获取距离的函数。我的问题是试图确定每个飞行员离哪个主要机场最近以及与每个机场的距离。
#Build Airports
code <- c("IAH", "DFW", "Denver", "STL")
lat <- c(29.97, 32.90, 39.75, 38.75)
long <- c(95.35, 97.03, 104.87, 90.37)
airports <- data.frame(code, lat, long)
#Build Pilots
names <- c("James", "Fiona", "Seamus")
lat <- c(32.335131, 44.913223, 28.849631)
long <- c(-84.989067, -97.151334, -96.917240)
pilots <- data.frame(names, lat, long)
#Create distance function
distInKm <- function(lat1, long1, lat2, long2) {
dlat = (lat2 * 0.01745329) - (lat1 * 0.01745329) #pi/180 convert to radians
dlong = (long2 * 0.01745329) - (long1 * 0.01745329)
step1 = (sin(dlat / 2)) ^ 2 + cos(lat1 * 0.01745329) * cos(long2 * 0.01745329) * (sin(dlong / 2)) ^ 2
step2 = 2 * atan2(sqrt(step1), sqrt(1 - step1))
dist = 6372.798 * step2 #R is the radius of earth (40041.47 / (2 * pi))
dist
}
感谢您的宝贵时间。
首先,您的机场经度应该为负,而实际为正,这会影响结果。让我们修复它们,使结果更有意义:
airports$long <- -airports$long
现在,您可以使用 apply
来评估每个机场的所有飞行员。 geosphere
包有几个计算直线距离的函数,包括distGeo
和distHaversine
。
library(geosphere)
pilots$closest_airport <- apply(pilots[, 3:2], 1, function(x){
airports[which.min(distGeo(x, airports[, 3:2])), 'code']
})
pilots$airport_distance <- apply(pilots[, 3:2], 1, function(x){
min(distGeo(x, airports[, 3:2])) / 1000 # /1000 to convert m to km
})
pilots
## names lat long closest_airport airport_distance
## 1 James 32.33513 -84.98907 STL 862.5394
## 2 Fiona 44.91322 -97.15133 Denver 855.8088
## 3 Seamus 28.84963 -96.91724 IAH 196.3559
或者如果您想要所有距离而不只是最小距离,cbind
由 apply
:
pilots <- cbind(pilots, t(apply(pilots[, 3:2], 1, function(x){
setNames(distGeo(x, airports[, 3:2]) / 1000, airports$code)
})))
pilots
## names lat long closest_airport IAH DFW Denver STL
## 1 James 32.33513 -84.98907 STL 1021.6523 1131.2129 1965.6586 862.5394
## 2 Fiona 44.91322 -97.15133 Denver 1666.0359 1333.6842 855.8088 885.8480
## 3 Seamus 28.84963 -96.91724 IAH 196.3559 449.1838 1412.0664 1253.4874
翻译成dplyr
,继承plyr
,
library(dplyr)
pilots %>% rowwise() %>%
mutate(closest_airport = airports[which.min(distGeo(c(long, lat), airports[, 3:2])), 'code'],
airport_distance = min(distGeo(c(long, lat), airports[, 3:2])) / 1000)
## Source: local data frame [3 x 5]
## Groups: <by row>
##
## # A tibble: 3 × 5
## names lat long closest_airport airport_distance
## <fctr> <dbl> <dbl> <fctr> <dbl>
## 1 James 32.33513 -84.98907 STL 862.5394
## 2 Fiona 44.91322 -97.15133 Denver 855.8088
## 3 Seamus 28.84963 -96.91724 IAH 196.3559
或对于所有距离,使用 bind_cols
和上述方法,或 unnest
列表列并重塑:
library(tidyverse)
pilots %>% rowwise() %>%
mutate(closest_airport = airports[which.min(distGeo(c(long, lat), airports[, 3:2])), 'code'],
data = list(data_frame(airport = airports$code,
distance = distGeo(c(long, lat), airports[, 3:2]) / 1000))) %>%
unnest() %>%
spread(airport, distance)
## # A tibble: 3 × 8
## names lat long closest_airport Denver DFW IAH STL
## * <fctr> <dbl> <dbl> <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 Fiona 44.91322 -97.15133 Denver 855.8088 1333.6842 1666.0359 885.8480
## 2 James 32.33513 -84.98907 STL 1965.6586 1131.2129 1021.6523 862.5394
## 3 Seamus 28.84963 -96.91724 IAH 1412.0664 449.1838 196.3559 1253.4874
或更直接但不太清晰,
pilots %>% rowwise() %>%
mutate(closest_airport = airports[which.min(distGeo(c(long, lat), airports[, 3:2])), 'code'],
data = (distGeo(c(long, lat), airports[, 3:2]) / 1000) %>%
setNames(airports$code) %>% t() %>% as_data_frame() %>% list()) %>%
unnest()
## # A tibble: 3 × 8
## names lat long closest_airport IAH DFW Denver STL
## <fctr> <dbl> <dbl> <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 James 32.33513 -84.98907 STL 1021.6523 1131.2129 1965.6586 862.5394
## 2 Fiona 44.91322 -97.15133 Denver 1666.0359 1333.6842 855.8088 885.8480
## 3 Seamus 28.84963 -96.91724 IAH 196.3559 449.1838 1412.0664 1253.4874