使用 geopandas 查找地理边界 N 米范围内的所有点
Find all points within N meters of a geographic border using geopandas
我需要找到 U.S 的 N 米范围内的所有点(名称、纬度、经度)。边界。我的做法是:
- 创建一个地理数据框,其中包含一个多边形,其中每个多边形都是一个缓冲边界。
# Get the U.S. borders
border = gpd.read_file('cb_2018_us_nation_5m')
border = border.to_crs("EPSG:32636")
# Get just the boundary
just_border = border.boundary
# Put a 10km buffer around the line. We now have multiple polygons (AK, HI, CONUS), each that look like a donut.
border_buffered = just_border.buffer(10000, cap_style=3, join_style=2)
然后将我的数据框转换为地理数据框。
gdf = gpd.GeoDataFrame(temp, geometry=gpd.points_from_xy(temp.Longitude, temp.Latitude))
gdf = gdf.set_crs("EPSG:32636")
然后剪裁多边形内的点。
ngdf = gpd.clip(gdf, border_buffered, keep_geom_type=False)
找不到点:
ngdf.shape
(0, 15)
所以我尝试使用面具:
gdf.reset_index(inplace=True)
gdf.shape
(25746326, 17)
pip_mask = gdf.within(border_buffered.loc[0, 'geometry'])
Traceback (most recent call last):
File "/Users/kovar/work/a50_utils/foo.py", line 45, in <module>
pip_mask = gdf.within(border_buffered.loc[0, 'geometry'])
File "/Users/kovar/miniforge3/envs/a50-dev/lib/python3.9/site-packages/pandas/core/indexing.py", line 889, in __getitem__
return self._getitem_tuple(key)
File "/Users/kovar/miniforge3/envs/a50-dev/lib/python3.9/site-packages/pandas/core/indexing.py", line 1063, in _getitem_tuple
self._has_valid_tuple(tup)
File "/Users/kovar/miniforge3/envs/a50-dev/lib/python3.9/site-packages/pandas/core/indexing.py", line 720, in _has_valid_tuple
self._validate_key_length(key)
File "/Users/kovar/miniforge3/envs/a50-dev/lib/python3.9/site-packages/pandas/core/indexing.py", line 761, in _validate_key_length
raise IndexingError("Too many indexers")
pandas.core.indexing.IndexingError: Too many indexers
我猜我的多边形太多了,或者缓冲线太复杂了。
- 你的问题明明是N米。这需要使用 UTM CRS 几何。美国边界穿过多个UTM CRS区域
- 首先将边界的一部分与 UTM 区域(线段)相关联
- 对每个线段使用适当的 UTM CRS 并添加缓冲区。此示例创建一个 50KM 缓冲区。存在一些问题,因此如果有必要,可以减少缓冲区的大小
- 现在拥有包含缓冲区的多边形地理数据框,以及显示实际使用的线串和实际缓冲区的列
import geopandas as gpd
import shapely.geometry
import numpy as np
import plotly.express as px
import requests, io
from pathlib import Path
from zipfile import ZipFile
import urllib
import pandas as pd
# fmt: off
# US geometry
url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_nation_5m.zip"
f = Path.cwd().joinpath(urllib.parse.urlparse(url).path.split("/")[-1])
if not f.exists():
r = requests.get(url, stream=True, headers={"User-Agent": "XY"})
with open(f, "wb") as fd:
for chunk in r.iter_content(chunk_size=128):
fd.write(chunk)
zfile = ZipFile(f)
zfile.extractall(f.stem)
gdf2 = gpd.read_file(list(f.parent.joinpath(f.stem).glob("*.shp"))[0])
gdf2["geometry"] = gdf2["geometry"].apply(lambda g: g.boundary)
# get UTM zones
utm = gpd.GeoDataFrame.from_features(requests.get("https://opendata.arcgis.com/datasets/b294795270aa4fb3bd25286bf09edc51_0.geojson").json()).set_crs(gdf2.crs)
utm = utm.loc[~utm["ZONE"].eq(0)]
# fmt: on
# join US boundary to UTM zomes using spatial join, i.e. segments of boundary that pass through multiple zone
us_utm = gpd.sjoin(utm, gdf2)
us_utm = us_utm.merge(
gdf2["geometry"], left_on="index_right", right_index=True, suffixes=("", "_right")
)
# create linestrings for each zone / boundary segment
us_utm_sects = us_utm["geometry"].intersection(gpd.GeoSeries(us_utm["geometry_right"]))
# function to create a buffer on line seqment
def buffer(g, buffer=1):
gdf_sect = gpd.GeoDataFrame(geometry=[g], crs="EPSG:4326")
utm_crs = gdf_sect.estimate_utm_crs()
while True:
g2 = gdf_sect.to_crs(utm_crs).buffer(buffer, cap_style=2).to_crs("EPSG:4326")
tb = g2.total_bounds[[0,2]]
bounds = abs(tb[0] - tb[1])
# make sure epsg:4326 geometry makes sense, if not reduce size of buffer on this segment
if bounds < 60:
break
buffer = buffer / 2
return {"geometry": g2.values[0], "utm_crs": str(utm_crs), "line_geometry":g, "buffer":buffer}
# create geo data frame of us border with 50km buffer
us_buf = gpd.GeoDataFrame(us_utm_sects.apply(buffer, buffer=5 * 10 ** 4).apply(pd.Series))
us_buf["color"] = pd.factorize(us_buf["utm_crs"])[0]
使用它
- 已经用世界范围的地震来证明
- 在点和缓冲的美国边界之间进行空间连接的简单案例
- 已将鼠标悬停在墨西哥靠近美国边境的地震上以证明它有效
gdf_eq = gpd.GeoDataFrame.from_features(
requests.get(
"https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson"
).json()
)
# earthquakes near US border...
us_eq = gpd.sjoin(gdf_eq, us_buf)
px.scatter_mapbox(us_eq, lat=us_eq.geometry.y, lon=us_eq.geometry.x, hover_data=["title"]).update_layout(
mapbox={
"style": "carto-positron",
"zoom":1,
"center":{"lat":sum(us_eq.total_bounds[[0,2]])/2,"lon":sum(us_eq.total_bounds[[1,3]])/2},
"layers": [
{
"source": us_buf.geometry.__geo_interface__,
"type": "fill",
"color": "yellow",
"opacity": 0.2,
}
],
},
margin={"l":0,"t":0,"b":0,"r":0}
)
只过滤墨西哥边境
- 简单地空间连接到墨西哥多边形
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres")).set_crs("EPSG:4326")
us_buf = gpd.sjoin(us_buf, world.loc[world["iso_a3"].eq("MEX")]).drop(columns=["index_right"])
将 Michael 的评论提升为答案。
“看起来 border_buffered 是 GeoSeries,而不是 GeoDataFrame,因为 GeoDataFrame.boundary 只是 returns GeoSeries,所以只需要 border_buffered.loc[0]”。 =11=]
这解决了问题。
我需要找到 U.S 的 N 米范围内的所有点(名称、纬度、经度)。边界。我的做法是:
- 创建一个地理数据框,其中包含一个多边形,其中每个多边形都是一个缓冲边界。
# Get the U.S. borders
border = gpd.read_file('cb_2018_us_nation_5m')
border = border.to_crs("EPSG:32636")
# Get just the boundary
just_border = border.boundary
# Put a 10km buffer around the line. We now have multiple polygons (AK, HI, CONUS), each that look like a donut.
border_buffered = just_border.buffer(10000, cap_style=3, join_style=2)
然后将我的数据框转换为地理数据框。
gdf = gpd.GeoDataFrame(temp, geometry=gpd.points_from_xy(temp.Longitude, temp.Latitude))
gdf = gdf.set_crs("EPSG:32636")
然后剪裁多边形内的点。
ngdf = gpd.clip(gdf, border_buffered, keep_geom_type=False)
找不到点:
ngdf.shape
(0, 15)
所以我尝试使用面具:
gdf.reset_index(inplace=True)
gdf.shape
(25746326, 17)
pip_mask = gdf.within(border_buffered.loc[0, 'geometry'])
Traceback (most recent call last):
File "/Users/kovar/work/a50_utils/foo.py", line 45, in <module>
pip_mask = gdf.within(border_buffered.loc[0, 'geometry'])
File "/Users/kovar/miniforge3/envs/a50-dev/lib/python3.9/site-packages/pandas/core/indexing.py", line 889, in __getitem__
return self._getitem_tuple(key)
File "/Users/kovar/miniforge3/envs/a50-dev/lib/python3.9/site-packages/pandas/core/indexing.py", line 1063, in _getitem_tuple
self._has_valid_tuple(tup)
File "/Users/kovar/miniforge3/envs/a50-dev/lib/python3.9/site-packages/pandas/core/indexing.py", line 720, in _has_valid_tuple
self._validate_key_length(key)
File "/Users/kovar/miniforge3/envs/a50-dev/lib/python3.9/site-packages/pandas/core/indexing.py", line 761, in _validate_key_length
raise IndexingError("Too many indexers")
pandas.core.indexing.IndexingError: Too many indexers
我猜我的多边形太多了,或者缓冲线太复杂了。
- 你的问题明明是N米。这需要使用 UTM CRS 几何。美国边界穿过多个UTM CRS区域
- 首先将边界的一部分与 UTM 区域(线段)相关联
- 对每个线段使用适当的 UTM CRS 并添加缓冲区。此示例创建一个 50KM 缓冲区。存在一些问题,因此如果有必要,可以减少缓冲区的大小
- 现在拥有包含缓冲区的多边形地理数据框,以及显示实际使用的线串和实际缓冲区的列
import geopandas as gpd
import shapely.geometry
import numpy as np
import plotly.express as px
import requests, io
from pathlib import Path
from zipfile import ZipFile
import urllib
import pandas as pd
# fmt: off
# US geometry
url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_nation_5m.zip"
f = Path.cwd().joinpath(urllib.parse.urlparse(url).path.split("/")[-1])
if not f.exists():
r = requests.get(url, stream=True, headers={"User-Agent": "XY"})
with open(f, "wb") as fd:
for chunk in r.iter_content(chunk_size=128):
fd.write(chunk)
zfile = ZipFile(f)
zfile.extractall(f.stem)
gdf2 = gpd.read_file(list(f.parent.joinpath(f.stem).glob("*.shp"))[0])
gdf2["geometry"] = gdf2["geometry"].apply(lambda g: g.boundary)
# get UTM zones
utm = gpd.GeoDataFrame.from_features(requests.get("https://opendata.arcgis.com/datasets/b294795270aa4fb3bd25286bf09edc51_0.geojson").json()).set_crs(gdf2.crs)
utm = utm.loc[~utm["ZONE"].eq(0)]
# fmt: on
# join US boundary to UTM zomes using spatial join, i.e. segments of boundary that pass through multiple zone
us_utm = gpd.sjoin(utm, gdf2)
us_utm = us_utm.merge(
gdf2["geometry"], left_on="index_right", right_index=True, suffixes=("", "_right")
)
# create linestrings for each zone / boundary segment
us_utm_sects = us_utm["geometry"].intersection(gpd.GeoSeries(us_utm["geometry_right"]))
# function to create a buffer on line seqment
def buffer(g, buffer=1):
gdf_sect = gpd.GeoDataFrame(geometry=[g], crs="EPSG:4326")
utm_crs = gdf_sect.estimate_utm_crs()
while True:
g2 = gdf_sect.to_crs(utm_crs).buffer(buffer, cap_style=2).to_crs("EPSG:4326")
tb = g2.total_bounds[[0,2]]
bounds = abs(tb[0] - tb[1])
# make sure epsg:4326 geometry makes sense, if not reduce size of buffer on this segment
if bounds < 60:
break
buffer = buffer / 2
return {"geometry": g2.values[0], "utm_crs": str(utm_crs), "line_geometry":g, "buffer":buffer}
# create geo data frame of us border with 50km buffer
us_buf = gpd.GeoDataFrame(us_utm_sects.apply(buffer, buffer=5 * 10 ** 4).apply(pd.Series))
us_buf["color"] = pd.factorize(us_buf["utm_crs"])[0]
使用它
- 已经用世界范围的地震来证明
- 在点和缓冲的美国边界之间进行空间连接的简单案例
- 已将鼠标悬停在墨西哥靠近美国边境的地震上以证明它有效
gdf_eq = gpd.GeoDataFrame.from_features(
requests.get(
"https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson"
).json()
)
# earthquakes near US border...
us_eq = gpd.sjoin(gdf_eq, us_buf)
px.scatter_mapbox(us_eq, lat=us_eq.geometry.y, lon=us_eq.geometry.x, hover_data=["title"]).update_layout(
mapbox={
"style": "carto-positron",
"zoom":1,
"center":{"lat":sum(us_eq.total_bounds[[0,2]])/2,"lon":sum(us_eq.total_bounds[[1,3]])/2},
"layers": [
{
"source": us_buf.geometry.__geo_interface__,
"type": "fill",
"color": "yellow",
"opacity": 0.2,
}
],
},
margin={"l":0,"t":0,"b":0,"r":0}
)
只过滤墨西哥边境
- 简单地空间连接到墨西哥多边形
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres")).set_crs("EPSG:4326")
us_buf = gpd.sjoin(us_buf, world.loc[world["iso_a3"].eq("MEX")]).drop(columns=["index_right"])
将 Michael 的评论提升为答案。
“看起来 border_buffered 是 GeoSeries,而不是 GeoDataFrame,因为 GeoDataFrame.boundary 只是 returns GeoSeries,所以只需要 border_buffered.loc[0]”。 =11=]
这解决了问题。