xarray 等效于 pandas `qcut()` 函数
xarray equivalent of pandas `qcut()` function
我想计算 Decile Index - 请参阅 ex1-Calculate Decile Index (DI) with Python.ipynb
。
pandas
实现非常简单,但我需要帮助使用 groupby_bins()
功能将 bin 标签应用到新的 variable
/ coordinate
。
工作示例(测试数据集)
import pandas as pd
import numpy as np
import xarray as xr
time = pd.date_range('2010-01-01','2011-12-31',freq='M')
lat = np.linspace(-5.175003, -4.7250023, 10)
lon = np.linspace(33.524994, 33.97499, 10)
precip = np.random.normal(0, 1, size=(len(time), len(lat), len(lon)))
ds = xr.Dataset(
{'precip': (['time', 'lat', 'lon'], precip)},
coords={
'lon': lon,
'lat': lat,
'time': time,
}
)
这看起来像:
Out[]:
<xarray.Dataset>
Dimensions: (lat: 10, lon: 10, time: 24)
Coordinates:
* lon (lon) float64 33.52 33.57 33.62 33.67 ... 33.82 33.87 33.92 33.97
* lat (lat) float64 -5.175 -5.125 -5.075 -5.025 ... -4.825 -4.775 -4.725
* time (time) datetime64[ns] 2010-01-31 2010-02-28 ... 2011-12-31
Data variables:
precip (time, lat, lon) float64 0.1638 -1.031 0.2087 ... -0.1147 -0.6863
计算累积频率分布(归一化排名)
# calculate a cumsum over some window size
rolling_window = 3
ds_window = (
ds.rolling(time=rolling_window, center=True)
.sum()
.dropna(dim='time', how='all')
)
# construct a cumulative frequency distribution ranking the precip values
# per month
def rank_norm(ds, dim='time'):
return (ds.rank(dim=dim) - 1) / (ds.sizes[dim] - 1) * 100
result = ds_window.groupby('time.month').apply(rank_norm, args=('time',))
result = result.rename({variable:'rank_norm'}).drop('month')
Out[]:
<xarray.Dataset>
Dimensions: (lat: 10, lon: 10, time: 108)
Coordinates:
* lat (lat) float64 -5.175 -5.125 -5.075 ... -4.825 -4.775 -4.725
* lon (lon) float64 33.52 33.57 33.62 33.67 ... 33.82 33.87 33.92 33.97
* time (time) datetime64[ns] 2010-01-31 2010-02-28 ... 2018-12-31
Data variables:
rank_norm (time, lat, lon) float64 75.0 75.0 12.5 100.0 ... 87.5 0.0 25.0
Pandas 解决方案
我想创建一个变量,它将创建一个新的 variable
或 coordinate
在 ds
中,将具有与 bins = [20., 40., 60., 80., np.Inf]
.
中的 bin 对应的整数
在 Pandas 中使用 .qcut
功能相对简单。
test = result.to_dataframe()
bins = pd.qcut(test['rank_norm'], 5, labels=[1, 2, 3, 4, 5])
result = bins.to_xarray().to_dataset().rename({'rank_norm': 'rank_bins'})
Out[]:
<xarray.Dataset>
Dimensions: (lat: 10, lon: 10, time: 108)
Coordinates:
* lat (lat) float64 -5.175 -5.125 -5.075 -5.025 ... -4.825 -4.775 -4.725
* lon (lon) float64 33.52 33.57 33.62 33.67 ... 33.82 33.87 33.92 33.97
* time (time) datetime64[ns] 2010-01-31 2010-02-28 ... 2018-12-31
Data variables:
rank_bins (lat, lon, time) int64 4 4 1 4 3 4 5 1 1 2 ... 2 1 1 4 2 4 3 1 2 2
我的xarray
尝试
# assign bins to variable xarray
bins = [20., 40., 60., 80., np.Inf]
decile_index_gpby = rank_norm.groupby_bins('rank_norm', bins=bins)
out = decile_index_gpby.assign() # assign_coords()
我得到的错误信息如下:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-166-8d48b9fc1d56> in <module>
1 bins = [20., 40., 60., 80., np.Inf]
2 decile_index_gpby = rank_norm.groupby_bins('rank_norm', bins=bins)
----> 3 out = decile_index_gpby.assign() # assign_coords()
~/miniconda3/lib/python3.7/site-packages/xarray/core/groupby.py in assign(self, **kwargs)
772 Dataset.assign
773 """
--> 774 return self.apply(lambda ds: ds.assign(**kwargs))
775
776
~/miniconda3/lib/python3.7/site-packages/xarray/core/groupby.py in apply(self, func, args, **kwargs)
684 kwargs.pop('shortcut', None) # ignore shortcut if set (for now)
685 applied = (func(ds, *args, **kwargs) for ds in self._iter_grouped())
--> 686 return self._combine(applied)
687
688 def _combine(self, applied):
~/miniconda3/lib/python3.7/site-packages/xarray/core/groupby.py in _combine(self, applied)
691 coord, dim, positions = self._infer_concat_args(applied_example)
692 combined = concat(applied, dim)
--> 693 combined = _maybe_reorder(combined, dim, positions)
694 if coord is not None:
695 combined[coord.name] = coord
~/miniconda3/lib/python3.7/site-packages/xarray/core/groupby.py in _maybe_reorder(xarray_obj, dim, positions)
468
469 def _maybe_reorder(xarray_obj, dim, positions):
--> 470 order = _inverse_permutation_indices(positions)
471
472 if order is None:
~/miniconda3/lib/python3.7/site-packages/xarray/core/groupby.py in _inverse_permutation_indices(positions)
110 positions = [np.arange(sl.start, sl.stop, sl.step) for sl in positions]
111
--> 112 indices = nputils.inverse_permutation(np.concatenate(positions))
113 return indices
114
~/miniconda3/lib/python3.7/site-packages/xarray/core/nputils.py in inverse_permutation(indices)
58 # use intp instead of int64 because of windows :(
59 inverse_permutation = np.empty(len(indices), dtype=np.intp)
---> 60 inverse_permutation[indices] = np.arange(len(indices), dtype=np.intp)
61 return inverse_permutation
62
IndexError: index 1304 is out of bounds for axis 0 with size 1000
看起来如果您使用 scalar
来定义您的 bins
那么它只会生成 4 个范围。您可以通过查看结果 GroupBy 对象的 length
和 groups
的 keys
的名称来检查这一点:
mybins = [20., 40., 60., 80., np.inf]
decile_index_gpby = rank_norm.groupby_bins('rank_norm', bins=mybins)
len(decile_index_gpby.groups)
=> 4
decile_index_gpby.groups.keys()
=> [Interval(80.0, inf, closed='right'),
Interval(20.0, 40.0, closed='right'),
Interval(60.0, 80.0, closed='right'),
Interval(40.0, 60.0, closed='right')]
为了防止丢失 1/5 的值,您必须将 mybins
的定义更改为:
mybins = [np.NINF, 20., 40., 60., np.inf]
这不是你想要的。
所以使用 bins=5
代替:
decile_index_gpby = rank_norm.groupby_bins('rank_norm', bins=5)
len(decile_index_gpby.groups)
=> 5
decile_index_gpby.groups.keys()
=> [Interval(80.0, 100.0, closed='right'),
Interval(20.0, 40.0, closed='right'),
Interval(60.0, 80.0, closed='right'),
Interval(40.0, 60.0, closed='right'),
Interval(-0.1, 20.0, closed='right')]
我不确定 pandas.qcut
是否完全符合您的期望;例如在您的示例中查看 returns 的垃圾箱:
>>> test = result.to_dataframe()
>>> binned, bins = pd.qcut(test['rank_norm'], 5, labels=[1, 2, 3, 4, 5], retbins=True)
>>> bins
array([ 0. , 12.5, 37.5, 62.5, 87.5, 100. ])
如果我没理解错的话,您希望根据每个点所在的 bin 为每个点分配一个整数值。即:
0.0 <= x < 20.0
: 1
20.0 <= x < 40.0
: 2
40.0 <= x < 60.0
: 3
60.0 <= x < 80.0
: 4
80.0 <= x
: 5
对于这个任务,我可能会推荐使用 numpy.digitize
applied via xarray.apply_ufunc
:
>>> bins = [0., 20., 40., 60., 80., np.inf]
>>> result = xr.apply_ufunc(np.digitize, result, kwargs={'bins': bins})
我想计算 Decile Index - 请参阅 ex1-Calculate Decile Index (DI) with Python.ipynb
。
pandas
实现非常简单,但我需要帮助使用 groupby_bins()
功能将 bin 标签应用到新的 variable
/ coordinate
。
工作示例(测试数据集)
import pandas as pd
import numpy as np
import xarray as xr
time = pd.date_range('2010-01-01','2011-12-31',freq='M')
lat = np.linspace(-5.175003, -4.7250023, 10)
lon = np.linspace(33.524994, 33.97499, 10)
precip = np.random.normal(0, 1, size=(len(time), len(lat), len(lon)))
ds = xr.Dataset(
{'precip': (['time', 'lat', 'lon'], precip)},
coords={
'lon': lon,
'lat': lat,
'time': time,
}
)
这看起来像:
Out[]:
<xarray.Dataset>
Dimensions: (lat: 10, lon: 10, time: 24)
Coordinates:
* lon (lon) float64 33.52 33.57 33.62 33.67 ... 33.82 33.87 33.92 33.97
* lat (lat) float64 -5.175 -5.125 -5.075 -5.025 ... -4.825 -4.775 -4.725
* time (time) datetime64[ns] 2010-01-31 2010-02-28 ... 2011-12-31
Data variables:
precip (time, lat, lon) float64 0.1638 -1.031 0.2087 ... -0.1147 -0.6863
计算累积频率分布(归一化排名)
# calculate a cumsum over some window size
rolling_window = 3
ds_window = (
ds.rolling(time=rolling_window, center=True)
.sum()
.dropna(dim='time', how='all')
)
# construct a cumulative frequency distribution ranking the precip values
# per month
def rank_norm(ds, dim='time'):
return (ds.rank(dim=dim) - 1) / (ds.sizes[dim] - 1) * 100
result = ds_window.groupby('time.month').apply(rank_norm, args=('time',))
result = result.rename({variable:'rank_norm'}).drop('month')
Out[]:
<xarray.Dataset>
Dimensions: (lat: 10, lon: 10, time: 108)
Coordinates:
* lat (lat) float64 -5.175 -5.125 -5.075 ... -4.825 -4.775 -4.725
* lon (lon) float64 33.52 33.57 33.62 33.67 ... 33.82 33.87 33.92 33.97
* time (time) datetime64[ns] 2010-01-31 2010-02-28 ... 2018-12-31
Data variables:
rank_norm (time, lat, lon) float64 75.0 75.0 12.5 100.0 ... 87.5 0.0 25.0
Pandas 解决方案
我想创建一个变量,它将创建一个新的 variable
或 coordinate
在 ds
中,将具有与 bins = [20., 40., 60., 80., np.Inf]
.
在 Pandas 中使用 .qcut
功能相对简单。
test = result.to_dataframe()
bins = pd.qcut(test['rank_norm'], 5, labels=[1, 2, 3, 4, 5])
result = bins.to_xarray().to_dataset().rename({'rank_norm': 'rank_bins'})
Out[]:
<xarray.Dataset>
Dimensions: (lat: 10, lon: 10, time: 108)
Coordinates:
* lat (lat) float64 -5.175 -5.125 -5.075 -5.025 ... -4.825 -4.775 -4.725
* lon (lon) float64 33.52 33.57 33.62 33.67 ... 33.82 33.87 33.92 33.97
* time (time) datetime64[ns] 2010-01-31 2010-02-28 ... 2018-12-31
Data variables:
rank_bins (lat, lon, time) int64 4 4 1 4 3 4 5 1 1 2 ... 2 1 1 4 2 4 3 1 2 2
我的xarray
尝试
# assign bins to variable xarray
bins = [20., 40., 60., 80., np.Inf]
decile_index_gpby = rank_norm.groupby_bins('rank_norm', bins=bins)
out = decile_index_gpby.assign() # assign_coords()
我得到的错误信息如下:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-166-8d48b9fc1d56> in <module>
1 bins = [20., 40., 60., 80., np.Inf]
2 decile_index_gpby = rank_norm.groupby_bins('rank_norm', bins=bins)
----> 3 out = decile_index_gpby.assign() # assign_coords()
~/miniconda3/lib/python3.7/site-packages/xarray/core/groupby.py in assign(self, **kwargs)
772 Dataset.assign
773 """
--> 774 return self.apply(lambda ds: ds.assign(**kwargs))
775
776
~/miniconda3/lib/python3.7/site-packages/xarray/core/groupby.py in apply(self, func, args, **kwargs)
684 kwargs.pop('shortcut', None) # ignore shortcut if set (for now)
685 applied = (func(ds, *args, **kwargs) for ds in self._iter_grouped())
--> 686 return self._combine(applied)
687
688 def _combine(self, applied):
~/miniconda3/lib/python3.7/site-packages/xarray/core/groupby.py in _combine(self, applied)
691 coord, dim, positions = self._infer_concat_args(applied_example)
692 combined = concat(applied, dim)
--> 693 combined = _maybe_reorder(combined, dim, positions)
694 if coord is not None:
695 combined[coord.name] = coord
~/miniconda3/lib/python3.7/site-packages/xarray/core/groupby.py in _maybe_reorder(xarray_obj, dim, positions)
468
469 def _maybe_reorder(xarray_obj, dim, positions):
--> 470 order = _inverse_permutation_indices(positions)
471
472 if order is None:
~/miniconda3/lib/python3.7/site-packages/xarray/core/groupby.py in _inverse_permutation_indices(positions)
110 positions = [np.arange(sl.start, sl.stop, sl.step) for sl in positions]
111
--> 112 indices = nputils.inverse_permutation(np.concatenate(positions))
113 return indices
114
~/miniconda3/lib/python3.7/site-packages/xarray/core/nputils.py in inverse_permutation(indices)
58 # use intp instead of int64 because of windows :(
59 inverse_permutation = np.empty(len(indices), dtype=np.intp)
---> 60 inverse_permutation[indices] = np.arange(len(indices), dtype=np.intp)
61 return inverse_permutation
62
IndexError: index 1304 is out of bounds for axis 0 with size 1000
看起来如果您使用 scalar
来定义您的 bins
那么它只会生成 4 个范围。您可以通过查看结果 GroupBy 对象的 length
和 groups
的 keys
的名称来检查这一点:
mybins = [20., 40., 60., 80., np.inf]
decile_index_gpby = rank_norm.groupby_bins('rank_norm', bins=mybins)
len(decile_index_gpby.groups)
=> 4
decile_index_gpby.groups.keys()
=> [Interval(80.0, inf, closed='right'),
Interval(20.0, 40.0, closed='right'),
Interval(60.0, 80.0, closed='right'),
Interval(40.0, 60.0, closed='right')]
为了防止丢失 1/5 的值,您必须将 mybins
的定义更改为:
mybins = [np.NINF, 20., 40., 60., np.inf]
这不是你想要的。
所以使用 bins=5
代替:
decile_index_gpby = rank_norm.groupby_bins('rank_norm', bins=5)
len(decile_index_gpby.groups)
=> 5
decile_index_gpby.groups.keys()
=> [Interval(80.0, 100.0, closed='right'),
Interval(20.0, 40.0, closed='right'),
Interval(60.0, 80.0, closed='right'),
Interval(40.0, 60.0, closed='right'),
Interval(-0.1, 20.0, closed='right')]
我不确定 pandas.qcut
是否完全符合您的期望;例如在您的示例中查看 returns 的垃圾箱:
>>> test = result.to_dataframe()
>>> binned, bins = pd.qcut(test['rank_norm'], 5, labels=[1, 2, 3, 4, 5], retbins=True)
>>> bins
array([ 0. , 12.5, 37.5, 62.5, 87.5, 100. ])
如果我没理解错的话,您希望根据每个点所在的 bin 为每个点分配一个整数值。即:
0.0 <= x < 20.0
: 120.0 <= x < 40.0
: 240.0 <= x < 60.0
: 360.0 <= x < 80.0
: 480.0 <= x
: 5
对于这个任务,我可能会推荐使用 numpy.digitize
applied via xarray.apply_ufunc
:
>>> bins = [0., 20., 40., 60., 80., np.inf]
>>> result = xr.apply_ufunc(np.digitize, result, kwargs={'bins': bins})