如何将 Pandas 系列中的连续 NaN 值分组到一组切片中?
How to group consecutive NaN values from a Pandas Series in a set of slices?
我想将连续的 NaN
值合并到切片中。有没有一种简单的方法可以使用 numpy 或 pandas?
l = [
(996, np.nan), (997, np.nan), (998, np.nan),
(999, -47.3), (1000, -72.5), (1100, -97.7),
(1200, np.nan), (1201, np.nan), (1205, -97.8),
(1300, np.nan), (1302, np.nan), (1305, -97.9),
(1400, np.nan), (1405, -97.10), (1408, np.nan)
]
l = pd.Series(dict(l))
预期结果:
[
(slice(996, 999, None), array([nan, nan, nan])),
(999, -47.3),
(1000, -72.5),
(1100, -97.7),
(slice(1200, 1202, None), array([nan, nan])),
(1205, -97.8),
(slice(1300, 1301, None), array([nan])),
(slice(1302, 1303, None), array([nan])),
(1305, -97.9),
(slice(1400, 1401, None), array([nan])),
(1405, -97.1),
(slice(1408, 1409, None), array([nan]))
]
二维的 numpy 数组也可以,而不是元组列表
更新 2019/05/31:我刚刚意识到,如果我只使用字典而不是 Pandas 系列,则算法效率更高
你想要的是完整或极端情况,nan 相等,每对的第一个元素是一个切片或单个值,第二个是 np.array 或单个值。
对于如此复杂的需求,我只会依赖一种简单的 Python 非矢量化方式:
def trans(ser):
def build(last, cur, val):
if cur == last + 1:
if np.isnan(val):
return (slice(last, cur), np.array([np.nan]))
else:
return (last, val)
else:
return (slice(last, cur), np.array([val] * (cur - last)))
last = ser.iloc[0]
old = last_index = ser.index[0]
resul = []
for i in ser.index[1:]:
val = ser[i]
if ((val != last) and not(np.isnan(val) and np.isnan(last))) \
or i != old + 1:
resul.append(build(last_index, old + 1, last))
last_index = i
last = val
old = i
resul.append(build(last_index, old+1, last))
return resul
它给出了一些接近预期的结果:
[(slice(996, 999, None), array([nan, nan, nan])),
(999, -47.3),
(1000, -72.5),
(1100, -97.7),
(slice(1200, 1202, None), array([nan, nan])),
(1205, -97.8),
(slice(1300, 1301, None), array([nan])),
(slice(1302, 1303, None), array([nan])),
(1305, -97.9),
(slice(1400, 1401, None), array([nan])),
(1405, -97.1),
(slice(1408, 1409, None), array([nan]))]
Group by cumsum
of notnull
是个好主意,但是我们需要过滤掉每个sub-series中的第一个non-null值,所以我们可以groupby对 (cumsum, notnull)
:
# convert series to frame,
# don't know why series only doesn't work
df = l.to_frame(name='val')
df['notnull'] = df['val'].notnull()
g = df.groupby([ df['notnull'].cumsum(), 'notnull']).val
[(v.index, v.values) for i, v in g]
输出:
[(Int64Index([996, 997, 998], dtype='int64'), array([nan, nan, nan])),
(Int64Index([1200, 1201], dtype='int64'), array([nan, nan])),
(Int64Index([1300, 1302, 1400, 1402], dtype='int64'),
array([nan, nan, nan, nan])),
(Int64Index([999], dtype='int64'), array([-47.3])),
(Int64Index([1000], dtype='int64'), array([-72.5])),
(Int64Index([1100], dtype='int64'), array([-97.7])),
(Int64Index([1202], dtype='int64'), array([-97.1]))]
编辑:考虑连续索引并更新切片:
# convert group to slices
def get_slice(x):
idx_min, idx_max = x.index.min(), x.index.max()
if len(x) >1:
return (slice(idx_min, idx_max+1), x.values)
elif x.isna().any():
return (slice(idx_min, idx_min+1), x.values)
else:
return (idx_min, x[idx_min])
df['notnull'] = df['val'].notnull()
# non-continuous indices
df['sep'] = (df.index != df.index.to_series().shift() + 1).cumsum()
g = df.groupby(['sep', df['notnull'].cumsum(), 'notnull']).val
g.apply(get_slice).values.tolist()
给出:
[(slice(996, 999, None), array([nan, nan, nan])),
(999, -47.3),
(1000, -72.5),
(1100, -97.7),
(slice(1200, 1202, None), array([nan, nan])),
(1205, -97.8),
(slice(1300, 1301, None), array([nan])),
(slice(1302, 1303, None), array([nan])),
(1305, -97.9),
(slice(1400, 1401, None), array([nan])),
(1405, -97.1),
(slice(1408, 1409, None), array([nan]))]
我想将连续的 NaN
值合并到切片中。有没有一种简单的方法可以使用 numpy 或 pandas?
l = [
(996, np.nan), (997, np.nan), (998, np.nan),
(999, -47.3), (1000, -72.5), (1100, -97.7),
(1200, np.nan), (1201, np.nan), (1205, -97.8),
(1300, np.nan), (1302, np.nan), (1305, -97.9),
(1400, np.nan), (1405, -97.10), (1408, np.nan)
]
l = pd.Series(dict(l))
预期结果:
[
(slice(996, 999, None), array([nan, nan, nan])),
(999, -47.3),
(1000, -72.5),
(1100, -97.7),
(slice(1200, 1202, None), array([nan, nan])),
(1205, -97.8),
(slice(1300, 1301, None), array([nan])),
(slice(1302, 1303, None), array([nan])),
(1305, -97.9),
(slice(1400, 1401, None), array([nan])),
(1405, -97.1),
(slice(1408, 1409, None), array([nan]))
]
二维的 numpy 数组也可以,而不是元组列表
更新 2019/05/31:我刚刚意识到,如果我只使用字典而不是 Pandas 系列,则算法效率更高
你想要的是完整或极端情况,nan 相等,每对的第一个元素是一个切片或单个值,第二个是 np.array 或单个值。
对于如此复杂的需求,我只会依赖一种简单的 Python 非矢量化方式:
def trans(ser):
def build(last, cur, val):
if cur == last + 1:
if np.isnan(val):
return (slice(last, cur), np.array([np.nan]))
else:
return (last, val)
else:
return (slice(last, cur), np.array([val] * (cur - last)))
last = ser.iloc[0]
old = last_index = ser.index[0]
resul = []
for i in ser.index[1:]:
val = ser[i]
if ((val != last) and not(np.isnan(val) and np.isnan(last))) \
or i != old + 1:
resul.append(build(last_index, old + 1, last))
last_index = i
last = val
old = i
resul.append(build(last_index, old+1, last))
return resul
它给出了一些接近预期的结果:
[(slice(996, 999, None), array([nan, nan, nan])),
(999, -47.3),
(1000, -72.5),
(1100, -97.7),
(slice(1200, 1202, None), array([nan, nan])),
(1205, -97.8),
(slice(1300, 1301, None), array([nan])),
(slice(1302, 1303, None), array([nan])),
(1305, -97.9),
(slice(1400, 1401, None), array([nan])),
(1405, -97.1),
(slice(1408, 1409, None), array([nan]))]
Group by cumsum
of notnull
是个好主意,但是我们需要过滤掉每个sub-series中的第一个non-null值,所以我们可以groupby对 (cumsum, notnull)
:
# convert series to frame,
# don't know why series only doesn't work
df = l.to_frame(name='val')
df['notnull'] = df['val'].notnull()
g = df.groupby([ df['notnull'].cumsum(), 'notnull']).val
[(v.index, v.values) for i, v in g]
输出:
[(Int64Index([996, 997, 998], dtype='int64'), array([nan, nan, nan])),
(Int64Index([1200, 1201], dtype='int64'), array([nan, nan])),
(Int64Index([1300, 1302, 1400, 1402], dtype='int64'),
array([nan, nan, nan, nan])),
(Int64Index([999], dtype='int64'), array([-47.3])),
(Int64Index([1000], dtype='int64'), array([-72.5])),
(Int64Index([1100], dtype='int64'), array([-97.7])),
(Int64Index([1202], dtype='int64'), array([-97.1]))]
编辑:考虑连续索引并更新切片:
# convert group to slices
def get_slice(x):
idx_min, idx_max = x.index.min(), x.index.max()
if len(x) >1:
return (slice(idx_min, idx_max+1), x.values)
elif x.isna().any():
return (slice(idx_min, idx_min+1), x.values)
else:
return (idx_min, x[idx_min])
df['notnull'] = df['val'].notnull()
# non-continuous indices
df['sep'] = (df.index != df.index.to_series().shift() + 1).cumsum()
g = df.groupby(['sep', df['notnull'].cumsum(), 'notnull']).val
g.apply(get_slice).values.tolist()
给出:
[(slice(996, 999, None), array([nan, nan, nan])),
(999, -47.3),
(1000, -72.5),
(1100, -97.7),
(slice(1200, 1202, None), array([nan, nan])),
(1205, -97.8),
(slice(1300, 1301, None), array([nan])),
(slice(1302, 1303, None), array([nan])),
(1305, -97.9),
(slice(1400, 1401, None), array([nan])),
(1405, -97.1),
(slice(1408, 1409, None), array([nan]))]