shift pandas 多索引值,使它们在非空白下方没有空白
shift pandas multiindex values such that they don't have blanks below non blanks
考虑以下 df
和 pd.MultiIndex
col = pd.MultiIndex.from_tuples([('stat1', '', ''), ('stat2', '', ''),
('A', 'mean', 'blue'), ('A', 'mean', 'red'),
('B', 'var', 'blue'), ('B', 'var', 'red')])
df = pd.DataFrame(np.arange(24).reshape(4, 6), list('abcd'), col)
df
我不想 stat1
和 stat2
放在最前面。我希望它们像这样位于底部:
一个夸张的例子更能说明问题
col = pd.MultiIndex.from_tuples([('a', '', ''), ('', 'b', ''),
('c', 'd', ''), ('e', '', 'f'),
('g', 'h', 'i'), ('', 'j', 'k')])
df = pd.DataFrame(np.arange(24).reshape(4, 6), list('abcd'), col)
df
应该看起来像:
我试过:
c = np.array(col.values.tolist())
c_ = pd.MultiIndex.from_tuples(np.sort(c).tolist())
pd.DataFrame(df.values, df.index, c_)
这是不正确的,因为某些列以我不希望的方式排序。
时机
到目前为止,@root 获得了计时奖。我认为我不太关心更大的指数。这主要是为了报告。
代码
def pir(df):
base = df.columns.to_series().apply(pd.Series).reset_index(drop=True)
rplc = pd.DataFrame(np.where(base == '', None, base))
data = {k:v.dropna()[::-1].reset_index(drop=True) for k, v in rplc.iterrows()}
new_col = pd.MultiIndex.from_tuples(pd.DataFrame(data)[::-1].fillna('').T.values.tolist())
return pd.DataFrame(df.values, df.index, new_col)
def kartik(df):
new_cols = []
for col in df.columns:
col = list(col)
if col[2] == '':
col[2] = col[0]
col[0] = ''
new_cols.append(col)
return pd.DataFrame(df.values, df.index, list(map(list, zip(*new_cols))))
def so44(df):
tuple_list = df.columns.values
new_tuple_list = []
for t in tuple_list:
if t[2] == '':
if t[1] == '':
t = (t[1], t[2], t[0])
else:
t = (t[2], t[0], t[1])
elif t[1] == '' and t[0] != '':
t = (t[1], t[0], t[2])
new_tuple_list.append(t)
return pd.DataFrame(df.values, df.index, pd.MultiIndex.from_tuples(new_tuple_list))
def root(df):
new_cols = [['']*col.count('')+[x for x in col if x] for col in df.columns.values]
return pd.DataFrame(df.values, df.index, pd.MultiIndex.from_tuples(new_cols))
return pd.DataFrame(df.values, df.index, pd.MultiIndex.from_tuples(new_tuple_list))
piRSquared,试试这个:
In [1]: import pandas as pd
import numpy as np
In [2]: col = pd.MultiIndex.from_tuples([('a', '', ''), ('', 'b', ''),
('c', 'd', ''), ('e', '', 'f'),
('g', 'h', 'i'), ('', 'j', 'k')])
df = pd.DataFrame(np.arange(24).reshape(4, 6), list('abcd'), col)
df
Out[2]: a c e g
b d h j
f i k
a 0 1 2 3 4 5
b 6 7 8 9 10 11
c 12 13 14 15 16 17
d 18 19 20 21 22 23
In [3]: new_cols = []
for col in df.columns:
b = np.argsort([1 if (v != '') else 0 for v in col])
col = [col[i] for i in b]
new_cols.append(col)
df.columns = list(map(list, zip(*new_cols)))
df
Out[3]: g
c e h j
a b d f i k
a 0 1 2 3 4 5
b 6 7 8 9 10 11
c 12 13 14 15 16 17
d 18 19 20 21 22 23
时间
运行只有一个循环因为:
- 获取缓存副本警告
- OP 可能只会 运行 一次。由于缓存副本而加速将无用
tuple_list = df.columns.values
new_tuple_list = []
for t in tuple_list:
if t[2] == '':
if t[1] == '':
t = (t[1], t[2], t[0])
else:
t = (t[2], t[0], t[1])
elif t[1] == '' and t[0] != '':
t = (t[1], t[0], t[2])
new_tuple_list.append(t)
df.columns = pd.MultiIndex.from_tuples(new_tuple_list)
这更好,应该适用于 headers 更多级别:
tuple_list = df.columns.values
new_tuple_list = []
for t in tuple_list:
header_only = [x for x in t if x != '']
leading_empty = ['' for x in range(0, 3-len(header_only))]
new_tuple = tuple(leading_empty + header_only)
new_tuple_list.append(new_tuple)
df.columns = pd.MultiIndex.from_tuples(new_tuple_list)
我会根据审美和运行时间来筛选答案
我不会选择我的答案!
我想到了什么:
base = df.columns.to_series().apply(pd.Series).reset_index(drop=True)
rplc = pd.DataFrame(np.where(base == '', None, base))
data = {k:v.dropna()[::-1].reset_index(drop=True) for k, v in rplc.iterrows()}
new_col = pd.MultiIndex.from_tuples(pd.DataFrame(data)[::-1].fillna('').T.values.tolist())
df.columns = new_col
df
重新排列每列元组的通用方法:
new_cols = [['']*col.count('')+[x for x in col if x != ''] for col in df.columns.values]
df.columns = pd.MultiIndex.from_tuples(new_cols)
它可能不如针对特定级别数调整的其他方法那么快,但它很简洁并且应该适用于任意数量的级别。
考虑以下 df
和 pd.MultiIndex
col = pd.MultiIndex.from_tuples([('stat1', '', ''), ('stat2', '', ''),
('A', 'mean', 'blue'), ('A', 'mean', 'red'),
('B', 'var', 'blue'), ('B', 'var', 'red')])
df = pd.DataFrame(np.arange(24).reshape(4, 6), list('abcd'), col)
df
我不想 stat1
和 stat2
放在最前面。我希望它们像这样位于底部:
一个夸张的例子更能说明问题
col = pd.MultiIndex.from_tuples([('a', '', ''), ('', 'b', ''),
('c', 'd', ''), ('e', '', 'f'),
('g', 'h', 'i'), ('', 'j', 'k')])
df = pd.DataFrame(np.arange(24).reshape(4, 6), list('abcd'), col)
df
应该看起来像:
我试过:
c = np.array(col.values.tolist())
c_ = pd.MultiIndex.from_tuples(np.sort(c).tolist())
pd.DataFrame(df.values, df.index, c_)
这是不正确的,因为某些列以我不希望的方式排序。
时机
到目前为止,@root 获得了计时奖。我认为我不太关心更大的指数。这主要是为了报告。
代码
def pir(df):
base = df.columns.to_series().apply(pd.Series).reset_index(drop=True)
rplc = pd.DataFrame(np.where(base == '', None, base))
data = {k:v.dropna()[::-1].reset_index(drop=True) for k, v in rplc.iterrows()}
new_col = pd.MultiIndex.from_tuples(pd.DataFrame(data)[::-1].fillna('').T.values.tolist())
return pd.DataFrame(df.values, df.index, new_col)
def kartik(df):
new_cols = []
for col in df.columns:
col = list(col)
if col[2] == '':
col[2] = col[0]
col[0] = ''
new_cols.append(col)
return pd.DataFrame(df.values, df.index, list(map(list, zip(*new_cols))))
def so44(df):
tuple_list = df.columns.values
new_tuple_list = []
for t in tuple_list:
if t[2] == '':
if t[1] == '':
t = (t[1], t[2], t[0])
else:
t = (t[2], t[0], t[1])
elif t[1] == '' and t[0] != '':
t = (t[1], t[0], t[2])
new_tuple_list.append(t)
return pd.DataFrame(df.values, df.index, pd.MultiIndex.from_tuples(new_tuple_list))
def root(df):
new_cols = [['']*col.count('')+[x for x in col if x] for col in df.columns.values]
return pd.DataFrame(df.values, df.index, pd.MultiIndex.from_tuples(new_cols))
return pd.DataFrame(df.values, df.index, pd.MultiIndex.from_tuples(new_tuple_list))
piRSquared,试试这个:
In [1]: import pandas as pd
import numpy as np
In [2]: col = pd.MultiIndex.from_tuples([('a', '', ''), ('', 'b', ''),
('c', 'd', ''), ('e', '', 'f'),
('g', 'h', 'i'), ('', 'j', 'k')])
df = pd.DataFrame(np.arange(24).reshape(4, 6), list('abcd'), col)
df
Out[2]: a c e g
b d h j
f i k
a 0 1 2 3 4 5
b 6 7 8 9 10 11
c 12 13 14 15 16 17
d 18 19 20 21 22 23
In [3]: new_cols = []
for col in df.columns:
b = np.argsort([1 if (v != '') else 0 for v in col])
col = [col[i] for i in b]
new_cols.append(col)
df.columns = list(map(list, zip(*new_cols)))
df
Out[3]: g
c e h j
a b d f i k
a 0 1 2 3 4 5
b 6 7 8 9 10 11
c 12 13 14 15 16 17
d 18 19 20 21 22 23
时间
运行只有一个循环因为:
- 获取缓存副本警告
- OP 可能只会 运行 一次。由于缓存副本而加速将无用
tuple_list = df.columns.values
new_tuple_list = []
for t in tuple_list:
if t[2] == '':
if t[1] == '':
t = (t[1], t[2], t[0])
else:
t = (t[2], t[0], t[1])
elif t[1] == '' and t[0] != '':
t = (t[1], t[0], t[2])
new_tuple_list.append(t)
df.columns = pd.MultiIndex.from_tuples(new_tuple_list)
这更好,应该适用于 headers 更多级别:
tuple_list = df.columns.values
new_tuple_list = []
for t in tuple_list:
header_only = [x for x in t if x != '']
leading_empty = ['' for x in range(0, 3-len(header_only))]
new_tuple = tuple(leading_empty + header_only)
new_tuple_list.append(new_tuple)
df.columns = pd.MultiIndex.from_tuples(new_tuple_list)
我会根据审美和运行时间来筛选答案
我不会选择我的答案!
我想到了什么:
base = df.columns.to_series().apply(pd.Series).reset_index(drop=True)
rplc = pd.DataFrame(np.where(base == '', None, base))
data = {k:v.dropna()[::-1].reset_index(drop=True) for k, v in rplc.iterrows()}
new_col = pd.MultiIndex.from_tuples(pd.DataFrame(data)[::-1].fillna('').T.values.tolist())
df.columns = new_col
df
重新排列每列元组的通用方法:
new_cols = [['']*col.count('')+[x for x in col if x != ''] for col in df.columns.values]
df.columns = pd.MultiIndex.from_tuples(new_cols)
它可能不如针对特定级别数调整的其他方法那么快,但它很简洁并且应该适用于任意数量的级别。