如何从 apply return 正确格式化 pandas 数据框?
How to return correctly formatted pandas dataframe from apply?
假设我们有以下数据框:
import pandas as pd
import numpy as np
years = [2005, 2006]
location = ['city', 'suburb']
dft = pd.DataFrame({
'year': [years[np.random.randint(0, 1+1)] for _ in range(100)],
'location': [location[np.random.randint(0, 1+1)] for _ in range(100)],
'days_to_complete': np.random.randint(100, high=600, size=100),
'cost_in_millions': np.random.randint(1, high=10, size=100)
})
按年份和位置分组,然后应用如下函数:
def get_custom_summary(group):
gt_200 = group.days_to_complete > 200
lt_200 = group.days_to_complete < 200
avg_days_gt200 = group[gt_200].days_to_complete.mean()
avg_cost_gt200 = group[gt_200].cost_in_millions.mean()
avg_days_lt200 = group[lt_200].days_to_complete.mean()
avg_cost_lt200 = group[lt_200].cost_in_millions.mean()
lt_200_prop = lt_200.sum() / (gt_200.sum() + lt_200.sum())
return pd.DataFrame({
'gt_200': {'AVG_DAYS': avg_days_gt200, 'AVG_COST': avg_cost_gt200},
'lt_200': {'avg_days': avg_days_lt200, 'avg_cost': avg_cost_lt200},
'lt_200_prop' : lt_200_prop
})
result = dft.groupby(['year', 'location']).apply(get_custom_summary)
对结果调用 unstack(2) 我们得到以下输出:
print(result.unstack(2))
gt_200 lt_200 lt_200_prop
AVG_COST AVG_DAYS avg_cost avg_days AVG_COST AVG_DAYS avg_cost avg_days AVG_COST AVG_DAYS avg_cost avg_days
year location
2005 city 4.818182 415.636364 NaN NaN NaN NaN 7.250000 165.50 0.153846 0.153846 0.153846 0.153846
suburb 5.631579 336.631579 NaN NaN NaN NaN 5.166667 140.50 0.240000 0.240000 0.240000 0.240000
2006 city 4.130435 396.913043 NaN NaN NaN NaN 5.750000 150.75 0.258065 0.258065 0.258065 0.258065
suburb 5.294118 392.823529 NaN NaN NaN NaN 1.000000 128.00 0.055556 0.055556 0.055556 0.055556
对于列 gt_200
和 lt_200
调用 dropna(axis=1)
将删除填充 NaN 的列,但 lt_200_prop
列仍然卡在错误的列中名字。我怎么能 return 来自 get_custom_summary 的 DataFrame 不广播(如果这是正确的词)子列(AVG_COST
、AVG_DAYS
、avg_cost
、avg_days
) 到列 (gt_200
, lt_200
, lt_200_prop
)?
编辑:
期望的输出:
gt_200 lt_200 lt_200_prop
AVG_COST AVG_DAYS avg_cost avg_days
year location
2005 city 4.818182 415.636364 7.250000 165.50 0.153846
suburb 5.631579 336.631579 5.166667 140.50 0.240000
2006 city 4.130435 396.913043 5.750000 150.75 0.258065
suburb 5.294118 392.823529 1.000000 128.00 0.055556
我的解决方案是在 gt_200
和 lt_200
函数 get_custom_summary
中使用相同的列名,然后通过函数 str.lower
重命名并添加最后一个自定义列名 col
.
但是有MultiIndex
,所以你需要通过MultiIndex.from_tuples
创建新的:
years = [2005, 2006]
location = ['city', 'suburb']
np.random.seed(1234)
dft = pd.DataFrame({
'year': [years[np.random.randint(0, 1+1)] for _ in range(100)],
'location': [location[np.random.randint(0, 1+1)] for _ in range(100)],
'days_to_complete': np.random.randint(100, high=600, size=100),
'cost_in_millions': np.random.randint(1, high=10, size=100)
})
def get_custom_summary(group):
gt_200 = group.days_to_complete > 200
lt_200 = group.days_to_complete < 200
avg_days_gt200 = group[gt_200].days_to_complete.mean()
avg_cost_gt200 = group[gt_200].cost_in_millions.mean()
avg_days_lt200 = group[lt_200].days_to_complete.mean()
avg_cost_lt200 = group[lt_200].cost_in_millions.mean()
lt_200_prop = (lt_200).sum() / ((gt_200).sum() + (lt_200).sum())
return pd.DataFrame({
'gt_200': {'AVG_DAYS': avg_days_gt200, 'AVG_COST': avg_cost_gt200},
'lt_200': {'AVG_DAYS': avg_days_lt200, 'AVG_COST': avg_cost_lt200},
'lt_200_prop' : lt_200_prop
})
result = dft.groupby(['year', 'location']).apply(get_custom_summary).unstack(2)
#drop last column with duplicates values
result = result.drop(result.columns[[-1]], axis=1)
#rename columns names in level 1
a = (result.columns.get_level_values(1))
level1 = a[:2].union(a[2:4].str.lower().union(['col']))
cols = list(zip(result.columns.get_level_values(0),level1))
result.columns = pd.MultiIndex.from_tuples(cols)
print (result)
gt_200 lt_200 lt_200_prop
AVG_COST AVG_DAYS avg_cost avg_days col
year location
2005 city 5.238095 392.095238 5.500000 144.666667 0.222222
suburb 4.428571 427.095238 4.000000 167.666667 0.125000
2006 city 4.368421 406.789474 4.571429 150.142857 0.269231
suburb 4.000000 439.062500 4.142857 145.142857 0.304348
更简单的解决方案是删除列:
result = dft.groupby(['year', 'location']).apply(get_custom_summary).unstack(2)
#drop last 3 column, then drop NaN columns
result = result.drop(result.columns[[-1, -2, -3]], axis=1).dropna(axis=1)
print (result)
gt_200 lt_200 lt_200_prop
AVG_COST AVG_DAYS avg_cost avg_days AVG_COST
year location
2005 city 5.238095 392.095238 5.500000 144.666667 0.222222
suburb 4.428571 427.095238 4.000000 167.666667 0.125000
2006 city 4.368421 406.789474 4.571429 150.142857 0.269231
suburb 4.000000 439.062500 4.142857 145.142857 0.304348
Return 列设置为等于 MultiIndex 的 Dataframe。
from collections import OrderedDict
def get_multi_index(ordered_dict):
length = len(list(ordered_dict.values())[0])
for k in ordered_dict:
assert(len(ordered_dict[k]) == length)
names = list()
arrays = list()
for k in ordered_dict:
names.append(k)
arrays.append(np.array(ordered_dict[k]))
tuples = list(zip(*arrays))
return pd.MultiIndex.from_tuples(tuples, names=names)
def get_custom_summary(group):
gt_200 = group.days_to_complete > 200
lt_200 = group.days_to_complete < 200
avg_days_gt_200 = group[gt_200].days_to_complete.mean()
avg_cost_gt_200 = group[gt_200].cost_in_millions.mean()
avg_days_lt_200 = group[lt_200].days_to_complete.mean()
avg_cost_lt_200 = group[lt_200].cost_in_millions.mean()
lt_200_prop = lt_200.sum() / (gt_200.sum() + lt_200.sum())
ordered_dict = OrderedDict()
ordered_dict['first'] = ['lt_200', 'lt_200', 'gt_200', 'gt_200', 'lt_200_prop']
ordered_dict['second'] = ['avg_cost', 'avg_days', 'AVG_COST', 'AVG_DAYS', 'prop']
data = [[avg_cost_lt_200, avg_days_lt_200, avg_cost_gt_200, avg_days_gt_200, lt_200_prop]]
return pd.DataFrame(data, columns=get_multi_index(ordered_dict))
获取并打印结果:
result = dft.groupby(['year', 'location']).apply(get_custom_summary).xs(0, level=2)
print(result)
输出:
first lt_200 gt_200 lt_200_prop
second avg_cost avg_days AVG_COST AVG_DAYS prop
year location
2005 city 7.555556 135.444444 5.300000 363.750000 0.310345
suburb 5.000000 137.333333 5.555556 444.222222 0.250000
2006 city 6.250000 169.000000 4.714286 422.380952 0.160000
suburb 4.428571 133.142857 4.333333 445.666667 0.318182
假设我们有以下数据框:
import pandas as pd
import numpy as np
years = [2005, 2006]
location = ['city', 'suburb']
dft = pd.DataFrame({
'year': [years[np.random.randint(0, 1+1)] for _ in range(100)],
'location': [location[np.random.randint(0, 1+1)] for _ in range(100)],
'days_to_complete': np.random.randint(100, high=600, size=100),
'cost_in_millions': np.random.randint(1, high=10, size=100)
})
按年份和位置分组,然后应用如下函数:
def get_custom_summary(group):
gt_200 = group.days_to_complete > 200
lt_200 = group.days_to_complete < 200
avg_days_gt200 = group[gt_200].days_to_complete.mean()
avg_cost_gt200 = group[gt_200].cost_in_millions.mean()
avg_days_lt200 = group[lt_200].days_to_complete.mean()
avg_cost_lt200 = group[lt_200].cost_in_millions.mean()
lt_200_prop = lt_200.sum() / (gt_200.sum() + lt_200.sum())
return pd.DataFrame({
'gt_200': {'AVG_DAYS': avg_days_gt200, 'AVG_COST': avg_cost_gt200},
'lt_200': {'avg_days': avg_days_lt200, 'avg_cost': avg_cost_lt200},
'lt_200_prop' : lt_200_prop
})
result = dft.groupby(['year', 'location']).apply(get_custom_summary)
对结果调用 unstack(2) 我们得到以下输出:
print(result.unstack(2))
gt_200 lt_200 lt_200_prop
AVG_COST AVG_DAYS avg_cost avg_days AVG_COST AVG_DAYS avg_cost avg_days AVG_COST AVG_DAYS avg_cost avg_days
year location
2005 city 4.818182 415.636364 NaN NaN NaN NaN 7.250000 165.50 0.153846 0.153846 0.153846 0.153846
suburb 5.631579 336.631579 NaN NaN NaN NaN 5.166667 140.50 0.240000 0.240000 0.240000 0.240000
2006 city 4.130435 396.913043 NaN NaN NaN NaN 5.750000 150.75 0.258065 0.258065 0.258065 0.258065
suburb 5.294118 392.823529 NaN NaN NaN NaN 1.000000 128.00 0.055556 0.055556 0.055556 0.055556
对于列 gt_200
和 lt_200
调用 dropna(axis=1)
将删除填充 NaN 的列,但 lt_200_prop
列仍然卡在错误的列中名字。我怎么能 return 来自 get_custom_summary 的 DataFrame 不广播(如果这是正确的词)子列(AVG_COST
、AVG_DAYS
、avg_cost
、avg_days
) 到列 (gt_200
, lt_200
, lt_200_prop
)?
编辑:
期望的输出:
gt_200 lt_200 lt_200_prop
AVG_COST AVG_DAYS avg_cost avg_days
year location
2005 city 4.818182 415.636364 7.250000 165.50 0.153846
suburb 5.631579 336.631579 5.166667 140.50 0.240000
2006 city 4.130435 396.913043 5.750000 150.75 0.258065
suburb 5.294118 392.823529 1.000000 128.00 0.055556
我的解决方案是在 gt_200
和 lt_200
函数 get_custom_summary
中使用相同的列名,然后通过函数 str.lower
重命名并添加最后一个自定义列名 col
.
但是有MultiIndex
,所以你需要通过MultiIndex.from_tuples
创建新的:
years = [2005, 2006]
location = ['city', 'suburb']
np.random.seed(1234)
dft = pd.DataFrame({
'year': [years[np.random.randint(0, 1+1)] for _ in range(100)],
'location': [location[np.random.randint(0, 1+1)] for _ in range(100)],
'days_to_complete': np.random.randint(100, high=600, size=100),
'cost_in_millions': np.random.randint(1, high=10, size=100)
})
def get_custom_summary(group):
gt_200 = group.days_to_complete > 200
lt_200 = group.days_to_complete < 200
avg_days_gt200 = group[gt_200].days_to_complete.mean()
avg_cost_gt200 = group[gt_200].cost_in_millions.mean()
avg_days_lt200 = group[lt_200].days_to_complete.mean()
avg_cost_lt200 = group[lt_200].cost_in_millions.mean()
lt_200_prop = (lt_200).sum() / ((gt_200).sum() + (lt_200).sum())
return pd.DataFrame({
'gt_200': {'AVG_DAYS': avg_days_gt200, 'AVG_COST': avg_cost_gt200},
'lt_200': {'AVG_DAYS': avg_days_lt200, 'AVG_COST': avg_cost_lt200},
'lt_200_prop' : lt_200_prop
})
result = dft.groupby(['year', 'location']).apply(get_custom_summary).unstack(2)
#drop last column with duplicates values
result = result.drop(result.columns[[-1]], axis=1)
#rename columns names in level 1
a = (result.columns.get_level_values(1))
level1 = a[:2].union(a[2:4].str.lower().union(['col']))
cols = list(zip(result.columns.get_level_values(0),level1))
result.columns = pd.MultiIndex.from_tuples(cols)
print (result)
gt_200 lt_200 lt_200_prop
AVG_COST AVG_DAYS avg_cost avg_days col
year location
2005 city 5.238095 392.095238 5.500000 144.666667 0.222222
suburb 4.428571 427.095238 4.000000 167.666667 0.125000
2006 city 4.368421 406.789474 4.571429 150.142857 0.269231
suburb 4.000000 439.062500 4.142857 145.142857 0.304348
更简单的解决方案是删除列:
result = dft.groupby(['year', 'location']).apply(get_custom_summary).unstack(2)
#drop last 3 column, then drop NaN columns
result = result.drop(result.columns[[-1, -2, -3]], axis=1).dropna(axis=1)
print (result)
gt_200 lt_200 lt_200_prop
AVG_COST AVG_DAYS avg_cost avg_days AVG_COST
year location
2005 city 5.238095 392.095238 5.500000 144.666667 0.222222
suburb 4.428571 427.095238 4.000000 167.666667 0.125000
2006 city 4.368421 406.789474 4.571429 150.142857 0.269231
suburb 4.000000 439.062500 4.142857 145.142857 0.304348
Return 列设置为等于 MultiIndex 的 Dataframe。
from collections import OrderedDict
def get_multi_index(ordered_dict):
length = len(list(ordered_dict.values())[0])
for k in ordered_dict:
assert(len(ordered_dict[k]) == length)
names = list()
arrays = list()
for k in ordered_dict:
names.append(k)
arrays.append(np.array(ordered_dict[k]))
tuples = list(zip(*arrays))
return pd.MultiIndex.from_tuples(tuples, names=names)
def get_custom_summary(group):
gt_200 = group.days_to_complete > 200
lt_200 = group.days_to_complete < 200
avg_days_gt_200 = group[gt_200].days_to_complete.mean()
avg_cost_gt_200 = group[gt_200].cost_in_millions.mean()
avg_days_lt_200 = group[lt_200].days_to_complete.mean()
avg_cost_lt_200 = group[lt_200].cost_in_millions.mean()
lt_200_prop = lt_200.sum() / (gt_200.sum() + lt_200.sum())
ordered_dict = OrderedDict()
ordered_dict['first'] = ['lt_200', 'lt_200', 'gt_200', 'gt_200', 'lt_200_prop']
ordered_dict['second'] = ['avg_cost', 'avg_days', 'AVG_COST', 'AVG_DAYS', 'prop']
data = [[avg_cost_lt_200, avg_days_lt_200, avg_cost_gt_200, avg_days_gt_200, lt_200_prop]]
return pd.DataFrame(data, columns=get_multi_index(ordered_dict))
获取并打印结果:
result = dft.groupby(['year', 'location']).apply(get_custom_summary).xs(0, level=2)
print(result)
输出:
first lt_200 gt_200 lt_200_prop
second avg_cost avg_days AVG_COST AVG_DAYS prop
year location
2005 city 7.555556 135.444444 5.300000 363.750000 0.310345
suburb 5.000000 137.333333 5.555556 444.222222 0.250000
2006 city 6.250000 169.000000 4.714286 422.380952 0.160000
suburb 4.428571 133.142857 4.333333 445.666667 0.318182