如何从 apply return 正确格式化 pandas 数据框?

How to return correctly formatted pandas dataframe from apply?

假设我们有以下数据框:

import pandas as pd
import numpy as np

years = [2005, 2006]
location = ['city', 'suburb']
dft = pd.DataFrame({
    'year': [years[np.random.randint(0, 1+1)] for _ in range(100)],
    'location': [location[np.random.randint(0, 1+1)] for _ in range(100)],
    'days_to_complete': np.random.randint(100, high=600, size=100),
    'cost_in_millions': np.random.randint(1, high=10, size=100)
})

按年份和位置分组,然后应用如下函数:

def get_custom_summary(group):
    gt_200 = group.days_to_complete > 200
    lt_200 = group.days_to_complete < 200

    avg_days_gt200 = group[gt_200].days_to_complete.mean()
    avg_cost_gt200 = group[gt_200].cost_in_millions.mean()

    avg_days_lt200 = group[lt_200].days_to_complete.mean()
    avg_cost_lt200 = group[lt_200].cost_in_millions.mean()

    lt_200_prop = lt_200.sum() / (gt_200.sum() + lt_200.sum())

    return pd.DataFrame({
        'gt_200': {'AVG_DAYS': avg_days_gt200, 'AVG_COST': avg_cost_gt200}, 
        'lt_200': {'avg_days': avg_days_lt200, 'avg_cost': avg_cost_lt200},
        'lt_200_prop' : lt_200_prop
    })

result = dft.groupby(['year', 'location']).apply(get_custom_summary)

对结果调用 unstack(2) 我们得到以下输出:

print(result.unstack(2))

                 gt_200                                 lt_200                             lt_200_prop                              
               AVG_COST    AVG_DAYS avg_cost avg_days AVG_COST AVG_DAYS  avg_cost avg_days    AVG_COST  AVG_DAYS  avg_cost  avg_days
year location                                                                                                                       
2005 city      4.818182  415.636364      NaN      NaN      NaN      NaN  7.250000   165.50    0.153846  0.153846  0.153846  0.153846
     suburb    5.631579  336.631579      NaN      NaN      NaN      NaN  5.166667   140.50    0.240000  0.240000  0.240000  0.240000
2006 city      4.130435  396.913043      NaN      NaN      NaN      NaN  5.750000   150.75    0.258065  0.258065  0.258065  0.258065
     suburb    5.294118  392.823529      NaN      NaN      NaN      NaN  1.000000   128.00    0.055556  0.055556  0.055556  0.055556

对于列 gt_200lt_200 调用 dropna(axis=1) 将删除填充 NaN 的列,但 lt_200_prop 列仍然卡在错误的列中名字。我怎么能 return 来自 get_custom_summary 的 DataFrame 不广播(如果这是正确的词)子列(AVG_COSTAVG_DAYSavg_costavg_days) 到列 (gt_200, lt_200, lt_200_prop)?

编辑:

期望的输出:

                 gt_200               lt_200          lt_200_prop                              
               AVG_COST    AVG_DAYS avg_cost avg_days
year location                                                                                                                       
2005 city      4.818182  415.636364 7.250000   165.50    0.153846
     suburb    5.631579  336.631579 5.166667   140.50    0.240000
2006 city      4.130435  396.913043 5.750000   150.75    0.258065
     suburb    5.294118  392.823529 1.000000   128.00    0.055556

我的解决方案是在 gt_200lt_200 函数 get_custom_summary 中使用相同的列名,然后通过函数 str.lower 重命名并添加最后一个自定义列名 col.

但是有MultiIndex,所以你需要通过MultiIndex.from_tuples创建新的:

years = [2005, 2006]
location = ['city', 'suburb']
np.random.seed(1234)
dft = pd.DataFrame({
    'year': [years[np.random.randint(0, 1+1)] for _ in range(100)],
    'location': [location[np.random.randint(0, 1+1)] for _ in range(100)],
    'days_to_complete': np.random.randint(100, high=600, size=100),
    'cost_in_millions': np.random.randint(1, high=10, size=100)
})

def get_custom_summary(group):
    gt_200 = group.days_to_complete > 200
    lt_200 = group.days_to_complete < 200

    avg_days_gt200 = group[gt_200].days_to_complete.mean()
    avg_cost_gt200 = group[gt_200].cost_in_millions.mean()

    avg_days_lt200 = group[lt_200].days_to_complete.mean()
    avg_cost_lt200 = group[lt_200].cost_in_millions.mean()

    lt_200_prop = (lt_200).sum() / ((gt_200).sum() + (lt_200).sum())

    return pd.DataFrame({
        'gt_200': {'AVG_DAYS': avg_days_gt200, 'AVG_COST': avg_cost_gt200}, 
        'lt_200': {'AVG_DAYS': avg_days_lt200, 'AVG_COST': avg_cost_lt200},
         'lt_200_prop' : lt_200_prop
    })
result = dft.groupby(['year', 'location']).apply(get_custom_summary).unstack(2)
#drop last column with duplicates values
result = result.drop(result.columns[[-1]], axis=1)

#rename columns names in level 1
a = (result.columns.get_level_values(1))
level1 = a[:2].union(a[2:4].str.lower().union(['col']))
cols = list(zip(result.columns.get_level_values(0),level1))
result.columns = pd.MultiIndex.from_tuples(cols)

print (result)
                 gt_200                lt_200             lt_200_prop
               AVG_COST    AVG_DAYS  avg_cost    avg_days         col
year location                                                        
2005 city      5.238095  392.095238  5.500000  144.666667    0.222222
     suburb    4.428571  427.095238  4.000000  167.666667    0.125000
2006 city      4.368421  406.789474  4.571429  150.142857    0.269231
     suburb    4.000000  439.062500  4.142857  145.142857    0.304348

更简单的解决方案是删除列:

result = dft.groupby(['year', 'location']).apply(get_custom_summary).unstack(2)
#drop last 3 column, then drop NaN columns
result = result.drop(result.columns[[-1, -2, -3]], axis=1).dropna(axis=1)
print (result)
                 gt_200                lt_200             lt_200_prop
               AVG_COST    AVG_DAYS  avg_cost    avg_days    AVG_COST
year location                                                        
2005 city      5.238095  392.095238  5.500000  144.666667    0.222222
     suburb    4.428571  427.095238  4.000000  167.666667    0.125000
2006 city      4.368421  406.789474  4.571429  150.142857    0.269231
     suburb    4.000000  439.062500  4.142857  145.142857    0.304348

Return 列设置为等于 MultiIndex 的 Dataframe。

from collections import OrderedDict

def get_multi_index(ordered_dict):
    length = len(list(ordered_dict.values())[0])

    for k in ordered_dict:
        assert(len(ordered_dict[k]) == length)

    names = list()
    arrays = list()
    for k in ordered_dict:
        names.append(k)
        arrays.append(np.array(ordered_dict[k]))

    tuples = list(zip(*arrays))
    return pd.MultiIndex.from_tuples(tuples, names=names) 

def get_custom_summary(group):
    gt_200 = group.days_to_complete > 200
    lt_200 = group.days_to_complete < 200

    avg_days_gt_200 = group[gt_200].days_to_complete.mean()
    avg_cost_gt_200 = group[gt_200].cost_in_millions.mean()

   avg_days_lt_200 = group[lt_200].days_to_complete.mean()
   avg_cost_lt_200 = group[lt_200].cost_in_millions.mean()

   lt_200_prop = lt_200.sum() / (gt_200.sum() + lt_200.sum())

   ordered_dict = OrderedDict()
   ordered_dict['first'] = ['lt_200', 'lt_200', 'gt_200', 'gt_200', 'lt_200_prop']
   ordered_dict['second'] = ['avg_cost', 'avg_days', 'AVG_COST', 'AVG_DAYS', 'prop']

   data = [[avg_cost_lt_200, avg_days_lt_200, avg_cost_gt_200, avg_days_gt_200, lt_200_prop]]
   return pd.DataFrame(data, columns=get_multi_index(ordered_dict))

获取并打印结果:

result = dft.groupby(['year', 'location']).apply(get_custom_summary).xs(0, level=2)
print(result)

输出:

first            lt_200                gt_200             lt_200_prop
second         avg_cost    avg_days  AVG_COST    AVG_DAYS        prop
year location                                                        
2005 city      7.555556  135.444444  5.300000  363.750000    0.310345
     suburb    5.000000  137.333333  5.555556  444.222222    0.250000
2006 city      6.250000  169.000000  4.714286  422.380952    0.160000
     suburb    4.428571  133.142857  4.333333  445.666667    0.318182