基于列和行列表重构 numpy 数组(如 numpy 数组中的 pivot table)

Restructure numpy array based on column and row list (Like pivot table in numpy array)

这是示例 numpy 数据源

     col    row1   row2   row3  row4  columns
[[(  11.2, '689', '197', 'value_2', 0, 1)]
 [(  56.4, '689', '197', 'value_3', 0, 1)]
 [(  195.7, '689', '197', 'value_2', 0, 2)]
 [(  565.2, '689', '197', 'value_3', 0, 2)]
 [(  227.6, '689', '197', 'value_2', 0, 3)]
 [(  1347.6, '689', '197', 'value_2', 0, 3)]
 [( 613.5, '689', '196', 'value_2', 0, 1)]
 [(139. , '689', '196', 'value_3', 0, 1)]
 [( 6011. , '689', '196', 'value_2', 0, 2)]
 [(103. , '689', '196', 'value_3', 0, 2)]
 [( 6860. , '689', '196', 'value_2', 0, 3)]
 [(1302. , '689', '196', 'value_3', 0, 3)]
 [( 1787.9, '622', '197', 'value_2', 0, 1)]
 [( 632.5, '622', '197', 'value_3', 0, 1)]
 [( 178.8, '622', '197', 'value_2', 0, 2)]
 [( 6360.5, '622', '197', 'value_3', 0, 2)]
 [( 228. , '622', '196', 'value_2', 0, 1)]
 [(672. , '622', '196', 'value_3', 0, 2)]
 ]

所以从这个预期的输出应该是

                                   1       2       3

row1   row2    row3        row4
689    197     value_2     0       11.2    195.7   227.6
689    197     value_3     0       56.4    565     1347
689    196     value_2     0       613.5   6011    6860
689    196     value_3     0       139     103     1302
622    197     value_2     0       1787    178     
622    197     value_3     0       632     6360

以上1 2 3列是从numpy数组中的一列中得到的,即rank

根据给定的数据,第1行总是1,但它有多个第2行、第3行和第4行。 对于 row1 中的每个数据,都应找到等效的行并按照输出中的说明进行填充。

我尝试了下面的代码,但无法正确获取 (1, 2, 3) 列的值,因为它在不同的地方,我无法在 numpy 数组中写入。

new_temp_arr = 'actual_data_given'
m = 1
row_list = ['row1', 'row2', 'row3', 'row4']
# Column list taken from the array based on rank column
column_list = [1, 2, 3]
sample_list = []

for value in new_temp_arr:
    for new_value in new_temp_arr:
        if m >= len(new_temp_arr):
            break
        new_value = new_temp_arr[m]
        # Checking all the values for the rows matches with one another
        condition = [value[row] == new_value[row] for row in row_list]
        if all(condition):
            # Looping through all the column list and getting the float value
            # I'm stuck here, how to store the values with properly matched data
            for per in column_list:
                if new_value['rank'] == [per]:
                    float_value = new_value['float_value']
                    sample_list.append(new_value)
        m += 1

我不认为你可以用 numpy 有效地做到这一点,特别是因为你的数据中有重复项并且一个简单的数据透视表会失败(看起来你保留了第一个值,虽然不是完全当然,请澄清这一点)。

此外,看起来你的输出是一个数据框,那么为什么不直接将 pandaspivot_tableaggfunc='first' 一起使用呢?:

a = np.array([[(  11.2, '689', '197', 'value_2', 0, 1)],
              [(  56.4, '689', '197', 'value_3', 0, 1)],
              [(  195.7, '689', '197', 'value_2', 0, 2)],
              [(  565.2, '689', '197', 'value_3', 0, 2)],
              [(  227.6, '689', '197', 'value_2', 0, 3)],
              [(  1347.6, '689', '197', 'value_2', 0, 3)],
              [( 613.5, '689', '196', 'value_2', 0, 1)],
              [(139. , '689', '196', 'value_3', 0, 1)],
              [( 6011. , '689', '196', 'value_2', 0, 2)],
              [(103. , '689', '196', 'value_3', 0, 2)],
              [( 6860. , '689', '196', 'value_2', 0, 3)],
              [(1302. , '689', '196', 'value_3', 0, 3)],
              [( 1787.9, '622', '197', 'value_2', 0, 1)],
              [( 632.5, '622', '197', 'value_3', 0, 1)],
              [( 178.8, '622', '197', 'value_2', 0, 2)],
              [( 6360.5, '622', '197', 'value_3', 0, 2)],
              [( 228. , '622', '196', 'value_2', 0, 1)],
              [(672. , '622', '196', 'value_3', 0, 2)],
             ])
cols = ['col', 'row1', 'row2', 'row3', 'row4', 'columns']
(pd.DataFrame(a[:,0,:], columns=cols)
   .pivot_table(index=['row1', 'row2', 'row3', 'row4'], columns='columns', values='col', aggfunc='first')
)

输出:

columns                      1       2       3
row1 row2 row3    row4                        
622  196  value_2 0      228.0     NaN     NaN
          value_3 0        NaN   672.0     NaN
     197  value_2 0     1787.9   178.8     NaN
          value_3 0      632.5  6360.5     NaN
689  196  value_2 0      613.5  6011.0  6860.0
          value_3 0      139.0   103.0  1302.0
     197  value_2 0       11.2   195.7   227.6
          value_3 0       56.4   565.2     NaN

如果顺序很重要你可以reindex到原来的顺序:

cols = ['col', 'row1', 'row2', 'row3', 'row4', 'columns']
df = pd.DataFrame(a[:,0,:], columns=cols)

idx = df.set_index(['row1', 'row2', 'row3', 'row4']).index
idx = idx[~idx.duplicated(keep='first')]

(df.pivot_table(index=['row1', 'row2', 'row3', 'row4'], columns='columns', values='col', aggfunc='first')
   .reindex(idx)
)

输出:

columns                      1       2       3
row1 row2 row3    row4                        
689  197  value_2 0       11.2   195.7   227.6
          value_3 0       56.4   565.2     NaN
     196  value_2 0      613.5  6011.0  6860.0
          value_3 0      139.0   103.0  1302.0
622  197  value_2 0     1787.9   178.8     NaN
          value_3 0      632.5  6360.5     NaN
     196  value_2 0      228.0     NaN     NaN
          value_3 0        NaN   672.0     NaN
def get_list(arr, row1, row_column_values, row_list, column_list, index):
    dict_keys = {i: [] for i in column_list}
    dic = {row1: dict_keys}
    for value in arr:
        if index == len(arr):
            index = 0
        value = arr[index]
        condition = [value[row][0] == row_column_values[row] for row in row_list]
        if all(condition):
            dic[row1][int(value['rank'][0])] = value['float_value'][0]
            if index == 0:
                break
        index += 1
        
        
new_temp_arr = 'actual_data_given'
m = 1
row_list = ['row1', 'row2', 'row3', 'row4']
# Column list taken from the array based on rank column
column_list = [1, 2, 3]
out_array = np.zeros() #Numpy array with type
dic = {}
    
for value in new_temp_arr:
    row_values = {row: value[row][0] for row in row_list}
    dic = get_list(new_temp_arr, value['row1'][0], row_values, row_list, column_list, m)
    float_value = list(dic[value['row1'][0]].values())
    out_array[out_index] = tuple(list(value[row_list][0]) + float_value)

return out_array
 

上面的代码得到了我在问题中提到的预期结果。