基于列和行列表重构 numpy 数组(如 numpy 数组中的 pivot table)
Restructure numpy array based on column and row list (Like pivot table in numpy array)
这是示例 numpy 数据源
col row1 row2 row3 row4 columns
[[( 11.2, '689', '197', 'value_2', 0, 1)]
[( 56.4, '689', '197', 'value_3', 0, 1)]
[( 195.7, '689', '197', 'value_2', 0, 2)]
[( 565.2, '689', '197', 'value_3', 0, 2)]
[( 227.6, '689', '197', 'value_2', 0, 3)]
[( 1347.6, '689', '197', 'value_2', 0, 3)]
[( 613.5, '689', '196', 'value_2', 0, 1)]
[(139. , '689', '196', 'value_3', 0, 1)]
[( 6011. , '689', '196', 'value_2', 0, 2)]
[(103. , '689', '196', 'value_3', 0, 2)]
[( 6860. , '689', '196', 'value_2', 0, 3)]
[(1302. , '689', '196', 'value_3', 0, 3)]
[( 1787.9, '622', '197', 'value_2', 0, 1)]
[( 632.5, '622', '197', 'value_3', 0, 1)]
[( 178.8, '622', '197', 'value_2', 0, 2)]
[( 6360.5, '622', '197', 'value_3', 0, 2)]
[( 228. , '622', '196', 'value_2', 0, 1)]
[(672. , '622', '196', 'value_3', 0, 2)]
]
所以从这个预期的输出应该是
1 2 3
row1 row2 row3 row4
689 197 value_2 0 11.2 195.7 227.6
689 197 value_3 0 56.4 565 1347
689 196 value_2 0 613.5 6011 6860
689 196 value_3 0 139 103 1302
622 197 value_2 0 1787 178
622 197 value_3 0 632 6360
以上1 2 3列是从numpy数组中的一列中得到的,即rank
根据给定的数据,第1行总是1,但它有多个第2行、第3行和第4行。
对于 row1 中的每个数据,都应找到等效的行并按照输出中的说明进行填充。
我尝试了下面的代码,但无法正确获取 (1, 2, 3) 列的值,因为它在不同的地方,我无法在 numpy 数组中写入。
new_temp_arr = 'actual_data_given'
m = 1
row_list = ['row1', 'row2', 'row3', 'row4']
# Column list taken from the array based on rank column
column_list = [1, 2, 3]
sample_list = []
for value in new_temp_arr:
for new_value in new_temp_arr:
if m >= len(new_temp_arr):
break
new_value = new_temp_arr[m]
# Checking all the values for the rows matches with one another
condition = [value[row] == new_value[row] for row in row_list]
if all(condition):
# Looping through all the column list and getting the float value
# I'm stuck here, how to store the values with properly matched data
for per in column_list:
if new_value['rank'] == [per]:
float_value = new_value['float_value']
sample_list.append(new_value)
m += 1
我不认为你可以用 numpy
有效地做到这一点,特别是因为你的数据中有重复项并且一个简单的数据透视表会失败(看起来你保留了第一个值,虽然不是完全当然,请澄清这一点)。
此外,看起来你的输出是一个数据框,那么为什么不直接将 pandas
与 pivot_table
和 aggfunc='first'
一起使用呢?:
a = np.array([[( 11.2, '689', '197', 'value_2', 0, 1)],
[( 56.4, '689', '197', 'value_3', 0, 1)],
[( 195.7, '689', '197', 'value_2', 0, 2)],
[( 565.2, '689', '197', 'value_3', 0, 2)],
[( 227.6, '689', '197', 'value_2', 0, 3)],
[( 1347.6, '689', '197', 'value_2', 0, 3)],
[( 613.5, '689', '196', 'value_2', 0, 1)],
[(139. , '689', '196', 'value_3', 0, 1)],
[( 6011. , '689', '196', 'value_2', 0, 2)],
[(103. , '689', '196', 'value_3', 0, 2)],
[( 6860. , '689', '196', 'value_2', 0, 3)],
[(1302. , '689', '196', 'value_3', 0, 3)],
[( 1787.9, '622', '197', 'value_2', 0, 1)],
[( 632.5, '622', '197', 'value_3', 0, 1)],
[( 178.8, '622', '197', 'value_2', 0, 2)],
[( 6360.5, '622', '197', 'value_3', 0, 2)],
[( 228. , '622', '196', 'value_2', 0, 1)],
[(672. , '622', '196', 'value_3', 0, 2)],
])
cols = ['col', 'row1', 'row2', 'row3', 'row4', 'columns']
(pd.DataFrame(a[:,0,:], columns=cols)
.pivot_table(index=['row1', 'row2', 'row3', 'row4'], columns='columns', values='col', aggfunc='first')
)
输出:
columns 1 2 3
row1 row2 row3 row4
622 196 value_2 0 228.0 NaN NaN
value_3 0 NaN 672.0 NaN
197 value_2 0 1787.9 178.8 NaN
value_3 0 632.5 6360.5 NaN
689 196 value_2 0 613.5 6011.0 6860.0
value_3 0 139.0 103.0 1302.0
197 value_2 0 11.2 195.7 227.6
value_3 0 56.4 565.2 NaN
如果顺序很重要你可以reindex
到原来的顺序:
cols = ['col', 'row1', 'row2', 'row3', 'row4', 'columns']
df = pd.DataFrame(a[:,0,:], columns=cols)
idx = df.set_index(['row1', 'row2', 'row3', 'row4']).index
idx = idx[~idx.duplicated(keep='first')]
(df.pivot_table(index=['row1', 'row2', 'row3', 'row4'], columns='columns', values='col', aggfunc='first')
.reindex(idx)
)
输出:
columns 1 2 3
row1 row2 row3 row4
689 197 value_2 0 11.2 195.7 227.6
value_3 0 56.4 565.2 NaN
196 value_2 0 613.5 6011.0 6860.0
value_3 0 139.0 103.0 1302.0
622 197 value_2 0 1787.9 178.8 NaN
value_3 0 632.5 6360.5 NaN
196 value_2 0 228.0 NaN NaN
value_3 0 NaN 672.0 NaN
def get_list(arr, row1, row_column_values, row_list, column_list, index):
dict_keys = {i: [] for i in column_list}
dic = {row1: dict_keys}
for value in arr:
if index == len(arr):
index = 0
value = arr[index]
condition = [value[row][0] == row_column_values[row] for row in row_list]
if all(condition):
dic[row1][int(value['rank'][0])] = value['float_value'][0]
if index == 0:
break
index += 1
new_temp_arr = 'actual_data_given'
m = 1
row_list = ['row1', 'row2', 'row3', 'row4']
# Column list taken from the array based on rank column
column_list = [1, 2, 3]
out_array = np.zeros() #Numpy array with type
dic = {}
for value in new_temp_arr:
row_values = {row: value[row][0] for row in row_list}
dic = get_list(new_temp_arr, value['row1'][0], row_values, row_list, column_list, m)
float_value = list(dic[value['row1'][0]].values())
out_array[out_index] = tuple(list(value[row_list][0]) + float_value)
return out_array
上面的代码得到了我在问题中提到的预期结果。
这是示例 numpy 数据源
col row1 row2 row3 row4 columns
[[( 11.2, '689', '197', 'value_2', 0, 1)]
[( 56.4, '689', '197', 'value_3', 0, 1)]
[( 195.7, '689', '197', 'value_2', 0, 2)]
[( 565.2, '689', '197', 'value_3', 0, 2)]
[( 227.6, '689', '197', 'value_2', 0, 3)]
[( 1347.6, '689', '197', 'value_2', 0, 3)]
[( 613.5, '689', '196', 'value_2', 0, 1)]
[(139. , '689', '196', 'value_3', 0, 1)]
[( 6011. , '689', '196', 'value_2', 0, 2)]
[(103. , '689', '196', 'value_3', 0, 2)]
[( 6860. , '689', '196', 'value_2', 0, 3)]
[(1302. , '689', '196', 'value_3', 0, 3)]
[( 1787.9, '622', '197', 'value_2', 0, 1)]
[( 632.5, '622', '197', 'value_3', 0, 1)]
[( 178.8, '622', '197', 'value_2', 0, 2)]
[( 6360.5, '622', '197', 'value_3', 0, 2)]
[( 228. , '622', '196', 'value_2', 0, 1)]
[(672. , '622', '196', 'value_3', 0, 2)]
]
所以从这个预期的输出应该是
1 2 3
row1 row2 row3 row4
689 197 value_2 0 11.2 195.7 227.6
689 197 value_3 0 56.4 565 1347
689 196 value_2 0 613.5 6011 6860
689 196 value_3 0 139 103 1302
622 197 value_2 0 1787 178
622 197 value_3 0 632 6360
以上1 2 3列是从numpy数组中的一列中得到的,即rank
根据给定的数据,第1行总是1,但它有多个第2行、第3行和第4行。 对于 row1 中的每个数据,都应找到等效的行并按照输出中的说明进行填充。
我尝试了下面的代码,但无法正确获取 (1, 2, 3) 列的值,因为它在不同的地方,我无法在 numpy 数组中写入。
new_temp_arr = 'actual_data_given'
m = 1
row_list = ['row1', 'row2', 'row3', 'row4']
# Column list taken from the array based on rank column
column_list = [1, 2, 3]
sample_list = []
for value in new_temp_arr:
for new_value in new_temp_arr:
if m >= len(new_temp_arr):
break
new_value = new_temp_arr[m]
# Checking all the values for the rows matches with one another
condition = [value[row] == new_value[row] for row in row_list]
if all(condition):
# Looping through all the column list and getting the float value
# I'm stuck here, how to store the values with properly matched data
for per in column_list:
if new_value['rank'] == [per]:
float_value = new_value['float_value']
sample_list.append(new_value)
m += 1
我不认为你可以用 numpy
有效地做到这一点,特别是因为你的数据中有重复项并且一个简单的数据透视表会失败(看起来你保留了第一个值,虽然不是完全当然,请澄清这一点)。
此外,看起来你的输出是一个数据框,那么为什么不直接将 pandas
与 pivot_table
和 aggfunc='first'
一起使用呢?:
a = np.array([[( 11.2, '689', '197', 'value_2', 0, 1)],
[( 56.4, '689', '197', 'value_3', 0, 1)],
[( 195.7, '689', '197', 'value_2', 0, 2)],
[( 565.2, '689', '197', 'value_3', 0, 2)],
[( 227.6, '689', '197', 'value_2', 0, 3)],
[( 1347.6, '689', '197', 'value_2', 0, 3)],
[( 613.5, '689', '196', 'value_2', 0, 1)],
[(139. , '689', '196', 'value_3', 0, 1)],
[( 6011. , '689', '196', 'value_2', 0, 2)],
[(103. , '689', '196', 'value_3', 0, 2)],
[( 6860. , '689', '196', 'value_2', 0, 3)],
[(1302. , '689', '196', 'value_3', 0, 3)],
[( 1787.9, '622', '197', 'value_2', 0, 1)],
[( 632.5, '622', '197', 'value_3', 0, 1)],
[( 178.8, '622', '197', 'value_2', 0, 2)],
[( 6360.5, '622', '197', 'value_3', 0, 2)],
[( 228. , '622', '196', 'value_2', 0, 1)],
[(672. , '622', '196', 'value_3', 0, 2)],
])
cols = ['col', 'row1', 'row2', 'row3', 'row4', 'columns']
(pd.DataFrame(a[:,0,:], columns=cols)
.pivot_table(index=['row1', 'row2', 'row3', 'row4'], columns='columns', values='col', aggfunc='first')
)
输出:
columns 1 2 3
row1 row2 row3 row4
622 196 value_2 0 228.0 NaN NaN
value_3 0 NaN 672.0 NaN
197 value_2 0 1787.9 178.8 NaN
value_3 0 632.5 6360.5 NaN
689 196 value_2 0 613.5 6011.0 6860.0
value_3 0 139.0 103.0 1302.0
197 value_2 0 11.2 195.7 227.6
value_3 0 56.4 565.2 NaN
如果顺序很重要你可以reindex
到原来的顺序:
cols = ['col', 'row1', 'row2', 'row3', 'row4', 'columns']
df = pd.DataFrame(a[:,0,:], columns=cols)
idx = df.set_index(['row1', 'row2', 'row3', 'row4']).index
idx = idx[~idx.duplicated(keep='first')]
(df.pivot_table(index=['row1', 'row2', 'row3', 'row4'], columns='columns', values='col', aggfunc='first')
.reindex(idx)
)
输出:
columns 1 2 3
row1 row2 row3 row4
689 197 value_2 0 11.2 195.7 227.6
value_3 0 56.4 565.2 NaN
196 value_2 0 613.5 6011.0 6860.0
value_3 0 139.0 103.0 1302.0
622 197 value_2 0 1787.9 178.8 NaN
value_3 0 632.5 6360.5 NaN
196 value_2 0 228.0 NaN NaN
value_3 0 NaN 672.0 NaN
def get_list(arr, row1, row_column_values, row_list, column_list, index):
dict_keys = {i: [] for i in column_list}
dic = {row1: dict_keys}
for value in arr:
if index == len(arr):
index = 0
value = arr[index]
condition = [value[row][0] == row_column_values[row] for row in row_list]
if all(condition):
dic[row1][int(value['rank'][0])] = value['float_value'][0]
if index == 0:
break
index += 1
new_temp_arr = 'actual_data_given'
m = 1
row_list = ['row1', 'row2', 'row3', 'row4']
# Column list taken from the array based on rank column
column_list = [1, 2, 3]
out_array = np.zeros() #Numpy array with type
dic = {}
for value in new_temp_arr:
row_values = {row: value[row][0] for row in row_list}
dic = get_list(new_temp_arr, value['row1'][0], row_values, row_list, column_list, m)
float_value = list(dic[value['row1'][0]].values())
out_array[out_index] = tuple(list(value[row_list][0]) + float_value)
return out_array
上面的代码得到了我在问题中提到的预期结果。