对于给定的 bin,如何确定一个数组中的任何值是否低于另一个数组中的任何值?
How to determine if any value in one array, is lower than any value in another array, for a given bin?
我正在尝试比较不同的线,以了解一条线是否高于另一条线,如果不是,则 x
发生此变化的位置。
如果我有相同的 x
值和相同的长度,那将非常容易,并且只有 y
行的区别。
但是我对不同的线有不同的 x
值,并且向量的长度不相同,但是 x
间隔对于所有曲线都是相同的。
作为一个非常简单的示例,我使用以下数据:
#curve 1: len = 9
x1 = np.array([5,6,7,8,9,10,11,12,13])
y1 = np.array([100,101,110,130,132,170,190,192,210])
#curve 2: len = 10
x2 = np.array([3,4,5,6,7,8,9,10,11,12])
y2 = np.array([90,210,211,250,260,261,265,180,200,210])
#curve 3: len = 8
x3 = np.array([7.3,8.3,9.3,10.3,11.3,12.3,13.3,14.3])
y3 = np.array([300,250,270,350,380,400,390,380])
它们应该是 2 条回归线。在这个简单的例子中,结果应该是 曲线 2 在所有 x
范围内的值都高于曲线 1。
我试图在 2.5-12.5 范围内对 x
进行分箱,分箱长度为 1,以比较每个分箱中相应的 y
s。
我的实际数据很大,这个比较需要做很多次,所以我需要找到一个不需要太多时间的解决方案。
情节
- 给定 x 轴的数据图
plt.figure(figsize=(6, 6))
plt.plot(x1, y1, marker='o', label='y1')
plt.plot(x2, y2, marker='o', label='y2')
plt.plot(x3, y3, marker='o', label='y3')
plt.xticks(range(15))
plt.legend()
plt.grid()
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
函数
def get_new_x
使用 np.digitize
到 re-bin x-axis 值。
def get_comparison
为每两列比较添加一列布尔值
- 目前,每个新列都会添加到主数据框
df
,但是可以将其更新为单独的 comparison
数据框。
combs
是一个列表列组合
[Index(['y1', 'y2'], dtype='object'), Index(['y2', 'y3'], dtype='object')]
# function to create the bins
def get_bins(x_arrays: List[np.array]) -> np.array:
bin_len = np.diff(x_arrays[0][:2]) # calculate bin length
all_x = np.concatenate(x_arrays) # join arrays
min_x = min(all_x) # get min
max_x = max(all_x) # get max
return np.arange(min_x, max_x + bin_len, bin_len)
# function using np.digitize to bin the old x-axis into new bins
def get_new_x(x_arrays: List[np.array]) -> List[np.array]:
bins = get_bins(x_arrays) # get the bins
x_new = list()
for x in x_arrays:
x_new.append(bins[np.digitize(np.round(x), bins, right=True)]) # determine bins
return x_new
# function to create dataframe for arrays with new x-axis as index
def get_df(x_arrays: List[np.array], y_arrays: List[np.array]) -> pd.DataFrame:
x_new = get_new_x(x_arrays)
return pd.concat([pd.DataFrame(y, columns=[f'y{i+1}'], index=x_new[i]) for i, y in enumerate(y_arrays)], axis=1)
# compare each successive column of the dataframe
# if the left column is greater than the right column, then True
def get_comparison(df: pd.DataFrame):
cols = df.columns
combs = [cols[i:i+2] for i in range(0, len(cols), 1) if i < len(cols)-1]
for comb in combs:
df[f'{comb[0]} > {comb[1]}'] = df[comb[0]] > df[comb[1]]
调用函数:
import numpy as np
import pandas as pd
# put the arrays into a list
y = [y1, y2, y3]
x = [x1, x2, x3]
# call get_df
df = get_df(x, y)
# call get_comparison
get_comparison(df)
# get only the index of True values with Boolean indexing
for col in df.columns[3:]:
vals = df.index[df[col]].tolist()
if vals:
print(f'{col}: {vals}')
[out]:
y2 > y3: [8.0]
显示(df)
y1 y2 y3 y1 > y2 y2 > y3
3.0 NaN 90.0 NaN False False
4.0 NaN 210.0 NaN False False
5.0 100.0 211.0 NaN False False
6.0 101.0 250.0 NaN False False
7.0 110.0 260.0 300.0 False False
8.0 130.0 261.0 250.0 False True
9.0 132.0 265.0 270.0 False False
10.0 170.0 180.0 350.0 False False
11.0 190.0 200.0 380.0 False False
12.0 192.0 210.0 400.0 False False
13.0 210.0 NaN 390.0 False False
14.0 NaN NaN 380.0 False False
情节
fig, ax = plt.subplots(figsize=(8, 6))
# add markers for problem values
for i, col in enumerate(df.columns[3:], 1):
vals = df.iloc[:, i][df[col]]
if not vals.empty:
ax.scatter(vals.index, vals.values, color='red', s=110, label='bad')
df.iloc[:, :3].plot(marker='o', ax=ax) # plot the dataframe
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(range(16))
plt.title('y-values plotted against rebinned x-values')
plt.grid()
plt.show()
这是我第一次问这个问题时想到的答案,但当时无法实现。我的想法基于基于 x 对 y1 和 y2 进行分箱,并在每个分箱中比较这两个。因此,作为示例,我有 3 条曲线,我想比较它们。这些曲线中唯一相似的是 delta x
(bin 长度),这里是 1。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#curve 1
x1 = np.array([5,6,7,8,9,10,11,12,13])
y1 = np.array([100,101,110,130,132,170,190,192,210])
#curve 2
x2 = np.array([3,4,5,6,7,8,9,10,11,12])
y2 = np.array([90,210,211,250,260,261,265,180,200,210])
#curve 3
x3 = np.array([7.3,8.3,9.3,10.3,11.3,12.3,13.3,14.3])
y3 = np.array([300,250,270,350,380,400,390,380])
bin_length = 1
# x values have same intervals both in x1 and x2
x_min = min(x1[0],x2[0],x3[0])-bin_length/2
x_max = max(x1[-1],x2[-1],x3[-1])+bin_length/2
bins = np.arange(x_min,x_max+bin_length,bin_length)
# bin mid points to use as index
bin_mid = []
for i in range(len(bins)-1):
# compute mid point of the bins
bin_mid.append((bins[i] + bins[i+1])/2)
# This function bins y based on binning x
def bin_fun(x,y,bins,bin_length):
c = list(zip(x, y))
# define final out put of the function
final_y_binning = []
# define a list for holding members of each bin
bined_y_members = []
# compute length of each bin
for i in range(len(bins)-1):
# compute high and low threshold of the bins
low_threshold = bins[i]
high_threshold = bins[i+1]
# bin y according to x
for member in c:
if (member[0] < high_threshold and member[0] >= low_threshold):
bined_y_members.append(member[1])
final_y_binning.append(bined_y_members)
# fill out the container of the bin members
bined_y_members=[]
df = pd.DataFrame(final_y_binning)
return(df)
binned_y =pd.DataFrame(columns=[1,2,3])
Y1 = bin_fun(x1,y1,bins, bin_length)
Y1.columns =[1]
Y2 = bin_fun(x2,y2,bins, bin_length)
Y2.columns =[2]
Y3 = bin_fun(x3,y3,bins, bin_length)
Y3.columns =[3]
binned_y = binned_y.append(Y1)
binned_y[2] = Y2
binned_y[3] = Y3
binned_y.index = bin_mid
print(binned_y)
# comparing curve 2 and curve 1
for i in binned_y.index:
if (binned_y.loc[i][2]-binned_y.loc[i][1]<0):
print(i)
# comparing curve 3 and curve 2
for i in binned_y.index:
if (binned_y.loc[i][3]-binned_y.loc[i][2]<0):
print(i)
这个returns8是y3的索引
binned_y
1 2 3
3.0 NaN 90.0 NaN
4.0 NaN 210.0 NaN
5.0 100.0 211.0 NaN
6.0 101.0 250.0 NaN
7.0 110.0 260.0 300.0
8.0 130.0 261.0 250.0
9.0 132.0 265.0 270.0
10.0 170.0 180.0 350.0
11.0 190.0 200.0 380.0
12.0 192.0 210.0 400.0
13.0 210.0 NaN 390.0
14.0 NaN NaN 380.0
15.0 NaN NaN NaN
情节
binned_y.plot(marker='o', figsize=(6, 6)) # plot the dataframe
plt.legend(labels=['y1', 'y2', 'y3'], bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(range(16))
plt.grid()
我正在尝试比较不同的线,以了解一条线是否高于另一条线,如果不是,则 x
发生此变化的位置。
如果我有相同的 x
值和相同的长度,那将非常容易,并且只有 y
行的区别。
但是我对不同的线有不同的 x
值,并且向量的长度不相同,但是 x
间隔对于所有曲线都是相同的。
作为一个非常简单的示例,我使用以下数据:
#curve 1: len = 9
x1 = np.array([5,6,7,8,9,10,11,12,13])
y1 = np.array([100,101,110,130,132,170,190,192,210])
#curve 2: len = 10
x2 = np.array([3,4,5,6,7,8,9,10,11,12])
y2 = np.array([90,210,211,250,260,261,265,180,200,210])
#curve 3: len = 8
x3 = np.array([7.3,8.3,9.3,10.3,11.3,12.3,13.3,14.3])
y3 = np.array([300,250,270,350,380,400,390,380])
它们应该是 2 条回归线。在这个简单的例子中,结果应该是 曲线 2 在所有 x
范围内的值都高于曲线 1。
我试图在 2.5-12.5 范围内对 x
进行分箱,分箱长度为 1,以比较每个分箱中相应的 y
s。
我的实际数据很大,这个比较需要做很多次,所以我需要找到一个不需要太多时间的解决方案。
情节
- 给定 x 轴的数据图
plt.figure(figsize=(6, 6))
plt.plot(x1, y1, marker='o', label='y1')
plt.plot(x2, y2, marker='o', label='y2')
plt.plot(x3, y3, marker='o', label='y3')
plt.xticks(range(15))
plt.legend()
plt.grid()
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
函数
def get_new_x
使用np.digitize
到 re-bin x-axis 值。def get_comparison
为每两列比较添加一列布尔值- 目前,每个新列都会添加到主数据框
df
,但是可以将其更新为单独的comparison
数据框。 combs
是一个列表列组合[Index(['y1', 'y2'], dtype='object'), Index(['y2', 'y3'], dtype='object')]
- 目前,每个新列都会添加到主数据框
# function to create the bins
def get_bins(x_arrays: List[np.array]) -> np.array:
bin_len = np.diff(x_arrays[0][:2]) # calculate bin length
all_x = np.concatenate(x_arrays) # join arrays
min_x = min(all_x) # get min
max_x = max(all_x) # get max
return np.arange(min_x, max_x + bin_len, bin_len)
# function using np.digitize to bin the old x-axis into new bins
def get_new_x(x_arrays: List[np.array]) -> List[np.array]:
bins = get_bins(x_arrays) # get the bins
x_new = list()
for x in x_arrays:
x_new.append(bins[np.digitize(np.round(x), bins, right=True)]) # determine bins
return x_new
# function to create dataframe for arrays with new x-axis as index
def get_df(x_arrays: List[np.array], y_arrays: List[np.array]) -> pd.DataFrame:
x_new = get_new_x(x_arrays)
return pd.concat([pd.DataFrame(y, columns=[f'y{i+1}'], index=x_new[i]) for i, y in enumerate(y_arrays)], axis=1)
# compare each successive column of the dataframe
# if the left column is greater than the right column, then True
def get_comparison(df: pd.DataFrame):
cols = df.columns
combs = [cols[i:i+2] for i in range(0, len(cols), 1) if i < len(cols)-1]
for comb in combs:
df[f'{comb[0]} > {comb[1]}'] = df[comb[0]] > df[comb[1]]
调用函数:
import numpy as np
import pandas as pd
# put the arrays into a list
y = [y1, y2, y3]
x = [x1, x2, x3]
# call get_df
df = get_df(x, y)
# call get_comparison
get_comparison(df)
# get only the index of True values with Boolean indexing
for col in df.columns[3:]:
vals = df.index[df[col]].tolist()
if vals:
print(f'{col}: {vals}')
[out]:
y2 > y3: [8.0]
显示(df)
y1 y2 y3 y1 > y2 y2 > y3
3.0 NaN 90.0 NaN False False
4.0 NaN 210.0 NaN False False
5.0 100.0 211.0 NaN False False
6.0 101.0 250.0 NaN False False
7.0 110.0 260.0 300.0 False False
8.0 130.0 261.0 250.0 False True
9.0 132.0 265.0 270.0 False False
10.0 170.0 180.0 350.0 False False
11.0 190.0 200.0 380.0 False False
12.0 192.0 210.0 400.0 False False
13.0 210.0 NaN 390.0 False False
14.0 NaN NaN 380.0 False False
情节
fig, ax = plt.subplots(figsize=(8, 6))
# add markers for problem values
for i, col in enumerate(df.columns[3:], 1):
vals = df.iloc[:, i][df[col]]
if not vals.empty:
ax.scatter(vals.index, vals.values, color='red', s=110, label='bad')
df.iloc[:, :3].plot(marker='o', ax=ax) # plot the dataframe
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(range(16))
plt.title('y-values plotted against rebinned x-values')
plt.grid()
plt.show()
这是我第一次问这个问题时想到的答案,但当时无法实现。我的想法基于基于 x 对 y1 和 y2 进行分箱,并在每个分箱中比较这两个。因此,作为示例,我有 3 条曲线,我想比较它们。这些曲线中唯一相似的是 delta x
(bin 长度),这里是 1。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#curve 1
x1 = np.array([5,6,7,8,9,10,11,12,13])
y1 = np.array([100,101,110,130,132,170,190,192,210])
#curve 2
x2 = np.array([3,4,5,6,7,8,9,10,11,12])
y2 = np.array([90,210,211,250,260,261,265,180,200,210])
#curve 3
x3 = np.array([7.3,8.3,9.3,10.3,11.3,12.3,13.3,14.3])
y3 = np.array([300,250,270,350,380,400,390,380])
bin_length = 1
# x values have same intervals both in x1 and x2
x_min = min(x1[0],x2[0],x3[0])-bin_length/2
x_max = max(x1[-1],x2[-1],x3[-1])+bin_length/2
bins = np.arange(x_min,x_max+bin_length,bin_length)
# bin mid points to use as index
bin_mid = []
for i in range(len(bins)-1):
# compute mid point of the bins
bin_mid.append((bins[i] + bins[i+1])/2)
# This function bins y based on binning x
def bin_fun(x,y,bins,bin_length):
c = list(zip(x, y))
# define final out put of the function
final_y_binning = []
# define a list for holding members of each bin
bined_y_members = []
# compute length of each bin
for i in range(len(bins)-1):
# compute high and low threshold of the bins
low_threshold = bins[i]
high_threshold = bins[i+1]
# bin y according to x
for member in c:
if (member[0] < high_threshold and member[0] >= low_threshold):
bined_y_members.append(member[1])
final_y_binning.append(bined_y_members)
# fill out the container of the bin members
bined_y_members=[]
df = pd.DataFrame(final_y_binning)
return(df)
binned_y =pd.DataFrame(columns=[1,2,3])
Y1 = bin_fun(x1,y1,bins, bin_length)
Y1.columns =[1]
Y2 = bin_fun(x2,y2,bins, bin_length)
Y2.columns =[2]
Y3 = bin_fun(x3,y3,bins, bin_length)
Y3.columns =[3]
binned_y = binned_y.append(Y1)
binned_y[2] = Y2
binned_y[3] = Y3
binned_y.index = bin_mid
print(binned_y)
# comparing curve 2 and curve 1
for i in binned_y.index:
if (binned_y.loc[i][2]-binned_y.loc[i][1]<0):
print(i)
# comparing curve 3 and curve 2
for i in binned_y.index:
if (binned_y.loc[i][3]-binned_y.loc[i][2]<0):
print(i)
这个returns8是y3binned_y
1 2 3
3.0 NaN 90.0 NaN
4.0 NaN 210.0 NaN
5.0 100.0 211.0 NaN
6.0 101.0 250.0 NaN
7.0 110.0 260.0 300.0
8.0 130.0 261.0 250.0
9.0 132.0 265.0 270.0
10.0 170.0 180.0 350.0
11.0 190.0 200.0 380.0
12.0 192.0 210.0 400.0
13.0 210.0 NaN 390.0
14.0 NaN NaN 380.0
15.0 NaN NaN NaN
情节
binned_y.plot(marker='o', figsize=(6, 6)) # plot the dataframe
plt.legend(labels=['y1', 'y2', 'y3'], bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(range(16))
plt.grid()