如何进行约束线性回归 - scikit 学习？

Question

我正在尝试使用一些约束来执行线性回归主题以获得特定预测。我想让模型预测一半的线性预测，最后一半的线性预测接近上半部分的最后一个值，使用非常窄的范围（使用约束）类似于图中的绿线。

完整代码：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None  # default='warn'
data = [5.269, 5.346, 5.375, 5.482, 5.519, 5.57, 5.593999999999999, 5.627000000000001, 5.724, 5.818, 5.792999999999999, 5.817, 5.8389999999999995, 5.882000000000001, 5.92, 6.025, 6.064, 6.111000000000001, 6.1160000000000005, 6.138, 6.247000000000001, 6.279, 6.332000000000001, 6.3389999999999995, 6.3420000000000005, 6.412999999999999, 6.442, 6.519, 6.596, 6.603, 6.627999999999999, 6.76, 6.837000000000001, 6.781000000000001, 6.8260000000000005, 6.849, 6.875, 6.982, 7.018, 7.042000000000001, 7.068, 7.091, 7.204, 7.228, 7.261, 7.3420000000000005, 7.414, 7.44, 7.516, 7.542000000000001, 7.627000000000001, 7.667000000000001, 7.821000000000001, 7.792999999999999, 7.756, 7.871, 8.006, 8.078, 7.916, 7.974, 8.074, 8.119, 8.228, 7.976, 8.045, 8.312999999999999, 8.335, 8.388, 8.437999999999999, 8.456, 8.227, 8.266, 8.277999999999999, 8.289, 8.299, 8.318, 8.332, 8.34, 8.349, 8.36, 8.363999999999999, 8.368, 8.282, 8.283999999999999]
time = range(1,85,1)   
x=int(0.7*len(data))
df = pd.DataFrame(list(zip(*[time, data])))
df.columns = ['time', 'data']
# print df
x=int(0.7*len(df))
train = df[:x]
valid = df[x:]
models = []
names = []
tr_x_ax = []
va_x_ax = []
pr_x_ax = []
tr_y_ax = []
va_y_ax = []
pr_y_ax = []
time_model = []
models.append(('LR', LinearRegression()))

for name, model in models:
    x_train=df.iloc[:, 0][:x].values
    y_train=df.iloc[:, 1][:x].values
    x_valid=df.iloc[:, 0][x:].values
    y_valid=df.iloc[:, 1][x:].values

    model = LinearRegression()
    # poly = PolynomialFeatures(5)
    x_train= x_train.reshape(-1, 1)
    y_train= y_train.reshape(-1, 1)
    x_valid = x_valid.reshape(-1, 1)
    y_valid = y_valid.reshape(-1, 1)
    # model.fit(x_train,y_train)
    model.fit(x_train,y_train.ravel())
    # score = model.score(x_train,y_train.ravel())
    # print 'score', score
    preds = model.predict(x_valid)
    tr_x_ax.extend(train['data'])
    va_x_ax.extend(valid['data'])
    pr_x_ax.extend(preds)

    valid['Predictions'] = preds
    valid.index = df[x:].index
    train.index = df[:x].index
    plt.figure(figsize=(5,5))
    # plt.plot(train['data'],label='data')
    # plt.plot(valid[['Close', 'Predictions']])
    x = valid['data']
    # print x
    # plt.plot(valid['data'],label='validation')
    plt.plot(valid['Predictions'],label='Predictions before',color='orange')



y =range(0,58)
y1 =range(58,84)
for index, item in enumerate(pr_x_ax):
    if index >13:
        pr_x_ax[index] = pr_x_ax[13]
pr_x_ax = list([float(i) for i in pr_x_ax])
va_x_ax = list([float(i) for i in va_x_ax])
tr_x_ax = list([float(i) for i in tr_x_ax])
plt.plot(y,tr_x_ax,  label='train' , color='red',  linewidth=2)
plt.plot(y1,va_x_ax,  label='validation1' , color='blue',  linewidth=2)
plt.plot(y1,pr_x_ax,  label='Predictions after' , color='green',  linewidth=2)
plt.xlabel("time")
plt.ylabel("data")
plt.xticks(rotation=45)
plt.legend()
plt.show()

如果你看到这个数字：

标签：Predictions before，模型没有任何限制地预测了它（我不需要这个结果）。

标签：Predictions after，模型在约束范围内预测它，但这是在模型预测之后并且所有值都等于 index = 71 , item 8.56 的最后一个值。

我在 line:64 中使用了循环 for index, item in enumerate(pr_x_ax):，如您所见，从时间 71 到 85 秒的曲线是直线，以便向您展示我如何需要模型工作。

我可以构建模型而不是 for 循环给出相同的结果吗？？？

请多多指教

Answer 1

我希望在你的问题中通过画绿线，你真的希望训练有素的模型预测线性水平向右转弯。但是当前训练的模型只绘制直线橙色线。

对于任何算法和类型的任何经过训练的模型来说，为了学习行为模型中的一些异常变化，确实需要至少具有该异常变化的一些样本。或者至少观察到的数据中的一些隐藏含义应该指向有这种不寻常的变化。

换句话说，为了让您的模型了解绿线上的右转，模型应该在训练数据集中包含与右转相关的点。但是你只将第一个（最左边的）70% 的数据作为训练数据 train = df[:int(0.7 * len(df))] 并且训练数据没有这样的右转并且这个训练数据看起来接近一条直线。

因此您需要 re-sample 您的数据以不同的方式进行训练和验证 - 从 X 的整个范围内随机抽取 70% 的样本，其余的用于验证。这样在你的训练数据中也包含了右转的样本。

第二件事是 LinearRegression 模型总是只用一条直线对预测进行建模，而这条直线不能右转。为了右转，您需要一些更复杂的模型。

模型右转的一种方法是piece-wise-linear，即有几条连接的直线。我没有在 sklearn 中找到 ready-made 分段线性模型，仅使用其他 pip 模型。所以我决定实现我自己的简单 class PieceWiseLinearRegression，它使用 np.piecewise() 和 scipy.optimize.curve_fit() 来建模分段线性函数。

下一张图片显示了应用上面提到的两个东西的结果，代码在后面，re-sampling 数据集以不同的方式和建模 piece-wise-linear 函数。您当前的线性模型 LR 仍然仅使用一条蓝色直线进行预测，而我的分段线性 PWLR2 橙色线由两段组成并正确预测右转：

为了看清楚一张PWLR2图我也做了下一张图:

我的 class PieceWiseLinearRegression 在创建对象时只接受一个参数 n - 用于预测的线性段数。对于上图，使用了 n = 2。

import sys, numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
np.random.seed(0)

class PieceWiseLinearRegression:
    @classmethod
    def nargs_func(cls, f, n):
        return eval('lambda ' + ', '.join([f'a{i}'for i in range(n)]) + ': f(' + ', '.join([f'a{i}'for i in range(n)]) + ')', locals())
        
    @classmethod
    def piecewise_linear(cls, n):
        condlist = lambda xs, xa: [(lambda x: (
            (xs[i] <= x if i > 0 else np.full_like(x, True, dtype = np.bool_)) &
            (x < xs[i + 1] if i < n - 1 else np.full_like(x, True, dtype = np.bool_))
        ))(xa) for i in range(n)]
        funclist = lambda xs, ys: [(lambda i: (
            lambda x: (
                (x - xs[i]) * (ys[i + 1] - ys[i]) / (
                    (xs[i + 1] - xs[i]) if abs(xs[i + 1] - xs[i]) > 10 ** -7 else 10 ** -7 * (-1, 1)[xs[i + 1] - xs[i] >= 0]
                ) + ys[i]
            )
        ))(j) for j in range(n)]
        def f(x, *pargs):
            assert len(pargs) == (n + 1) * 2, (n, pargs)
            xs, ys = pargs[0::2], pargs[1::2]
            xa = x.ravel().astype(np.float64)
            ya = np.piecewise(x = xa, condlist = condlist(xs, xa), funclist = funclist(xs, ys)).ravel()
            #print('xs', xs, 'ys', ys, 'xa', xa, 'ya', ya)
            return ya
        return cls.nargs_func(f, 1 + (n + 1) * 2)
        
    def __init__(self, n):
        self.n = n
        self.f = self.piecewise_linear(self.n)

    def fit(self, x, y):
        from scipy import optimize
        self.p, self.e = optimize.curve_fit(self.f, x, y, p0 = [j for i in range(self.n + 1) for j in (np.amin(x) + i * (np.amax(x) - np.amin(x)) / self.n, 1)])
        #print('p', self.p)
        
    def predict(self, x):
        return self.f(x, *self.p)

data = [5.269, 5.346, 5.375, 5.482, 5.519, 5.57, 5.593999999999999, 5.627000000000001, 5.724, 5.818, 5.792999999999999, 5.817, 5.8389999999999995, 5.882000000000001, 5.92, 6.025, 6.064, 6.111000000000001, 6.1160000000000005, 6.138, 6.247000000000001, 6.279, 6.332000000000001, 6.3389999999999995, 6.3420000000000005, 6.412999999999999, 6.442, 6.519, 6.596, 6.603, 6.627999999999999, 6.76, 6.837000000000001, 6.781000000000001, 6.8260000000000005, 6.849, 6.875, 6.982, 7.018, 7.042000000000001, 7.068, 7.091, 7.204, 7.228, 7.261, 7.3420000000000005, 7.414, 7.44, 7.516, 7.542000000000001, 7.627000000000001, 7.667000000000001, 7.821000000000001, 7.792999999999999, 7.756, 7.871, 8.006, 8.078, 7.916, 7.974, 8.074, 8.119, 8.228, 7.976, 8.045, 8.312999999999999, 8.335, 8.388, 8.437999999999999, 8.456, 8.227, 8.266, 8.277999999999999, 8.289, 8.299, 8.318, 8.332, 8.34, 8.349, 8.36, 8.363999999999999, 8.368, 8.282, 8.283999999999999]
time = list(range(1, 85))
df = pd.DataFrame(list(zip(time, data)), columns = ['time', 'data'])

choose_train = np.random.uniform(size = (len(df),)) < 0.8
choose_valid = ~choose_train

x_all = df.iloc[:, 0].values
y_all = df.iloc[:, 1].values
x_train = df.iloc[:, 0][choose_train].values
y_train = df.iloc[:, 1][choose_train].values
x_valid = df.iloc[:, 0][choose_valid].values
y_valid = df.iloc[:, 1][choose_valid].values
x_all_lin = np.linspace(np.amin(x_all), np.amax(x_all), 500)

models = []
models.append(('LR', LinearRegression()))
models.append(('PWLR2', PieceWiseLinearRegression(2)))
        
for imodel, (name, model) in enumerate(models):
    model.fit(x_train[:, None], y_train)
    x_all_lin_pred = model.predict(x_all_lin[:, None])
    plt.plot(x_all_lin, x_all_lin_pred, label = f'pred {name}')

plt.plot(x_train, y_train, label='train')
plt.plot(x_valid, y_valid, label='valid')
plt.xlabel('time')
plt.ylabel('data')
plt.legend()
plt.show()

如何进行约束线性回归 - scikit 学习？

How to do Constrained Linear Regression - scikit learn?

python

machine-learning

linear-regression

scikit-learn