调整数据数组的形状以在 SciPy 中执行优化

Question

我有一个执行优化以推断参数的代码：

import numpy as np
from scipy.integrate import odeint
import matplotlib.pyplot as plt
from scipy.optimize import root
from scipy.optimize import minimize
import pandas as pd
    
    
d = {'Week': [1, 2,3,4,5,6,7,8,9,10,11], 'incidence': [206.1705794,2813.420201,11827.9453,30497.58655,10757.66954,7071.878779,3046.752723,1314.222882,765.9763902,201.3800578,109.8982006]}
df = pd.DataFrame(data=d)
    
def peak_infections(beta, df):
    
    # Weeks for which the ODE system will be solved
    weeks = df.Week.to_numpy()
    
    # Total population, N.
    N = 100000
    # Initial number of infected and recovered individuals, I0 and R0.
    I0, R0 = 10, 0
    # Everyone else, S0, is susceptible to infection initially.
    S0 = N - I0 - R0
    J0 = I0
    # Contact rate, beta, and mean recovery rate, gamma, (in 1/days).
    #reproductive no. R zero is beta/gamma
    gamma = 1/7 * 7 #rate should be in weeks now
    # A grid of time points 
    t = np.linspace(0, weeks[-1], weeks[-1] + 1)
    
    # The SIR model differential equations.
    def deriv(y, t, N, beta, gamma):
        S, I, R, J = y
        dS = ((-beta * S * I) / N)
        dI = ((beta * S * I) / N) - (gamma * I)
        dR = (gamma * I)
        dJ = ((beta * S * I) / N)
        return dS, dI, dR, dJ
    
    # Initial conditions are S0, I0, R0
    # Integrate the SIR equations over the time grid, t.
    solve = odeint(deriv, (S0, I0, R0, J0), t, args=(N, beta, gamma))
    S, I, R, J = solve.T
    
    return I/N
    
def residual(x, df):
    
    # Total population, N.
    N = 100000
    incidence = df.incidence.to_numpy()/N
    return np.sum((peak_infections(x, df)[1:] - incidence) ** 2)
    
x0 = 0.5
res = minimize(residual, x0, args=(df), method="Nelder-Mead").x
print(res)

但是，它没有给出正确的值，所以在 d = {'Week': [1, 2,3,4,5,6,7,8,9,10,11], 'incidence': [206.1705794,2813.420201,11827.9453,30497.58655,10757.66954,7071.878779,3046.752723,1314.222882,765.9763902,201.3800578,109.8982006]} 行中我不想将周数设为 1,2,3... 我想用天数代替 Python 有更清晰的信息可供使用。我想将天数 linspace 分割为每周间隔。但是，我遇到了一些形状对齐问题：

import numpy as np
from scipy.integrate import odeint
import matplotlib.pyplot as plt
from scipy.optimize import root
from scipy.optimize import minimize
import pandas as pd
    
time = np.linspace(0, 77, 77 + 1)
d = {'Week': [time[7],time[14],time[21],time[28],time[35],time[42],time[49],time[56],time[63],time[70],time[77]], 'incidence': [206.1705794,2813.420201,11827.9453,30497.58655,10757.66954,7071.878779,3046.752723,1314.222882,765.9763902,201.3800578,109.8982006]}
#d = {'Week': [1, 2,3,4,5,6,7,8,9,10,11], 'incidence': [206.1705794,2813.420201,11827.9453,30497.58655,10757.66954,7071.878779,3046.752723,1314.222882,765.9763902,201.3800578,109.8982006]}
df = pd.DataFrame(data=d)
    
def peak_infections(beta, df):
    
    # Weeks for which the ODE system will be solved
    weeks = df.Week.to_numpy()
    
    # Total population, N.
    N = 100000
    # Initial number of infected and recovered individuals, I0 and R0.
    I0, R0 = 10, 0
    # Everyone else, S0, is susceptible to infection initially.
    S0 = N - I0 - R0
    J0 = I0
    # Contact rate, beta, and mean recovery rate, gamma, (in 1/days).
    #reproductive no. R zero is beta/gamma
    gamma = 1/7 * 7 #rate should be in weeks now
    # A grid of time points 
    t = np.linspace(0, 77, 77 + 1)
    
    # The SIR model differential equations.
    def deriv(y, t, N, beta, gamma):
        S, I, R, J = y
        dS = ((-beta * S * I) / N)
        dI = ((beta * S * I) / N) - (gamma * I)
        dR = (gamma * I)
        dJ = ((beta * S * I) / N)
        return dS, dI, dR, dJ
    
    # Initial conditions are S0, I0, R0
    # Integrate the SIR equations over the time grid, t.
    solve = odeint(deriv, (S0, I0, R0, J0), t, args=(N, beta, gamma))
    S, I, R, J = solve.T
    
    return I/N
    
def residual(x, df):
    
    # Total population, N.
    N = 100000
    incidence = df.incidence.to_numpy()/N
    return np.sum((peak_infections(x, df)[1:] - incidence) ** 2)
    
x0 = 0.5
res = minimize(residual, x0, args=(df), method="Nelder-Mead").x
print(res)

我在这里尝试的方法是通过切片 time 重新创建数据框，即 77 天，也就是 11 周。它仍然 returns 形状错误，77 对 11 个元素出现在我的函数 residual 行 return np.sum((peak_infections(x, df)[1:] - incidence) ** 2) 中。我的方法哪里出了问题？

------------编辑---------- 更新代码

import numpy as np
from scipy.integrate import odeint
import matplotlib.pyplot as plt
from scipy.optimize import minimize
import pandas as pd

t = np.arange(7,84,7)
d = {'Week': t, 'incidence': [206.1705794,2813.420201,11827.9453,30497.58655,10757.66954,7071.878779,3046.752723,1314.222882,765.9763902,201.3800578,109.8982006]}
#d = {'Week': [time[7],time[14],time[21],time[28],time[35],time[42],time[49],time[56],time[63],time[70],time[77]], 'incidence': [206.1705794,2813.420201,11827.9453,30497.58655,10757.66954,7071.878779,3046.752723,1314.222882,765.9763902,201.3800578,109.8982006]}
#d = {'Week': [1, 2,3,4,5,6,7,8,9,10,11], 'incidence': [206.1705794,2813.420201,11827.9453,30497.58655,10757.66954,7071.878779,3046.752723,1314.222882,765.9763902,201.3800578,109.8982006]}
df = pd.DataFrame(data=d)

def peak_infections(beta, df):

    # Weeks for which the ODE system will be solved
    weeks = df.Week.to_numpy()

    # Total population, N.
    N = 100000
    # Initial number of infected and recovered individuals, I0 and R0.
    I0, R0 = 10, 0
    # Everyone else, S0, is susceptible to infection initially.
    S0 = N - I0 - R0
    J0 = I0
    # Contact rate, beta, and mean recovery rate, gamma, (in 1/days).
    #reproductive no. R zero is beta/gamma
    gamma = 1/7 * 7 #rate should be in weeks now
    # A grid of time points 
    t = np.linspace(0, 77, 77 + 1)

    # The SIR model differential equations.
    def deriv(y, t, N, beta, gamma):
        S, I, R, J = y
        dS = ((-beta * S * I) / N)
        dI = ((beta * S * I) / N) - (gamma * I)
        dR = (gamma * I)
        dJ = ((beta * S * I) / N)
        return dS, dI, dR, dJ

    # Initial conditions are S0, I0, R0
    # Integrate the SIR equations over the time grid, t.
    solve = odeint(deriv, (S0, I0, R0, J0), t, args=(N, beta, gamma))
    S, I, R, J = solve.T

    return I/N

def residual(x, df):

    # Total population, N.
    N = 100000
    incidence = df.incidence.to_numpy()/N
    return np.sum((peak_infections(x, df) - incidence) ** 2)

x0 = 0.5
res = minimize(residual, x0, args=(df), method="Nelder-Mead").x
print(res)

Answer 1

您的问题出现在第 52 行，您通过求解 peak_infections(x, df)[1:] 得到 77 个值，并且您有 11 个 incidence 值，如您所提到的。

出现这种情况是因为您正在 t（第 29 行）求解您的颂歌，它有 78 个值。为避免这种情况，请在 peak_infections 函数中生成一个包含 7 个值的时间向量，如下所示：

t = np.linspace(0, 77, 77 + 1)
t = [t[7],t[14],t[21],t[28],t[35],t[42],t[49],t[56],t[63],t[70],t[77]]

或全新的：

t = np.arange(7,84,7)

并按如下方式更改 residual 函数（不要分割 peak_infections(x, df)[1:]）：

def residual(x, df):

    # Total population, N.
    N = 100000
    incidence = df.incidence.to_numpy()/N
    return np.sum((peak_infections(x, df) - incidence) ** 2)

这将解决您的问题，因为现在您正在比较具有形状 (7,) 和 (7,) 的 NumPy 数组，这不会产生错误。

调整数据数组的形状以在 SciPy 中执行优化

Adjusting shape of a data array to perform optimization in SciPy

python

optimization

numpy

scipy

pandas