光谱数据的基线校正
Baseline correction for spectroscopic data
我正在使用拉曼光谱,它通常有一个基线叠加在我感兴趣的实际信息上。因此我想估计基线贡献。为此,我实施了 的解决方案。
我确实喜欢那里描述的解决方案,并且给出的代码在我的数据上运行良好。计算数据的典型结果如下所示,红色和橙色线是基线估计值:Typical result of baseline estimation with calculated data
问题是:我经常在 pandas DataFrame 中收集数千张光谱,每一行代表一个光谱。我当前的解决方案是使用 for 循环一次遍历一个频谱的数据。然而,这使得该过程相当缓慢。由于我是 python 的新手,而且由于 numpy/pandas/scipy,我已经习惯了几乎完全不必使用 for 循环,我正在寻找一种解决方案,它也可以省略这个 for 循环.然而,使用的稀疏矩阵函数似乎仅限于二维,但我可能需要三个,而且我还没有想到其他解决方案。有人有想法吗?
当前代码如下所示:
import numpy as np
import pandas as pd
from scipy.signal import gaussian
import matplotlib.pyplot as plt
from scipy import sparse
from scipy.sparse.linalg import spsolve
def baseline_correction(raman_spectra,lam,p,niter=10):
#according to "Asymmetric Least Squares Smoothing" by P. Eilers and H. Boelens
number_of_spectra = raman_spectra.index.size
baseline_data = pd.DataFrame(np.zeros((len(raman_spectra.index),len(raman_spectra.columns))),columns=raman_spectra.columns)
for ii in np.arange(number_of_spectra):
curr_dataset = raman_spectra.iloc[ii,:]
#this is the code for the fitting procedure
L = len(curr_dataset)
w = np.ones(L)
D = sparse.diags([1,-2,1],[0,-1,-2], shape=(L,L-2))
for jj in range(int(niter)):
W = sparse.spdiags(w,0,L,L)
Z = W + lam * D.dot(D.transpose())
z = spsolve(Z,w*curr_dataset.astype(np.float64))
w = p * (curr_dataset > z) + (1-p) * (curr_dataset < z)
#end of fitting procedure
baseline_data.iloc[ii,:] = z
return baseline_data
#the following four lines calculate two sample spectra
wavenumbers = np.linspace(500,2000,100)
intensities1 = 500*gaussian(100,2) + 0.0002*wavenumbers**2
intensities2 = 100*gaussian(100,5) + 0.0001*wavenumbers**2
raman_spectra = pd.DataFrame((intensities1,intensities2),columns=wavenumbers)
#end of smaple spectra calculataion
baseline_data = baseline_correction(raman_spectra,200,0.01)
#the rest is just for plotting the data
plt.figure(1)
plt.plot(wavenumbers,raman_spectra.iloc[0])
plt.plot(wavenumbers,baseline_data.iloc[0])
plt.plot(wavenumbers,raman_spectra.iloc[1])
plt.plot(wavenumbers,baseline_data.iloc[1])
新功能
def baseline_correction_new(data: pd.Series, lam: int=200, p: float=0.01, niter: int=10) -> pd.Series:
#this is the code for the fitting procedure
L = len(data)
w = np.ones(L)
D = sparse.diags([1,-2,1], [0,-1,-2], shape=(L,L-2))
for jj in range(int(niter)):
W = sparse.spdiags(w, 0, L, L)
Z = W + lam * D.dot(D.transpose())
z = spsolve(Z, w*data.astype(np.float64))
w = p * (data > z) + (1-p) * (data < z)
return pd.Series(z)
调用新函数
baseline_data_new = raman_spectra.apply(baseline_correction_new, axis=1)
添加列名
baseline_data_new.columns = wavenumbers
比较
baseline_data.equals(baseline_data_new)
>>> True
情节
plt.figure(1)
plt.plot(wavenumbers,baseline_data.iloc[0], label='Baseline_0')
plt.plot(wavenumbers,baseline_data_new.iloc[0], label='Baseline_new_0')
plt.plot(wavenumbers,baseline_data.iloc[1], label='Baseline_1')
plt.plot(wavenumbers,baseline_data_new.iloc[1], label='Baseline_new_1')
plt.legend()
plt.show()
3000 行的原始方法
%%timeit
baseline_data = baseline_correction(df_int,200,0.01)
>>> 60 s ± 608 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
apply
有 3000 行
%%timeit
baseline_3000 = df_int.apply(lambda x: baseline_correction_new(x, 200, 0.01), axis=1)
>>> 58.3 s ± 206 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
apply
方法简化了代码,但只提供了几毫秒的改进。也许改进的途径是使用 multiprocessing or 10x Faster Parallel Python Without Python Multiprocessing
根据Christian K.的建议,我看了背景估计的SNIP算法,详情可以参考here。这是我的 python 代码:
import numpy as np
import pandas as pd
from scipy.signal import gaussian
import matplotlib.pyplot as plt
def baseline_correction(raman_spectra,niter):
assert(isinstance(raman_spectra, pd.DataFrame)), 'Input must be pandas DataFrame'
spectrum_points = len(raman_spectra.columns)
raman_spectra_transformed = np.log(np.log(np.sqrt(raman_spectra +1)+1)+1)
working_spectra = np.zeros(raman_spectra.shape)
for pp in np.arange(1,niter+1):
r1 = raman_spectra_transformed.iloc[:,pp:spectrum_points-pp]
r2 = (np.roll(raman_spectra_transformed,-pp,axis=1)[:,pp:spectrum_points-pp] + np.roll(raman_spectra_transformed,pp,axis=1)[:,pp:spectrum_points-pp])/2
working_spectra = np.minimum(r1,r2)
raman_spectra_transformed.iloc[:,pp:spectrum_points-pp] = working_spectra
baseline = (np.exp(np.exp(raman_spectra_transformed)-1)-1)**2 -1
return baseline
wavenumbers = np.linspace(500,2000,1000)
intensities1 = gaussian(1000,20) + 0.000002*wavenumbers**2
intensities2 = gaussian(1000,50) + 0.000001*wavenumbers**2
raman_spectra = pd.DataFrame((intensities1,intensities2),columns=np.around(wavenumbers,decimals=1))
iterations = 100
baseline_data = baseline_correction(raman_spectra,iterations)
#the rest is just for plotting the data
plt.figure(1)
plt.plot(wavenumbers,raman_spectra.iloc[0])
plt.plot(wavenumbers,baseline_data.iloc[0])
plt.plot(wavenumbers,raman_spectra.iloc[1])
plt.plot(wavenumbers,baseline_data.iloc[1])
它确实有效,而且似乎与基于非对称最小二乘平滑的算法一样可靠。它也更快。通过 100 次迭代,拟合 73 个真实的测量光谱大约需要 1.5 秒,与大约 1.5 秒形成鲜明对比。 2.2为非对称最小二乘平滑,所以是一个改进。
更好的是:所需的计算时间
对于 3267 个真实光谱,使用 SNIP 算法的时间仅为 11.7 秒,而对于非对称最小二乘平滑则为 1 分 28 秒。这可能是因为没有任何 for 循环使用 SNIP 算法一次迭代每个光谱。
A typical result of the SNIP algorithm with calculated examples is shown here。
我对这个结果很满意,感谢所有贡献者的支持!
更新:
感谢 中的 sascha,我找到了一种使用非对称最小二乘平滑的方法,无需 for 循环来遍历每个光谱,基线校正函数如下所示:
def baseline_correction4(raman_spectra,lam,p,niter=10):
#according to "Asymmetric Least Squares Smoothing" by P. Eilers and H. Boelens
number_of_spectra = raman_spectra.index.size
#this is the code for the fitting procedure
L = len(raman_spectra.columns)
w = np.ones(raman_spectra.shape[0]*raman_spectra.shape[1])
D = sparse.block_diag(np.tile(sparse.diags([1,-2,1],[0,-1,-2],shape=(L,L-2)),number_of_spectra),format='csr')
raman_spectra_flattened = raman_spectra.values.ravel()
for jj in range(int(niter)):
W = sparse.diags(w,format='csr')
Z = W + lam * D.dot(D.transpose())
z = spsolve(Z,w*raman_spectra_flattened,permc_spec='NATURAL')
w = p * (raman_spectra_flattened > z) + (1-p) * (raman_spectra_flattened < z)
#end of fitting procedure
baseline_data = pd.DataFrame(z.reshape(number_of_spectra,-1),index=raman_spectra.index,columns=raman_spectra.columns)
return baseline_data
这种方法基于将所有稀疏矩阵组合成一个块对角稀疏矩阵。这样,无论您有多少光谱,您都只需调用 spsolve 一次。这导致在 593 毫秒内对 73 个真实光谱进行基线校正(比 SNIP 快),在 32.8 秒内对 3267 个真实光谱进行基线校正(比 SNIP 慢)。我希望这对将来的人有用。
我正在使用拉曼光谱,它通常有一个基线叠加在我感兴趣的实际信息上。因此我想估计基线贡献。为此,我实施了
我确实喜欢那里描述的解决方案,并且给出的代码在我的数据上运行良好。计算数据的典型结果如下所示,红色和橙色线是基线估计值:Typical result of baseline estimation with calculated data
问题是:我经常在 pandas DataFrame 中收集数千张光谱,每一行代表一个光谱。我当前的解决方案是使用 for 循环一次遍历一个频谱的数据。然而,这使得该过程相当缓慢。由于我是 python 的新手,而且由于 numpy/pandas/scipy,我已经习惯了几乎完全不必使用 for 循环,我正在寻找一种解决方案,它也可以省略这个 for 循环.然而,使用的稀疏矩阵函数似乎仅限于二维,但我可能需要三个,而且我还没有想到其他解决方案。有人有想法吗?
当前代码如下所示:
import numpy as np
import pandas as pd
from scipy.signal import gaussian
import matplotlib.pyplot as plt
from scipy import sparse
from scipy.sparse.linalg import spsolve
def baseline_correction(raman_spectra,lam,p,niter=10):
#according to "Asymmetric Least Squares Smoothing" by P. Eilers and H. Boelens
number_of_spectra = raman_spectra.index.size
baseline_data = pd.DataFrame(np.zeros((len(raman_spectra.index),len(raman_spectra.columns))),columns=raman_spectra.columns)
for ii in np.arange(number_of_spectra):
curr_dataset = raman_spectra.iloc[ii,:]
#this is the code for the fitting procedure
L = len(curr_dataset)
w = np.ones(L)
D = sparse.diags([1,-2,1],[0,-1,-2], shape=(L,L-2))
for jj in range(int(niter)):
W = sparse.spdiags(w,0,L,L)
Z = W + lam * D.dot(D.transpose())
z = spsolve(Z,w*curr_dataset.astype(np.float64))
w = p * (curr_dataset > z) + (1-p) * (curr_dataset < z)
#end of fitting procedure
baseline_data.iloc[ii,:] = z
return baseline_data
#the following four lines calculate two sample spectra
wavenumbers = np.linspace(500,2000,100)
intensities1 = 500*gaussian(100,2) + 0.0002*wavenumbers**2
intensities2 = 100*gaussian(100,5) + 0.0001*wavenumbers**2
raman_spectra = pd.DataFrame((intensities1,intensities2),columns=wavenumbers)
#end of smaple spectra calculataion
baseline_data = baseline_correction(raman_spectra,200,0.01)
#the rest is just for plotting the data
plt.figure(1)
plt.plot(wavenumbers,raman_spectra.iloc[0])
plt.plot(wavenumbers,baseline_data.iloc[0])
plt.plot(wavenumbers,raman_spectra.iloc[1])
plt.plot(wavenumbers,baseline_data.iloc[1])
新功能
def baseline_correction_new(data: pd.Series, lam: int=200, p: float=0.01, niter: int=10) -> pd.Series:
#this is the code for the fitting procedure
L = len(data)
w = np.ones(L)
D = sparse.diags([1,-2,1], [0,-1,-2], shape=(L,L-2))
for jj in range(int(niter)):
W = sparse.spdiags(w, 0, L, L)
Z = W + lam * D.dot(D.transpose())
z = spsolve(Z, w*data.astype(np.float64))
w = p * (data > z) + (1-p) * (data < z)
return pd.Series(z)
调用新函数
baseline_data_new = raman_spectra.apply(baseline_correction_new, axis=1)
添加列名
baseline_data_new.columns = wavenumbers
比较
baseline_data.equals(baseline_data_new)
>>> True
情节
plt.figure(1)
plt.plot(wavenumbers,baseline_data.iloc[0], label='Baseline_0')
plt.plot(wavenumbers,baseline_data_new.iloc[0], label='Baseline_new_0')
plt.plot(wavenumbers,baseline_data.iloc[1], label='Baseline_1')
plt.plot(wavenumbers,baseline_data_new.iloc[1], label='Baseline_new_1')
plt.legend()
plt.show()
3000 行的原始方法
%%timeit
baseline_data = baseline_correction(df_int,200,0.01)
>>> 60 s ± 608 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
apply
有 3000 行
%%timeit
baseline_3000 = df_int.apply(lambda x: baseline_correction_new(x, 200, 0.01), axis=1)
>>> 58.3 s ± 206 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
apply
方法简化了代码,但只提供了几毫秒的改进。也许改进的途径是使用 multiprocessing or 10x Faster Parallel Python Without Python Multiprocessing
根据Christian K.的建议,我看了背景估计的SNIP算法,详情可以参考here。这是我的 python 代码:
import numpy as np
import pandas as pd
from scipy.signal import gaussian
import matplotlib.pyplot as plt
def baseline_correction(raman_spectra,niter):
assert(isinstance(raman_spectra, pd.DataFrame)), 'Input must be pandas DataFrame'
spectrum_points = len(raman_spectra.columns)
raman_spectra_transformed = np.log(np.log(np.sqrt(raman_spectra +1)+1)+1)
working_spectra = np.zeros(raman_spectra.shape)
for pp in np.arange(1,niter+1):
r1 = raman_spectra_transformed.iloc[:,pp:spectrum_points-pp]
r2 = (np.roll(raman_spectra_transformed,-pp,axis=1)[:,pp:spectrum_points-pp] + np.roll(raman_spectra_transformed,pp,axis=1)[:,pp:spectrum_points-pp])/2
working_spectra = np.minimum(r1,r2)
raman_spectra_transformed.iloc[:,pp:spectrum_points-pp] = working_spectra
baseline = (np.exp(np.exp(raman_spectra_transformed)-1)-1)**2 -1
return baseline
wavenumbers = np.linspace(500,2000,1000)
intensities1 = gaussian(1000,20) + 0.000002*wavenumbers**2
intensities2 = gaussian(1000,50) + 0.000001*wavenumbers**2
raman_spectra = pd.DataFrame((intensities1,intensities2),columns=np.around(wavenumbers,decimals=1))
iterations = 100
baseline_data = baseline_correction(raman_spectra,iterations)
#the rest is just for plotting the data
plt.figure(1)
plt.plot(wavenumbers,raman_spectra.iloc[0])
plt.plot(wavenumbers,baseline_data.iloc[0])
plt.plot(wavenumbers,raman_spectra.iloc[1])
plt.plot(wavenumbers,baseline_data.iloc[1])
它确实有效,而且似乎与基于非对称最小二乘平滑的算法一样可靠。它也更快。通过 100 次迭代,拟合 73 个真实的测量光谱大约需要 1.5 秒,与大约 1.5 秒形成鲜明对比。 2.2为非对称最小二乘平滑,所以是一个改进。
更好的是:所需的计算时间 对于 3267 个真实光谱,使用 SNIP 算法的时间仅为 11.7 秒,而对于非对称最小二乘平滑则为 1 分 28 秒。这可能是因为没有任何 for 循环使用 SNIP 算法一次迭代每个光谱。
A typical result of the SNIP algorithm with calculated examples is shown here。
我对这个结果很满意,感谢所有贡献者的支持!
更新:
感谢
def baseline_correction4(raman_spectra,lam,p,niter=10):
#according to "Asymmetric Least Squares Smoothing" by P. Eilers and H. Boelens
number_of_spectra = raman_spectra.index.size
#this is the code for the fitting procedure
L = len(raman_spectra.columns)
w = np.ones(raman_spectra.shape[0]*raman_spectra.shape[1])
D = sparse.block_diag(np.tile(sparse.diags([1,-2,1],[0,-1,-2],shape=(L,L-2)),number_of_spectra),format='csr')
raman_spectra_flattened = raman_spectra.values.ravel()
for jj in range(int(niter)):
W = sparse.diags(w,format='csr')
Z = W + lam * D.dot(D.transpose())
z = spsolve(Z,w*raman_spectra_flattened,permc_spec='NATURAL')
w = p * (raman_spectra_flattened > z) + (1-p) * (raman_spectra_flattened < z)
#end of fitting procedure
baseline_data = pd.DataFrame(z.reshape(number_of_spectra,-1),index=raman_spectra.index,columns=raman_spectra.columns)
return baseline_data
这种方法基于将所有稀疏矩阵组合成一个块对角稀疏矩阵。这样,无论您有多少光谱,您都只需调用 spsolve 一次。这导致在 593 毫秒内对 73 个真实光谱进行基线校正(比 SNIP 快),在 32.8 秒内对 3267 个真实光谱进行基线校正(比 SNIP 慢)。我希望这对将来的人有用。