跳过 NaN 值以获得距离
Skip NaN values to get distance
我的部分数据集(实际上我的数据集大小(106,1800)
):
df =
1 1.1 2 2.1 3 3.1 4 4.1 5 5.1
0 43.1024 6.7498 NaN NaN NaN NaN NaN NaN NaN NaN
1 46.0595 1.6829 25.0695 3.7463 NaN NaN NaN NaN NaN NaN
2 25.0695 5.5454 44.9727 8.6660 41.9726 2.6666 84.9566 3.8484 44.9566 1.8484
3 35.0281 7.7525 45.0322 3.7465 14.0369 3.7463 NaN NaN NaN NaN
4 35.0292 7.5616 45.0292 4.5616 23.0292 3.5616 45.0292 6.7463 NaN NaN
根据 Tom 的回答,我现在可以做什么:
- 我手动写了 1-st 2 行,如 p 和 q 值:
p =
[[45.1024,7.7498],[45.1027,7.7513],[45.1072,7.7568],[45.1076,7.7563]]
q=
[[45.0595,7.6829],[45.0595,7.6829],[45.0564,7.6820],[45.0533,7.6796],[45.0501,7.6775]]
然后:
__all__ = ['frdist']
def _c(ca, i, j, p, q):
if ca[i, j] > -1:
return ca[i, j]
elif i == 0 and j == 0:
ca[i, j] = np.linalg.norm(p[i]-q[j])
elif i > 0 and j == 0:
ca[i, j] = max(_c(ca, i-1, 0, p, q), np.linalg.norm(p[i]-q[j]))
elif i == 0 and j > 0:
ca[i, j] = max(_c(ca, 0, j-1, p, q), np.linalg.norm(p[i]-q[j]))
elif i > 0 and j > 0:
ca[i, j] = max(
min(
_c(ca, i-1, j, p, q),
_c(ca, i-1, j-1, p, q),
_c(ca, i, j-1, p, q)
),
np.linalg.norm(p[i]-q[j])
)
else:
ca[i, j] = float('inf')
return ca[i, j]
然后:
def frdist(p, q):
# Remove nan values from p
p = np.array([i for i in p if np.any(np.isfinite(i))], np.float64)
q = np.array([i for i in q if np.any(np.isfinite(i))], np.float64)
len_p = len(p)
len_q = len(q)
if len_p == 0 or len_q == 0:
raise ValueError('Input curves are empty.')
# p and q will no longer be the same length
if len(p[0]) != len(q[0]):
raise ValueError('Input curves do not have the same dimensions.')
ca = (np.ones((len_p, len_q), dtype=np.float64) * -1)
dist = _c(ca, len_p-1, len_q-1, p, q)
return(dist)
frdist(p, q)
有效。但是我如何将 p 和 q 应用于整个数据集呢?不是逐行选择吗?
最后我需要得到 106 to 106
对称矩阵 0
对角线
我认为您唯一需要做的更改是在 frdist
函数内部,首先从 p
中删除 nan
值。这将需要删除 p
和 q
长度相同的条件,但我认为这应该没问题,因为你自己说 p
有 1 个值并且 q
有 1800 个值。
def frdist(p, q):
# Remove nan values from p
p = np.array([i for i in p if np.any(np.isfinite(i))], np.float64)
q = np.array(q, np.float64)
len_p = len(p)
len_q = len(q)
if len_p == 0 or len_q == 0:
raise ValueError('Input curves are empty.')
# p and q no longer have to be the same length
if len(p[0]) != len(q[0]):
raise ValueError('Input curves do not have the same dimensions.')
ca = (np.ones((len_p, len_q), dtype=np.float64) * -1)
dist = _c(ca, len_p-1, len_q-1, p, q)
return(dist)
然后给出:
frdist(p, q)
1.9087938076177846
删除 NaN
个值
简单明了:
p = p[~np.isnan(p)]
计算整个数据集的 Fréchet 距离
最简单的方法是使用成对距离计算 pdist
from SciPy. It takes an m
observations by n
dimensions array, so we need to reshape our row arrays using reshape(-1,2)
inside frdist
. pdist
returns the condensed (upper triangular) distance matrix. We use squareform
来得到 m x m
对角线为 0
的对称矩阵。
import pandas as pd
import numpy as np
import io
from scipy.spatial.distance import pdist, squareform
data = """ 1 1.1 2 2.1 3 3.1 4 4.1 5 5.1
0 43.1024 6.7498 NaN NaN NaN NaN NaN NaN NaN NaN
1 46.0595 1.6829 25.0695 3.7463 NaN NaN NaN NaN NaN NaN
2 25.0695 5.5454 44.9727 8.6660 41.9726 2.6666 84.9566 3.8484 44.9566 1.8484
3 35.0281 7.7525 45.0322 3.7465 14.0369 3.7463 NaN NaN NaN NaN
4 35.0292 7.5616 45.0292 4.5616 23.0292 3.5616 45.0292 6.7463 NaN NaN
"""
df = pd.read_csv(io.StringIO(data), sep='\s+')
def _c(ca, i, j, p, q):
if ca[i, j] > -1:
return ca[i, j]
elif i == 0 and j == 0:
ca[i, j] = np.linalg.norm(p[i]-q[j])
elif i > 0 and j == 0:
ca[i, j] = max(_c(ca, i-1, 0, p, q), np.linalg.norm(p[i]-q[j]))
elif i == 0 and j > 0:
ca[i, j] = max(_c(ca, 0, j-1, p, q), np.linalg.norm(p[i]-q[j]))
elif i > 0 and j > 0:
ca[i, j] = max(
min(
_c(ca, i-1, j, p, q),
_c(ca, i-1, j-1, p, q),
_c(ca, i, j-1, p, q)
),
np.linalg.norm(p[i]-q[j])
)
else:
ca[i, j] = float('inf')
return ca[i, j]
def frdist(p, q):
# Remove nan values and reshape into two column array
p = p[~np.isnan(p)].reshape(-1,2)
q = q[~np.isnan(q)].reshape(-1,2)
len_p = len(p)
len_q = len(q)
if len_p == 0 or len_q == 0:
raise ValueError('Input curves are empty.')
# p and q will no longer be the same length
if len(p[0]) != len(q[0]):
raise ValueError('Input curves do not have the same dimensions.')
ca = (np.ones((len_p, len_q), dtype=np.float64) * -1)
dist = _c(ca, len_p-1, len_q-1, p, q)
return(dist)
print(squareform(pdist(df.values, frdist)))
结果:
[[ 0. 18.28131545 41.95464432 29.22027212 20.32481187]
[18.28131545 0. 38.9573328 12.59094238 20.18389517]
[41.95464432 38.9573328 0. 39.92453004 39.93376923]
[29.22027212 12.59094238 39.92453004 0. 31.13715882]
[20.32481187 20.18389517 39.93376923 31.13715882 0. ]]
无需重新发明轮子
Fréchet 距离计算已由 similaritymeasures
提供。所以下面会给你和上面一样的结果:
from scipy.spatial.distance import pdist, squareform
import similaritymeasures
def frechet(p, q):
p = p[~np.isnan(p)].reshape(-1,2)
q = q[~np.isnan(q)].reshape(-1,2)
return similaritymeasures.frechet_dist(p,q)
print(squareform(pdist(df.values, frechet)))
我的部分数据集(实际上我的数据集大小(106,1800)
):
df =
1 1.1 2 2.1 3 3.1 4 4.1 5 5.1
0 43.1024 6.7498 NaN NaN NaN NaN NaN NaN NaN NaN
1 46.0595 1.6829 25.0695 3.7463 NaN NaN NaN NaN NaN NaN
2 25.0695 5.5454 44.9727 8.6660 41.9726 2.6666 84.9566 3.8484 44.9566 1.8484
3 35.0281 7.7525 45.0322 3.7465 14.0369 3.7463 NaN NaN NaN NaN
4 35.0292 7.5616 45.0292 4.5616 23.0292 3.5616 45.0292 6.7463 NaN NaN
根据 Tom 的回答,我现在可以做什么:
- 我手动写了 1-st 2 行,如 p 和 q 值:
p =
[[45.1024,7.7498],[45.1027,7.7513],[45.1072,7.7568],[45.1076,7.7563]]
q=
[[45.0595,7.6829],[45.0595,7.6829],[45.0564,7.6820],[45.0533,7.6796],[45.0501,7.6775]]
然后:
__all__ = ['frdist']
def _c(ca, i, j, p, q):
if ca[i, j] > -1:
return ca[i, j]
elif i == 0 and j == 0:
ca[i, j] = np.linalg.norm(p[i]-q[j])
elif i > 0 and j == 0:
ca[i, j] = max(_c(ca, i-1, 0, p, q), np.linalg.norm(p[i]-q[j]))
elif i == 0 and j > 0:
ca[i, j] = max(_c(ca, 0, j-1, p, q), np.linalg.norm(p[i]-q[j]))
elif i > 0 and j > 0:
ca[i, j] = max(
min(
_c(ca, i-1, j, p, q),
_c(ca, i-1, j-1, p, q),
_c(ca, i, j-1, p, q)
),
np.linalg.norm(p[i]-q[j])
)
else:
ca[i, j] = float('inf')
return ca[i, j]
然后:
def frdist(p, q):
# Remove nan values from p
p = np.array([i for i in p if np.any(np.isfinite(i))], np.float64)
q = np.array([i for i in q if np.any(np.isfinite(i))], np.float64)
len_p = len(p)
len_q = len(q)
if len_p == 0 or len_q == 0:
raise ValueError('Input curves are empty.')
# p and q will no longer be the same length
if len(p[0]) != len(q[0]):
raise ValueError('Input curves do not have the same dimensions.')
ca = (np.ones((len_p, len_q), dtype=np.float64) * -1)
dist = _c(ca, len_p-1, len_q-1, p, q)
return(dist)
frdist(p, q)
有效。但是我如何将 p 和 q 应用于整个数据集呢?不是逐行选择吗?
最后我需要得到 106 to 106
对称矩阵 0
对角线
我认为您唯一需要做的更改是在 frdist
函数内部,首先从 p
中删除 nan
值。这将需要删除 p
和 q
长度相同的条件,但我认为这应该没问题,因为你自己说 p
有 1 个值并且 q
有 1800 个值。
def frdist(p, q):
# Remove nan values from p
p = np.array([i for i in p if np.any(np.isfinite(i))], np.float64)
q = np.array(q, np.float64)
len_p = len(p)
len_q = len(q)
if len_p == 0 or len_q == 0:
raise ValueError('Input curves are empty.')
# p and q no longer have to be the same length
if len(p[0]) != len(q[0]):
raise ValueError('Input curves do not have the same dimensions.')
ca = (np.ones((len_p, len_q), dtype=np.float64) * -1)
dist = _c(ca, len_p-1, len_q-1, p, q)
return(dist)
然后给出:
frdist(p, q)
1.9087938076177846
删除 NaN
个值
简单明了:
p = p[~np.isnan(p)]
计算整个数据集的 Fréchet 距离
最简单的方法是使用成对距离计算 pdist
from SciPy. It takes an m
observations by n
dimensions array, so we need to reshape our row arrays using reshape(-1,2)
inside frdist
. pdist
returns the condensed (upper triangular) distance matrix. We use squareform
来得到 m x m
对角线为 0
的对称矩阵。
import pandas as pd
import numpy as np
import io
from scipy.spatial.distance import pdist, squareform
data = """ 1 1.1 2 2.1 3 3.1 4 4.1 5 5.1
0 43.1024 6.7498 NaN NaN NaN NaN NaN NaN NaN NaN
1 46.0595 1.6829 25.0695 3.7463 NaN NaN NaN NaN NaN NaN
2 25.0695 5.5454 44.9727 8.6660 41.9726 2.6666 84.9566 3.8484 44.9566 1.8484
3 35.0281 7.7525 45.0322 3.7465 14.0369 3.7463 NaN NaN NaN NaN
4 35.0292 7.5616 45.0292 4.5616 23.0292 3.5616 45.0292 6.7463 NaN NaN
"""
df = pd.read_csv(io.StringIO(data), sep='\s+')
def _c(ca, i, j, p, q):
if ca[i, j] > -1:
return ca[i, j]
elif i == 0 and j == 0:
ca[i, j] = np.linalg.norm(p[i]-q[j])
elif i > 0 and j == 0:
ca[i, j] = max(_c(ca, i-1, 0, p, q), np.linalg.norm(p[i]-q[j]))
elif i == 0 and j > 0:
ca[i, j] = max(_c(ca, 0, j-1, p, q), np.linalg.norm(p[i]-q[j]))
elif i > 0 and j > 0:
ca[i, j] = max(
min(
_c(ca, i-1, j, p, q),
_c(ca, i-1, j-1, p, q),
_c(ca, i, j-1, p, q)
),
np.linalg.norm(p[i]-q[j])
)
else:
ca[i, j] = float('inf')
return ca[i, j]
def frdist(p, q):
# Remove nan values and reshape into two column array
p = p[~np.isnan(p)].reshape(-1,2)
q = q[~np.isnan(q)].reshape(-1,2)
len_p = len(p)
len_q = len(q)
if len_p == 0 or len_q == 0:
raise ValueError('Input curves are empty.')
# p and q will no longer be the same length
if len(p[0]) != len(q[0]):
raise ValueError('Input curves do not have the same dimensions.')
ca = (np.ones((len_p, len_q), dtype=np.float64) * -1)
dist = _c(ca, len_p-1, len_q-1, p, q)
return(dist)
print(squareform(pdist(df.values, frdist)))
结果:
[[ 0. 18.28131545 41.95464432 29.22027212 20.32481187]
[18.28131545 0. 38.9573328 12.59094238 20.18389517]
[41.95464432 38.9573328 0. 39.92453004 39.93376923]
[29.22027212 12.59094238 39.92453004 0. 31.13715882]
[20.32481187 20.18389517 39.93376923 31.13715882 0. ]]
无需重新发明轮子
Fréchet 距离计算已由 similaritymeasures
提供。所以下面会给你和上面一样的结果:
from scipy.spatial.distance import pdist, squareform
import similaritymeasures
def frechet(p, q):
p = p[~np.isnan(p)].reshape(-1,2)
q = q[~np.isnan(q)].reshape(-1,2)
return similaritymeasures.frechet_dist(p,q)
print(squareform(pdist(df.values, frechet)))