R - 不同的结果 gower.dist 和 daisy(...,metric="gower")
R - Different results gower.dist and daisy(...,metric="gower")
我想计算两个数据帧的行之间的距离(差异),以便为每个观察找到最近的聚类。因为我有因子和数值变量,所以我使用 Gower 距离。因为我想比较两个数据帧(而不是一个矩阵的行之间的差异),gower.dist 将是我需要的功能。然而,当我实施它时,我意识到结果与我使用 Daisy's gower 时得到的结果不同,将行绑定在一起并查看感兴趣的相异矩阵部分。
我在这里只提供了我的数据样本,但是当我计算所有数据的差异时,gower.dist 通常导致差异为零,尽管相应的行彼此不相等。为什么?不同结果的原因可能是什么?在我看来,daisys 的 gower 工作正常而 gower.dist 不是(在这个例子中)。
library(cluster)
library(StatMatch)
# Calculate distance using daisy's gower
daisyDist <- daisy(rbind(df,cent),metric="gower")
daisyDist <- as.matrix(daisyDist)
daisyDist <- daisyDist[(nrow(df)+1):nrow(daisyDist),1:nrow(df)] #only look at part where rows from df are compared to (rows of) cent
# Calculate distance using dist.gower
gowerDist <- gower.dist(cent,df)
具有以下数据
df <- structure(list(searchType = structure(c(NA, 1L, 1L, 1L, 1L), .Label = c("1", "2"), class = "factor"), roomMin = structure(c(4L, 1L, 1L, 6L, 6L), .Label = c("10", "100", "150", "20", "255", "30", "40", "50", "60", "70", "Missing[NoInput]"), class = "factor"), roomMax = structure(c(8L, 8L, NA, 10L, 9L), .Label = c("10", "100", "120", "150", "160", "20", "255", "30", "40", "50", "60", "70", "80", "90", "Missing[NoInput]"), class = "factor"), priceMin = c(NA, 73, 60, 29, 11), priceMax = c(35, 11, 1, 62, 23), sizeMin = structure(c(5L, 5L, 5L, 6L, 6L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), sizeMax = structure(c(1L, 6L, 5L, 3L, 1L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), longitude = c(6.6306, 7.47195, 8.5562, NA, 8.569), latitude = c(46.52425, 46.9512, 47.37515, NA, 47.3929), specificSearch = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), objectType = structure(c(NA, 2L, 2L, 2L, 2L), .Label = c("1", "2", "3", "Missing[]"), class = "factor")), .Names = c("searchType", "roomMin", "roomMax", "priceMin", "priceMax", "sizeMin", "sizeMax", "longitude", "latitude", "specificSearch", "objectType"), row.names = c(112457L, 94601L, 78273L, 59172L, 117425L), class = "data.frame")
cent <- structure(list(searchType = structure(c(1L, 1L, 1L), .Label = c("1", "2"), class = "factor"), roomMin = structure(c(1L, 4L, 4L), .Label = c("10", "100", "150", "20", "255", "30", "40", "50", "60", "70", "Missing[NoInput]"), class = "factor"), roomMax = structure(c(6L, 9L, 8L), .Label = c("10", "100", "120", "150", "160", "20", "255", "30", "40", "50", "60", "70", "80", "90", "Missing[NoInput]"), class = "factor"), priceMin = c(60, 33, 73), priceMax = c(103, 46, 23), sizeMin = structure(c(1L, 5L, 5L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), sizeMax = structure(c(1L, 2L, 1L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), longitude = c(8.3015, 7.42765, 7.6104), latitude = c(47.05485, 46.9469, 46.75125), specificSearch = structure(c(1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), objectType = structure(c(2L, 2L, 2L), .Label = c("1", "2", "3", "Missing[]"), class = "factor")), .Names = c("searchType", "roomMin", "roomMax", "priceMin", "priceMax", "sizeMin", "sizeMax", "longitude", "latitude", "specificSearch", "objectType"), row.names = c(60656L, 66897L, 130650L), class = "data.frame")
谢谢!
编辑:error/difference 似乎是因为数字列中有 NA,而且它们似乎被区别对待。我怎样才能使 Daisy 对 NA 的处理适应 gower.dist?
这是由于数据框的数字列中的 NA 值所致。考虑以下代码,以了解这两个函数如何在具有 NA 值的数字列中表现完全不同(daisy 比 gower.dist 更健壮):
df1 <- rbind(df,cent)
head(df1)
searchType roomMin roomMax priceMin priceMax sizeMin sizeMax longitude latitude specificSearch objectType
112457 <NA> 20 30 NA 35 50 100 6.63060 46.52425 0 <NA>
94601 1 10 30 73 11 50 75 7.47195 46.95120 0 2
78273 1 10 <NA> 60 1 50 50 8.55620 47.37515 0 2
59172 1 30 50 29 62 75 150 NA NA 0 2
117425 1 30 40 11 23 75 100 8.56900 47.39290 0 2
60656 1 10 20 60 103 100 100 8.30150 47.05485 0 2
# only use the numeric column priceMin (4th column) to compute the distance
class(df1[,4])
# [1] "numeric"
df2 <- df1[4]
# daisy output
as.matrix(daisy(df2,metric="gower"))
112457 94601 78273 59172 117425 60656 66897 130650
112457 0 NA NA NA NA NA NA NA
94601 NA 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
78273 NA 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
59172 NA 0.7096774 0.5000000 0.00000000 0.2903226 0.5000000 0.06451613 0.7096774
117425 NA 1.0000000 0.7903226 0.29032258 0.0000000 0.7903226 0.35483871 1.0000000
60656 NA 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
66897 NA 0.6451613 0.4354839 0.06451613 0.3548387 0.4354839 0.00000000 0.6451613
130650 NA 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
# gower.dist output
gower.dist(df2)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] NaN NaN NaN NaN NaN NaN NaN NaN
[2,] NaN 0 0 0 0 0 0 0
[3,] NaN 0 0 0 0 0 0 0
[4,] NaN 0 0 0 0 0 0 0
[5,] NaN 0 0 0 0 0 0 0
[6,] NaN 0 0 0 0 0 0 0
[7,] NaN 0 0 0 0 0 0 0
[8,] NaN 0 0 0 0 0 0 0
使用 gower.dist 函数中的参数 rngs 修复此问题:
gower.dist(df2, rngs=max(df2, na.rm=TRUE) - min(df2, na.rm=TRUE))
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] NaN NaN NaN NaN NaN NaN NaN NaN
[2,] NaN 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
[3,] NaN 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
[4,] NaN 0.7096774 0.5000000 0.00000000 0.2903226 0.5000000 0.06451613 0.7096774
[5,] NaN 1.0000000 0.7903226 0.29032258 0.0000000 0.7903226 0.35483871 1.0000000
[6,] NaN 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
[7,] NaN 0.6451613 0.4354839 0.06451613 0.3548387 0.4354839 0.00000000 0.6451613
[8,] NaN 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
因此,当数值变量中存在 NA 时,使函数 gower.dist 像菊花一样工作的方法可以像下面这样:
df1 <- rbind(df,cent)
# compute the ranges of the numeric variables correctly
cols <- which(sapply(df1, is.numeric))
rngs <- rep(1, ncol(df1))
rngs[cols] <- sapply(df1[cols], function(x) max(x, na.rm=TRUE) - min(x, na.rm=TRUE))
daisyDist <- as.matrix(daisy(df1,metric="gower"))
gowerDist <- gower.dist(df1)
daisyDist
112457 94601 78273 59172 117425 60656 66897 130650
112457 0.0000000 0.3951059 0.6151851 0.7107843 0.6397059 0.6424374 0.3756990 0.1105551
94601 0.3951059 0.0000000 0.2355126 0.5788530 0.5629176 0.4235379 0.3651002 0.2199324
78273 0.6151851 0.2355126 0.0000000 0.5122549 0.4033046 0.3500130 0.3951874 0.3631533
59172 0.7107843 0.5788530 0.5122549 0.0000000 0.2969639 0.5446623 0.4690421 0.5657812
117425 0.6397059 0.5629176 0.4033046 0.2969639 0.0000000 0.4638003 0.4256891 0.4757460
60656 0.6424374 0.4235379 0.3500130 0.5446623 0.4638003 0.0000000 0.5063082 0.4272755
66897 0.3756990 0.3651002 0.3951874 0.4690421 0.4256891 0.5063082 0.0000000 0.2900150
130650 0.1105551 0.2199324 0.3631533 0.5657812 0.4757460 0.4272755 0.2900150 0.0000000
gowerDist
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] 0.0000000 0.3951059 0.6151851 0.7107843 0.6397059 0.6424374 0.3756990 0.1105551
[2,] 0.3951059 0.0000000 0.2355126 0.5788530 0.5629176 0.4235379 0.3651002 0.2199324
[3,] 0.6151851 0.2355126 0.0000000 0.5122549 0.4033046 0.3500130 0.3951874 0.3631533
[4,] 0.7107843 0.5788530 0.5122549 0.0000000 0.2969639 0.5446623 0.4690421 0.5657812
[5,] 0.6397059 0.5629176 0.4033046 0.2969639 0.0000000 0.4638003 0.4256891 0.4757460
[6,] 0.6424374 0.4235379 0.3500130 0.5446623 0.4638003 0.0000000 0.5063082 0.4272755
[7,] 0.3756990 0.3651002 0.3951874 0.4690421 0.4256891 0.5063082 0.0000000 0.2900150
[8,] 0.1105551 0.2199324 0.3631533 0.5657812 0.4757460 0.4272755 0.2900150 0.0000000
我想计算两个数据帧的行之间的距离(差异),以便为每个观察找到最近的聚类。因为我有因子和数值变量,所以我使用 Gower 距离。因为我想比较两个数据帧(而不是一个矩阵的行之间的差异),gower.dist 将是我需要的功能。然而,当我实施它时,我意识到结果与我使用 Daisy's gower 时得到的结果不同,将行绑定在一起并查看感兴趣的相异矩阵部分。
我在这里只提供了我的数据样本,但是当我计算所有数据的差异时,gower.dist 通常导致差异为零,尽管相应的行彼此不相等。为什么?不同结果的原因可能是什么?在我看来,daisys 的 gower 工作正常而 gower.dist 不是(在这个例子中)。
library(cluster)
library(StatMatch)
# Calculate distance using daisy's gower
daisyDist <- daisy(rbind(df,cent),metric="gower")
daisyDist <- as.matrix(daisyDist)
daisyDist <- daisyDist[(nrow(df)+1):nrow(daisyDist),1:nrow(df)] #only look at part where rows from df are compared to (rows of) cent
# Calculate distance using dist.gower
gowerDist <- gower.dist(cent,df)
具有以下数据
df <- structure(list(searchType = structure(c(NA, 1L, 1L, 1L, 1L), .Label = c("1", "2"), class = "factor"), roomMin = structure(c(4L, 1L, 1L, 6L, 6L), .Label = c("10", "100", "150", "20", "255", "30", "40", "50", "60", "70", "Missing[NoInput]"), class = "factor"), roomMax = structure(c(8L, 8L, NA, 10L, 9L), .Label = c("10", "100", "120", "150", "160", "20", "255", "30", "40", "50", "60", "70", "80", "90", "Missing[NoInput]"), class = "factor"), priceMin = c(NA, 73, 60, 29, 11), priceMax = c(35, 11, 1, 62, 23), sizeMin = structure(c(5L, 5L, 5L, 6L, 6L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), sizeMax = structure(c(1L, 6L, 5L, 3L, 1L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), longitude = c(6.6306, 7.47195, 8.5562, NA, 8.569), latitude = c(46.52425, 46.9512, 47.37515, NA, 47.3929), specificSearch = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), objectType = structure(c(NA, 2L, 2L, 2L, 2L), .Label = c("1", "2", "3", "Missing[]"), class = "factor")), .Names = c("searchType", "roomMin", "roomMax", "priceMin", "priceMax", "sizeMin", "sizeMax", "longitude", "latitude", "specificSearch", "objectType"), row.names = c(112457L, 94601L, 78273L, 59172L, 117425L), class = "data.frame")
cent <- structure(list(searchType = structure(c(1L, 1L, 1L), .Label = c("1", "2"), class = "factor"), roomMin = structure(c(1L, 4L, 4L), .Label = c("10", "100", "150", "20", "255", "30", "40", "50", "60", "70", "Missing[NoInput]"), class = "factor"), roomMax = structure(c(6L, 9L, 8L), .Label = c("10", "100", "120", "150", "160", "20", "255", "30", "40", "50", "60", "70", "80", "90", "Missing[NoInput]"), class = "factor"), priceMin = c(60, 33, 73), priceMax = c(103, 46, 23), sizeMin = structure(c(1L, 5L, 5L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), sizeMax = structure(c(1L, 2L, 1L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), longitude = c(8.3015, 7.42765, 7.6104), latitude = c(47.05485, 46.9469, 46.75125), specificSearch = structure(c(1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), objectType = structure(c(2L, 2L, 2L), .Label = c("1", "2", "3", "Missing[]"), class = "factor")), .Names = c("searchType", "roomMin", "roomMax", "priceMin", "priceMax", "sizeMin", "sizeMax", "longitude", "latitude", "specificSearch", "objectType"), row.names = c(60656L, 66897L, 130650L), class = "data.frame")
谢谢!
编辑:error/difference 似乎是因为数字列中有 NA,而且它们似乎被区别对待。我怎样才能使 Daisy 对 NA 的处理适应 gower.dist?
这是由于数据框的数字列中的 NA 值所致。考虑以下代码,以了解这两个函数如何在具有 NA 值的数字列中表现完全不同(daisy 比 gower.dist 更健壮):
df1 <- rbind(df,cent)
head(df1)
searchType roomMin roomMax priceMin priceMax sizeMin sizeMax longitude latitude specificSearch objectType
112457 <NA> 20 30 NA 35 50 100 6.63060 46.52425 0 <NA>
94601 1 10 30 73 11 50 75 7.47195 46.95120 0 2
78273 1 10 <NA> 60 1 50 50 8.55620 47.37515 0 2
59172 1 30 50 29 62 75 150 NA NA 0 2
117425 1 30 40 11 23 75 100 8.56900 47.39290 0 2
60656 1 10 20 60 103 100 100 8.30150 47.05485 0 2
# only use the numeric column priceMin (4th column) to compute the distance
class(df1[,4])
# [1] "numeric"
df2 <- df1[4]
# daisy output
as.matrix(daisy(df2,metric="gower"))
112457 94601 78273 59172 117425 60656 66897 130650
112457 0 NA NA NA NA NA NA NA
94601 NA 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
78273 NA 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
59172 NA 0.7096774 0.5000000 0.00000000 0.2903226 0.5000000 0.06451613 0.7096774
117425 NA 1.0000000 0.7903226 0.29032258 0.0000000 0.7903226 0.35483871 1.0000000
60656 NA 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
66897 NA 0.6451613 0.4354839 0.06451613 0.3548387 0.4354839 0.00000000 0.6451613
130650 NA 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
# gower.dist output
gower.dist(df2)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] NaN NaN NaN NaN NaN NaN NaN NaN
[2,] NaN 0 0 0 0 0 0 0
[3,] NaN 0 0 0 0 0 0 0
[4,] NaN 0 0 0 0 0 0 0
[5,] NaN 0 0 0 0 0 0 0
[6,] NaN 0 0 0 0 0 0 0
[7,] NaN 0 0 0 0 0 0 0
[8,] NaN 0 0 0 0 0 0 0
使用 gower.dist 函数中的参数 rngs 修复此问题:
gower.dist(df2, rngs=max(df2, na.rm=TRUE) - min(df2, na.rm=TRUE))
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] NaN NaN NaN NaN NaN NaN NaN NaN
[2,] NaN 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
[3,] NaN 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
[4,] NaN 0.7096774 0.5000000 0.00000000 0.2903226 0.5000000 0.06451613 0.7096774
[5,] NaN 1.0000000 0.7903226 0.29032258 0.0000000 0.7903226 0.35483871 1.0000000
[6,] NaN 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
[7,] NaN 0.6451613 0.4354839 0.06451613 0.3548387 0.4354839 0.00000000 0.6451613
[8,] NaN 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
因此,当数值变量中存在 NA 时,使函数 gower.dist 像菊花一样工作的方法可以像下面这样:
df1 <- rbind(df,cent)
# compute the ranges of the numeric variables correctly
cols <- which(sapply(df1, is.numeric))
rngs <- rep(1, ncol(df1))
rngs[cols] <- sapply(df1[cols], function(x) max(x, na.rm=TRUE) - min(x, na.rm=TRUE))
daisyDist <- as.matrix(daisy(df1,metric="gower"))
gowerDist <- gower.dist(df1)
daisyDist
112457 94601 78273 59172 117425 60656 66897 130650
112457 0.0000000 0.3951059 0.6151851 0.7107843 0.6397059 0.6424374 0.3756990 0.1105551
94601 0.3951059 0.0000000 0.2355126 0.5788530 0.5629176 0.4235379 0.3651002 0.2199324
78273 0.6151851 0.2355126 0.0000000 0.5122549 0.4033046 0.3500130 0.3951874 0.3631533
59172 0.7107843 0.5788530 0.5122549 0.0000000 0.2969639 0.5446623 0.4690421 0.5657812
117425 0.6397059 0.5629176 0.4033046 0.2969639 0.0000000 0.4638003 0.4256891 0.4757460
60656 0.6424374 0.4235379 0.3500130 0.5446623 0.4638003 0.0000000 0.5063082 0.4272755
66897 0.3756990 0.3651002 0.3951874 0.4690421 0.4256891 0.5063082 0.0000000 0.2900150
130650 0.1105551 0.2199324 0.3631533 0.5657812 0.4757460 0.4272755 0.2900150 0.0000000
gowerDist
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] 0.0000000 0.3951059 0.6151851 0.7107843 0.6397059 0.6424374 0.3756990 0.1105551
[2,] 0.3951059 0.0000000 0.2355126 0.5788530 0.5629176 0.4235379 0.3651002 0.2199324
[3,] 0.6151851 0.2355126 0.0000000 0.5122549 0.4033046 0.3500130 0.3951874 0.3631533
[4,] 0.7107843 0.5788530 0.5122549 0.0000000 0.2969639 0.5446623 0.4690421 0.5657812
[5,] 0.6397059 0.5629176 0.4033046 0.2969639 0.0000000 0.4638003 0.4256891 0.4757460
[6,] 0.6424374 0.4235379 0.3500130 0.5446623 0.4638003 0.0000000 0.5063082 0.4272755
[7,] 0.3756990 0.3651002 0.3951874 0.4690421 0.4256891 0.5063082 0.0000000 0.2900150
[8,] 0.1105551 0.2199324 0.3631533 0.5657812 0.4757460 0.4272755 0.2900150 0.0000000