通过二维密度图画线

Draw line through 2d density plot

我有一个来自约 10,000 个患者样本 (TCGA) 的大型基因表达数据集,我正在绘制某个基因特征的预测表达值 (x) 和实际观察值 (y)。对于我的下游分析,我需要通过绘图绘制一条精确的线并计算样本中的不同参数 above/below 该线。
无论我如何在数据中画一条线 (geom_smooth(method = 'lm', 'glm', 'gam', or 'loess')),这条线似乎总是不完美——它没有按照我的喜好切割数据(图中的红线是 lm)。
玩了一会儿后,我意识到 2d 核密度线 (geom_density2d) 实际上很好地显示了我数据的 slope/trends,所以我手动画了一条切线密度线(图中的黑线)。

我的问题:如何自动绘制一条穿过内核密度线的线,如图中的黑线? (而不是手动玩不同的截距和斜率直到看起来不错)。

我能想到的最佳方法是以某种方式计算每条内核线的最长直径的截距和斜率,取所有这些截距和斜率的平均值并绘制该线,但这有点不合时宜我的联赛。也许这里有人对此有经验并可以提供帮助?

一个更 hacky 的方法可能是从 ggplot_build 获取每条内核密度线的 x,y 坐标,然后从那里开始,但感觉太 hacky(而且也不在我的范围内)。

谢谢!

编辑:更改了一些细节以使 figure/analysis 更容易。 (密度线现在更平滑)。 代表:

library(MASS)
set.seed(123)
samples <- 10000
r <- 0.9
data <- mvrnorm(n=samples, mu=c(0, 0), Sigma=matrix(c(2, r, r, 2), nrow=2))
x <- data[, 1]  # standard normal (mu=0, sd=1)
y <- data[, 2]  # standard normal (mu=0, sd=1)

test.df <- data.frame(x = x, y = y)
lm(y ~ x, test.df)

ggplot(test.df, aes(x, y)) +
  geom_point(color = 'grey') +
  geom_density2d(color = 'red', lwd = 0.5, contour = T, h = c(2,2)) + ### EDIT: h = c(2,2)
  geom_smooth(method = "glm", se = F, lwd = 1, color = 'red') +
  geom_abline(intercept = 0, slope = 0.7, lwd = 1, col = 'black') ## EDIT: slope to 0.7

图:

我大体同意@Hack-R。
然而,这是一个有趣的问题,研究 ggplot_build 并不是什么大问题。

require(dplyr)
require(ggplot2)

p <- ggplot(test.df, aes(x, y)) +
geom_density2d(color = 'red', lwd = 0.5, contour = T, h = c(2,2)) 
#basic version of your plot

p_built <- ggplot_build(p)

p_data <- p_built$data[[1]]
p_maxring <- p_data[p_data[['level']] == min(p_data[['level']]),] %>%
  select(x,y) # extracts the x/y coordinates of the points on the largest ellipse from your 2d-density contour

现在this answer帮我找到了这个椭圆上相距最远的点。

coord_mean <- c(x = mean(p_maxring$x), y = mean(p_maxring$y))

p_maxring <- p_maxring %>% 
  mutate (mean_dev = sqrt((x - mean(x))^2 + (y - mean(y))^2)) #extra column specifying the distance of each point to the mean of those points

coord_farthest <- c('x' = p_maxring$x[which.max(p_maxring$mean_dev)], 'y' = p_maxring$y[which.max(p_maxring$mean_dev)])
# gives the coordinates of the point farthest away from the mean point

farthest_from_farthest <- sqrt((p_maxring$x - coord_farthest['x'])^2 + (p_maxring$y - coord_farthest['y'])^2)
#now this looks which of the points is the farthest from the point farthest from the mean point :D
coord_fff <- c('x' = p_maxring$x[which.max(farthest_from_farthest)], 'y' = p_maxring$y[which.max(farthest_from_farthest)])



 ggplot(test.df, aes(x, y)) +
  geom_density2d(color = 'red', lwd = 0.5, contour = T, h = c(2,2)) +
  # geom_segment using the coordinates of the points farthest apart 
  geom_segment((aes(x = coord_farthest['x'], y = coord_farthest['y'],
                    xend = coord_fff['x'], yend = coord_fff['y']))) +
  geom_smooth(method = "glm", se = F, lwd = 1, color = 'red') +
# as per your request with your geom_smooth line

  coord_equal()

coord_equal 非常重要,因为否则你会得到非常奇怪的结果——它也把我的大脑搞得一团糟。因为如果坐标不设置相等,直线将貌似不会通过距离均值最远的点...

我留给你将它构建到一个函数中以使其自动化。另外,我会留给你计算两点的 y 截距和斜率

Tjebo 的方法一开始还不错,但仔细观察后发现它找到了椭圆上两点之间的最长距离。虽然这接近我想要的,但由于椭圆形状不规则或椭圆中的点稀疏而失败。这是因为它测量了两个之间最长的距离;而我真正想要的是椭圆的最长直径;即:半长轴。 examples/details.

见下图

简要说明:

至find/draw特定density/percentage的密度等高线:

求椭圆的最长直径("semi-major axis"):

对于 returns 截距和斜率(如在 OP 中)的功能,请参阅最后一段代码。
下面的两段代码和图片比较了两种 Tjebo 的方法与我基于上述帖子的新方法。

#### Reprex from OP
require(dplyr)
require(ggplot2)
require(MASS)
set.seed(123)
samples <- 10000
r <- 0.9
data <- mvrnorm(n=samples, mu=c(0, 0), Sigma=matrix(c(2, r, r, 2), nrow=2))
x <- data[, 1]  # standard normal (mu=0, sd=1)
y <- data[, 2]  # standard normal (mu=0, sd=1)
test.df <- data.frame(x = x, y = y)

#### From Tjebo
p <- ggplot(test.df, aes(x, y)) +
  geom_density2d(color = 'red', lwd = 0.5, contour = T, h = 2) 
p_built <- ggplot_build(p)
p_data <- p_built$data[[1]]
p_maxring <- p_data[p_data[['level']] == min(p_data[['level']]),][,2:3]
coord_mean <- c(x = mean(p_maxring$x), y = mean(p_maxring$y))
p_maxring <- p_maxring %>% 
  mutate (mean_dev = sqrt((x - mean(x))^2 + (y - mean(y))^2)) #extra column specifying the distance of each point to the mean of those points
p_maxring = p_maxring[round(seq(1, nrow(p_maxring), nrow(p_maxring)/23)),] #### Make a small ellipse to illustrate flaws of approach
coord_farthest <- c('x' = p_maxring$x[which.max(p_maxring$mean_dev)], 'y' = p_maxring$y[which.max(p_maxring$mean_dev)])
# gives the coordinates of the point farthest away from the mean point
farthest_from_farthest <- sqrt((p_maxring$x - coord_farthest['x'])^2 + (p_maxring$y - coord_farthest['y'])^2)
#now this looks which of the points is the farthest from the point farthest from the mean point :D
coord_fff <- c('x' = p_maxring$x[which.max(farthest_from_farthest)], 'y' = p_maxring$y[which.max(farthest_from_farthest)])
farthest_2_points = data.frame(t(cbind(coord_farthest, coord_fff)))
plot(p_maxring[,1:2], asp=1)
lines(farthest_2_points, col = 'blue', lwd = 2)


#### From answer in another post
d = cbind(p_maxring[,1], p_maxring[,2])
r = ellipsoidhull(d)
exy = predict(r) ## the ellipsoid boundary
lines(exy)
me = colMeans((exy))           
dist2center = sqrt(rowSums((t(t(exy)-me))^2))
max(dist2center)     ## major axis
lines(exy[dist2center == max(dist2center),], col = 'red', lwd = 2)

#### The plot here is made from the data in the reprex in OP, but with h = 0.5
library(MASS)
set.seed(123)
samples <- 10000
r <- 0.9
data <- mvrnorm(n=samples, mu=c(0, 0), Sigma=matrix(c(2, r, r, 2), nrow=2))
x <- data[, 1]  # standard normal (mu=0, sd=1)
y <- data[, 2]  # standard normal (mu=0, sd=1)
test.df <- data.frame(x = x, y = y)

## MAKE BLUE LINE
p <- ggplot(test.df, aes(x, y)) +
  geom_density2d(color = 'red', lwd = 0.5, contour = T, h = 0.5)  ## NOTE h = 0.5
p_built <- ggplot_build(p)
p_data <- p_built$data[[1]]
p_maxring <- p_data[p_data[['level']] == min(p_data[['level']]),][,2:3]
coord_mean <- c(x = mean(p_maxring$x), y = mean(p_maxring$y))
p_maxring <- p_maxring %>% 
  mutate (mean_dev = sqrt((x - mean(x))^2 + (y - mean(y))^2))
coord_farthest <- c('x' = p_maxring$x[which.max(p_maxring$mean_dev)], 'y' = p_maxring$y[which.max(p_maxring$mean_dev)])
farthest_from_farthest <- sqrt((p_maxring$x - coord_farthest['x'])^2 + (p_maxring$y - coord_farthest['y'])^2)
coord_fff <- c('x' = p_maxring$x[which.max(farthest_from_farthest)], 'y' = p_maxring$y[which.max(farthest_from_farthest)])

## MAKE RED LINE
## h = 0.5
## Given the highly irregular shape of the contours, I will use only the largest contour line (0.95) for draing the line.
## Thus, average = 1. See function below for details.
ln = long.diam("x", "y", test.df, h = 0.5, average = 1) ## NOTE h = 0.5

## PLOT
ggplot(test.df, aes(x, y)) +
  geom_density2d(color = 'red', lwd = 0.5, contour = T, h = 0.5) + ## NOTE h = 0.5
  geom_segment((aes(x = coord_farthest['x'], y = coord_farthest['y'],
                    xend = coord_fff['x'], yend = coord_fff['y'])), col = 'blue', lwd = 2) +
  geom_abline(intercept = ln[1], slope = ln[2], color = 'red', lwd = 2) +
  coord_equal()

最后,我想出了以下功能来处理这一切。抱歉缺少 comments/clarity

#### This will return the intercept and slope of the longest diameter (semi-major axis).
####If Average = TRUE, it will average the int and slope across different density contours.
long.diam = function(x, y, df, probs = c(0.95, 0.5, 0.1), average = T, h = 2) {
  fun.df = data.frame(cbind(df[,x], df[,y]))
  colnames(fun.df) = c("x", "y")
  dens = kde2d(fun.df$x, fun.df$y, n = 200, h = h)
  dx <- diff(dens$x[1:2])
  dy <- diff(dens$y[1:2])
  sz <- sort(dens$z)
  c1 <- cumsum(sz) * dx * dy 
  levels <- sapply(probs, function(x) { 
    approx(c1, sz, xout = 1 - x)$y
  })
  names(levels) = paste0("L", str_sub(formatC(probs, 2, format = 'f'), -2))
  #plot(fun.df$x,fun.df$y, asp = 1)
  #contour(dens, levels = levels, labels=probs, add=T, col = c('red', 'blue', 'green'), lwd = 2)
  #contour(dens, add = T, col = 'red', lwd = 2)
  #abline(lm(fun.df$y~fun.df$x))

  ls <- contourLines(dens, levels = levels)
  names(ls) = names(levels)

  lines.info = list()
  for (i in 1:length(ls)) {
    d = cbind(ls[[i]]$x, ls[[i]]$y)
    exy = predict(ellipsoidhull(d))## the ellipsoid boundary
    colnames(exy) = c("x", "y")
    me = colMeans((exy))            ## center of the ellipse
    dist2center = sqrt(rowSums((t(t(exy)-me))^2))
    #plot(exy,type='l',asp=1)
    #points(d,col='blue')
    #lines(exy[order(dist2center)[1:2],])
    #lines(exy[rev(order(dist2center))[1:2],])
    max.dist = data.frame(exy[rev(order(dist2center))[1:2],])
    line.fit = lm(max.dist$y ~ max.dist$x)
    lines.info[[i]] = c(as.numeric(line.fit$coefficients[1]), as.numeric(line.fit$coefficients[2]))
  }
  names(lines.info) = names(ls)

  #plot(fun.df$x,fun.df$y, asp = 1)
  #contour(dens, levels = levels, labels=probs, add=T, col = c('red', 'blue', 'green'), lwd = 2)
  #abline(lines.info[[1]], col = 'red', lwd = 2)
  #abline(lines.info[[2]], col = 'blue', lwd = 2)
  #abline(lines.info[[3]], col = 'green', lwd = 2)
  #abline(apply(simplify2array(lines.info), 1, mean), col = 'black', lwd = 4)
  if (isTRUE(average)) {
    apply(simplify2array(lines.info), 1, mean)
  } else {
    lines.info[[average]]
  }
}

最后,这里是不同答案的最终实现:

library(MASS)
set.seed(123)
samples = 10000
r = 0.9
data = mvrnorm(n=samples, mu=c(0, 0), Sigma=matrix(c(2, r, r, 2), nrow=2))
x = data[, 1]  # standard normal (mu=0, sd=1)
y = data[, 2]  # standard normal (mu=0, sd=1)
#plot(x, y)
test.df = data.frame(x = x, y = y)

#### Find furthest two points of contour
## BLUE
p <- ggplot(test.df, aes(x, y)) +
  geom_density2d(color = 'red', lwd = 2, contour = T, h = 2) 
p_built <- ggplot_build(p)
p_data <- p_built$data[[1]]
p_maxring <- p_data[p_data[['level']] == min(p_data[['level']]),][,2:3]
coord_mean <- c(x = mean(p_maxring$x), y = mean(p_maxring$y))
p_maxring <- p_maxring %>% 
  mutate (mean_dev = sqrt((x - mean(x))^2 + (y - mean(y))^2))
coord_farthest <- c('x' = p_maxring$x[which.max(p_maxring$mean_dev)], 'y' = p_maxring$y[which.max(p_maxring$mean_dev)])
farthest_from_farthest <- sqrt((p_maxring$x - coord_farthest['x'])^2 + (p_maxring$y - coord_farthest['y'])^2)
coord_fff <- c('x' = p_maxring$x[which.max(farthest_from_farthest)], 'y' = p_maxring$y[which.max(farthest_from_farthest)])

#### Find the average intercept and slope of 3 contour lines (0.95, 0.5, 0.1), as in my long.diam function above.
## RED
ln = long.diam("x", "y", test.df)

#### Plot everything. Black line is GLM
ggplot(test.df, aes(x, y)) +
  geom_point(color = 'grey') +
  geom_density2d(color = 'red', lwd = 1, contour = T, h = 2) + 
  geom_smooth(method = "glm", se = F, lwd = 1, color = 'black') +
  geom_abline(intercept = ln[1], slope = ln[2], col = 'red', lwd = 1) +
  geom_segment((aes(x = coord_farthest['x'], y = coord_farthest['y'],
                    xend = coord_fff['x'], yend = coord_fff['y'])), col = 'blue', lwd = 1) +
  coord_equal()