如何处理 MatchIt 中未被识别（例如）的替代公式？

Question

我收到了关于如何替换公式的很棒的。我需要它来匹配 data.frames 的列表。

与MatchIt::matchit()匹配时，首先，我必须将结果保存为matchit.full / matchit class。其次，使用 match.data() data.frames w/ 仅创建匹配的观察值。

问题出现在第二步，当我像往常一样使用公式时工作正常。对于替换，现在似乎 match.data() 需要以某种方式识别公式，但它不会。

以此为例（警告可以忽略）：

# example list
library(car)
WeightLoss1 <- WeightLoss
WeightLoss1$group <- as.integer(ifelse(WeightLoss1$group == "Control", 0, 1))

WL = list(WeightLoss1, WeightLoss1, WeightLoss1)  # doesn't make much sense, but suffices for example

# substitute formula
wl.cov <- c("wl1", "se1")
WL.FM <- reformulate(wl.cov, response = "group")

# matching w/o substitution
m.match.0 <- lapply(1:length(WL), function(mark) {
  require(MatchIt)
  matchit(group ~ wl1 + se1, data = WL[[mark]])
})  

# matching w/ substitution
m.match.1 <- lapply(1:length(WL), function(mark) {
  require(MatchIt)
  matchit(WL.FM, data = WL[[mark]])
})

# now compare both attempts to create list of data.frames
# w/o
match <- lapply(1:length(m.match.0), function(i){
  require(MatchIt)
  match.data(m.match.0[[i]])
})

# w/
match <- lapply(1:length(m.match.1), function(i){
  require(MatchIt)
  match.data(m.match.1[[i]])
})

可以看出，尝试 w/o 替换工作正常，尝试替换会产生错误 Error in eval(object$call$data, envir = env) : object 'mark' not found。

如何打补丁？

--

备注：

> match.data
function (object, group = "all", distance = "distance", weights = "weights", 
    subclass = "subclass") 
{
    if (!is.null(object$model)) {
        env <- attributes(terms(object$model))$.Environment
    }
    else {
        env <- parent.frame()
    }
    data <- eval(object$call$data, envir = env)
    treat <- object$treat
    wt <- object$weights
    vars <- names(data)
    if (distance %in% vars) 
        stop("invalid input for distance. choose a different name.")
    else if (!is.null(object$distance)) {
        dta <- data.frame(cbind(data, object$distance))
        names(dta) <- c(names(data), distance)
        data <- dta
    }
    if (weights %in% vars) 
        stop("invalid input for weights. choose a different name.")
    else if (!is.null(object$weights)) {
        dta <- data.frame(cbind(data, object$weights))
        names(dta) <- c(names(data), weights)
        data <- dta
    }
    if (subclass %in% vars) 
        stop("invalid input for subclass. choose a different name.")
    else if (!is.null(object$subclass)) {
        dta <- data.frame(cbind(data, object$subclass))
        names(dta) <- c(names(data), subclass)
        data <- dta
    }
    if (group == "all") 
        return(data[wt > 0, ])
    else if (group == "treat") 
        return(data[wt > 0 & treat == 1, ])
    else if (group == "control") 
        return(data[wt > 0 & treat == 0, ])
    else stop("error: invalid input for group.")
}
<bytecode: 0x00000000866125e0>
<environment: namespace:MatchIt>

> matchit
function (formula, data, method = "nearest", distance = "logit", 
    distance.options = list(), discard = "none", reestimate = FALSE, 
    ...) 
{
    mcall <- match.call()
    if (is.null(data)) 
        stop("Dataframe must be specified", call. = FALSE)
    if (!is.data.frame(data)) {
        stop("Data must be a dataframe", call. = FALSE)
    }
    if (sum(is.na(data)) > 0) 
        stop("Missing values exist in the data")
    ischar <- rep(0, dim(data)[2])
    for (i in 1:dim(data)[2]) if (is.character(data[, i])) 
        data[, i] <- as.factor(data[, i])
    if (!is.numeric(distance)) {
        fn1 <- paste("distance2", distance, sep = "")
        if (!exists(fn1)) 
            stop(distance, "not supported.")
    }
    if (is.numeric(distance)) {
        fn1 <- "distance2user"
    }
    fn2 <- paste("matchit2", method, sep = "")
    if (!exists(fn2)) 
        stop(method, "not supported.")
    tryerror <- try(model.frame(formula), TRUE)
    if (distance %in% c("GAMlogit", "GAMprobit", "GAMcloglog", 
        "GAMlog", "GAMcauchit")) {
        requireNamespace("mgcv")
        tt <- terms(mgcv::interpret.gam(formula)$fake.formula)
    }
    else {
        tt <- terms(formula)
    }
    attr(tt, "intercept") <- 0
    mf <- model.frame(tt, data)
    treat <- model.response(mf)
    X <- model.matrix(tt, data = mf)
    if (method == "exact") {
        distance <- out1 <- discarded <- NULL
        if (!is.null(distance)) 
            warning("distance is set to `NULL' when exact matching is used.")
    }
    else if (is.numeric(distance)) {
        out1 <- NULL
        discarded <- discard(treat, distance, discard, X)
    }
    else {
        if (is.null(distance.options$formula)) 
            distance.options$formula <- formula
        if (is.null(distance.options$data)) 
            distance.options$data <- data
        out1 <- do.call(fn1, distance.options)
        discarded <- discard(treat, out1$distance, discard, X)
        if (reestimate) {
            distance.options$data <- data[!discarded, ]
            distance.options$weights <- distance.options$weights[!discarded]
            tmp <- out1
            out1 <- do.call(fn1, distance.options)
            tmp$distance[!discarded] <- out1$distance
            out1$distance <- tmp$distance
        }
        distance <- out1$distance
    }
    if (fn1 == "distance2mahalanobis") {
        is.full.mahalanobis <- TRUE
    }
    else {
        is.full.mahalanobis <- FALSE
    }
    out2 <- do.call(fn2, list(treat, X, data, distance = distance, 
        discarded, is.full.mahalanobis = is.full.mahalanobis, 
        ...))
    if (fn1 == "distance2mahalanobis") {
        distance[1:length(distance)] <- NA
        class(out2) <- c("matchit.mahalanobis", "matchit")
    }
    out2$call <- mcall
    out2$model <- out1$model
    out2$formula <- formula
    out2$treat <- treat
    if (is.null(out2$X)) {
        out2$X <- X
    }
    out2$distance <- distance
    out2$discarded <- discarded
    nn <- matrix(0, ncol = 2, nrow = 4)
    nn[1, ] <- c(sum(out2$treat == 0), sum(out2$treat == 1))
    nn[2, ] <- c(sum(out2$treat == 0 & out2$weights > 0), sum(out2$treat == 
        1 & out2$weights > 0))
    nn[3, ] <- c(sum(out2$treat == 0 & out2$weights == 0 & out2$discarded == 
        0), sum(out2$treat == 1 & out2$weights == 0 & out2$discarded == 
        0))
    nn[4, ] <- c(sum(out2$treat == 0 & out2$weights == 0 & out2$discarded == 
        1), sum(out2$treat == 1 & out2$weights == 0 & out2$discarded == 
        1))
    dimnames(nn) <- list(c("All", "Matched", "Unmatched", "Discarded"), 
        c("Control", "Treated"))
    out2$nn <- nn
    return(out2)
}
<bytecode: 0x0000000086d6e158>
<environment: namespace:MatchIt>

Answer 1

首先请注意，这两个场景之间的主要区别不是替换，而是在非替换情况下，代码在调用 matchit 的函数中定义了公式，而在替换情况下，它定义了该函数之外的公式。在这两种情况下，如果公式是在函数外部定义的，它都会失败；如果公式是在函数内部定义的，那么在这两种情况下它都会起作用。

问题是因为公式是在函数外定义的所以例子中公式的环境是全局环境

environment(WL.FM)
## <environment: R_GlobalEnv>

而我们希望它是使用它的匿名函数中的本地环境。

1)试试这个：

m.match.1 <- lapply(WL, function(x) {
     WL.FM <- reformulate(wl.cov, response = "group")
     matchit(WL.FM, data = x)
})
match <- lapply(m.match.1, match.data)

2) 或者如果您不想在函数中定义公式，请尝试以下替代方法：

WL.FM <- reformulate(wl.cov, response = "group")
m.match.1 <- lapply(WL, function(x) {
     environment(WL.FM) <- environment()
     matchit(WL.FM, data = x)
})
match <- lapply(m.match.1, match.data)

2a) 另一种重置环境的方法是将公式转换为字符，然后再转换回公式：

WL.FM <- reformulate(wl.cov, response = "group")
m.match.1 <- lapply(WL, function(x) {
     WL.FM <- formula(format(WL.FM))
     matchit(WL.FM, data = x)
})
match <- lapply(m.match.1, match.data)

3) 另一种方法是将 WL.FM 定义为字符串而不是公式对象。然后它没有环境。将其转换为匿名函数中的公式，在这种情况下，其环境将默认为：

WL.FM <- format(reformulate(wl.cov, response = "group")) # character   
m.match.1 <- lapply(WL, function(x) matchit(formula(WL.FM), data = x))
match <- lapply(m.match.1, match.data)

注：虽然与关键问题无关，但是从风格上来说，上面我们去掉了require语句。在代码顶部使用单个 library 语句并且不要使用 require 除非它在 if 语句中。 -- if (require(...)) ... 如果要加载的包不可用，您希望代码尽早失败。

我们还更改了 lapply 代码以迭代 WL 和 m.match.1 而不是在每种情况下迭代下标。

如何处理 MatchIt 中未被识别（例如）的替代公式？

How to deal with a substituted formula not being recognized (e. g.) in MatchIt?

r

matching

substitution