在 R 中使用具有连续值的查找 table

Using a lookup table in R with continous values

我在 R 中有一个查找 table,我正试图弄清楚如何实现。对我来说,挑战在于它涉及连续值或数据范围。如果该值介于两者之间,我希望它选择正确的值。

我想使用两个连续的 'GRADE'、'SAT' 变量加上分类 'TYPE' 值来分配一个 'GROUP' 值。这一大块代码看起来很吓人,但它们很小 tables.

如有任何建议,我们将不胜感激!!!!

    #lookup table code for recreating dataframe
     structure(list(Type = structure(c(1L, 2L, 1L, 1L), .Label = c("A", 
"B"), class = "factor"), min_grade = c(93L, 85L, 93L, 80L), max_grade = c(100L, 
93L, 100L, 92L), min_sat = c(600L, 700L, 400L, 600L), max_sat = c(800L, 
800L, 599L, 800L), Group = structure(c(1L, 1L, 2L, 3L), .Label = c("A", 
"B", "C"), class = "factor")), .Names = c("Type", "min_grade", 
"max_grade", "min_sat", "max_sat", "Group"), class = "data.frame", row.names = c(NA, 
-4L))


#example ----- desired value is in the 'GROUP' column so this would be NULL before I used the lookup table


           structure(list(Name = structure(c(3L, 1L, 2L, 4L), .Label = c("Jack", 
    "James", "John", "Jordan"), class = "factor"), Grade = c(95L, 
 95L, 92L, 93L), Sat = c(701L, 500L, 800L, 800L), Type = structure(c(1L, 
1L, 1L, 2L), .Label = c("A", "B"), class = "factor"), Group = structure(c(1L, 
2L, 3L, 1L), .Label = c("A", "B", "C"), class = "factor")), .Names = c("Name", 
"Grade", "Sat", "Type", "Group"), class = "data.frame", row.names = c(NA, 
-4L))

这个怎么样?

ltab <- structure(list(Type = structure(c(1L, 2L, 1L, 1L), .Label = c("A", 
    "B"), class = "factor"), min_grade = c(93L, 85L, 93L, 80L), max_grade = c(100L, 
    93L, 100L, 92L), min_sat = c(600L, 700L, 400L, 600L), max_sat = c(800L, 
    800L, 599L, 800L), Group = structure(c(1L, 1L, 2L, 3L), .Label = c("A", 
    "B", "C"), class = "factor")), .Names = c("Type", "min_grade", 
    "max_grade", "min_sat", "max_sat", "Group"), class = "data.frame", row.names = c(NA, 
    -4L))


dat <- structure(list(Name = structure(c(3L, 1L, 2L, 4L), .Label = c("Jack", 
    "James", "John", "Jordan"), class = "factor"), Grade = c(95L, 
    95L, 92L, 93L), Sat = c(701L, 500L, 800L, 800L), Type = structure(c(1L, 
    1L, 1L, 2L), .Label = c("A", "B"), class = "factor")), .Names = c("Name", 
    "Grade", "Sat", "Type"), class = "data.frame", row.names = c(NA, 
    -4L))

library(plyr)
mdat <- adply(merge(dat, ltab, by="Type", all=T), 1, function(x) {
     c(FallsIn=x$Grade > x$min_grade & x$Grade <= x$max_grade & x$Sat > x$min_sat & x$Sat <= x$max_sat)
})
mdat[mdat$FallsIn,]

考虑泛化,是否需要检查更多连续变量?


编辑:无法编辑 OP post 所以考虑 OP 的评论是我将如何处理 "categorizing multidimensional continuous random variables" 的示例 这样这些关键字就会在以后的搜索中出现

breaks <- list(Var1=c(0, 0.25, 1),
    Var2=c(0, 0.5, 1),
    Var3=c(0, 0.25, 0.75, 1))

#generate this on the fly
genIntv <- function(x) {
    ret <- paste0("(", x[1:(length(x)-1)],", ",x[2:length(x)], "]")
    names(ret) <- 1:(length(x)-1)
    ret
}
lookupTbl <- data.frame(expand.grid(lapply(breaks, genIntv), stringsAsFactors=F), 
    Group=LETTERS[1:12])
lookupTbl2 <- data.frame(expand.grid(lapply(breaks, function(x) 1:(length(x)-1)), stringsAsFactors=F), 
    Group=LETTERS[1:12])

#data set
dat <- data.frame(Var1=c(0.1, 0.76), Var2=c(0.5, 0.75), Var3=c(0.25,0.9))
binDat <- do.call(cbind, setNames(lapply(1:ncol(dat), function(k) 
    .bincode(dat[,k], breaks[[k]], T, T)),colnames(dat)))
merge(binDat, lookupTbl2, all.x=T, all.y=F)

如果其他人有更好的方法,很好学习

如果你有小数据,完全连接应该没问题。

library(dplyr)
result = 
  example %>%
  select(-Type) %>%
  full_join(look_up) %>%
  filter(min_grade < Grade & Grade <= max_grade &
           min_sat < Sat & Sat <= max_sat)