在大 data.table 或 data.frame 中查找每行仅出现一次的值

Question

我正在尝试查找 data.table 中每行仅出现一次的值和值的位置。我发现这段代码可以细化每行的值：

How to find all values which only appear less than X times in a vector

我在下面的代码中使用了它。我想知道如何让它运行得更快。目前超过 1000 行需要这么长时间

一个申请：

system.time(apply((singletons),1, function(x) Filter(function (elem) length(which((x) == elem)) <= 1, (x))))
user  system elapsed 
18.528   0.000  18.543 

Rprof("asdas")
(apply((singletons),1, function(x) Filter(function (elem) length(which((x) == elem)) <= 1, (x))))
summaryRprof()


    $by.self
                           self.time self.pct total.time total.pct
    "=="                        0.08    23.53       0.08     23.53
    "as.character.default"      0.06    17.65       0.10     29.41
    "ls"                        0.06    17.65       0.06     17.65
    "which"                     0.04    11.76       0.26     76.47
    "as.character"              0.04    11.76       0.14     41.18
    "as.vector"                 0.04    11.76       0.04     11.76
    "lapply"                    0.02     5.88       0.28     82.35

    $by.total
                           total.time total.pct self.time self.pct
    "lapply"                     0.28     82.35      0.02     5.88
    "[.data.table"               0.28     82.35      0.00     0.00
    "["                          0.28     82.35      0.00     0.00
    "Filter"                     0.28     82.35      0.00     0.00
    "unlist"                     0.28     82.35      0.00     0.00
    "which"                      0.26     76.47      0.04    11.76
    "FUN"                        0.26     76.47      0.00     0.00
    "as.character"               0.14     41.18      0.04    11.76
    "as.character.default"       0.10     29.41      0.06    17.65
    "=="                         0.08     23.53      0.08    23.53
    "ls"                         0.06     17.65      0.06    17.65
    ".completeToken"             0.06     17.65      0.00     0.00
    "apropos"                    0.06     17.65      0.00     0.00
    "normalCompletions"          0.06     17.65      0.00     0.00
    "as.vector"                  0.04     11.76      0.04    11.76

    $sample.interval
    [1] 0.02

    $sampling.time
    [1] 0.34

data.table

中的一个

system.time(singletons[, Filter(function (elem) length(which(as.character(.SD) == elem)) <= 1, as.character(.SD)) , by = ID  ])
user  system elapsed 
25.064   0.000  25.085 

Rprof("asdas")
singletons[, Filter(function (elem) length(which(as.character(.SD) == elem)) <= 1, as.character(.SD)) , by = ID  ]
summaryRprof()

$by.self
                           self.time self.pct total.time total.pct
    "=="                        0.08    23.53       0.08     23.53
    "as.character.default"      0.06    17.65       0.10     29.41
    "ls"                        0.06    17.65       0.06     17.65
    "which"                     0.04    11.76       0.26     76.47
    "as.character"              0.04    11.76       0.14     41.18
    "as.vector"                 0.04    11.76       0.04     11.76
    "lapply"                    0.02     5.88       0.28     82.35

    $by.total
                           total.time total.pct self.time self.pct
    "lapply"                     0.28     82.35      0.02     5.88
    "[.data.table"               0.28     82.35      0.00     0.00
    "["                          0.28     82.35      0.00     0.00
    "Filter"                     0.28     82.35      0.00     0.00
    "unlist"                     0.28     82.35      0.00     0.00
    "which"                      0.26     76.47      0.04    11.76
    "FUN"                        0.26     76.47      0.00     0.00
    "as.character"               0.14     41.18      0.04    11.76
    "as.character.default"       0.10     29.41      0.06    17.65
    "=="                         0.08     23.53      0.08    23.53
    "ls"                         0.06     17.65      0.06    17.65
    ".completeToken"             0.06     17.65      0.00     0.00
    "apropos"                    0.06     17.65      0.00     0.00
    "normalCompletions"          0.06     17.65      0.00     0.00
    "as.vector"                  0.04     11.76      0.04    11.76

    $sample.interval
    [1] 0.02

    $sampling.time
    [1] 0.34

如果您能帮助我们加快速度，我们将不胜感激。

此外，我正在寻找那些只在一行中出现一次的东西的位置，所以如果有人对此有好的想法，请告诉我。

编辑：数据关于数据的注释，每一行只有一个值出现一次它并不总是在第二列

我去掉了前三列：

V1 V2 V3 V4 V5 V6 V7 V8
./    T/G    T/T     ./    T/T    T/T    T/T     ./
./    G/T    G/G     ./    G/G    G/G    G/G     ./
./    C/A    C/C    C/C    C/C    C/C    C/C     ./
./    G/T    G/G    G/G    G/G    G/G    G/G     ./
./    G/C    G/G    G/G    G/G    G/G    G/G     ./
A/A    A/T    A/A    A/A    A/A    A/A    A/A    A/A

期望的输出：

包含每行只出现一次的值的字符向量。

所以：

("T/G", "G/T", ...)

或者如果有人找出索引部分而不是 data.frame（不需要行列）：

singleton row column
"T/G" 1 2
"G/T" 2 2
.......
.......
.......

Answer 1

我建议不要按行操作并使用 apply 将数据集转换为矩阵，只需将其转换为长格式并在单列上操作

melt(singletons[, Row := .I], "Row")[, 
                                     if(.N == 1L) .(Column = variable), 
                                     by = .(Row, value)]
#    Row value Column
# 1:   1   T/G     V2
# 2:   2   G/T     V2
# 3:   3   C/A     V2
# 4:   4   G/T     V2
# 5:   5   G/C     V2
# 6:   6   A/T     V2

一些基准测试- 所以除了它是唯一提供可读输出的函数外，这是迄今为止最快的

set.seed(123)
N <- 1e4
BIDsingletons <- as.data.table(matrix(sample(unlist(singletons), N, replace = TRUE), ncol = N/1e2))
BIDsingletons2 <- copy(BIDsingletons)
DT <- function(BIDsingletons2) melt(BIDsingletons2[, Row := .I], "Row")[, 
                                     if(.N == 1L) .(Column = variable), 
                                     by = .(Row, value)]
OP <- function(BIDsingletons) apply(BIDsingletons, 1, function(x) Filter(function (elem) length(which((x) == elem)) <= 1, (x)))
Alexis_Laz <- function(BIDsingletons) apply(BIDsingletons, 1, function(x) which(!(duplicated(x) | duplicated(x, fromLast = TRUE))))

library(microbenchmark)
microbenchmark(DT(BIDsingletons2),
               #OP(BIDsingletons),
               Alexis_Laz(BIDsingletons))

# Unit: milliseconds
#                      expr       min        lq      mean    median        uq        max neval cld
#        DT(BIDsingletons2)  1.660324  1.911583  2.373655  2.093168  2.407389   8.150031   100  a 
#         OP(BIDsingletons) 57.763136 65.187614 72.071544 69.557509 76.446112 150.318052   100   b
# Alexis_Laz(BIDsingletons)  2.617990  2.847735  3.489971  3.052611  3.529667   8.605180   100  a

比较大一点的数据集

N <- 1e6
BIDsingletons <- as.data.table(matrix(sample(unlist(singletons), N, replace = TRUE), ncol = N/1e2))
BIDsingletons2 <- copy(BIDsingletons)
microbenchmark(DT(BIDsingletons2),
               # OP(BIDsingletons),
               Alexis_Laz(BIDsingletons))

# Unit: milliseconds
#                      expr       min        lq      mean   median        uq      max neval cld
#        DT(BIDsingletons2)  30.26517  33.79918  44.56996  36.2648  42.76773 128.8803   100  a 
# Alexis_Laz(BIDsingletons) 148.89655 213.85403 231.91895 232.7776 249.27168 325.6523   100   b

在大 data.table 或 data.frame 中查找每行仅出现一次的值

Find values that only occur once per row over large data.table or data.frame

profiling

r

apply

data.table