如何从 data.table 中删除 NaN 和 Inf 值,其中所有列都是 R 中的字符类型

How to remove NaN and Inf values from data.table where all columns are character types in R

我有一个data.table如下-

data = structure(list(date = c("2021-11-24", "2021-11-24", "2021-11-26", 
"2021-11-24", "2021-11-26", "2021-11-24", "2021-11-24", "2021-11-26", 
"2021-11-26", "2021-11-26", "2021-11-26"), open = c("NaN", "NaN", 
"0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"
), high = c("NaN", "NaN", "0.43", "0.17", "0.19", "0.15", "NaN", 
"NaN", "NaN", "NaN", "NaN"), low = c("NaN", "NaN", "0.43", "0.17", 
"0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"), close = c("NaN", 
"NaN", "0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", 
"NaN"), volume = c(0L, 0L, 2L, 10L, 75L, 1L, 0L, 0L, 0L, 0L, 
0L)), row.names = c(NA, -11L), class = c("data.table", "data.frame"
))

我想从此 data.table.

中删除所有 NaNInf
     date      open high  low close volume
 1: 2021-11-24  NaN  NaN  NaN   NaN      0
 2: 2021-11-24  NaN  NaN  NaN   NaN      0
 3: 2021-11-26 0.43 0.43 0.43  0.43      2
 4: 2021-11-24 0.17 0.17 0.17  0.17     10
 5: 2021-11-26 0.19 0.19 0.19  0.19     75
 6: 2021-11-24 0.15 0.15 0.15  0.15      1
 7: 2021-11-24  NaN  NaN  NaN   NaN      0
 8: 2021-11-26  NaN  NaN  NaN   NaN      0
 9: 2021-11-26  NaN  NaN  NaN   NaN      0
10: 2021-11-26  NaN  NaN  NaN   NaN      0
11: 2021-11-26  NaN  NaN  NaN   NaN      0

由于 NaN 个值,所有列 openhighlowclose 都是字符类型。

有没有直接在data.table中删除NaN的快速方法?

下面是每个解决方案的性能 -

p_load(dtplyr, dplyr)
microbenchmark::microbenchmark(
  
  user438383 = data[!unique(which(data == "NaN" | data == "Inf", arr.ind=T)[,1])],
  
  langtang = na.omit(cbind(data[, .(date,volume)], data[, lapply(.SD, as.numeric), .SDcols = 2:5])),
  
  akrun  = {data <- type.convert(data, as.is = TRUE);
  data[data[, Reduce(`&`, lapply(.SD, function(x)
    !is.nan(x) & is.finite(x))), .SDcols = -1]]},

  paul = {data <- type.convert(data, as.is = TRUE);
  data[data[,is.finite(rowSums(.SD)), .SDcols=-1]]},
  
  Macosso = {data$Row <- row.names(data);
  rm_rw <- data[apply(data, 1, 
                      function(X) any(X== "NaN"|X== "Inf")),] %>% dplyr::pull(Row);
  data[!row.names(data) %in% rm_rw ,] %>% dplyr::select(-Row)} 
)

Unit: microseconds
       expr      min       lq      mean   median       uq       max neval  cld
 user438383  893.843  931.243  976.4554  974.011 1005.673  1093.929   100 a   
   langtang 2694.987 2779.411 2904.5124 2877.927 3003.832  3420.539   100   c 
      akrun 1664.476 1694.780 2253.8962 1731.392 1838.755 26035.268   100  b  
       paul 1663.552 1718.956 1792.2313 1770.511 1843.051  2151.975   100  b  
    Macosso 5899.961 6140.244 6429.9634 6368.072 6604.615  8180.782   100    d

您可以让我们 as.numeric 转换吗?

result = na.omit(cbind(data[, .(date,volume)], data[, lapply(.SD, as.numeric), .SDcols = 2:5]))

输出:

         date volume open high  low close
1: 2021-11-26      2 0.43 0.43 0.43  0.43
2: 2021-11-24     10 0.17 0.17 0.17  0.17
3: 2021-11-26     75 0.19 0.19 0.19  0.19
4: 2021-11-24      1 0.15 0.15 0.15  0.15

一种方法是查找包含 NaN:

的行的索引
unique(which(data == "NaN" | data == "Inf", arr.ind=T)[,1])
[1]  1  2  7  8  9 10 11

然后设置一个逻辑条件来删除这些行:

data[!unique(which(data == "NaN" | data == "Inf", arr.ind=T)[,1])]
         date open high  low close volume
1: 2021-11-26 0.43 0.43 0.43  0.43      2
2: 2021-11-24 0.17 0.17 0.17  0.17     10
3: 2021-11-26 0.19 0.19 0.19  0.19     75
4: 2021-11-24 0.15 0.15 0.15  0.15      1

一些基准测试

Unit: milliseconds
     expr        min         lq       mean     median         uq       max neval  cld
       me   4.513141   5.545293   7.068744   6.707279   8.356170  31.30188   100 a   
 langtang   3.535727   3.646819   8.718629   6.318445   6.983275  59.76049   100 a   
    akrun  51.169168 195.102026 208.889413 204.564707 216.545022 274.02575   100   c 
     paul  11.235627 145.195062 146.721146 146.670909 148.432261 200.56718   100  b  
  Macosso 370.269687 448.143027 468.074160 457.499264 497.636319 553.70491   100    d
data = structure(list(date = c("2021-11-24", "2021-11-24", "2021-11-26", 
"2021-11-24", "2021-11-26", "2021-11-24", "2021-11-24", "2021-11-26", 
"2021-11-26", "2021-11-26", "2021-11-26"), open = c("NaN", "NaN", 
"0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"
), high = c("NaN", "NaN", "0.43", "0.17", "0.19", "0.15", "NaN", 
"NaN", "NaN", "NaN", "NaN"), low = c("NaN", "NaN", "0.43", "0.17", 
"0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"), close = c("NaN", 
"NaN", "0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", 
"NaN"), volume = c(0L, 0L, 2L, 10L, 75L, 1L, 0L, 0L, 0L, 0L, 
0L)), row.names = c(NA, -11L), class = c("data.table", "data.frame"
))
data = do.call("rbind", replicate(1000, data, simplify = FALSE))

library(dtplyr)

res = microbenchmark::microbenchmark(
    me = data[!unique(which(data == NaN, arr.ind=T)[,1])],

    langtang = na.omit(cbind(data[, .(date,volume)], data[, lapply(.SD, as.numeric), .SDcols = 2:5])),

    akrun  = {data <- type.convert(data, as.is = TRUE);
    data[data[, Reduce(`&`, lapply(.SD, function(x)
         !is.nan(x) & is.finite(x))), .SDcols = -1]]},

    paul = data %>% 
        lazy_dt %>%  
        filter(across(2:5, ~ .x != "NaN")) %>% 
        as.data.table,

    Macosso = {data$Row <- row.names(data);
            rm_rw <- data[apply(data, 1, 
                    function(X) any(X== "NaN"|X== "Inf")),] %>% pull(Row);
            data[!row.names(data) %in% rm_rw ,] %>% select(-Row)
        } 

    )

基于dtplyr的解决方案:

library(dtplyr)
library(dplyr)
library(data.table)

data <- structure(
list(date=c("2021-11-24","2021-11-24","2021-11-26",
"2021-11-24","2021-11-26","2021-11-24",
"2021-11-24","2021-11-26","2021-11-26",
"2021-11-26","2021-11-26"),
open=c("NaN","NaN","0.43","0.17","0.19","0.15",
"NaN","NaN","NaN","NaN","NaN"),
high=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN",
"NaN","NaN","NaN","NaN"),low=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN","NaN","NaN","NaN","NaN"),close=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN","NaN","NaN","NaN","NaN"),volume=c(0L,0L,2L,10L,75L,1L,0L,0L,0L,0L,0L)),row.names=c(NA,-11L),class=c("data.table","data.frame"))

data %>% 
  lazy_dt %>%  
  filter(across(2:5, ~ !.x %in% c("NaN","Inf"))) %>%
  as.data.table

#>          date open high  low close volume
#> 1: 2021-11-26 0.43 0.43 0.43  0.43      2
#> 2: 2021-11-24 0.17 0.17 0.17  0.17     10
#> 3: 2021-11-26 0.19 0.19 0.19  0.19     75
#> 4: 2021-11-24 0.15 0.15 0.15  0.15      1

如果NaNInf不是字符类型,正如@akrun在他的回答中提到的,那么下面也是一个解决方案:

library(data.table)

# No need of this, if NaN e Inf are not strings
data <- type.convert(data, as.is = TRUE)

data[data[,is.finite(rowSums(.SD)), .SDcols=-1]]

#>          date open high  low close volume
#> 1: 2021-11-26 0.43 0.43 0.43  0.43      2
#> 2: 2021-11-24 0.17 0.17 0.17  0.17     10
#> 3: 2021-11-26 0.19 0.19 0.19  0.19     75
#> 4: 2021-11-24 0.15 0.15 0.15  0.15      1

最佳策略是,获取具有 NaN 的行的索引,然后过滤掉这些索引。

library(dplyr)

data$Row <- row.names(data)
rm_rw <- data[apply(data, 1, 
                    function(X) any(X== "NaN"|X== "Inf")),] %>% pull(Row)
data[!row.names(data) %in% rm_rw ,] %>% select(-Row)

         date open high  low close volume
1: 2021-11-26 0.43 0.43 0.43  0.43      2
2: 2021-11-24 0.17 0.17 0.17  0.17     10
3: 2021-11-26 0.19 0.19 0.19  0.19     75
4: 2021-11-24 0.15 0.15 0.15  0.15      1

更新 1
any(X== "NaN")) 更改为 any(X== "NaN"|X== "Inf")),这样 Inf 也可以被过滤掉

创建的 NaN 被引用,因此不必要地将列类型更改为 character

> str(data)
Classes ‘data.table’ and 'data.frame':  11 obs. of  6 variables:
 $ date  : chr  "2021-11-24" "2021-11-24" "2021-11-26" "2021-11-24" ...
 $ open  : chr  "NaN" "NaN" "0.43" "0.17" ...
 $ high  : chr  "NaN" "NaN" "0.43" "0.17" ...
 $ low   : chr  "NaN" "NaN" "0.43" "0.17" ...
 $ close : chr  "NaN" "NaN" "0.43" "0.17" ...
 $ volume: int  0 0 2 10 75 1 0 0 0 0 ...

我们可能需要自动转换类型,然后使用 data.table 方法 - 通过指定 .SDcols 遍历 'date' 以外的列,创建逻辑表达式,即列值不是 NaN (!is.nan) 并且 (&) 是有限的 (is.finite),Reduce 逻辑向量到具有 & 和子集的单个向量行

library(data.table)
data <- type.convert(data, as.is = TRUE)
out <- data[data[, Reduce(`&`, lapply(.SD, function(x)
     !is.nan(x) & is.finite(x))), .SDcols = -1]]
out
         date open high  low close volume
1: 2021-11-26 0.43 0.43 0.43  0.43      2
2: 2021-11-24 0.17 0.17 0.17  0.17     10
3: 2021-11-26 0.19 0.19 0.19  0.19     75
4: 2021-11-24 0.15 0.15 0.15  0.15      1