使用 apply 和 data.table 在 R 中自动进行 num-to-char 转换
Automatic num-to-char conversion in R using apply and data.table
我想计算我的 data.frame
两列的平均差,按三分之一分组。
apply
甚至不让我计算任何算术运算而不显式转换 已经是数字的列 .
data.table
进行操作和分组但 returns 一个字符向量 .
dplyr
语法 returns 数值正确。
为什么 apply() 将数值向量转换为字符?为什么 data.table
将结果转换为 char?
library(dplyr); library(data.table)
a <- letters[c(1,1:9)]
b <- (1:10)/10
c <- sin(1:10)
dat <- data.frame(a,b,c)
table(dat$a)
typeof(dat$b) #double
dat$bb <- apply(dat, 1,function(x) x["b"])
typeof(dat$bb) #character
dat$bb <- apply(dat, 1,function(x) x["b"]-x["c"])
# Error in x["b"] - x["c"] : non-numeric argument to binary operator
tidydat <- dat %>% group_by(a) %>% summarise(diffr = mean(b-c))
typeof(tidydat$diffr) #double
dt <- data.table(dat)
dt[,bb:=mean(b-c), by=a]
typeof(dt$bb) #character
> dt$bb
[1] "-0.725384205816789" "-0.725384205816789" "0.158879991940133" "1.15680249530793" "1.45892427466314"
[6] "0.879415498198926" "0.0430134012812109" "-0.189358246623382" "0.487881514758243" "1.54402111088937"
> tidydat$diffr
[1] -0.7253842 0.1588800 1.1568025 1.4589243 0.8794155 0.0430134 -0.1893582 0.4878815 1.5440211
EDIT 这个 data.table
部分是不真实的,我只是通过引用修改一个已经存在的 char
列,来自@Akrun
使用apply
,将数据集从data.frame
转换为matrix
> is.matrix(apply(dat, 1, I))
[1] TRUE
矩阵只能有一个class
,即如果有一个字符元素,它会将整个数据转换为字符。而是使用 lapply
(如果它是按列排列的)或者也可以在执行 apply
之前对 numeric
列进行子集化
out <- apply(dat[-1], 1,function(x) x["b"]-x["c"])
-输出
> out
[1] -0.7414710 -0.7092974 0.1588800 1.1568025 1.4589243 0.8794155 0.0430134 -0.1893582 0.4878815 1.5440211
> str(out)
num [1:10] -0.741 -0.709 0.159 1.157 1.459 ...
行为变化的原因是 vector
元素只有一个 class 并且在 data.frame/data.table/tibble 等中,列是 list
元素而不是行,即 class 特定于列而不是行
关于 data.table
案例
> library(data.table)
> dt <- as.data.table(dat)
> dt$bb <- NULL # in case if the character column was already created
> dt[,bb:=mean(b-c), by=a]
> str(dt)
Classes ‘data.table’ and 'data.frame': 10 obs. of 4 variables:
$ a : chr "A" "A" "B" "C" ...
$ b : num 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
$ c : num 0.841 0.909 0.141 -0.757 -0.959 ...
$ bb: num -0.725 -0.725 0.159 1.157 0.704 ...
我认为提供了足够的信息来理解背后的原因。其实你可以试试下面的代码,看看当你使用 apply
by rows
时发生了什么
> apply(dat, 1, str)
Named chr [1:3] "a" "0.1" " 0.8414710"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "a" "0.2" " 0.9092974"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "b" "0.3" " 0.1411200"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "c" "0.4" "-0.7568025"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "d" "0.5" "-0.9589243"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "e" "0.6" "-0.2794155"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "f" "0.7" " 0.6569866"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "g" "0.8" " 0.9893582"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "h" "0.9" " 0.4121185"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "i" "1.0" "-0.5440211"
- attr(*, "names")= chr [1:3] "a" "b" "c"
NULL
如您所见,当您 运行 apply(dat,1,FUN = ...)
时,传递给 FUN
的数据被合并为一个字符向量,而不是 data.frame。
我想计算我的 data.frame
两列的平均差,按三分之一分组。
apply
甚至不让我计算任何算术运算而不显式转换 已经是数字的列 .data.table
进行操作和分组但 returns 一个字符向量 .dplyr
语法 returns 数值正确。
为什么 apply() 将数值向量转换为字符?为什么 data.table
将结果转换为 char?
library(dplyr); library(data.table)
a <- letters[c(1,1:9)]
b <- (1:10)/10
c <- sin(1:10)
dat <- data.frame(a,b,c)
table(dat$a)
typeof(dat$b) #double
dat$bb <- apply(dat, 1,function(x) x["b"])
typeof(dat$bb) #character
dat$bb <- apply(dat, 1,function(x) x["b"]-x["c"])
# Error in x["b"] - x["c"] : non-numeric argument to binary operator
tidydat <- dat %>% group_by(a) %>% summarise(diffr = mean(b-c))
typeof(tidydat$diffr) #double
dt <- data.table(dat)
dt[,bb:=mean(b-c), by=a]
typeof(dt$bb) #character
> dt$bb
[1] "-0.725384205816789" "-0.725384205816789" "0.158879991940133" "1.15680249530793" "1.45892427466314"
[6] "0.879415498198926" "0.0430134012812109" "-0.189358246623382" "0.487881514758243" "1.54402111088937"
> tidydat$diffr
[1] -0.7253842 0.1588800 1.1568025 1.4589243 0.8794155 0.0430134 -0.1893582 0.4878815 1.5440211
EDIT 这个 data.table
部分是不真实的,我只是通过引用修改一个已经存在的 char
列,来自@Akrun
使用apply
,将数据集从data.frame
转换为matrix
> is.matrix(apply(dat, 1, I))
[1] TRUE
矩阵只能有一个class
,即如果有一个字符元素,它会将整个数据转换为字符。而是使用 lapply
(如果它是按列排列的)或者也可以在执行 apply
numeric
列进行子集化
out <- apply(dat[-1], 1,function(x) x["b"]-x["c"])
-输出
> out
[1] -0.7414710 -0.7092974 0.1588800 1.1568025 1.4589243 0.8794155 0.0430134 -0.1893582 0.4878815 1.5440211
> str(out)
num [1:10] -0.741 -0.709 0.159 1.157 1.459 ...
行为变化的原因是 vector
元素只有一个 class 并且在 data.frame/data.table/tibble 等中,列是 list
元素而不是行,即 class 特定于列而不是行
关于 data.table
案例
> library(data.table)
> dt <- as.data.table(dat)
> dt$bb <- NULL # in case if the character column was already created
> dt[,bb:=mean(b-c), by=a]
> str(dt)
Classes ‘data.table’ and 'data.frame': 10 obs. of 4 variables:
$ a : chr "A" "A" "B" "C" ...
$ b : num 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
$ c : num 0.841 0.909 0.141 -0.757 -0.959 ...
$ bb: num -0.725 -0.725 0.159 1.157 0.704 ...
我认为apply
by rows
> apply(dat, 1, str)
Named chr [1:3] "a" "0.1" " 0.8414710"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "a" "0.2" " 0.9092974"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "b" "0.3" " 0.1411200"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "c" "0.4" "-0.7568025"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "d" "0.5" "-0.9589243"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "e" "0.6" "-0.2794155"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "f" "0.7" " 0.6569866"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "g" "0.8" " 0.9893582"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "h" "0.9" " 0.4121185"
- attr(*, "names")= chr [1:3] "a" "b" "c"
Named chr [1:3] "i" "1.0" "-0.5440211"
- attr(*, "names")= chr [1:3] "a" "b" "c"
NULL
如您所见,当您 运行 apply(dat,1,FUN = ...)
时,传递给 FUN
的数据被合并为一个字符向量,而不是 data.frame。