示例函数在 ifelse 中重复相同的值
Sample function repeats same value within ifelse
我有以下数据框:
structure(list(Store = c("vpm", "vpm",
"vpm"), Date = structure(c(18042, 18042, 18042), class = "Date"),
UniqueImageId = c("vp3_523", "vp3_668", "vp3_523"), EntryTime = structure(c(1558835514,
1558834942, 1558835523), class = c("POSIXct", "POSIXt")),
ExitTime = structure(c(1558838793, 1558838793, 1558839824
), class = c("POSIXct", "POSIXt")), Duration = c(3279, 3851,
4301), Age = c(35L, 35L, 35L), EntryPoint = c("Entry2Side",
"Entry2Side", "Entry2Side"), ExitPoint = c("Exit2Side", "Exit2Side",
"Exit2Side"), AgeNew = c("15_20", "25_32", "15_20"), GenderNew = c("Female",
"Male", "Female")), row.names = 4:6, class = c("data.table",
"data.frame"))
我正在尝试为列 AgeNew
填充一个随机数,并且我正在使用带有 ifelse 条件的 sample
函数。
我尝试了以下方法
d$AgeNew <- ifelse(d$AgeNew == "0_2", sample(0:2, 1,replace = TRUE),
ifelse(d$AgeNew == "15_20", sample(15:20,1,replace = TRUE),
ifelse(d$AgeNew == "25_32", sample(25:36,1,replace = TRUE),
ifelse(d$AgeNew == "38_43", sample(36:43,1,replace = TRUE),
ifelse(d$AgeNew == "4_6", sample(4:6, 1,replace = TRUE),
ifelse(d$AgeNew == "48_53", sample(48:53,1,replace = TRUE),
ifelse(d$AgeNew == "60_Inf",sample(60:65,1,replace = TRUE),
sample(8:13, 1,replace = TRUE))))))))
但是我得到了重复的相同值。例如,对于 0_2 年龄组,我只填充了 2 个。我尝试使用 set.seed
set.seed(123)
然后 运行 ifelse 仍然重复相同的值。
一个更简单的选择是将 _
替换为 :
,然后 eval
uate 和 sample
该范围内的元素
library(data.table)
d[, AgeNew := sapply(sub("_", ":", sub('Inf', '65', AgeNew)),
function(x) sample(eval(parse(text = x)), 1))]
d[is.na(AgeNew), AgeNew := sample(8:13, 1)]
d
# Store Date UniqueImageId EntryTime ExitTime Duration Age EntryPoint ExitPoint AgeNew GenderNew
#1: vpm 2019-05-26 vp3_523 2019-05-25 21:51:54 2019-05-25 22:46:33 3279 35 Entry2Side Exit2Side 15 Female
#2: vpm 2019-05-26 vp3_668 2019-05-25 21:42:22 2019-05-25 22:46:33 3851 35 Entry2Side Exit2Side 30 Male
#3: vpm 2019-05-26 vp3_523 2019-05-25 21:52:03 2019-05-25 23:03:44 4301 35 Entry2Side Exit2Side 17 Female
或 tidyverse
的另一个选项
library(tidyverse)
d %>%
mutate(AgeNew = str_replace(AgeNew, "Inf", "65")) %>%
separate(AgeNew, into = c('start', 'end'), convert = TRUE) %>%
mutate(AgNew = map2_int(start, end, ~ sample(.x:.y, 1)))
或者另一种选择是按 _
拆分,然后采样
d[, AgeNew := unlist(lapply(strsplit(sub('Inf', '65', AgeNew), "_"), function(x)
sample(as.numeric(x[1]):as.numeric(x[2]), 1)))]
请注意,我们不需要任何嵌套 ifelse
来更改此处。没有任何 ifelse
更容易做到这一点
注意 2:OP 以 data.table
为例,这里我们展示了 data.table
方法
注意3:使用嵌套ifelse是非常低效的
注意 4:基于 strsplit
的方法首次发布于此
关于为什么ifelse
的工作方式不同,在?ifelse
的文档中已经提到了
If yes or no are too short, their elements are recycled. yes will be evaluated if and only if any element of test is true, and analogously for no.
这已经在某处讨论过(目前找不到来源)。它之所以这样,是因为 ifelse
在一个条件下只运行一次,因此该值被回收。考虑这个例子,
x <- c(1, 2, 1, 2, 1, 2)
ifelse(x == 1, sample(1:10, 1), sample(20:30, 1))
#[1] 1 26 1 26 1 26
ifelse(x == 1, sample(1:10, 1), sample(20:30, 1))
#[1] 10 28 10 28 10 28
ifelse(x == 1, sample(1:10, 1), sample(20:30, 1))
#[1] 9 24 9 24 9 24
正如我们所见,它给出了相同的数字,该数字在两种情况下都被回收。为避免这种情况,我们需要将 size
of sample
指定为 ifelse
中 test
条件的长度
ifelse(x == 1, sample(1:10, length(x)), sample(20:30, length(x)))
#[1] 7 23 1 26 10 24
ifelse(x == 1, sample(1:10, length(x)), sample(20:30, length(x)))
#[1] 3 23 5 26 6 22
ifelse(x == 1, sample(1:10, length(x)), sample(20:30, length(x)))
#[1] 2 30 9 27 1 29
您将需要处理 Inf
。根据您的示例,我假设您要在出现 Inf
时添加 +5
。所以基于这个假设,我们可以做,
sapply(strsplit(d$AgeNew, '_'), function(i){
sample(i[1]:replace(i[2], i[2] == 'Inf', as.numeric(i[1]) + 5), 1)
})
#[1] 60 32 19
注意: 我将 AgeNew
的第一个条目更改为 60_Inf
以测试
我有以下数据框:
structure(list(Store = c("vpm", "vpm",
"vpm"), Date = structure(c(18042, 18042, 18042), class = "Date"),
UniqueImageId = c("vp3_523", "vp3_668", "vp3_523"), EntryTime = structure(c(1558835514,
1558834942, 1558835523), class = c("POSIXct", "POSIXt")),
ExitTime = structure(c(1558838793, 1558838793, 1558839824
), class = c("POSIXct", "POSIXt")), Duration = c(3279, 3851,
4301), Age = c(35L, 35L, 35L), EntryPoint = c("Entry2Side",
"Entry2Side", "Entry2Side"), ExitPoint = c("Exit2Side", "Exit2Side",
"Exit2Side"), AgeNew = c("15_20", "25_32", "15_20"), GenderNew = c("Female",
"Male", "Female")), row.names = 4:6, class = c("data.table",
"data.frame"))
我正在尝试为列 AgeNew
填充一个随机数,并且我正在使用带有 ifelse 条件的 sample
函数。
我尝试了以下方法
d$AgeNew <- ifelse(d$AgeNew == "0_2", sample(0:2, 1,replace = TRUE),
ifelse(d$AgeNew == "15_20", sample(15:20,1,replace = TRUE),
ifelse(d$AgeNew == "25_32", sample(25:36,1,replace = TRUE),
ifelse(d$AgeNew == "38_43", sample(36:43,1,replace = TRUE),
ifelse(d$AgeNew == "4_6", sample(4:6, 1,replace = TRUE),
ifelse(d$AgeNew == "48_53", sample(48:53,1,replace = TRUE),
ifelse(d$AgeNew == "60_Inf",sample(60:65,1,replace = TRUE),
sample(8:13, 1,replace = TRUE))))))))
但是我得到了重复的相同值。例如,对于 0_2 年龄组,我只填充了 2 个。我尝试使用 set.seed
set.seed(123)
然后 运行 ifelse 仍然重复相同的值。
一个更简单的选择是将 _
替换为 :
,然后 eval
uate 和 sample
该范围内的元素
library(data.table)
d[, AgeNew := sapply(sub("_", ":", sub('Inf', '65', AgeNew)),
function(x) sample(eval(parse(text = x)), 1))]
d[is.na(AgeNew), AgeNew := sample(8:13, 1)]
d
# Store Date UniqueImageId EntryTime ExitTime Duration Age EntryPoint ExitPoint AgeNew GenderNew
#1: vpm 2019-05-26 vp3_523 2019-05-25 21:51:54 2019-05-25 22:46:33 3279 35 Entry2Side Exit2Side 15 Female
#2: vpm 2019-05-26 vp3_668 2019-05-25 21:42:22 2019-05-25 22:46:33 3851 35 Entry2Side Exit2Side 30 Male
#3: vpm 2019-05-26 vp3_523 2019-05-25 21:52:03 2019-05-25 23:03:44 4301 35 Entry2Side Exit2Side 17 Female
或 tidyverse
library(tidyverse)
d %>%
mutate(AgeNew = str_replace(AgeNew, "Inf", "65")) %>%
separate(AgeNew, into = c('start', 'end'), convert = TRUE) %>%
mutate(AgNew = map2_int(start, end, ~ sample(.x:.y, 1)))
或者另一种选择是按 _
拆分,然后采样
d[, AgeNew := unlist(lapply(strsplit(sub('Inf', '65', AgeNew), "_"), function(x)
sample(as.numeric(x[1]):as.numeric(x[2]), 1)))]
请注意,我们不需要任何嵌套 ifelse
来更改此处。没有任何 ifelse
注意 2:OP 以 data.table
为例,这里我们展示了 data.table
方法
注意3:使用嵌套ifelse是非常低效的
注意 4:基于 strsplit
的方法首次发布于此
关于为什么ifelse
的工作方式不同,在?ifelse
If yes or no are too short, their elements are recycled. yes will be evaluated if and only if any element of test is true, and analogously for no.
这已经在某处讨论过(目前找不到来源)。它之所以这样,是因为 ifelse
在一个条件下只运行一次,因此该值被回收。考虑这个例子,
x <- c(1, 2, 1, 2, 1, 2)
ifelse(x == 1, sample(1:10, 1), sample(20:30, 1))
#[1] 1 26 1 26 1 26
ifelse(x == 1, sample(1:10, 1), sample(20:30, 1))
#[1] 10 28 10 28 10 28
ifelse(x == 1, sample(1:10, 1), sample(20:30, 1))
#[1] 9 24 9 24 9 24
正如我们所见,它给出了相同的数字,该数字在两种情况下都被回收。为避免这种情况,我们需要将 size
of sample
指定为 ifelse
test
条件的长度
ifelse(x == 1, sample(1:10, length(x)), sample(20:30, length(x)))
#[1] 7 23 1 26 10 24
ifelse(x == 1, sample(1:10, length(x)), sample(20:30, length(x)))
#[1] 3 23 5 26 6 22
ifelse(x == 1, sample(1:10, length(x)), sample(20:30, length(x)))
#[1] 2 30 9 27 1 29
您将需要处理 Inf
。根据您的示例,我假设您要在出现 Inf
时添加 +5
。所以基于这个假设,我们可以做,
sapply(strsplit(d$AgeNew, '_'), function(i){
sample(i[1]:replace(i[2], i[2] == 'Inf', as.numeric(i[1]) + 5), 1)
})
#[1] 60 32 19
注意: 我将 AgeNew
的第一个条目更改为 60_Inf
以测试