使用条件变异 returns NA 为行的子集创建新变量
creating new variable using conditional mutating returns NAs for a subset of rows
我正在尝试根据日期变量的逻辑条件创建一个新的因子变量:
'data.frame': 364458 obs. of 2 variables:
$ first_order_date: Date, format: "2015-11-24" "2015-12-15" "2015-06-10" "2015-12-22" ...
$ order_date : Date, format: "2016-02-09" "2016-03-15" "2015-12-22" "2015-12-28" ...
library(moisaic)
bb =
df %>%
mutate(days_since_first = as.integer(order_date - first_order_date),
time_after_first = derivedFactor(
"<3months" = order_date <= first_order_date +months(3),
"3-6months" = (order_date <= first_order_date +months(6) & order_date > first_order_date +months(3)),
...
"15-18months" = (order_date <= first_order_date +months(18) & order_date > first_order_date +months(15)),
"18-21months" = (order_date <= first_order_date +months(21) & order_date > first_order_date +months(18)),
.default = "21month+"))
在 运行 之后,我收到了警告:
Warning messages:
1: In base::max(x, ..., na.rm = na.rm) :
no non-missing arguments to max; returning -Inf
在大多数情况下它仍然运行良好,
> head(bb[!is.na(bb$time_after_first), ])
first_order_date order_date days_since_first time_after_first
1 2015-11-24 2016-02-09 77 <3months
2 2015-12-15 2016-03-15 91 <3months
3 2015-06-10 2015-12-22 195 6-9months
4 2015-12-22 2015-12-28 6 <3months
5 2016-01-21 2016-05-29 129 3-6months
6 2016-03-12 2016-05-30 79 <3months
但不是全部
sum(is.na(bb$time_after_first))
[1] 7174
我看不出这些特定条目为何不起作用的任何模式
summary(bb[is.na(bb$time_after_first), ])
first_order_date order_date days_since_first
Min. :2015-01-31 Min. :2015-01-31 Min. : 0.0
1st Qu.:2015-08-31 1st Qu.:2016-02-07 1st Qu.: 41.5
Median :2015-11-30 Median :2016-03-31 Median :106.0
Mean :2015-11-05 Mean :2016-03-12 Mean :128.3
3rd Qu.:2016-01-31 3rd Qu.:2016-05-13 3rd Qu.:175.0
Max. :2016-05-31 Max. :2016-09-04 Max. :546.0
time_after_first
<3months : 0
3-6months : 0
6-9months : 0
9-12months : 0
12-15months: 0
(Other) : 0
NA's :6455
此外,我尝试使用普通的 ifelse() 语句来实现此目的,
bb2 =
all_orders3 %>% select(user_id, order_id, first_order_date, order_date2, category) %>%
mutate(days_since_first = as.integer(order_date - first_order_date),
time_after_first= as.factor(ifelse(order_date <= first_order_date +months(3), "<3months",
ifelse(order_date <= first_order_date +months(6) & order_date > first_order_date +months(3), "3-6months",
....
ifelse(order_date <= first_order_date +months(24) & order_date > first_order_date +months(21), "21-24months",
"24months+"))))))))))
没有收到警告,但生成了更多 NA,但仍不清楚为什么会发生这种情况
summary(bb2[is.na(bb2$time_after_first), ])
first_order_date order_date days_since_first
Min. :2015-01-31 Min. :2015-01-31 Min. : 0.0
1st Qu.:2015-08-31 1st Qu.:2016-02-10 1st Qu.: 52.0
Median :2015-11-30 Median :2016-04-08 Median :123.0
Mean :2015-10-27 Mean :2016-03-25 Mean :150.2
3rd Qu.:2016-01-31 3rd Qu.:2016-05-29 3rd Qu.:211.0
Max. :2016-05-31 Max. :2016-09-04 Max. :582.0
time_after_first
<3months : 0
12-15months: 0
15-18months: 0
18-21months: 0
3-6months : 0
(Other) : 0
NA's :7407
这是 link 我的数据样本
欢迎提出如何解决这个问题的任何有用建议,谢谢!
样本 v2 已更新:
#Read csv...
dat = read.csv("data/07092016-sample_V2.csv")
library(dplyr)
dat = dat %>%
mutate(t1 = as.Date(first_order_date, "%d/%m/%Y"),
t2 = as.Date(order_date, "%d/%m/%Y"),
days = as.numeric(difftime(t2, t1, units = "days"))) %>%
select(t1:days)
head(dat)
# t1 t2 days
# 1 2016-03-02 2016-05-29 88
# 2 2015-04-25 2015-05-01 6
# 3 2015-06-29 2015-07-04 5
# 4 2015-09-09 2016-02-05 149
# 5 2016-01-08 2016-02-15 38
# 6 2016-04-17 2016-04-21 4
breaks = c(-Inf, 90, 180, Inf)
labels = c("<3 months", "3-6 months", "6+ months")
days = cut(dat$days, breaks, labels)
sum(is.na(days))
# [1] 0
我正在尝试根据日期变量的逻辑条件创建一个新的因子变量:
'data.frame': 364458 obs. of 2 variables:
$ first_order_date: Date, format: "2015-11-24" "2015-12-15" "2015-06-10" "2015-12-22" ...
$ order_date : Date, format: "2016-02-09" "2016-03-15" "2015-12-22" "2015-12-28" ...
library(moisaic)
bb =
df %>%
mutate(days_since_first = as.integer(order_date - first_order_date),
time_after_first = derivedFactor(
"<3months" = order_date <= first_order_date +months(3),
"3-6months" = (order_date <= first_order_date +months(6) & order_date > first_order_date +months(3)),
...
"15-18months" = (order_date <= first_order_date +months(18) & order_date > first_order_date +months(15)),
"18-21months" = (order_date <= first_order_date +months(21) & order_date > first_order_date +months(18)),
.default = "21month+"))
在 运行 之后,我收到了警告:
Warning messages:
1: In base::max(x, ..., na.rm = na.rm) :
no non-missing arguments to max; returning -Inf
在大多数情况下它仍然运行良好,
> head(bb[!is.na(bb$time_after_first), ])
first_order_date order_date days_since_first time_after_first
1 2015-11-24 2016-02-09 77 <3months
2 2015-12-15 2016-03-15 91 <3months
3 2015-06-10 2015-12-22 195 6-9months
4 2015-12-22 2015-12-28 6 <3months
5 2016-01-21 2016-05-29 129 3-6months
6 2016-03-12 2016-05-30 79 <3months
但不是全部
sum(is.na(bb$time_after_first))
[1] 7174
我看不出这些特定条目为何不起作用的任何模式
summary(bb[is.na(bb$time_after_first), ])
first_order_date order_date days_since_first
Min. :2015-01-31 Min. :2015-01-31 Min. : 0.0
1st Qu.:2015-08-31 1st Qu.:2016-02-07 1st Qu.: 41.5
Median :2015-11-30 Median :2016-03-31 Median :106.0
Mean :2015-11-05 Mean :2016-03-12 Mean :128.3
3rd Qu.:2016-01-31 3rd Qu.:2016-05-13 3rd Qu.:175.0
Max. :2016-05-31 Max. :2016-09-04 Max. :546.0
time_after_first
<3months : 0
3-6months : 0
6-9months : 0
9-12months : 0
12-15months: 0
(Other) : 0
NA's :6455
此外,我尝试使用普通的 ifelse() 语句来实现此目的,
bb2 =
all_orders3 %>% select(user_id, order_id, first_order_date, order_date2, category) %>%
mutate(days_since_first = as.integer(order_date - first_order_date),
time_after_first= as.factor(ifelse(order_date <= first_order_date +months(3), "<3months",
ifelse(order_date <= first_order_date +months(6) & order_date > first_order_date +months(3), "3-6months",
....
ifelse(order_date <= first_order_date +months(24) & order_date > first_order_date +months(21), "21-24months",
"24months+"))))))))))
没有收到警告,但生成了更多 NA,但仍不清楚为什么会发生这种情况
summary(bb2[is.na(bb2$time_after_first), ])
first_order_date order_date days_since_first
Min. :2015-01-31 Min. :2015-01-31 Min. : 0.0
1st Qu.:2015-08-31 1st Qu.:2016-02-10 1st Qu.: 52.0
Median :2015-11-30 Median :2016-04-08 Median :123.0
Mean :2015-10-27 Mean :2016-03-25 Mean :150.2
3rd Qu.:2016-01-31 3rd Qu.:2016-05-29 3rd Qu.:211.0
Max. :2016-05-31 Max. :2016-09-04 Max. :582.0
time_after_first
<3months : 0
12-15months: 0
15-18months: 0
18-21months: 0
3-6months : 0
(Other) : 0
NA's :7407
这是 link 我的数据样本
欢迎提出如何解决这个问题的任何有用建议,谢谢!
样本 v2 已更新:
#Read csv...
dat = read.csv("data/07092016-sample_V2.csv")
library(dplyr)
dat = dat %>%
mutate(t1 = as.Date(first_order_date, "%d/%m/%Y"),
t2 = as.Date(order_date, "%d/%m/%Y"),
days = as.numeric(difftime(t2, t1, units = "days"))) %>%
select(t1:days)
head(dat)
# t1 t2 days
# 1 2016-03-02 2016-05-29 88
# 2 2015-04-25 2015-05-01 6
# 3 2015-06-29 2015-07-04 5
# 4 2015-09-09 2016-02-05 149
# 5 2016-01-08 2016-02-15 38
# 6 2016-04-17 2016-04-21 4
breaks = c(-Inf, 90, 180, Inf)
labels = c("<3 months", "3-6 months", "6+ months")
days = cut(dat$days, breaks, labels)
sum(is.na(days))
# [1] 0