在两个values/date间隔之间通过id填写max
Fill in max by id between two values/date intervals
我需要一些关于 R 脚本的帮助。
我有一个 table 看起来像这样:
> dput(first)
structure(list(Date = structure(c(1438387200, 1441065600, 1456790400,
1459468800, 1462060800, 1464739200, 1467331200, 1470009600, 1472688000,
1475280000, 1477958400, 1480550400, 1483228800, 1485907200, 1488326400,
1491004800, 1493596800, 1464739200, 1467331200, 1470009600, 1472688000,
1475280000, 1477958400, 1480550400, 1483228800, 1517443200, 1519862400,
1522540800, 1525132800, 1527811200, 1530403200, 1533081600, 1535760000,
1538352000, 1541030400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
ID = c("A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1",
"A1", "A1", "A1", "A1", "A1", "A1", "A1", "A2", "A2", "A2",
"A2", "A2", "A2", "A2", "A2", "A2", "A3", "A3", "A3", "A3",
"A3", "A3", "A3", "A3", "A3", "A3"), flag = c(0, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0)), row.names = c(NA, -35L
), class = c("tbl_df", "tbl", "data.frame"))
我需要改变 table 例如,如果从上次标志 ==1(每笔贷款)到再次为 1 的时间少于 5 个月,则应为零替换为 1,否则保持原样。
输出看起来像这样:
> dput(second)
structure(list(Date = structure(c(1438387200, 1441065600, 1456790400,
1459468800, 1462060800, 1464739200, 1467331200, 1470009600, 1472688000,
1475280000, 1477958400, 1480550400, 1483228800, 1485907200, 1488326400,
1491004800, 1493596800, 1464739200, 1467331200, 1470009600, 1472688000,
1475280000, 1477958400, 1480550400, 1483228800, 1517443200, 1519862400,
1522540800, 1525132800, 1527811200, 1530403200, 1533081600, 1535760000,
1538352000, 1541030400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
ID = c("A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1",
"A1", "A1", "A1", "A1", "A1", "A1", "A1", "A2", "A2", "A2",
"A2", "A2", "A2", "A2", "A2", "A2", "A3", "A3", "A3", "A3",
"A3", "A3", "A3", "A3", "A3", "A3"), flag = c(0, 0, 0, 1,
1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0)), row.names = c(NA, -35L
), class = c("tbl_df", "tbl", "data.frame"))
坦率地说,我不知道从哪里开始,我是 R 的新手。
我们可以filter
'first'数据中'flag'为1的行,然后按'ID'分组,创建第二组('grp') 根据相邻 'Date's 之间的月差,检查它是否大于 5,执行 cumsum
(每当 'Date' 差时,组计数增加 1大于 5 个月),通过从 first
和 last
'Date' by
中执行一系列 'Date',用 complete
扩展数据集 ' 1个月',加入原始数据集并将'flag'列替换为coalesce
library(dplyr)
library(tidyr)
library(lubridate)
first %>%
filter(as.logical(flag)) %>%
group_by(ID) %>%
group_by(grp = cumsum(interval(lag(Date,
default = first(Date)), Date) %/% months(1) > 5), .add = TRUE) %>%
complete(Date = seq(first(Date), last(Date), by = '1 month'),
fill = list(flag = 1)) %>%
ungroup %>%
select(-grp) %>%
right_join(first, by = c("ID", "Date")) %>%
arrange(ID, Date) %>%
transmute(Date, ID, flag = coalesce(flag.x, flag.y))
我需要一些关于 R 脚本的帮助。
我有一个 table 看起来像这样:
> dput(first)
structure(list(Date = structure(c(1438387200, 1441065600, 1456790400,
1459468800, 1462060800, 1464739200, 1467331200, 1470009600, 1472688000,
1475280000, 1477958400, 1480550400, 1483228800, 1485907200, 1488326400,
1491004800, 1493596800, 1464739200, 1467331200, 1470009600, 1472688000,
1475280000, 1477958400, 1480550400, 1483228800, 1517443200, 1519862400,
1522540800, 1525132800, 1527811200, 1530403200, 1533081600, 1535760000,
1538352000, 1541030400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
ID = c("A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1",
"A1", "A1", "A1", "A1", "A1", "A1", "A1", "A2", "A2", "A2",
"A2", "A2", "A2", "A2", "A2", "A2", "A3", "A3", "A3", "A3",
"A3", "A3", "A3", "A3", "A3", "A3"), flag = c(0, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0)), row.names = c(NA, -35L
), class = c("tbl_df", "tbl", "data.frame"))
我需要改变 table 例如,如果从上次标志 ==1(每笔贷款)到再次为 1 的时间少于 5 个月,则应为零替换为 1,否则保持原样。
输出看起来像这样:
> dput(second)
structure(list(Date = structure(c(1438387200, 1441065600, 1456790400,
1459468800, 1462060800, 1464739200, 1467331200, 1470009600, 1472688000,
1475280000, 1477958400, 1480550400, 1483228800, 1485907200, 1488326400,
1491004800, 1493596800, 1464739200, 1467331200, 1470009600, 1472688000,
1475280000, 1477958400, 1480550400, 1483228800, 1517443200, 1519862400,
1522540800, 1525132800, 1527811200, 1530403200, 1533081600, 1535760000,
1538352000, 1541030400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
ID = c("A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1",
"A1", "A1", "A1", "A1", "A1", "A1", "A1", "A2", "A2", "A2",
"A2", "A2", "A2", "A2", "A2", "A2", "A3", "A3", "A3", "A3",
"A3", "A3", "A3", "A3", "A3", "A3"), flag = c(0, 0, 0, 1,
1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0)), row.names = c(NA, -35L
), class = c("tbl_df", "tbl", "data.frame"))
坦率地说,我不知道从哪里开始,我是 R 的新手。
我们可以filter
'first'数据中'flag'为1的行,然后按'ID'分组,创建第二组('grp') 根据相邻 'Date's 之间的月差,检查它是否大于 5,执行 cumsum
(每当 'Date' 差时,组计数增加 1大于 5 个月),通过从 first
和 last
'Date' by
中执行一系列 'Date',用 complete
扩展数据集 ' 1个月',加入原始数据集并将'flag'列替换为coalesce
library(dplyr)
library(tidyr)
library(lubridate)
first %>%
filter(as.logical(flag)) %>%
group_by(ID) %>%
group_by(grp = cumsum(interval(lag(Date,
default = first(Date)), Date) %/% months(1) > 5), .add = TRUE) %>%
complete(Date = seq(first(Date), last(Date), by = '1 month'),
fill = list(flag = 1)) %>%
ungroup %>%
select(-grp) %>%
right_join(first, by = c("ID", "Date")) %>%
arrange(ID, Date) %>%
transmute(Date, ID, flag = coalesce(flag.x, flag.y))