使用汇总(dplyr)从列中获取第一个非空日期
Get first non-null date from a column using summarise (dplyr)
数据框的dput。
structure(list(entity = c("A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A"), REPORT_DATE = structure(c(1622073600,
1621987200, 1621900800, 1621814400, 1621728000, 1621641600, 1621555200,
1621468800, 1621382400, 1621296000, 1621209600, 1621123200, 1621036800,
1620950400, 1620864000, 1620777600, 1620691200, 1620604800, 1620518400,
1620432000, 1620345600, 1620259200, 1620172800, 1620086400, 1.62e+09,
1619913600, 1619827200, 1619740800, 1619654400, 1619568000, 1619481600,
1619395200, 1619308800, 1619222400, 1619136000, 1619049600, 1618963200,
1618876800, 1618790400, 1618704000, 1618617600, 1618531200, 1618444800,
1618358400, 1618272000, 1618185600, 1618099200, 1618012800, 1617926400,
1617840000, 1617753600, 1617667200, 1617580800, 1617494400, 1617408000,
1617321600, 1617235200, 1617148800, 1617062400, 1616976000, 1616889600,
1616803200, 1616716800, 1616630400, 1616544000, 1616457600, 1616371200,
1616284800, 1616198400, 1616112000, 1616025600, 1615939200, 1615852800,
1615766400, 1615680000, 1615593600, 1615507200, 1615420800, 1615334400,
1615248000, 1615161600, 1615075200, 1614988800, 1614902400, 1614816000,
1614729600, 1614643200, 1614556800, 1614470400, 1614384000, 1614297600,
1614211200, 1614124800, 1614038400, 1613952000, 1613865600, 1613779200,
1613692800, 1613606400, 1613520000, 1613433600, 1613347200, 1613260800,
1613174400, 1613088000, 1613001600, 1612915200, 1612828800, 1612742400,
1612656000, 1612569600, 1612483200, 1612396800, 1612310400, 1612224000,
1612137600, 1612051200, 1611964800, 1611878400, 1611792000), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), value = c(NA, NA, NA, 109, NA, 111,
94, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 113, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 115, 113, 108, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 116, 118, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 133, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 131, 129, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 132,
141, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -120L), class = c("tbl_df",
"tbl", "data.frame"))
任务是寻找
一个。 'value
' 和
列中前 3 个非空值的“平均值”
b。从对应于第一个“非空”value
列的“Report_date
”列中获取第一个日期。
我可以做第一部分,但日期答案总是 NA。
这是我的代码:
Final <- dt%>%
arrange(entity,desc(REPORT_DATE))%>%
group_by(entity) %>%
select(entity,REPORT_DATE,value)%>%
summarise(FirstReportDate = first(REPORT_DATE[value>0]),Avg = lapply(head(na.omit(dt['value']),3),mean,na.rm=TRUE)) %>%
ungroup()
解决此问题的一种方法可能是:
df %>%
group_by(entity) %>%
summarise(REPORT_DATE = first(REPORT_DATE[!is.na(value)]),
value = mean(na.omit(value)[1:3]))
entity REPORT_DATE value
<chr> <dttm> <dbl>
1 A 2021-05-24 00:00:00 105.
数据框的dput。
structure(list(entity = c("A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A"), REPORT_DATE = structure(c(1622073600,
1621987200, 1621900800, 1621814400, 1621728000, 1621641600, 1621555200,
1621468800, 1621382400, 1621296000, 1621209600, 1621123200, 1621036800,
1620950400, 1620864000, 1620777600, 1620691200, 1620604800, 1620518400,
1620432000, 1620345600, 1620259200, 1620172800, 1620086400, 1.62e+09,
1619913600, 1619827200, 1619740800, 1619654400, 1619568000, 1619481600,
1619395200, 1619308800, 1619222400, 1619136000, 1619049600, 1618963200,
1618876800, 1618790400, 1618704000, 1618617600, 1618531200, 1618444800,
1618358400, 1618272000, 1618185600, 1618099200, 1618012800, 1617926400,
1617840000, 1617753600, 1617667200, 1617580800, 1617494400, 1617408000,
1617321600, 1617235200, 1617148800, 1617062400, 1616976000, 1616889600,
1616803200, 1616716800, 1616630400, 1616544000, 1616457600, 1616371200,
1616284800, 1616198400, 1616112000, 1616025600, 1615939200, 1615852800,
1615766400, 1615680000, 1615593600, 1615507200, 1615420800, 1615334400,
1615248000, 1615161600, 1615075200, 1614988800, 1614902400, 1614816000,
1614729600, 1614643200, 1614556800, 1614470400, 1614384000, 1614297600,
1614211200, 1614124800, 1614038400, 1613952000, 1613865600, 1613779200,
1613692800, 1613606400, 1613520000, 1613433600, 1613347200, 1613260800,
1613174400, 1613088000, 1613001600, 1612915200, 1612828800, 1612742400,
1612656000, 1612569600, 1612483200, 1612396800, 1612310400, 1612224000,
1612137600, 1612051200, 1611964800, 1611878400, 1611792000), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), value = c(NA, NA, NA, 109, NA, 111,
94, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 113, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 115, 113, 108, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 116, 118, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 133, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 131, 129, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 132,
141, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -120L), class = c("tbl_df",
"tbl", "data.frame"))
任务是寻找
一个。 'value
' 和
b。从对应于第一个“非空”value
列的“Report_date
”列中获取第一个日期。
我可以做第一部分,但日期答案总是 NA。
这是我的代码:
Final <- dt%>%
arrange(entity,desc(REPORT_DATE))%>%
group_by(entity) %>%
select(entity,REPORT_DATE,value)%>%
summarise(FirstReportDate = first(REPORT_DATE[value>0]),Avg = lapply(head(na.omit(dt['value']),3),mean,na.rm=TRUE)) %>%
ungroup()
解决此问题的一种方法可能是:
df %>%
group_by(entity) %>%
summarise(REPORT_DATE = first(REPORT_DATE[!is.na(value)]),
value = mean(na.omit(value)[1:3]))
entity REPORT_DATE value
<chr> <dttm> <dbl>
1 A 2021-05-24 00:00:00 105.