填充分组 R 中的所有项目
Populate all items in grouping R
我收到一个错误,我认为根本原因是在我的分组中没有跨所有组的值。
数据可以在这里下载:https://opendata.miamidade.gov/311/311-Service-Requests-Miami-Dade-County/dj6j-qg5t
我想要做的是拥有一个函数,该函数采用嵌套分组并检测所有漏洞并填充零。让我们看下面的代码示例:
d <- rDSamp %>%
FilterDateRange("Ticket.Created.Date...Time", "1/1/2013", "12/31/2013") %>%
group_by(Ticket.Created.Date...Time, Case.Owner) %>%
summarise(
count = n()
) %>%
arrange(Ticket.Created.Date...Time)
总结之后,我需要添加一个遍历每个日期的函数,如果该日期的案例所有者不存在,则创建案例所有者,并添加计数 0。
这是达到这一点的代码:
library("ggvis")
library("magrittr")
library("dplyr")
library("tidyr")
library("shiny")
library("checkpoint")
checkpoint("2016-03-29")
rData <- read.csv("C:\data\Miami_311.csv",
header=TRUE,
sep=",")
rDSamp <- rData[sample(1:length(rData$Case.Owner), 1000),]
rDSamp = rData %>%
subset(
Case.Owner == "Animal_Services" |
Case.Owner == "Waste_Management" |
Case.Owner == "Community_Information_and_Outreach" |
Case.Owner == "Waste_Management")
rDSamp$Case.Owner = factor(rDSamp$Case.Owner)
#Convert to known date time
rDSamp$Ticket.Created.Date...Time <-
rDSamp$Ticket.Created.Date...Time %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
FilterDateRange = function(data, feature, minDate, maxDate) {
minDate = minDate %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
maxDate = maxDate %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
result = subset(data, data[feature] <= maxDate)
subset(result, result[feature] >= minDate)
}
d <- rDSamp %>%
FilterDateRange("Ticket.Created.Date...Time", "1/1/2013", "12/31/2013") %>%
group_by(Ticket.Created.Date...Time, Case.Owner) %>%
summarise(
count = n()
) %>%
arrange(Ticket.Created.Date...Time)
关于最后的信息,我正在尝试使用 ggvis layer_smooths 并且它报告了强制引入的 na,我的假设是数据中的漏洞导致了这个。
找到一个解决方案,正在寻找更通用的解决方案...
FillDataHolesWithZeros = function(input){
countZero = input %>%
group_by(Ticket.Created.Date...Time) %>%
summarise(count = n()) %>%
filter(count < length(levels(input$Case.Owner)))
for(i in 1:nrow(countZero))
{
date = countZero[i,]$Ticket.Created.Date...Time
departments = input %>% filter(Ticket.Created.Date...Time == date)
myLevels = levels(input$Case.Owner)
for(j in 1:nrow(departments))
{
owner = departments[j,]$Case.Owner
myLevels = myLevels[myLevels != owner]
}
print(paste(i,":",myLevels))
for(k in 1:length(myLevels)){
input = input %>% rbind(data.frame(
Ticket.Created.Date...Time = date,
Case.Owner = myLevels[k],
count = 0
))
}
}
return(input)
}
尝试
例如
数据
(为了将来尝试显示可重现的数据和具体问题)
Date=c(rep("2016-01-01",2),rep("2016-01-02",3),rep("2016-01-03",4))
CaseOwner=c(letters[1:2],letters[1:3],letters[1:4])
CallCount=1:9
dat1=data.frame(Date, CaseOwner, CallCount)
组+加行
library(dplyr)
library(tidyr)
dat1%>%group_by(Date,CaseOwner)%>%summarize(cnt=max(CallCount))%>%complete(CaseOwner, fill = list(cnt = 0))
结果
Source: local data frame [12 x 3]
Date CaseOwner cnt
(fctr) (fctr) (dbl)
1 2016-01-01 a 1
2 2016-01-01 b 2
3 2016-01-01 c 0
4 2016-01-01 d 0
5 2016-01-02 a 3
6 2016-01-02 b 4
7 2016-01-02 c 5
8 2016-01-02 d 0
9 2016-01-03 a 6
10 2016-01-03 b 7
11 2016-01-03 c 8
12 2016-01-03 d 9
额外
1) %in%
- 看起来漂亮一些 |
rDSamp = rData %>%
subset(
Case.Owner == "Animal_Services" |
Case.Owner == "Waste_Management" |
Case.Owner == "Community_Information_and_Outreach" |
Case.Owner == "Waste_Management")
可以在
上更改
rDSamp = rData[rData$Case.Owner %in%
c("Animal_Services","Waste_Management","Community_Information_and_Outreach","Waste_Management"),]
2) 如果你想比较日期,你不需要将它转换为 char
maxDate = maxDate %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
和
data[feature] <= maxDate
将作为字符串进行比较。
我收到一个错误,我认为根本原因是在我的分组中没有跨所有组的值。
数据可以在这里下载:https://opendata.miamidade.gov/311/311-Service-Requests-Miami-Dade-County/dj6j-qg5t
我想要做的是拥有一个函数,该函数采用嵌套分组并检测所有漏洞并填充零。让我们看下面的代码示例:
d <- rDSamp %>%
FilterDateRange("Ticket.Created.Date...Time", "1/1/2013", "12/31/2013") %>%
group_by(Ticket.Created.Date...Time, Case.Owner) %>%
summarise(
count = n()
) %>%
arrange(Ticket.Created.Date...Time)
总结之后,我需要添加一个遍历每个日期的函数,如果该日期的案例所有者不存在,则创建案例所有者,并添加计数 0。
这是达到这一点的代码:
library("ggvis")
library("magrittr")
library("dplyr")
library("tidyr")
library("shiny")
library("checkpoint")
checkpoint("2016-03-29")
rData <- read.csv("C:\data\Miami_311.csv",
header=TRUE,
sep=",")
rDSamp <- rData[sample(1:length(rData$Case.Owner), 1000),]
rDSamp = rData %>%
subset(
Case.Owner == "Animal_Services" |
Case.Owner == "Waste_Management" |
Case.Owner == "Community_Information_and_Outreach" |
Case.Owner == "Waste_Management")
rDSamp$Case.Owner = factor(rDSamp$Case.Owner)
#Convert to known date time
rDSamp$Ticket.Created.Date...Time <-
rDSamp$Ticket.Created.Date...Time %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
FilterDateRange = function(data, feature, minDate, maxDate) {
minDate = minDate %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
maxDate = maxDate %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
result = subset(data, data[feature] <= maxDate)
subset(result, result[feature] >= minDate)
}
d <- rDSamp %>%
FilterDateRange("Ticket.Created.Date...Time", "1/1/2013", "12/31/2013") %>%
group_by(Ticket.Created.Date...Time, Case.Owner) %>%
summarise(
count = n()
) %>%
arrange(Ticket.Created.Date...Time)
关于最后的信息,我正在尝试使用 ggvis layer_smooths 并且它报告了强制引入的 na,我的假设是数据中的漏洞导致了这个。
找到一个解决方案,正在寻找更通用的解决方案...
FillDataHolesWithZeros = function(input){
countZero = input %>%
group_by(Ticket.Created.Date...Time) %>%
summarise(count = n()) %>%
filter(count < length(levels(input$Case.Owner)))
for(i in 1:nrow(countZero))
{
date = countZero[i,]$Ticket.Created.Date...Time
departments = input %>% filter(Ticket.Created.Date...Time == date)
myLevels = levels(input$Case.Owner)
for(j in 1:nrow(departments))
{
owner = departments[j,]$Case.Owner
myLevels = myLevels[myLevels != owner]
}
print(paste(i,":",myLevels))
for(k in 1:length(myLevels)){
input = input %>% rbind(data.frame(
Ticket.Created.Date...Time = date,
Case.Owner = myLevels[k],
count = 0
))
}
}
return(input)
}
尝试
例如
数据
(为了将来尝试显示可重现的数据和具体问题)
Date=c(rep("2016-01-01",2),rep("2016-01-02",3),rep("2016-01-03",4))
CaseOwner=c(letters[1:2],letters[1:3],letters[1:4])
CallCount=1:9
dat1=data.frame(Date, CaseOwner, CallCount)
组+加行
library(dplyr)
library(tidyr)
dat1%>%group_by(Date,CaseOwner)%>%summarize(cnt=max(CallCount))%>%complete(CaseOwner, fill = list(cnt = 0))
结果
Source: local data frame [12 x 3]
Date CaseOwner cnt
(fctr) (fctr) (dbl)
1 2016-01-01 a 1
2 2016-01-01 b 2
3 2016-01-01 c 0
4 2016-01-01 d 0
5 2016-01-02 a 3
6 2016-01-02 b 4
7 2016-01-02 c 5
8 2016-01-02 d 0
9 2016-01-03 a 6
10 2016-01-03 b 7
11 2016-01-03 c 8
12 2016-01-03 d 9
额外
1) %in%
- 看起来漂亮一些 |
rDSamp = rData %>%
subset(
Case.Owner == "Animal_Services" |
Case.Owner == "Waste_Management" |
Case.Owner == "Community_Information_and_Outreach" |
Case.Owner == "Waste_Management")
可以在
上更改 rDSamp = rData[rData$Case.Owner %in%
c("Animal_Services","Waste_Management","Community_Information_and_Outreach","Waste_Management"),]
2) 如果你想比较日期,你不需要将它转换为 char
maxDate = maxDate %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
和
data[feature] <= maxDate
将作为字符串进行比较。