通过 R 中的几行代码获取提供者和状态的平均金额
Get the averages of amount by providers and state with few lines of code in R
我有一个这样的数据框
prov_id <- c(599,599,599,599,599,599,599,699,699,699,699,699,699,699,699)
mbr_id <- c(100,101,102,103,103,104,105,200,201,201,202,203,203,204,205)
prov_state <- c("CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA")
amount <- c(3,5,2,28,12,17,10,6,33,31,161,24,22,12,17)
df.sample <- data.frame(prov_id,mbr_id,prov_state,amount,stringsAsFactors=FALSE)
我正在尝试计算提供者的平均金额并以这种方式陈述,
library(tidyverse)
# get the member counts for each provider by state
df.sample.memcnt <-
df.sample %>%
select (prov_id, prov_state, mbr_id) %>%
distinct(prov_id, prov_state, mbr_id) %>%
group_by(prov_id, prov_state) %>%
tally(sort=T) %>%
ungroup() %>%
rename(mem_cnt_pvdr = n)
# get the provider counts for each state
df.sample.pvdrcnt <-
df.sample %>%
select (prov_id, prov_state) %>%
distinct(prov_id, prov_state) %>%
group_by(prov_state) %>%
tally(sort=T) %>%
ungroup() %>%
rename(pvdr_cnt_state = n)
# get the mean total amount of providers
df.sample.pvdr <-
df.sample %>%
select (prov_id,prov_state,amount) %>%
group_by(prov_id,prov_state) %>%
summarise(total_amt = sum(as.numeric(amount))) %>%
ungroup() %>%
inner_join(df.sample.memcnt, by = c("prov_id","prov_state")) %>%
mutate(mean_total_amt_pvdr =
round((total_amt / mem_cnt_pvdr),2)) %>%
select(-total_amt)
# get the mean total amount of the state
df.sample.state <-
df.sample.pvdr %>%
group_by(prov_state) %>%
summarise(total_amt_state = sum(as.numeric(mean_total_amt_pvdr)),
mem_cnt_state = sum(mem_cnt_pvdr)) %>%
ungroup() %>%
inner_join(df.sample.pvdrcnt, by = c("prov_state")) %>%
mutate(mean_total_amt_state =
round((total_amt_state / pvdr_cnt_state),2)) %>%
select(-total_amt_state)
# merge provider df with state df
df.final <- df.sample.pvdr %>%
inner_join(df.sample.state)
虽然我得到了我需要的输出,但我觉得这是非常低效的。
期望输出
prov_id prov_state mem_cnt_pvdr mean_total_amt_pvdr mem_cnt_state pvdr_cnt_state mean_total_amt_state
599 CA 6 12.8 12 2 31.9
699 CA 6 51 12 2 31.9
有没有办法用几行代码得到想要的输出?非常感谢。
data.table
和 uniqueN
稍微短一点:
library(data.table)
setDT(df.sample)
df.sample.state.prov<- df.sample[,{mem_cnt_pvdr=uniqueN(mbr_id );
mean_total_amt_pvdr=round(sum(as.numeric(amount))/mem_cnt_pvdr,2);
.(mem_cnt_pvdr,mean_total_amt_pvdr)},by=.(prov_state,prov_id)]
df.sample.state <- df.sample.state.prov[,.(pvdr_cnt_state=uniqueN(prov_id ),
total_amt_state=sum(as.numeric(mean_total_amt_pvdr)),
mem_cnt_state=sum(mem_cnt_pvdr)),by=.(prov_state)]
df.sample.state[df.sample.state.prov,.(prov_id,
prov_state,
mem_cnt_pvdr,
mean_total_amt_pvdr,
mem_cnt_state,
pvdr_cnt_state,
mean_total_amt_state=total_amt_state/pvdr_cnt_state) ,on=.(prov_state)]
prov_id prov_state mem_cnt_pvdr mean_total_amt_pvdr mem_cnt_state pvdr_cnt_state mean_total_amt_state
1: 599 CA 6 12.83 12 2 31.915
2: 699 CA 6 51.00 12 2 31.915
我有一个这样的数据框
prov_id <- c(599,599,599,599,599,599,599,699,699,699,699,699,699,699,699)
mbr_id <- c(100,101,102,103,103,104,105,200,201,201,202,203,203,204,205)
prov_state <- c("CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA")
amount <- c(3,5,2,28,12,17,10,6,33,31,161,24,22,12,17)
df.sample <- data.frame(prov_id,mbr_id,prov_state,amount,stringsAsFactors=FALSE)
我正在尝试计算提供者的平均金额并以这种方式陈述,
library(tidyverse)
# get the member counts for each provider by state
df.sample.memcnt <-
df.sample %>%
select (prov_id, prov_state, mbr_id) %>%
distinct(prov_id, prov_state, mbr_id) %>%
group_by(prov_id, prov_state) %>%
tally(sort=T) %>%
ungroup() %>%
rename(mem_cnt_pvdr = n)
# get the provider counts for each state
df.sample.pvdrcnt <-
df.sample %>%
select (prov_id, prov_state) %>%
distinct(prov_id, prov_state) %>%
group_by(prov_state) %>%
tally(sort=T) %>%
ungroup() %>%
rename(pvdr_cnt_state = n)
# get the mean total amount of providers
df.sample.pvdr <-
df.sample %>%
select (prov_id,prov_state,amount) %>%
group_by(prov_id,prov_state) %>%
summarise(total_amt = sum(as.numeric(amount))) %>%
ungroup() %>%
inner_join(df.sample.memcnt, by = c("prov_id","prov_state")) %>%
mutate(mean_total_amt_pvdr =
round((total_amt / mem_cnt_pvdr),2)) %>%
select(-total_amt)
# get the mean total amount of the state
df.sample.state <-
df.sample.pvdr %>%
group_by(prov_state) %>%
summarise(total_amt_state = sum(as.numeric(mean_total_amt_pvdr)),
mem_cnt_state = sum(mem_cnt_pvdr)) %>%
ungroup() %>%
inner_join(df.sample.pvdrcnt, by = c("prov_state")) %>%
mutate(mean_total_amt_state =
round((total_amt_state / pvdr_cnt_state),2)) %>%
select(-total_amt_state)
# merge provider df with state df
df.final <- df.sample.pvdr %>%
inner_join(df.sample.state)
虽然我得到了我需要的输出,但我觉得这是非常低效的。
期望输出
prov_id prov_state mem_cnt_pvdr mean_total_amt_pvdr mem_cnt_state pvdr_cnt_state mean_total_amt_state
599 CA 6 12.8 12 2 31.9
699 CA 6 51 12 2 31.9
有没有办法用几行代码得到想要的输出?非常感谢。
data.table
和 uniqueN
稍微短一点:
library(data.table)
setDT(df.sample)
df.sample.state.prov<- df.sample[,{mem_cnt_pvdr=uniqueN(mbr_id );
mean_total_amt_pvdr=round(sum(as.numeric(amount))/mem_cnt_pvdr,2);
.(mem_cnt_pvdr,mean_total_amt_pvdr)},by=.(prov_state,prov_id)]
df.sample.state <- df.sample.state.prov[,.(pvdr_cnt_state=uniqueN(prov_id ),
total_amt_state=sum(as.numeric(mean_total_amt_pvdr)),
mem_cnt_state=sum(mem_cnt_pvdr)),by=.(prov_state)]
df.sample.state[df.sample.state.prov,.(prov_id,
prov_state,
mem_cnt_pvdr,
mean_total_amt_pvdr,
mem_cnt_state,
pvdr_cnt_state,
mean_total_amt_state=total_amt_state/pvdr_cnt_state) ,on=.(prov_state)]
prov_id prov_state mem_cnt_pvdr mean_total_amt_pvdr mem_cnt_state pvdr_cnt_state mean_total_amt_state
1: 599 CA 6 12.83 12 2 31.915
2: 699 CA 6 51.00 12 2 31.915