根据R中的条件按列聚合特定列分组
Aggregating specific column grouping by columns based on condition in R
我遇到了一个问题,如果 sum $lives!=0
我需要得到 $Access_score
的加权平均值,如果 sum $lives=0
和 'Trx',我需要得到 $Access_score
的简单平均] 作为按 $IMS_ID & $IMS_PLAN_ID
分组的每个值的唯一值
sc2=aggregate(nonsc1, by=list(nonsc1$PRSC_CID,nonsc1$IMSPayerPlanId),
FUN=function(x) if x$Lives=0 {colMeans(x$Breo_Access_score)} else {0} )
dt <- data.frame(IMS_ID=c(222,222,222,222,333,333,333,333),IMS_PLAN_ID=c(234,234,235,235,234,234,235,235),PLAN_ID=c(1234,678,1234,678,1234,678,1234,678),IMS_STATE=c('CA','CA','CA','CA','TX','TX','TX','TX'),PLAN_STATE=c('CA','CA','CA','CA','TX','TX','TX','TX'),ACCESS_SCORE=c(2,4,2,4,2,4,2,4),Lives=c(0,0,1000,200,0,0,1000,200),Trx=c(10,10,20,20,30,30,40,40));
我的输出应该是:
IMS_ID IMS_PLAN_ID TRX ACCESS_SCORE
222 234 10 3
222 235 20 2.3333
333 234 30 3
333 235 40 2.3333
根据您的新样本输入和预期输出,我们可以这样做:
library(data.table);
setDT(dt)[,{ s <- sum(Lives); cbind(.SD[1L,.(Trx)],ACCESS_SCORE=if (s==0) mean(ACCESS_SCORE) else sum(ACCESS_SCORE*Lives)/s); },.(IMS_ID,IMS_PLAN_ID)];
## IMS_ID IMS_PLAN_ID Trx ACCESS_SCORE
## 1: 222 234 10 3.000000
## 2: 222 235 20 2.333333
## 3: 333 234 30 3.000000
## 4: 333 235 40 2.333333
这是使用 by()
:
的基础 R 解决方案
df <- as.data.frame(dt);
keys <- c('IMS_ID','IMS_PLAN_ID');
do.call(rbind,by(df,df[keys],function(g) { s <- sum(g$Lives); cbind(g[1L,c(keys,'Trx')],ACCESS_SCORE=if (s==0) mean(g$ACCESS_SCORE) else sum(g$ACCESS_SCORE*g$Lives)/s); }));
## IMS_ID IMS_PLAN_ID Trx ACCESS_SCORE
## 1 222 234 10 3.000000
## 5 333 234 30 3.000000
## 3 222 235 20 2.333333
## 7 333 235 40 2.333333
我遇到了一个问题,如果 sum $lives!=0
我需要得到 $Access_score
的加权平均值,如果 sum $lives=0
和 'Trx',我需要得到 $Access_score
的简单平均] 作为按 $IMS_ID & $IMS_PLAN_ID
sc2=aggregate(nonsc1, by=list(nonsc1$PRSC_CID,nonsc1$IMSPayerPlanId),
FUN=function(x) if x$Lives=0 {colMeans(x$Breo_Access_score)} else {0} )
dt <- data.frame(IMS_ID=c(222,222,222,222,333,333,333,333),IMS_PLAN_ID=c(234,234,235,235,234,234,235,235),PLAN_ID=c(1234,678,1234,678,1234,678,1234,678),IMS_STATE=c('CA','CA','CA','CA','TX','TX','TX','TX'),PLAN_STATE=c('CA','CA','CA','CA','TX','TX','TX','TX'),ACCESS_SCORE=c(2,4,2,4,2,4,2,4),Lives=c(0,0,1000,200,0,0,1000,200),Trx=c(10,10,20,20,30,30,40,40));
我的输出应该是:
IMS_ID IMS_PLAN_ID TRX ACCESS_SCORE
222 234 10 3
222 235 20 2.3333
333 234 30 3
333 235 40 2.3333
根据您的新样本输入和预期输出,我们可以这样做:
library(data.table);
setDT(dt)[,{ s <- sum(Lives); cbind(.SD[1L,.(Trx)],ACCESS_SCORE=if (s==0) mean(ACCESS_SCORE) else sum(ACCESS_SCORE*Lives)/s); },.(IMS_ID,IMS_PLAN_ID)];
## IMS_ID IMS_PLAN_ID Trx ACCESS_SCORE
## 1: 222 234 10 3.000000
## 2: 222 235 20 2.333333
## 3: 333 234 30 3.000000
## 4: 333 235 40 2.333333
这是使用 by()
:
df <- as.data.frame(dt);
keys <- c('IMS_ID','IMS_PLAN_ID');
do.call(rbind,by(df,df[keys],function(g) { s <- sum(g$Lives); cbind(g[1L,c(keys,'Trx')],ACCESS_SCORE=if (s==0) mean(g$ACCESS_SCORE) else sum(g$ACCESS_SCORE*g$Lives)/s); }));
## IMS_ID IMS_PLAN_ID Trx ACCESS_SCORE
## 1 222 234 10 3.000000
## 5 333 234 30 3.000000
## 3 222 235 20 2.333333
## 7 333 235 40 2.333333