在 R 中使用 dplyr 对所有变量组合求和值

Sum values using dplyr in R for all combinations of variables

我是 R 编程的新手,如果这个问题太基础,我深表歉意。我的交易显示从六种不同类型的产品中获得的收入。有三年的交易。我的 objective 是找出每年所有不同产品组合的销售产品总和,即 2^6 - 1 = 64 - 1 = 63。意思是,我会有 63*3 = 189 组合。

为了简单起见,我只使用三个变量创建了测试数据,因为我用while循环写了一年的程序,很臭。我的 objective 是为了展示我正在努力完成的事情。尽管如此,我还是从下面的原始文件中发布了随机样本。

这是只有三个变量 CarTireServiceswhile 循环的测试数据,向您展示我在寻找什么:

    dput(Sample_File)
structure(list(Order.ID = c(171, 173, 132, 174, 132, 174, 132, 
174, 174), Fiscal.Year = c(2017, 2016, 2016, 2016, 2016, 2016, 
2016, 2016, 2018), Car = c(2, 2, 3, 1, 0, 0, 0, 0, 1), Tire = c(0, 
0, 0, 1, 0, 1, 0, 1, 1), Services = c(3, 1, 4, 0, 4, 1, 4, 0, 
0)), .Names = c("Order.ID", "Fiscal.Year", "Car", "Tire", "Services"
), row.names = c(NA, 9L), class = "data.frame")

这是我的代码:

  i<-1
    Csum <- matrix(rep(0,21),nrow = 7,ncol = 3) 
    # Row 1 is used when C is ON; T is ON ; S is ON
    # Row 2 is used when C is ON; T is ON ; S is OFF
    # Row 3 is used when C is ON; T is OFF ; S is ON
    # Row 4 is used when C is OFF; T is ON ; S is ON
    # Row 5 is used when C is ON; T is OFF ; S is OFF
    # Row 6 is used when C is OFF; T is ON ; S is OFF
    # Row 7 is used when C is OFF; T is OFF ; S is ON

    while (i <= length(Sample_File$Order.ID))
    {
      if (Sample_File$Fiscal.Year[i]!=2016)
        {
        i<-i+1
        next
      }
      if (Sample_File$Car[i]!=0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]!=0)#1 
      {
        Csum[1,1] <- Csum[1,1] + Sample_File$Car[i]
        Csum[1,2] <- Csum[1,2] + Sample_File$Tire[i]
        Csum[1,3] <- Csum[1,3] + Sample_File$Services[i]

      }
      else if (Sample_File$Car[i]!=0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]==0) #2
      {
        Csum[2,1] <- Csum[2,1] + Sample_File$Car[i]
        Csum[2,2] <- Csum[2,2] + Sample_File$Tire[i]
        Csum[2,3] <- Csum[2,3] + 0
      }
      else if(Sample_File$Car[i]!=0 & Sample_File$Tire[i]==0 & Sample_File$Services[i]!=0) #3
        {

        Csum[3,1] <- Csum[3,1] + Sample_File$Car[i]
        Csum[3,2] <- Csum[3,2] + 0
        Csum[3,3] <- Csum[3,3] + Sample_File$Services[i]
      }
      else if(Sample_File$Car[i]==0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]!=0) #4
      {
        Csum[4,1] <- Csum[4,1] + 0
        Csum[4,2] <- Csum[4,2] + Sample_File$Tire[i]
        Csum[4,3] <- Csum[4,3] + Sample_File$Services[i]
      }
      else if(Sample_File$Car[i]!=0 & Sample_File$Tire[i]==0 & Sample_File$Services[i]==0) #5
      {
        Csum[5,1] <- Csum[5,1] + Sample_File$Car[i]
        Csum[5,2] <- Csum[5,2] + 0
        Csum[5,3] <- Csum[5,3] + 0
      }
      else if(Sample_File$Car[i]==0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]==0)#6 
      {
        Csum[6,1] <- Csum[6,1] + 0
        Csum[6,2] <- Csum[6,2] + Sample_File$Tire[i]
        Csum[6,3] <- Csum[6,3] + 0
      }
      else #7
        {
          Csum[7,1] <- Csum[7,1] + 0
          Csum[7,2] <- Csum[7,2] + 0
          Csum[7,3] <- Csum[7,3] + Sample_File$Services[i]
        }
      i<-i+1
    }  

我写了只处理一年的代码,因为复制这段代码三年非常痛苦。我正在寻找一种解决方案,该解决方案将创建一个包含 3 个数据框的列表,每个数据框为期三年。

这是一个大小为 10 的随机样本,其中包含原始文件中的六个变量。

dput(Sample_File_Random)
structure(list(Order.ID = c(171, 173, 132, 174, 169, 175, 163, 
186, 178, 121), Fiscal.Year = c(2016, 2016, 2017, 2016, 2015, 
2016, 2015, 2015, 2015, 2017), Car = c(2, 0, 3, 0, 0, 0, 0, 5346.25, 
0, 0), Tire = c(0, 0, 0, 8691.55800460666, 3198, 5, 2, 0, 2, 
3282.18), Services = c(3, 0, 4, 0, 0, 0, 0, 0, 0, 0), Insurance = c(4, 
0, 0, 4, 0, 4, 0, 0, 0, 0), Accessories = c(94.3, 3749.8, 9308.65, 
0, 2, 0, 1, 633.75, 51.44, 0), Finance = c(0, 0, 0, 4, 0, 14800, 
0, 0, 0, 0)), .Names = c("Order.ID", "Fiscal.Year", "Car", "Tire", 
"Services", "Insurance", "Accessories", "Finance"), row.names = c(NA, 
10L), class = "data.frame")

我真的被卡住了,所以我真诚地感谢任何关于矢量化这个的帮助..


@Ronak shah 的请求:这是 Sample_File_Random

的预期输出
Output_File
  Fiscal.Year     Car     Tire Services Insurance Accessories Finance
1        2015    0.00 3202.000        0         0       54.44       0
2        2015 5346.25    0.000        0         0      633.75       0
3        2016    2.00    0.000        3         4       94.30       0
4        2016    0.00    0.000        0         0     3749.80       0
5        2016    0.00 8696.558        0         8        0.00   14804
6        2017    3.00    0.000        4         0     9308.65       0
7        2017    0.00 3282.180        0         0        0.00       0

多么好的挑战啊....

Using your dataset that I called test.  I chose to approach this with matrices.
names<-colnames(test[3:8])
library(combinat)
one<-t(combn(names,1))
two<-t(combn(names,2))
three<-t(combn(names,3))
four<-t(combn(names,4))
five<-t(combn(names,5))
six<-t(combn(names,6))
library(plyr)
myset<-unname(rbind.fill.matrix(one,two,three,four,five,six))
head(myset,3); tail(myset,3)

给出以下内容:

     [,1]       [,2] [,3] [,4] [,5] [,6]
[1,] "Car"      NA   NA   NA   NA   NA  
[2,] "Tire"     NA   NA   NA   NA   NA  
[3,] "Services" NA   NA   NA   NA   NA  
      [,1]   [,2]       [,3]        [,4]          [,5]          [,6]     
[61,] "Car"  "Services" "Insurance" "Accessories" "Finance"     NA       
[62,] "Tire" "Services" "Insurance" "Accessories" "Finance"     NA       
[63,] "Car"  "Tire"     "Services"  "Insurance"   "Accessories" "Finance"

使用 dplyr 按年份计算总和:

library(dplyr)
testsums<- test %>% select(-Order.ID) %>% group_by(Fiscal.Year) %>% summarise_each(funs(mean))
testsums
A tibble: 3 × 7
  Fiscal.Year      Car    Tire Services Insurance Accessories Finance
        <dbl>    <dbl>   <dbl>    <dbl>     <dbl>       <dbl>   <dbl>
1        2015 1336.562  800.50     0.00         0    172.0475       0
2        2016    0.500 2174.14     0.75         3    961.0250    3701
3        2017    1.500 1641.09     2.00         0   4654.3250       0

创建一个由 1 和 0 组成的矩阵,以乘以相同六个变量的年度总和向量。

mult.matrix<-myset
mult.matrix[!is.na(mult.matrix)]<-1
mult.matrix[is.na(mult.matrix)]<-0
class(mult.matrix) <- "numeric"
head(mult.matrix,3);tail(mult.matrix,3)
     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    1    0    0    0    0    0
[2,]    1    0    0    0    0    0
[3,]    1    0    0    0    0    0
      [,1] [,2] [,3] [,4] [,5] [,6]
[61,]    1    1    1    1    1    0
[62,]    1    1    1    1    1    0
[63,]    1    1    1    1    1    1

将年度总和转换为矩阵表示法。将它乘以 mult.matrix。将 3 个新列绑定到原始组合数据集。

year_sums<-unname(as.matrix(testsums[1:3,2:7]))
all_sums<-mult.matrix %*% t(year_sums)
myset<-unname(rbind.fill.matrix(one,two,three,four,five,six))
myset<-cbind(myset,all_sums)
head(myset,5); tail(myset,5)
     [,1]          [,2] [,3] [,4] [,5] [,6] [,7]        [,8]  [,9] 
[1,] "Car"         NA   NA   NA   NA   NA   "1336.5625" "0.5" "1.5"
[2,] "Tire"        NA   NA   NA   NA   NA   "1336.5625" "0.5" "1.5"
[3,] "Services"    NA   NA   NA   NA   NA   "1336.5625" "0.5" "1.5"
[4,] "Insurance"   NA   NA   NA   NA   NA   "1336.5625" "0.5" "1.5"
[5,] "Accessories" NA   NA   NA   NA   NA   "1336.5625" "0.5" "1.5"
      [,1]   [,2]       [,3]        [,4]          [,5]          [,6]      [,7]      [,8]               [,9]      
[59,] "Car"  "Tire"     "Services"  "Accessories" "Finance"     NA        "2309.11" "3139.41450115167" "6298.915"
[60,] "Car"  "Tire"     "Insurance" "Accessories" "Finance"     NA        "2309.11" "3139.41450115167" "6298.915"
[61,] "Car"  "Services" "Insurance" "Accessories" "Finance"     NA        "2309.11" "3139.41450115167" "6298.915"
[62,] "Tire" "Services" "Insurance" "Accessories" "Finance"     NA        "2309.11" "3139.41450115167" "6298.915"
[63,] "Car"  "Tire"     "Services"  "Insurance"   "Accessories" "Finance" "2309.11" "6840.41450115166" "6298.915"

这可以清理很多。我选择走过我的思维过程。您现在可以获取最终矩阵,将其转换为数据帧,重命名 headers 等...

这是一个简洁且富有表现力的 dplyr 解决方案,分三步进行:

  1. 为每项服务是否在购物篮中创建指标
  2. 按年份分组,指标组合
  3. 按分组变量对服务值求和

这是执行此操作的代码:

df_foo %>% 
  # 1. create the combinations of whether each of the 
  #   products is in the basket or not
  mutate_each(
    funs(In_Basket = . > 0), Car:Services
  ) %>% 
  # 2. group by the year and the basket service indicators
  group_by_(.dots = c("Fiscal.Year", grep("_In_Basket", names(.), value = TRUE))) %>% 
  # 3. sum the service values
  summarise_each(
    funs(sum(., na.rm = TRUE)), Car:Services
  )

这给出了输出:

Source: local data frame [7 x 7]
Groups: Fiscal.Year, Car_In_Basket, Tire_In_Basket [?]

  Fiscal.Year Car_In_Basket Tire_In_Basket Services_In_Basket   Car  Tire Services
        <dbl>         <lgl>          <lgl>              <lgl> <dbl> <dbl>    <dbl>
1        2016         FALSE          FALSE               TRUE     0     0        8
2        2016         FALSE           TRUE              FALSE     0     1        0
3        2016         FALSE           TRUE               TRUE     0     1        1
4        2016          TRUE          FALSE               TRUE     5     0        5
5        2016          TRUE           TRUE              FALSE     1     1        0
6        2017          TRUE          FALSE               TRUE     2     0        3
7        2018          TRUE           TRUE              FALSE     1     1        0