如何总结所有可能的变量组合?

How to summarize all possible combinations of variables?

我正在尝试根据所有可能的变量组合来汇总计数。这是一个示例数据:

天真的方法 SQL Server 版本(我假设我们总是有 3 列,所以会有 2^3-1 行):

SELECT 'A' AS combination, COUNT(DISTINCT CASE WHEN a > 0 THEN a ELSE NULL END) AS cnt FROM t
UNION ALL 
SELECT 'B', COUNT(DISTINCT CASE WHEN b > 0 THEN a ELSE NULL END) FROM t
UNION ALL 
SELECT 'C', COUNT(DISTINCT CASE WHEN c > 0 THEN a ELSE NULL END) FROM t
UNION ALL
SELECT 'A,B', COUNT(DISTINCT CASE WHEN a > 0 THEN CAST(a AS VARCHAR(10)) ELSE NULL END 
                     + ',' + CASE WHEN b > 0 THEN CAST(b AS VARCHAR(10)) ELSE NULL END) FROM t
UNION ALL
SELECT 'A,C', COUNT(DISTINCT CASE WHEN a > 0 THEN CAST(a AS VARCHAR(10)) ELSE NULL END 
                     + ',' + CASE WHEN c > 0 THEN CAST(c AS VARCHAR(10)) ELSE NULL END) FROM t
UNION ALL
SELECT 'B,C', COUNT(DISTINCT CASE WHEN b > 0 THEN CAST(b AS VARCHAR(10)) ELSE NULL END 
                     + ',' + CASE WHEN c > 0 THEN CAST(c AS VARCHAR(10)) ELSE NULL END) FROM t
UNION ALL
SELECT 'A,B,C', COUNT(DISTINCT CASE WHEN a > 0 THEN CAST(a AS VARCHAR(10)) ELSE NULL END 
                     + ',' + CASE WHEN b > 0 THEN CAST(b AS VARCHAR(10)) ELSE NULL END
                     + ',' + CASE WHEN c > 0 THEN CAST(c AS VARCHAR(10)) ELSE NULL END ) FROM t
ORDER BY combination 

 

Rextester Demo


编辑:

同上但更简洁:

WITH cte AS (
    SELECT ID
          ,CAST(NULLIF(a,0) AS VARCHAR(10)) a
          ,CAST(NULLIF(b,0) AS VARCHAR(10)) b
          ,CAST(NULLIF(c,0) AS VARCHAR(10)) c 
    FROM t
)
SELECT 'A' AS combination, COUNT(DISTINCT a) AS cnt FROM cte UNION ALL 
SELECT 'B', COUNT(DISTINCT b) FROM cte UNION ALL 
SELECT 'C', COUNT(DISTINCT c) FROM cte UNION ALL
SELECT 'A,B', COUNT(DISTINCT a + ',' + b) FROM cte UNION ALL
SELECT 'A,C', COUNT(DISTINCT a + ',' + c) FROM cte UNION ALL
SELECT 'B,C', COUNT(DISTINCT b + ',' + c) FROM cte UNION ALL
SELECT 'A,B,C', COUNT(DISTINCT a + ',' + b + ',' + c ) FROM cte ;

Rextester Demo


编辑 2

使用UNPIVOT:

WITH cte AS (SELECT ID
               ,CAST(IIF(a!=0,1,NULL) AS VARCHAR(10)) a
               ,CAST(IIF(b!=0,1,NULL) AS VARCHAR(10)) b
               ,CAST(IIF(c!=0,1,NULL) AS VARCHAR(10)) c 
            FROM t)
SELECT combination, [count]
FROM (SELECT  a=COUNT(a), b=COUNT(b), c=COUNT(c)
           , ab=COUNT(a+b), ac=COUNT(a+c), bc=COUNT(b+c), abc=COUNT(a+b+c)
      FROM cte) s
UNPIVOT ([count] FOR combination IN (a,b,c,ab,ac,bc,abc))AS unpvt

Rextester Demo


编辑最终方法

I appreciate your approach. I have more than 3 variables in my actual dataset and do you think we can generate all possible combinations programatically rather than the hard coding them! May be your second approach will cover that :

SQL做这种操作有点笨拙,但我想证明这是可能的。

CREATE TABLE t(id INT, a INT, b INT, c INT);

INSERT INTO t
SELECT 10001,1,3,3 UNION
SELECT 10002,0,0,0 UNION
SELECT 10003,3,6,0 UNION
SELECT 10004,7,0,0 UNION
SELECT 10005,0,0,0;

DECLARE @Sample AS TABLE 
(
    item_id     tinyint IDENTITY(1,1) PRIMARY KEY NONCLUSTERED,
    item        nvarchar(500) NOT NULL,
    bit_value   AS  CONVERT ( integer, POWER(2, item_id - 1) )
                PERSISTED UNIQUE CLUSTERED
);    

INSERT INTO @Sample
SELECT name
FROM sys.columns
WHERE object_id = OBJECT_ID('t')
  AND name != 'id';

DECLARE @max integer = POWER(2, ( SELECT COUNT(*) FROM @Sample AS s)) - 1;
DECLARE @cols NVARCHAR(MAX);
DECLARE @cols_casted NVARCHAR(MAX);
DECLARE @cols_count NVARCHAR(MAX);


;WITH
  Pass0 as (select 1 as C union all select 1), --2 rows
  Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
  Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
  Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
  Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
  Tally as (select row_number() over(order by C) as n from Pass4)
, cte AS (SELECT
    combination =
        STUFF
        (
            (
                SELECT ',' + s.item 
                FROM @Sample AS s
                WHERE
                    n.n & s.bit_value = s.bit_value
                ORDER BY
                    s.bit_value
                FOR XML 
                    PATH (''),
                    TYPE                    
            ).value('(./text())[1]', 'varchar(8000)'), 1, 1, ''
        )
FROM Tally AS N
WHERE N.n BETWEEN 1 AND @max
)
SELECT @cols = STRING_AGG(QUOTENAME(combination),',')
      ,@cols_count = STRING_AGG(FORMATMESSAGE('[%s]=COUNT(DISTINCT %s)'
                    ,combination,REPLACE(combination, ',', ' + '','' +') ),',')
FROM cte;

SELECT 
  @cols_casted = STRING_AGG(FORMATMESSAGE('CAST(NULLIF(%s,0) AS VARCHAR(10)) %s'
                 ,name, name), ',')
FROM sys.columns
WHERE object_id = OBJECT_ID('t')
  AND name != 'id';
  
DECLARE @sql NVARCHAR(MAX);

SET @sql =
'SELECT combination, [count]
FROM (SELECT  <cols_count>
      FROM (SELECT ID, <cols_casted> FROM t )cte) s
UNPIVOT ([count] FOR combination IN (<cols>))AS unpvt';

SET @sql = REPLACE(@sql, '<cols_casted>', @cols_casted);
SET @sql = REPLACE(@sql, '<cols_count>', @cols_count);
SET @sql = REPLACE(@sql, '<cols>', @cols);

SELECT @sql;
EXEC (@sql);

DBFiddle Demo

DBFiddle Demo with 4 variables

对于这种查询,使用一些内置的聚合工具非常简单。

首先根据您的样本图像设置一些样本数据:

declare @Table1 as table
    ([id] int, [a] int, [b] int, [c] int)
;

INSERT INTO @Table1
    ([id], [a], [b], [c])
VALUES
    (10001, 1, 3, 3),
    (10002, 0, 0, 0),
    (10003, 3, 6, 0),
    (10004, 7, 0, 0),
    (10005, 0, 0, 0)
;

由于您需要计算非零属性 A、B 和 C 的每种可能组合的 ID 数,因此第一步是消除零并将非零值转换为单个值,我们可以总结如下:在这种情况下,我将使用属性名称。之后,执行聚合是一件简单的事情,使用 group by 语句中的 CUBE 子句来生成组合。最后,在 having 子句中删除不需要的总和。大多数情况下,这只是忽略属性中的空值,并可选择删除总摘要(所有行的计数)

with t1 as (
select case a when 0 then null else 'a' end a
     , case b when 0 then null else 'b' end b
     , case c when 0 then null else 'c' end c
     , id
  from @Table1
)
select a, b, c, count(id) cnt
  from t1
  group by cube(a,b,c)
  having (a is not null or grouping(a) = 1) -- For each attribute
     and (b is not null or grouping(b) = 1) -- only allow nulls as
     and (c is not null or grouping(c) = 1) -- a result of grouping.
     and grouping_id(a,b,c) <> 7  -- exclude the grand total
  order by grouping_id(a,b,c);

结果如下:

    a       b       c       cnt
1   a       b       c       1
2   a       b       NULL    2
3   a       NULL    c       1
4   a       NULL    NULL    3
5   NULL    b       c       1
6   NULL    b       NULL    2
7   NULL    NULL    c       1

最后是我原来的 rextester link:http://rextester.com/YRJ10544

@lad2025 这是一个动态版本(抱歉,我的 SQL 服务器技能不如我的 Oracle 技能,但它可以工作)。只需为 @Table 和 @col 设置正确的值,只要所有其他列都是数字属性,它就应该可以工作:

declare @sql varchar(max), @table varchar(30), @col varchar(30);
set @table = 'Table1';
set @col = 'id';
with x(object_id, column_id, name, names, proj, pred, max_col, cnt) 
  as (
    select object_id, column_id, name, cast(name as varchar(max))
     , cast('case '+name+' when 0 then null else '''+name+''' end '+name as varchar(4000))
     , cast('('+name+' is not null or grouping('+name+') = 1)' as varchar(4000))
     , (select max(column_id) from sys.columns m where m.object_id = c.object_id and m.name <>'ID')
     , 1
     from sys.columns c
    where object_id = OBJECT_ID(@Table)
      and column_id = (select min(column_id) from sys.columns m where m.object_id = c.object_id and m.name <> @col)
    union all
    select x.object_id, c.column_id, c.name, cast(x.names+', '+c.name as varchar(max))
     , cast(proj+char(13)+char(10)+'     , case '+c.name+' when 0 then null else '''+c.name+''' end '+c.name as varchar(4000))
     , cast(pred+char(13)+char(10)+'   and ('+c.name+' is not null or grouping('+c.name+') = 1)' as varchar(4000))
     , max_col
     , cnt+1
      from x join sys.columns c on c.object_id = x.object_id and c.column_id = x.column_id+1
)
select @sql='with t1 as (
select '+proj+'
     , '+@col+'
  from '+@Table+'
)
select '+names+'
     , count('+@col+') cnt 
  from t1
 group by cube('+names+')
having '+pred+'
   and grouping_id('+names+') <> '+cast(power(2,cnt)-1 as varchar(10))+'
 order by grouping_id('+names+');'
  from x where column_id = max_col;

select @sql sql;
exec (@sql);

Rextester

宝山:

正如 Robert 所说,SUMMARY 可用于计算组合。第二个 SUMMARY 可以计算计算类型。一个困难是忽略涉及零值的组合。如果可以将它们转换为遗漏,则处理过程会更加清晰。假设零转换为缺失,此代码将计算不同的组合:

proc summary noprint data=have;
  class v2-v4 s1;
  output out=counts_eachCombo;
run;

proc summary noprint data=counts_eachCombo(rename=_type_=combo_type);
  class combo_type;
  output out=counts_eachClassType;
run;

您可以看到在组合中使用 CLASS 变量如何确定 TYPE,并且 class 变量可以是混合类型(数字, 字符)

一种不使用 SUMMARY 的不同 'home-grown' 方法可以使用带有 LEXCOMB 的数据步骤来计算每个组合,并且 SQL 带有 into / separated 来生成一个 SQL 语句,该语句将计算每一个都截然不同。

注意:以下代码包含用于将 SAS variable list 解析为单个变量名称的宏 varListEval。

%macro makeHave(n=,m=,maxval=&m*4,prob0=0.25);

  data have;
    do id = 1 to &n;
      array v v1-v&m;
      do over v;
        if ranuni(123) < &prob0 then v = 0; else v = ceil(&maxval*ranuni(123));
      end;
      s1 = byte(65+5*ranuni(123));
      output;
    end;
  run;

%mend;

%makeHave (n=100,m=5,maxval=15)

%macro varListEval (data=, var=);
  %* resolve a SAS variable list to individual variable names;
  %local dsid dsid2 i name num;
  %let dsid = %sysfunc(open(&data));
  %if &dsid %then %do;
    %let dsid2 = %sysfunc(open(&data(keep=&var)));
    %if &dsid2 %then %do;
      %do i = 1 %to %sysfunc(attrn(&dsid,nvar));
        %let name = %sysfunc(varname(&dsid,&i));
        %let num = %sysfunc(varnum(&dsid2,&name));
        %if &num %then "&NAME";
      %end;
      %let dsid2 = %sysfunc(close(&dsid2));
    %end;
    %let dsid = %sysfunc(close(&dsid));
  %end;
  %else
    %put %sysfunc(sysmsg());
%mend;

%macro combosUCounts(data=, var=);
  %local vars n;
  %let vars = %varListEval(data=&data, var=&var);

  %let n = %eval(1 + %sysfunc(count(&vars,%str(" ")));

  * compute combination selectors and criteria;
  data combos;
    array _names (&n)  (&vars);
    array _combos (&n) ;
    array _comboCriterias (&n) 0;

    length _selector 000;
    length _criteria 000;

    if 0 then set &data; %* prep PDV for vname;

    do _k = 1 to &n;
      do _j = 1 to comb(&n,_k);
        _rc = lexcomb(_j,_k, of _names[*]);
        do _p = 1 to _k;
          _combos(_p) = _names(_p);
          if vtypex(_names(_p)) = 'C' 
            then _comboCriterias(_p) = trim(_names(_p)) || " is not null and " || trim(_names(_p)) || " ne ''";
            else _comboCriterias(_p) = trim(_names(_p)) || " is not null and " || trim(_names(_p)) || " ne 0";
        end;
        _selector = catx(",", of _combos:);
        _criteria = catx(" and ", of _comboCriterias:);
        output;
      end;
    end;

    stop;
  run;

  %local union;

  proc sql noprint;
    * generate SQL statement that uses combination selectors and criteria;
    select "select "
    || quote(trim(_selector))
    || " as combo" 
    || ", "
    || "count(*) as uCount from (select distinct "
    || trim(_selector)
    || " from &data where "
    || trim(_criteria)
    || ")"
    into :union separated by " UNION "
    from combos
    ;

    * perform the generated SQL statement;
    create table comboCounts as
    &union;

    /* %put union=%superq(union); */
  quit;
%mend;

options mprint nosymbolgen;
%combosUCounts(data=have, var=v2-v4);
%combosUCounts(data=have, var=v2-v4 s1);

%put NOTE: Done;
/*
data _null_;
put %varListEval(data=have, var=v2-v4) ;
run;
*/