遍历两个数据集以创建不同的结果数据集

Iterate through two datasets to create distinct results dataset

在 SAS 中,我有以下两个数据集:

数据集 #1:关于人们饮食偏好的数据

   ID |  Meal   | Meal_rank
    1   Lobster       1
    1   Cake          2
    1   Hot Dog       3
    1   Salad         4
    1   Fries         5
    2   Burger        1
    2   Hot Dog       2
    2   Pizza         3
    2   Fries         4
    3   Hot Dog       1
    3   Salad         2
    3   Soup          3
    4   Lobster       1
    4   Hot Dog       2
    4   Burger        3

数据集 #2:膳食可用性数据

  Meal   | Units_available
  Hot Dog     2
  Burger      1
  Pizza       2

在 SAS 中,我想找到一种方法来导出如下所示的结果数据集(不更改数据集 #1 或 #2 中的任何内容):

   ID |  Assigned_Meal
    1   Hot Dog
    2   Burger
    3   Hot Dog
    4   Meal cannot be assigned (out of stock/unavailable)

结果由一个循环遍历每个人的膳食(由他们的 'ID' 值标识)的过程驱动,直到:

  1. 在有足够单位可用的地方找到一顿饭。
  2. 所有膳食均已根据可用性数据进行检查。

值得注意的是:

  1. 在某些情况下,此人会列出无法提供的餐点。

我正在使用的数据集比本例中的数据集大得多(数千行)。

这是创建两个样本数据集的 SAS 代码:

    proc sql;
       create table work.ppl_meal_pref
           (ID char(4),
            Meal char(20),
            Meal_rank num);

    insert into work.ppl_meal_pref
        values('1','Lobster',1)
        values('1','Cake',2)
        values('1','Hot Dog',3)
        values('1','Salad',4)
        values('1','Fries',5)
        values('2','Burger',1)
        values('2','Hot Dog',2)
        values('2','Pizza',3)
        values('2','Fries',4)
        values('3','Hot Dog',1)
        values('3','Salad',2)
        values('3','Soup',3)
        values('4','Lobster',1)
        values('4','Hot Dog',2)
        values('4','Burger',3)
        ;
    quit;
    run;

    proc sql;
       create table work.lunch_menu
           (FoodName char(14),
            Units_available num);

    insert into work.lunch_menu
        values('Hot Dog',2)
        values('Burger',1)
        values('Pizza',1)
        ;
    quit;
    run;

我已经尝试实现循环来执行此任务,但无济于事(见下文)。

        data work.assign_meals;

    length FoodName $ 14 Units_available 8;
    if (_n_ = 1) then do;
        declare hash lookup(dataset:'work.lunch_menu', duplicate: 'error', ordered: 'ascending', multidata: 'NO');
            lookup.defineKey('FoodName');
            lookup.defineData('Units_available');
            lookup.defineDone();
    end;

    do until (eof_pref);
        set work.ppl_meal_pref END = eof_pref;
        rc = lookup.FIND();
        IF rc ne 0 THEN DO;
            Units_available = 0;
        end;
    output;
    end;
    stop;
    run;

我以前从未使用过哈希表的替换功能,也没有测试这段代码,但据我了解,这应该可以完成工作:

/* build a dataset assign_meals with variables ID and Assigned_Meal */
data work.assign_meals (keep=ID Assigned_Meal);

    /* Do that while reading ppl_meal_pref */
    set work.ppl_meal_pref;
    /* Take care can use first.ID to know you start a new ID */
    by ID;
    /* Remember if someone is served (without retain, SAS forgets all values when reading a new observation) */
    retain served;
    if first.ID then served = 0;

    /* but first read lunch_menu into memory */
    length FoodName $ 14 Units_available 8;
    if (_n_ = 1) then do;
        declare hash lookup(dataset:'work.lunch_menu', 
            duplicate: 'error', 
            ordered: 'ascending', 
            multidata: 'NO');

        lookup.defineKey('FoodName');
        lookup.defineData('Units_available');
        lookup.defineDone();
    end;

    if not served then do;
        /* Look up if the desired meal is available */
        rc = lookup.FIND();
        IF rc eq 0 THEN DO;
            if Units_available gt 0 then do;
                /* Serve this customer */
                output;
                served = 1;
                Assigned_Meal= Meal;

                /* Remember the a meal is used */
                Units_available = Units_available - 1;
                lookup.REPLACE();
            end;
        end;
    end;
run;

我目前没有时间测试它。如果不行,告诉我,我稍后再做。

另一种方法:modify-ing 餐食可用性数据集。这比散列方法稍微简洁一些,但性能可能不尽如人意。另一方面,即使您的 lunch_menu 数据集太大而无法方便地放入内存,它仍然可以工作,并且您可以记录之后剩下的饭菜。为了输入数据集之间的一致性,我重命名了变量:

proc sql;
   create table work.ppl_meal_pref
       (ID char(4),
        Food char(20),
        Meal_rank num);

insert into work.ppl_meal_pref
    values('1','Lobster',1)
    values('1','Cake',2)
    values('1','Hot Dog',3)
    values('1','Salad',4)
    values('1','Fries',5)
    values('2','Burger',1)
    values('2','Hot Dog',2)
    values('2','Pizza',3)
    values('2','Fries',4)
    values('3','Hot Dog',1)
    values('3','Salad',2)
    values('3','Soup',3)
    values('4','Lobster',1)
    values('4','Hot Dog',2)
    values('4','Burger',3)
    ;
quit;
run;

proc sql;
   create table work.lunch_menu
       (Food char(20),
        Units_available num);

insert into work.lunch_menu
    values('Hot Dog',2)
    values('Burger',1)
    values('Pizza',1)
    ;
quit;
run;

proc datasets lib = work nolist nowarn nodetails;
    modify lunch_menu;
    index create Food /unique;
    run;
quit;

/*Output to assigned_meals and update lunch_menu*/
data assigned_meals(keep = id AssignedFood AssignedFoodRank) lunch_menu;
    length AssignedFood $ 20;
    do until(last.ID);
        set ppl_meal_pref;
        by ID;
        if missing(AssignedFood) then do;
            modify lunch_menu key = Food;
            if _iorc_ then _error_ = 0;
            else if units_available > 0 then do;
                AssignedFood = Food;
                AssignedFoodRank = Meal_Rank;
                units_available + -1;
                replace lunch_menu;
            end;
        end;
    end;
    output assigned_meals;  
run;

这是一个基于散列的工作代码,使用来自 ealfons1 的示例数据。键具有不同的变量名称(Meal 与 FoodName)意味着您必须在 FIND() 中使用额外的语法(或者您可以在 SET 或 DATASET 说明符中重命名)

它还将输出更新的库存水平数据集。跟踪未分配的条件,即对于每个没有得到膳食分配的 ID,哪些偏好 运行 没有/没有库存,将需要额外的代码和输出数据。

data meal_assignments;
  if 0 then set meals_stock; * prep PDV;
  declare hash stock (dataset:'meals_stock');
  stock.defineKey('FoodName');
  stock.defineData('FoodName', 'Units_available');
  stock.defineDone();

  do until (lastrow_flag);
    assigned = 0;
    stocked = 0;
    do until (last.ID);
      set ppl_meal_pref end=lastrow_flag;
      by ID Meal_rank; * error will happen if meal_rank is not monotonic;
      if assigned then continue; * alread assigned;
      if stock.find(key:Meal) ne 0 then continue; * off the menu;
      stocked = 1;
      if Units_available <  1 then continue; * out of stock or missing count;
      Units_available + (-1);
      if stock.replace() = 0 then do; * hash replace worked;
        assigned = 1;
        OUTPUT;
      end;
      else put 'WARNING: Problem with stock hash ' Meal=;
    end;
    if not assigned then do;
      if stocked then Meal = 'Ran out'; else Meal = 'Not stocked';
      OUTPUT;
    end;
  end;

  keep ID Meal;

  stock.output(dataset:'meals_stock_after_assignments');

  stop;
run;

options nocenter;
title "Meals report";
proc print noobs data=meal_assignments; title2 "Assignments";
proc print noobs data=meals_stock_after_assignments; title2 "New stock levels";
proc sql;
  title2 "Usage summary";
  select A.Meal, A.have_count, B.had_count, B.had_count - A.have_count as use_count
  from 
  (select FoodName as Meal, Units_available as have_count from meals_stock_after_assignments) as A
  join 
  (select FoodName as Meal, Units_available as had_count from meals_stock) as B
  on A.Meal = B.Meal
  ;
quit;

这里的'want'是基于队列的:

  • 先到先得,优先排序解决方案。
    • 基于 ID 的随机队列顺序可以提供少量感知 'fairness'

更困难的解决方案将基于全局规划,例如:

  • 服务最多的人,偏好等级最高
  • 服务最多的人,成本最低
  • 等...