遍历两个数据集以创建不同的结果数据集
Iterate through two datasets to create distinct results dataset
在 SAS 中,我有以下两个数据集:
数据集 #1:关于人们饮食偏好的数据
ID | Meal | Meal_rank
1 Lobster 1
1 Cake 2
1 Hot Dog 3
1 Salad 4
1 Fries 5
2 Burger 1
2 Hot Dog 2
2 Pizza 3
2 Fries 4
3 Hot Dog 1
3 Salad 2
3 Soup 3
4 Lobster 1
4 Hot Dog 2
4 Burger 3
数据集 #2:膳食可用性数据
Meal | Units_available
Hot Dog 2
Burger 1
Pizza 2
在 SAS 中,我想找到一种方法来导出如下所示的结果数据集(不更改数据集 #1 或 #2 中的任何内容):
ID | Assigned_Meal
1 Hot Dog
2 Burger
3 Hot Dog
4 Meal cannot be assigned (out of stock/unavailable)
结果由一个循环遍历每个人的膳食(由他们的 'ID' 值标识)的过程驱动,直到:
- 在有足够单位可用的地方找到一顿饭。
- 所有膳食均已根据可用性数据进行检查。
值得注意的是:
- 在某些情况下,此人会列出无法提供的餐点。
我正在使用的数据集比本例中的数据集大得多(数千行)。
这是创建两个样本数据集的 SAS 代码:
proc sql;
create table work.ppl_meal_pref
(ID char(4),
Meal char(20),
Meal_rank num);
insert into work.ppl_meal_pref
values('1','Lobster',1)
values('1','Cake',2)
values('1','Hot Dog',3)
values('1','Salad',4)
values('1','Fries',5)
values('2','Burger',1)
values('2','Hot Dog',2)
values('2','Pizza',3)
values('2','Fries',4)
values('3','Hot Dog',1)
values('3','Salad',2)
values('3','Soup',3)
values('4','Lobster',1)
values('4','Hot Dog',2)
values('4','Burger',3)
;
quit;
run;
proc sql;
create table work.lunch_menu
(FoodName char(14),
Units_available num);
insert into work.lunch_menu
values('Hot Dog',2)
values('Burger',1)
values('Pizza',1)
;
quit;
run;
我已经尝试实现循环来执行此任务,但无济于事(见下文)。
data work.assign_meals;
length FoodName $ 14 Units_available 8;
if (_n_ = 1) then do;
declare hash lookup(dataset:'work.lunch_menu', duplicate: 'error', ordered: 'ascending', multidata: 'NO');
lookup.defineKey('FoodName');
lookup.defineData('Units_available');
lookup.defineDone();
end;
do until (eof_pref);
set work.ppl_meal_pref END = eof_pref;
rc = lookup.FIND();
IF rc ne 0 THEN DO;
Units_available = 0;
end;
output;
end;
stop;
run;
我以前从未使用过哈希表的替换功能,也没有测试这段代码,但据我了解,这应该可以完成工作:
/* build a dataset assign_meals with variables ID and Assigned_Meal */
data work.assign_meals (keep=ID Assigned_Meal);
/* Do that while reading ppl_meal_pref */
set work.ppl_meal_pref;
/* Take care can use first.ID to know you start a new ID */
by ID;
/* Remember if someone is served (without retain, SAS forgets all values when reading a new observation) */
retain served;
if first.ID then served = 0;
/* but first read lunch_menu into memory */
length FoodName $ 14 Units_available 8;
if (_n_ = 1) then do;
declare hash lookup(dataset:'work.lunch_menu',
duplicate: 'error',
ordered: 'ascending',
multidata: 'NO');
lookup.defineKey('FoodName');
lookup.defineData('Units_available');
lookup.defineDone();
end;
if not served then do;
/* Look up if the desired meal is available */
rc = lookup.FIND();
IF rc eq 0 THEN DO;
if Units_available gt 0 then do;
/* Serve this customer */
output;
served = 1;
Assigned_Meal= Meal;
/* Remember the a meal is used */
Units_available = Units_available - 1;
lookup.REPLACE();
end;
end;
end;
run;
我目前没有时间测试它。如果不行,告诉我,我稍后再做。
另一种方法:modify
-ing 餐食可用性数据集。这比散列方法稍微简洁一些,但性能可能不尽如人意。另一方面,即使您的 lunch_menu
数据集太大而无法方便地放入内存,它仍然可以工作,并且您可以记录之后剩下的饭菜。为了输入数据集之间的一致性,我重命名了变量:
proc sql;
create table work.ppl_meal_pref
(ID char(4),
Food char(20),
Meal_rank num);
insert into work.ppl_meal_pref
values('1','Lobster',1)
values('1','Cake',2)
values('1','Hot Dog',3)
values('1','Salad',4)
values('1','Fries',5)
values('2','Burger',1)
values('2','Hot Dog',2)
values('2','Pizza',3)
values('2','Fries',4)
values('3','Hot Dog',1)
values('3','Salad',2)
values('3','Soup',3)
values('4','Lobster',1)
values('4','Hot Dog',2)
values('4','Burger',3)
;
quit;
run;
proc sql;
create table work.lunch_menu
(Food char(20),
Units_available num);
insert into work.lunch_menu
values('Hot Dog',2)
values('Burger',1)
values('Pizza',1)
;
quit;
run;
proc datasets lib = work nolist nowarn nodetails;
modify lunch_menu;
index create Food /unique;
run;
quit;
/*Output to assigned_meals and update lunch_menu*/
data assigned_meals(keep = id AssignedFood AssignedFoodRank) lunch_menu;
length AssignedFood $ 20;
do until(last.ID);
set ppl_meal_pref;
by ID;
if missing(AssignedFood) then do;
modify lunch_menu key = Food;
if _iorc_ then _error_ = 0;
else if units_available > 0 then do;
AssignedFood = Food;
AssignedFoodRank = Meal_Rank;
units_available + -1;
replace lunch_menu;
end;
end;
end;
output assigned_meals;
run;
这是一个基于散列的工作代码,使用来自 ealfons1 的示例数据。键具有不同的变量名称(Meal 与 FoodName)意味着您必须在 FIND() 中使用额外的语法(或者您可以在 SET 或 DATASET 说明符中重命名)
它还将输出更新的库存水平数据集。跟踪未分配的条件,即对于每个没有得到膳食分配的 ID,哪些偏好 运行 没有/没有库存,将需要额外的代码和输出数据。
data meal_assignments;
if 0 then set meals_stock; * prep PDV;
declare hash stock (dataset:'meals_stock');
stock.defineKey('FoodName');
stock.defineData('FoodName', 'Units_available');
stock.defineDone();
do until (lastrow_flag);
assigned = 0;
stocked = 0;
do until (last.ID);
set ppl_meal_pref end=lastrow_flag;
by ID Meal_rank; * error will happen if meal_rank is not monotonic;
if assigned then continue; * alread assigned;
if stock.find(key:Meal) ne 0 then continue; * off the menu;
stocked = 1;
if Units_available < 1 then continue; * out of stock or missing count;
Units_available + (-1);
if stock.replace() = 0 then do; * hash replace worked;
assigned = 1;
OUTPUT;
end;
else put 'WARNING: Problem with stock hash ' Meal=;
end;
if not assigned then do;
if stocked then Meal = 'Ran out'; else Meal = 'Not stocked';
OUTPUT;
end;
end;
keep ID Meal;
stock.output(dataset:'meals_stock_after_assignments');
stop;
run;
options nocenter;
title "Meals report";
proc print noobs data=meal_assignments; title2 "Assignments";
proc print noobs data=meals_stock_after_assignments; title2 "New stock levels";
proc sql;
title2 "Usage summary";
select A.Meal, A.have_count, B.had_count, B.had_count - A.have_count as use_count
from
(select FoodName as Meal, Units_available as have_count from meals_stock_after_assignments) as A
join
(select FoodName as Meal, Units_available as had_count from meals_stock) as B
on A.Meal = B.Meal
;
quit;
这里的'want'是基于队列的:
- 先到先得,优先排序解决方案。
- 基于 ID 的随机队列顺序可以提供少量感知 'fairness'
更困难的解决方案将基于全局规划,例如:
- 服务最多的人,偏好等级最高
- 服务最多的人,成本最低
- 等...
在 SAS 中,我有以下两个数据集:
数据集 #1:关于人们饮食偏好的数据
ID | Meal | Meal_rank
1 Lobster 1
1 Cake 2
1 Hot Dog 3
1 Salad 4
1 Fries 5
2 Burger 1
2 Hot Dog 2
2 Pizza 3
2 Fries 4
3 Hot Dog 1
3 Salad 2
3 Soup 3
4 Lobster 1
4 Hot Dog 2
4 Burger 3
数据集 #2:膳食可用性数据
Meal | Units_available
Hot Dog 2
Burger 1
Pizza 2
在 SAS 中,我想找到一种方法来导出如下所示的结果数据集(不更改数据集 #1 或 #2 中的任何内容):
ID | Assigned_Meal
1 Hot Dog
2 Burger
3 Hot Dog
4 Meal cannot be assigned (out of stock/unavailable)
结果由一个循环遍历每个人的膳食(由他们的 'ID' 值标识)的过程驱动,直到:
- 在有足够单位可用的地方找到一顿饭。
- 所有膳食均已根据可用性数据进行检查。
值得注意的是:
- 在某些情况下,此人会列出无法提供的餐点。
我正在使用的数据集比本例中的数据集大得多(数千行)。
这是创建两个样本数据集的 SAS 代码:
proc sql;
create table work.ppl_meal_pref
(ID char(4),
Meal char(20),
Meal_rank num);
insert into work.ppl_meal_pref
values('1','Lobster',1)
values('1','Cake',2)
values('1','Hot Dog',3)
values('1','Salad',4)
values('1','Fries',5)
values('2','Burger',1)
values('2','Hot Dog',2)
values('2','Pizza',3)
values('2','Fries',4)
values('3','Hot Dog',1)
values('3','Salad',2)
values('3','Soup',3)
values('4','Lobster',1)
values('4','Hot Dog',2)
values('4','Burger',3)
;
quit;
run;
proc sql;
create table work.lunch_menu
(FoodName char(14),
Units_available num);
insert into work.lunch_menu
values('Hot Dog',2)
values('Burger',1)
values('Pizza',1)
;
quit;
run;
我已经尝试实现循环来执行此任务,但无济于事(见下文)。
data work.assign_meals;
length FoodName $ 14 Units_available 8;
if (_n_ = 1) then do;
declare hash lookup(dataset:'work.lunch_menu', duplicate: 'error', ordered: 'ascending', multidata: 'NO');
lookup.defineKey('FoodName');
lookup.defineData('Units_available');
lookup.defineDone();
end;
do until (eof_pref);
set work.ppl_meal_pref END = eof_pref;
rc = lookup.FIND();
IF rc ne 0 THEN DO;
Units_available = 0;
end;
output;
end;
stop;
run;
我以前从未使用过哈希表的替换功能,也没有测试这段代码,但据我了解,这应该可以完成工作:
/* build a dataset assign_meals with variables ID and Assigned_Meal */
data work.assign_meals (keep=ID Assigned_Meal);
/* Do that while reading ppl_meal_pref */
set work.ppl_meal_pref;
/* Take care can use first.ID to know you start a new ID */
by ID;
/* Remember if someone is served (without retain, SAS forgets all values when reading a new observation) */
retain served;
if first.ID then served = 0;
/* but first read lunch_menu into memory */
length FoodName $ 14 Units_available 8;
if (_n_ = 1) then do;
declare hash lookup(dataset:'work.lunch_menu',
duplicate: 'error',
ordered: 'ascending',
multidata: 'NO');
lookup.defineKey('FoodName');
lookup.defineData('Units_available');
lookup.defineDone();
end;
if not served then do;
/* Look up if the desired meal is available */
rc = lookup.FIND();
IF rc eq 0 THEN DO;
if Units_available gt 0 then do;
/* Serve this customer */
output;
served = 1;
Assigned_Meal= Meal;
/* Remember the a meal is used */
Units_available = Units_available - 1;
lookup.REPLACE();
end;
end;
end;
run;
我目前没有时间测试它。如果不行,告诉我,我稍后再做。
另一种方法:modify
-ing 餐食可用性数据集。这比散列方法稍微简洁一些,但性能可能不尽如人意。另一方面,即使您的 lunch_menu
数据集太大而无法方便地放入内存,它仍然可以工作,并且您可以记录之后剩下的饭菜。为了输入数据集之间的一致性,我重命名了变量:
proc sql;
create table work.ppl_meal_pref
(ID char(4),
Food char(20),
Meal_rank num);
insert into work.ppl_meal_pref
values('1','Lobster',1)
values('1','Cake',2)
values('1','Hot Dog',3)
values('1','Salad',4)
values('1','Fries',5)
values('2','Burger',1)
values('2','Hot Dog',2)
values('2','Pizza',3)
values('2','Fries',4)
values('3','Hot Dog',1)
values('3','Salad',2)
values('3','Soup',3)
values('4','Lobster',1)
values('4','Hot Dog',2)
values('4','Burger',3)
;
quit;
run;
proc sql;
create table work.lunch_menu
(Food char(20),
Units_available num);
insert into work.lunch_menu
values('Hot Dog',2)
values('Burger',1)
values('Pizza',1)
;
quit;
run;
proc datasets lib = work nolist nowarn nodetails;
modify lunch_menu;
index create Food /unique;
run;
quit;
/*Output to assigned_meals and update lunch_menu*/
data assigned_meals(keep = id AssignedFood AssignedFoodRank) lunch_menu;
length AssignedFood $ 20;
do until(last.ID);
set ppl_meal_pref;
by ID;
if missing(AssignedFood) then do;
modify lunch_menu key = Food;
if _iorc_ then _error_ = 0;
else if units_available > 0 then do;
AssignedFood = Food;
AssignedFoodRank = Meal_Rank;
units_available + -1;
replace lunch_menu;
end;
end;
end;
output assigned_meals;
run;
这是一个基于散列的工作代码,使用来自 ealfons1 的示例数据。键具有不同的变量名称(Meal 与 FoodName)意味着您必须在 FIND() 中使用额外的语法(或者您可以在 SET 或 DATASET 说明符中重命名)
它还将输出更新的库存水平数据集。跟踪未分配的条件,即对于每个没有得到膳食分配的 ID,哪些偏好 运行 没有/没有库存,将需要额外的代码和输出数据。
data meal_assignments;
if 0 then set meals_stock; * prep PDV;
declare hash stock (dataset:'meals_stock');
stock.defineKey('FoodName');
stock.defineData('FoodName', 'Units_available');
stock.defineDone();
do until (lastrow_flag);
assigned = 0;
stocked = 0;
do until (last.ID);
set ppl_meal_pref end=lastrow_flag;
by ID Meal_rank; * error will happen if meal_rank is not monotonic;
if assigned then continue; * alread assigned;
if stock.find(key:Meal) ne 0 then continue; * off the menu;
stocked = 1;
if Units_available < 1 then continue; * out of stock or missing count;
Units_available + (-1);
if stock.replace() = 0 then do; * hash replace worked;
assigned = 1;
OUTPUT;
end;
else put 'WARNING: Problem with stock hash ' Meal=;
end;
if not assigned then do;
if stocked then Meal = 'Ran out'; else Meal = 'Not stocked';
OUTPUT;
end;
end;
keep ID Meal;
stock.output(dataset:'meals_stock_after_assignments');
stop;
run;
options nocenter;
title "Meals report";
proc print noobs data=meal_assignments; title2 "Assignments";
proc print noobs data=meals_stock_after_assignments; title2 "New stock levels";
proc sql;
title2 "Usage summary";
select A.Meal, A.have_count, B.had_count, B.had_count - A.have_count as use_count
from
(select FoodName as Meal, Units_available as have_count from meals_stock_after_assignments) as A
join
(select FoodName as Meal, Units_available as had_count from meals_stock) as B
on A.Meal = B.Meal
;
quit;
这里的'want'是基于队列的:
- 先到先得,优先排序解决方案。
- 基于 ID 的随机队列顺序可以提供少量感知 'fairness'
更困难的解决方案将基于全局规划,例如:
- 服务最多的人,偏好等级最高
- 服务最多的人,成本最低
- 等...