如何用 SAS 中的数据步骤替换联接
How to replace a join with a datastep in SAS
我正在尝试根据客户端 ID 和时间范围计算给定变量的一些统计信息。我当前的解决方案如下所示,但是,我想知道是否有一种方法可以将代码重新格式化为数据步而不是 sql 连接,因为连接需要很长时间才能在我的真实数据集上执行.
data have1(drop=t);
id = 1;
dt = '31dec2020'd;
do t=1 to 10;
dt = dt + 1;
var = rand('uniform');
output;
end;
format dt ddmmyyp10.;
run;
data have2(drop=t);
id = 2;
dt = '31dec2020'd;
do t=1 to 10;
dt = dt + 1;
var = rand('uniform');
output;
end;
format dt ddmmyyp10.;
run;
data have_fin;
set have1 have2;
run;
Proc sql;
create table want1 as
select a.id, a.dt,a.var, mean(b.var) as mean_var_3d
from have_fin as a
left join have_fin as b
on a.id = b.id and intnx('day',a.dt,-3,'S') < b.dt <= a.dt
group by 1,2,3;
Quit;
Proc sql;
create table want2 as
select a.id, a.dt,a.var, mean(b.var) as mean_var_3d
from have_fin as a
left join have_fin as b
on a.id = b.id and intnx('day',a.dt,-6,'S') < b.dt <= a.dt
group by 1,2,3;
Quit;
改用temporary arrays and a single data step。
这一步完成同样的事情。
- 对数据进行排序以确保顺序正确
- 为要计算的每组移动平均线声明一个临时数组。
- 确保数组在每个 ID 的开头为空
- 在正确的索引中为数组赋值。 MOD() 允许您动态索引数据,而无需包含单独的计数器变量。
- 取数组的平均值。如果您希望数组忽略前两个值 - 因为它只有 1/2 数据点,您也可以有条件地计算它。
*sort to ensure data is in correct order (Step 1);
proc sort data=have_fin;
by id dt;
run;
data want;
*Step 2;
array p3{0:2} _temporary_;
array p6(0:5) _temporary_;
set have_fin;
by ID;
*clear values at the start of each ID for the array Step3;
if first.ID then call missing(of p3{*}, of p6(*));
*assign the value to the array, the mod function indexes the array so it's continuously the most recent 3/6 values;
*Step 4;
p3{mod(_n_,3)} = var;
p6{mod(_n_,6)} = var;
*Step 5 - calculates statistic of interest, average in this case;
mean3d = mean(of p3(*));
mean6d = mean(of p6(*));
;
run;
如果您有 SAS/ETS 许可,这就非常简单了。
*prints product to log - check if you have SAS/ETS licensed;
proc product_status;run;
*sorts data;
proc sort data=have_fin;
by id dt;
run;
*calculates moving average;
proc expand data=have_fin out=want_expand;
by ID;
id dt;
convert var = mean_3d / method=none transformout= (movave 3);
convert var = mean_6d / method=none transformout= (movave 6);
run;
我正在尝试根据客户端 ID 和时间范围计算给定变量的一些统计信息。我当前的解决方案如下所示,但是,我想知道是否有一种方法可以将代码重新格式化为数据步而不是 sql 连接,因为连接需要很长时间才能在我的真实数据集上执行.
data have1(drop=t);
id = 1;
dt = '31dec2020'd;
do t=1 to 10;
dt = dt + 1;
var = rand('uniform');
output;
end;
format dt ddmmyyp10.;
run;
data have2(drop=t);
id = 2;
dt = '31dec2020'd;
do t=1 to 10;
dt = dt + 1;
var = rand('uniform');
output;
end;
format dt ddmmyyp10.;
run;
data have_fin;
set have1 have2;
run;
Proc sql;
create table want1 as
select a.id, a.dt,a.var, mean(b.var) as mean_var_3d
from have_fin as a
left join have_fin as b
on a.id = b.id and intnx('day',a.dt,-3,'S') < b.dt <= a.dt
group by 1,2,3;
Quit;
Proc sql;
create table want2 as
select a.id, a.dt,a.var, mean(b.var) as mean_var_3d
from have_fin as a
left join have_fin as b
on a.id = b.id and intnx('day',a.dt,-6,'S') < b.dt <= a.dt
group by 1,2,3;
Quit;
改用temporary arrays and a single data step。
这一步完成同样的事情。
- 对数据进行排序以确保顺序正确
- 为要计算的每组移动平均线声明一个临时数组。
- 确保数组在每个 ID 的开头为空
- 在正确的索引中为数组赋值。 MOD() 允许您动态索引数据,而无需包含单独的计数器变量。
- 取数组的平均值。如果您希望数组忽略前两个值 - 因为它只有 1/2 数据点,您也可以有条件地计算它。
*sort to ensure data is in correct order (Step 1);
proc sort data=have_fin;
by id dt;
run;
data want;
*Step 2;
array p3{0:2} _temporary_;
array p6(0:5) _temporary_;
set have_fin;
by ID;
*clear values at the start of each ID for the array Step3;
if first.ID then call missing(of p3{*}, of p6(*));
*assign the value to the array, the mod function indexes the array so it's continuously the most recent 3/6 values;
*Step 4;
p3{mod(_n_,3)} = var;
p6{mod(_n_,6)} = var;
*Step 5 - calculates statistic of interest, average in this case;
mean3d = mean(of p3(*));
mean6d = mean(of p6(*));
;
run;
如果您有 SAS/ETS 许可,这就非常简单了。
*prints product to log - check if you have SAS/ETS licensed;
proc product_status;run;
*sorts data;
proc sort data=have_fin;
by id dt;
run;
*calculates moving average;
proc expand data=have_fin out=want_expand;
by ID;
id dt;
convert var = mean_3d / method=none transformout= (movave 3);
convert var = mean_6d / method=none transformout= (movave 6);
run;