使用 PRXMATCH 匹配来自另一个 sas 数据集的字符串

Using PRXMATCH to match strings from another sas dataset

需要您的帮助和指导。请看下面

*rsubmit;proc sql;
connect to teradata(user=&user_id. password=&user_pwd.);
create table mylib.DWH_table as select * from connection to teradata(
select distinct nm from DWH_table
);
quit;*endrsubmit;
*rsubmit;
DATA mylib.out_sas1;
    set mylib.DWH_table;
    if prxmatch ("m/studio/i",nm) > 0;
run;*endrsubmit;

所以上面的代码检查列 nm 中的单词 "studio" 和 returns 结果。但是,这是一个需要自动化的手动过程。我有另一个数据集,其中只包含一个名为 "KEYWORDS" 的列。我在下面给出的一些样本数据

KEYWORDS:
apple
mango
banana
grapes

目标是 SAS 应获取列中的单词并将其与数据库中的值进行比较并创建单独的输出 table。 例如:

*rsubmit;
DATA mylib.out_sas2;
    set mylib.DWH_table;
    if prxmatch ("m/apple/i",nm) > 0;
run;*endrsubmit;


*rsubmit;
DATA mylib.out_sas3;
    set mylib.DWH_table;
    if prxmatch ("m/mango/i",nm) > 0;
run;*endrsubmit;

这可以在 SAS 中完成吗?

考虑使用 CALL EXECUTE 通过数据步骤调用宏:

%macro subset_data(key);
    %let name_unquoted = %qsysfunc(compress(&key., %str(%")));

    data mylib.out_&name_unquoted.;
        set mylib.DWH_table;
        if prxmatch ("m/"||trim(&key.)||"/i",nm) > 0;
    run;
%mend;

data _null_;
  set mydata;

  call execute('%nrstr(%subset_data("'||KEYWORDS||'"))');
run;

或者,代替 call execute,创建宏调用的 SAS 脚本文件,然后 运行 和 %include:

data _null_;
  set mydata;

  file "Temp.sas" ;
  put '%subset_data("' KEYWORDS '") ;' ;
run;

%include "Temp.sas"; 

但是如果关键字很多(即几十到几百到几千),请考虑@Richard 上面的评论,通过帮助程序在串联数据集中开发一个指标列,温度数据集:

%macro subset_data(key);
    *** BUILD temp WITH INDICATOR;
    data temp;
        set mylib.DWH_table;
        if prxmatch ("m/"||trim(&key.)||"/i",nm) > 0;
        keyword = &key.;
    run;

    *** CONCATENATE temp;
    data mylib.subset_data;
        set mylib.subset_data 
            temp;
    run;
%mend;

可重现示例(使用sashelp.class数据集)

proc contents data = sashelp.class; run;

%macro subset_data(key);
    %let name_unquoted = %qsysfunc(compress(&key.,%str(%")));

    data &name_unquoted.;
        set sashelp.class;
        if prxmatch("m/"||trim(&key.)||"/i", Name) > 0;
    run;
%mend;

data keywords;
   input id keyword $;
   datalines;
1     w
2     u
3     y
;

data _null_;
  set keywords;
  call execute('%nrstr(%subset_data("'||keyword||'"))');
run;

进程sql版本

%macro subset_data(key);
    %let name_unquoted = %qsysfunc(compress(&key., %str(%")));

    proc sql;
        create table &name_unquoted. as
        select * from mylib.DWH_table
        where nm like "%" || trim(&key.) || "%";
        -- where nm index(nm, trim(&key.)) > 0;
    quit;
%mend;

proc sql (使用 SAS## 数据集)

data keywords;
    set keywords;
    dname = cat("", "sas", _n_);
run;

%macro subset_data(key, dname);
    %let name_unquoted = %qsysfunc(compress(&dname.,%str(%")));
    proc sql;
        create table &name_unquoted. as
        select * from mylib.DWH_table
        where nm like "%" || trim(&key.) || "%";
        -- where nm index(nm, trim(&key.)) > 0;
    quit;
%mend;

data _null_;
  set keywords;
  call execute('%nrstr(%subset_data("'||keyword||'", "'||dname||'"))');
run;

将您的关键字放在宏变量中

proc sql;
  select count(distinct KEYWORDS)
  into :no_keys
  from mylib.MY_KEYWORDS;

  select distinct KEYWORDS
  into :key_1-key_&no_keys
  from mylib.MY_KEYWORDS;
quit;

现在使用那些宏变量

%macro find_keywords;
data 
  %do key_nr = 1 %to &no_keys;
    mylib.out_sas&key_nr (drop = UP_nm)
  %end;
  ;
    set mylib.DWH_table;
    UP_nm : upcase(nm);
    %do key_nr = 1 %to &no_keys;
      keyword = "&key.";
      if prxmatch ("m/&&key_&key_nr/i",UP_nm) > 0 then output out_sas&key_nr;
    %end;
run;
%mend;
%find_keywords;

您需要将其嵌入到宏中,因为您不能在“打开”代码中使用 %do ... %end;&& 解析为 &,这使其成为延迟的 &,在解析 &key_nr 之后解析。

免责声明:此代码未经测试。如果您无法获取它运行,请回复。

一个想法是对 is match 条件执行交叉连接。结果是 table 每个名称名词匹配一行。

示例数据和代码:

data names;
length name ;
infile cards length=L;
input name $varying. L;
datalines;
Bob
Bob's Burgers
Angel
Angle iron city
Chad
Chadwicks town council
Dutch
Edward
run;

data nouns;
length noun ;
infile cards length=L;
input noun $varying. L;
datalines;
chad
own
ward
burger
run;

/*
* might want to pre lowercase the data being matched up
data lower_names;
  set names;
  lower_name = lower(name);
data lower_nouns;
  lower_noun = lower(noun);
run;
*/

proc sql;
  create table want as 
  select name, noun 
  from names as NAME 
  cross join nouns as NOUN 
  where index(lowcase(NAME),lowcase(trim(NOUN))) >= 1 /* SAS INDEX() result: 1 or higher means noun is present */
  ;
quit;

无论您采用何种方法,都会有很多 activity。假设有 100 个名词要针对所有名称进行检查,即 26M 个名称 x 100 个名词 = 2.6B 是匹配评估。最强大的系统和可用的资源通常会为您提供最快的答案。

案例一:SAS安装比较好

  • 将名称下载到 SAS
  • 将名称交叉连接到 SAS 中的名词

案例二:Teradata安装比较好

  • 将名词上传到 Teradata
  • 将名称交叉连接到 Teradata 中的名词(通过直通 SQL)

案例一代码:

Proc SQL;
  connect to (user=&user_id. password=&user_pwd.);

  * download names;
  create table mylib.DWH_names as
  select * from connection to Teradata (
    select distinct nm from DWH_table
  );

  create table work.NameNounMatches as
  select 
    nm,
    noun
  from 
    mylib.dwh_names as NAMES
  cross join
    mylib.nouns as NOUNS
  where 
    INDEX(lowcase(NAMES.nm),lowcase(trim(NOUNS.noun))) >= 1
  ;

案例2代码:

Teradata temp table -- 从 Tom https://communities.sas.com/t5/SAS-Enterprise-Guide/SAS-Access-to-Teradata-How-to-create-Temporary-tables-in/td-p/228852

上传 (connection=global)
libname tdwork teradata username=&username password=&password server=&server 
  connection=global dbmstemp=yes
;
data tdwork.NOUNS_UPLOADED;
   set mylib.nouns;
run;

* cross join in Teradata via passthrough;    
proc sql;
   connect using tdwork;
   create table work.NameNounMatches as
   select * from connection to tdwork 
   ( select Cust.UNIQUE_ID,IP.IP_NAME
     from TABLE_DWH as NAMES_LIST
     cross join NOUNS_UPLOADED as NOUNS_LIST
     where POSITION(NAMES_LIST.nm,NOUNS_LIST.noun) >= 1
   );
quit;