选择所有缺失值的字符变量
我有一个包含大约3000个变量的SAS数据集,并且我想摆脱所有缺失值的字符变量。我知道如何为数字变量做这个事情 - 我特别想知道字符变量。我需要使用基础SAS来完成这项工作,但这可能包括proc SQL,这也是我为什么也标记了这个'SQL'的原因。
谢谢!选择所有缺失值的字符变量
编辑:
背景信息:这是一个很高的数据集,包含来自7次访谈的调查数据。一些但不是全部的调查项目(变量)在波浪中重复。我试图通过拉取所有波形的记录来创建实际用于每个波形的项目列表,摆脱除SAS以外的所有缺省值的所有列,然后运行proc contents
。
回答:
我创建了一个宏来检查空字符列,并将它们从原始列中删除,或者创建一个删除了空列的新数据集。它有两个可选参数:数据集的名称(默认是最近创建的数据集)以及用于命名新副本的后缀(将后缀设置为无需编辑原始数据)。
它使用proc freq与levels选项和自定义格式来确定空字符列。 proc sql然后用于创建要删除的列的列表并将它们存储在宏变量中。
下面是宏:
%macro delemptycol(ds=_last_, suffix=_noempty); option nonotes;
proc format;
value $charmiss
' '= ' '
other='1';
run;
%if "&ds"="_last_" %then %let ds=&syslast.;
ods select nlevels;
ods output nlevels=nlev;
proc freq data=&ds.(keep=_character_) levels ;
format _character_ $charmiss.;
run;
ods output close;
/* create macro var with list of cols to remove */
%local emptycols;
proc sql noprint;
select tablevar into: emptycols separated by ' '
from nlev
where NNonMissLevels=0;
quit;
%if &emptycols.= %then %do;
%put DELEMPTYCOL: No empty character columns were found in data set &ds.;
%end;
%else %do;
%put DELEMPTYCOL: The following empty character columns were found in data set &ds. : &emptycols.;
%put DELEMPTYCOL: Data set &ds.&suffix created with empty columns removed;
data &ds.&suffix. ;
set &ds(drop=&emptycols);
run;
%end;
options notes;
%mend;
例子用法:
/* create some fake data: Here char5 will be empty */ data chardata(drop= j randnum);
length char1-char5 $8.;
array chars(5) char1-char5;
do i=1 to 100;
call missing(of char:);
randnum=floor(10*ranuni(i));
do j=2 to 5;
if (j-1)<randnum<=(j+1) then chars(j-1)="FOO";
end;
output;
end;
run;
%delemptycol(); /* uses default _last_ for the data and "_noempty" as the suffix */
%delemptycol(ds=chardata, suffix=); /* removes the empty columns from the original */
回答:
这可能是一个更简单的方法,但这是我想出的。
干杯 罗布
编辑:注意这两个字符和数字变量。
** ** TEST DATASET
*;
data x;
col1 = "a"; col2 = ""; col3 = "c"; output;
col1 = "" ; col2 = ""; col3 = "c"; output;
col1 = "a"; col2 = ""; col3 = "" ; output;
run;
**
** GET A LIST OF VARIABLE NAMES
*;
proc sql noprint;
select name into :varlist separated by " "
from sashelp.vcolumn
where upcase(libname) eq "WORK"
and upcase(memname) eq "X";
quit;
%put &varlist;
**
** USE A MACRO TO CREATE A DATASTEP. FOR EACH COLUMN THE
** THE DATASTEP WILL CREATE A NEW COLUMN WITH THE SAME NAME
** BUT PREFIXED WITH "DELETE_". IF THERE IS AT LEAST 1
** NON-MISSING VALUE FOR THE COLUMN THEN THE "DELETE" COLUMN
** WILL FINISH WITH A VALUE OF 0, ELSE 1. WE WILL ONLY
** KEEP THE COLUMNS CALLED "DELETE_" AND OUTPUT ONLY A SINGLE
** OBSERVATION TO THE FINAL DATASET.
*;
%macro find_unused_cols(iDs=);
%local cnt;
data vars_to_delete;
set &iDs end=eof;
%let cnt = 1;
%let varname = %scan(&varlist, &cnt);
%do %while ("&varname" ne "");
retain delete_&varname;
delete_&varname = min(delete_&varname, missing(&varname));
drop &varname;
%let cnt = %eval(&cnt + 1);
%let varname = %scan(&varlist, &cnt);
%end;
if eof then do;
output;
end;
run;
%mend;
%find_unused_cols(iDs=x);
**
** GET A LIST OF VARIABLE NAMES FROM THE NEW DATASET
** THAT WE WANT TO DELETE AND STORE TO A MACRO VAR.
*;
proc transpose data=vars_to_delete out=vars_to_delete;
run;
proc sql noprint;
select substr(_name_,8) into :vars_to_delete separated by " "
from vars_to_delete
where col1;
quit;
%put &vars_to_delete;
**
** CREATE A NEW DATASET CONTAINING JUST THOSE VARS
** THAT WE WANT TO KEEP
*;
data new_x;
set x;
drop &vars_to_delete;
run;
回答:
Rob和cmjohns,非常感谢你的帮助。根据您的解决方案和想法我已经在上周末,这里是我想出了:
%macro removeEmptyCols(origDset, outDset); * get the number of obs in the original dset;
%let dsid = %sysfunc(open(&origDset));
%let origN = %sysfunc(attrn(&dsid, nlobs));
%let rc = %sysfunc(close(&dsid));
proc transpose data= &origDset out= transpDset;
var _all_;
run;
data transpDset;
set transpDset;
* proc transpose converted all old vars to character,
so the . from old numeric vars no longer means 'missing';
array oldVar_ _character_;
do over oldVar_;
if strip(oldVar_) = "." then oldVar_ = "";
end;
* each row from the old dset is now a column with varname starting with 'col';
numMiss = cmiss(of col:);
numCols = &origN;
run;
proc sql noprint;
select _NAME_ into: varsToKeep separated by ' '
from transpDset
where numMiss < numCols;
quit;
data &outDset;
set &origDset (keep = &varsToKeep);
run;
%mend removeEmptyCols;
我会尝试所有3种方式,并报告哪一个是最快的...
附:添加2010年12月23日作为未来参考:SGF Paper 048-2010: Dropping Automatically Variables with Only Missing Values
以上是 选择所有缺失值的字符变量 的全部内容, 来源链接: utcz.com/qa/260630.html