SAS Macros for Merging GID Map and Data Files.

A SAS program and macros code printed below may be used to make a wide data file. The program has two sections: a) user input section and b) execution section. The program requires input to three macro variables and the user must provide valid values to these macro variables in the user input section (see SAS program below): Input the value to the right of "=" sign on a macro definition row and leave ";" to mark the end of the definition. Use SAS help on how to create a macro variable.

The following macro variables are used in the program:

At the end of the run, users must check SAS log for errors and take appropriate action.

From this process, two output files --"long" and "wide"-- named as "map filename" + "_long" and "map filename" + "_wide" are copied to the input data files folder. The number of variables in the "long" file = mv + (n-2)*g, where mv = total variable in the map file, n = total number of variables in the data file and g = total number of generations.

The maximum number of columns in the "wide" file can be calculated by the formula:

where g = total number of generations, and n = total number of variables from the data file. Therefore, it's recommended that users standardize data columns and drop unwanted variables from the data file before using the wide utility (SAS program).
The output (wide) file is the intersection of individuals from map and data files, therefore, the wide file can have fewer columns than calculated by the above formula. Researchers can also subset rows from the map file as to fit the type individuals, parents and grandparents they want to include into the final analysis. Such sub setting can reduce the number of columns and rows from the wide file. It's recommended to avoid carrying extra variables in the wide file.

The output (wide) file is the intersection of individuals from map and data files, therefore, the wide file can have fewer columns than calculated by the above formula.

Researchers can also subset rows from the map file as to fit the type individuals, parents and grandparents they want to include into the final analysis. Such sub setting can reduce the number of columns and rows from the wide file. It's recommended to avoid carrying extra variables in the wide file.

SAS Program:

--- copy SAS code below this line to the end ---

/**	===USER INPUT MACRO VARIABLES SECTION BEGINS===	**/
/**	---Enter values for macro variables below---	**/

%let path= ; 		/*	Data files folder */
%let mapfile= ;		/*	Map filename 	  */
%let datafile= ;	/*	Data filename 	  */

/**	===USER INPUT MACRO VARIABLES SECTION ENDS=== 	**/

/**	===DO NOT EDIT BELOW THIS LINE=== 	**/
/**	===     EXECUTION SECTION     === 	**/

LIBNAME TMP "&path" ;
* options mprint ;

**	copy files to work library ** ;
DATA &datafile ;
	SET TMP.&datafile ;
RUN ;

**	check for ID68 and PN as first and second variables ** ;
%let ikey = 0 ;
proc sql noprint ;
select trim(varname) into :ikey separated by '-'
from (
select varnum, upcase(name) as varname
from dictionary.columns
where libname = 'WORK' and upcase(memname) = %upcase("&datafile")
and varnum <= 2
) order by varnum ;
quit ;
%put &ikey ;
%macro ikey ;
	%if "&ikey" ne "ID68-PN" %then %do ;
		%PUT ERROR: ID68 and PN are not the first and second variables ;
		%PUT ERROR- on the input data set (%UPCASE(&datafile)). ;
	%end ;
%mend ikey;
%ikey ;

**	check for duplicate key values ** ;
%let idup = 0 ;
proc sql noprint ;
select trim(left(put(count(*), best.))) into :idup
from (
select id68, pn
from &datafile
group by id68, pn
having count(*) > 1 )
;
quit ;

%macro idup ;
	%if &idup > 0 %then %do ;
		%PUT ERROR: The input data set (%UPCASE(&datafile)) has duplicate rows. ;
	%end ;
%mend idup;
%idup ;

DATA &mapfile (drop=gid) gid (keep=mapid gid) ;
	FORMAT MAPID F6. LINKID Z8.3 ;
	SET TMP.&mapfile ;
	MAPID = _N_ ;
	LINKID = ID68+PN/1000 ;
RUN ;


proc sql ;
create table var as
select varnum, upcase(name) as varname label=' '
from dictionary.columns
where libname='WORK' and memname=%upcase("&datafile")
order by varnum
;
quit ;

data _null_ ;
	set var ;
	call symput('var'||trim(left(_n_)), trim(left(varname)) ) ;
	call symput('nvar', trim(left(_n_)) ) ;
run ;
%put &nvar ;


proc sql ;
create table mvar as
select varnum
	, upcase(name) as varname label=' '
	, format as vFormat label=' '
	, case substr(name, 1, 4)
		when 'ID68' then substr(name, 5)
		else ' ' end as psfix label=' '
from dictionary.columns
where libname='WORK' and memname=%upcase("&mapfile")
order by varnum
;
quit ;


data _null_ ;
	set mvar ;
	call symput('mvar'||trim(left(_n_)), trim(left(varname)) ) ;
	call symput('mFmt'||trim(left(_n_)), trim(left(vFormat)) ) ;
	call symput('nmvar', trim(left(_n_)) ) ;
	if substr(varname, 1, 4) = 'ID68' then do ;
		if psfix = ' ' then psfix = 'I' ;
		n + 1 ;
		call symput('psfix'||trim(left(n)), trim(left(psfix)) ) ;
		call symput('npsfix', trim(left(n)) ) ;
	end ;
run ;
%put # of map vars = &nmvar ;
%put Map prefix types = &npsfix ;


%macro iad ;
%do i=1 %to &npsfix ;
	proc sql ;
		create table &&psfix&i as
		select
		  b.MAPID
		, b.LINKID
		%if &i = 1 %then %do ;
	  		, a.ID68
			, a.PN
			%do j=3 %to &nvar ;
				, a.&&var&j
			%end ;
		%end ;
		%else %if &i > 1 %then %do ;
			, a.ID68 as ID68&&psfix&i
			, a.PN as PN&&psfix&i
			, b.&&psfix&i..TYPE
			%do j=3 %to &nvar ;
				, a.&&var&j as &&var&j.._&&psfix&i
			%end ;
		%end ;
		from &datafile a inner join &mapfile b
		%if &i = 1 %then %do ;
			on (a.ID68 = b.ID68 and a.PN = b.PN)
		%end ;
		%if &i > 1 %then %do ;
			on (a.ID68 = b.ID68&&psfix&i and a.PN = b.PN&&psfix&i)
			where &&psfix&i..TYPE ne ' '
		%end ;
		order by b.mapid
		;
	quit ;
%end ;

data &mapfile._long ;
	format
	%do k=1 %to &nmvar ;
	&&mvar&k &&mFmt&k
	%end ; ;
	merge
	%do i=1 %to &npsfix ;
	&&psfix&i (in=&&psfix&i)
	%end ; ;
		by mapid ;
	if I ;
run ;
%mend iad ;
%iad ;


proc sql ;
create table mvar2 as
select
  a.varnum
, a.varname
, case (a.varname = b.varname)
	when 1 then 'I  ' else scan(a.varname, 2, '_') end as WTYPE
from (
	select varnum label=' '
		, upcase(name) as varname label=' '
		, format as vFormat label=' '
	from dictionary.columns
	where libname='WORK' and memname=%upcase("&mapfile._long")
	)a
	left join (
	select varname from var where varnum >= 3) b
on (scan(a.varname, 1, '_') = b.varname)
order by varnum
;
quit ;


%macro iad2 ;
data mvar3 ;
	set mvar2 ;
	%do i=1 %to &npsfix ;
	if 2 <= varnum <= %eval(&i*3+1) or wtype = "&&psfix&i" then  &&psfix&i = 1 ;
	%end ;
run ;

%do i=1 %to &npsfix ;
	proc sql noprint ;
	select trim(varname) into :svar separated by ' '
	from mvar3
	where &&psfix&i = 1
	;
	quit ;
	data &&psfix&i..2 ;
		set &mapfile._long (keep = &svar) ;
		%if &i > 1 %then %do ;
			length wtype $10 ;
			wtype = ptype ;
			%if &i = 2 %then %do ;
				if PTYPE ne ' ' ;
				wtype = ptype ;
			%end ;
			%if &i = 3 %then %do ;
				if PTYPE ne ' ' ; if GPTYPE ne ' ' ;
				wtype = trim(wtype)||trim(gptype) ;
			%end ;
			%else %if &i = 4 %then %do ;
				if PTYPE ne ' ' ; if GPTYPE ne ' ' ; if GGPTYPE ne ' ' ;
				wtype = trim(wtype)||trim(gptype)||trim(ggptype) ;
			%end ;
		%end ;
	run ;
%end ;
data wtype ;
	length id 8 wdata $10 ;
	set
	%do i=2 %to &npsfix ;
		&&psfix&i..2 (in=&&psfix&i keep=wtype)
	%end ; end = end ;
	%do i=2 %to &npsfix ;
		if &&psfix&i then do ;
			id = &i ;
			wdata = "&&psfix&i" ;
		end ;
	%end ; ;
	output ;
	if end then do ;
		id=1 ; wdata='I'; wtype=' ' ; output ;
	end ;
run ;
proc sort data = wtype nodupkey ;
	by id wdata wtype ;
run ;
%mend iad2 ;
%iad2 ;


data _null_ ;
	set wtype ;
		by id wdata wtype ;
	call symput('wdata'||trim(left(_n_)), trim(left(wdata)) ) ;
	call symput('wtype'||trim(left(_n_)), trim(left(wtype)) ) ;
	call symput('nw', trim(left(_n_)) ) ;
run ;
%put &wdata2 &wtype2 &nw ;


%macro iadw ;
%do i=1 %to &nw;
	proc sql noprint ;
	select trim(varname)||' as '||trim(varname)||"_&&wtype&i"
		into :wide separated by ', '
	from mvar3
	where wtype = "&&wdata&i"
	;
	quit ;
	/*%put &wide ;*/
	proc sql;
	create table &&wdata&i..2_&&wtype&i as
	select linkid, id68, pn, &wide
	from &&wdata&i..2
	%if &i > 1 %then %do ;
		where wtype = "&&wtype&i"
	%end ;
	;
	quit ;
	proc sort data = &&wdata&i..2_&&wtype&i nodupkey ;
		by linkid ;
	run ;
%end ;
data &mapfile._wide (drop=linkid) ;
	merge
	%do i=1 %to &nw;
		&&wdata&i..2_&&wtype&i
	%end ; ;
		by linkid ;
run ;
%mend iadw ;
%iadw ;


**	# of lines per individual type ;
proc sql ;
create table wtype2 as
select a.*, b.d2_ format=$10., b.nvar, b.nobs
from wtype a, (
	select memname as d2_ label=' ', nvar label=' ', nobs label=' '
	from dictionary.tables
	where libname = 'WORK') b
where trim(a.wdata)||'2_'||trim(a.wtype) = b.d2_
order by a.id
;
quit ;


**	Add GID to long file ;
proc sort data = &mapfile._long ;
	by mapid ;
run ;
proc sort data = gid ;
	by mapid ;
run ;
data &mapfile._long (drop = mapid linkid) ;
	format ID68 z4. PN z3. GID f2. ;
	merge &mapfile._long (in=long) gid ;
		by mapid ;
	if long ;
run ;

**	Save/copy output data files ;
proc datasets lib=work nolist ;
	save &mapfile._long &mapfile._wide ;
	copy in=work out=tmp ;
		select &mapfile._long &mapfile._wide;
quit ;