*DATA MINING SAS NOTES
*
*Linear Regression Misc Topics (Simulating data, categorical variables, ANOVA, higher order terms, and interaction terms;


*Loops;
data loop_example;
do i=1 to 10;
	x=i**2;
	output;
end;
run;

proc print data=loop_example;
run;


*Generating Random Normal Data;
data normal_example;
call streaminit(5366);  *Sets random seed;
do i=1 to 100;
	x=rand('NORMAL');   *Normally distributed;
	output;
end;
run;


proc univariate data=normal_example;
histogram x/normal;
run;


*Generating Random Uniform Data;
data normal_example;
call streaminit(5366);  *Sets random seed;
do i=1 to 100;
	x=rand('UNIFORM');   *Uniformly distributed;
	output;
end;
run;


proc univariate data=normal_example;
histogram x/normal;
run;


*Generating Example Linear Regression Data;
data regression_example;
call streaminit(5366);  
do i=1 to 1000;
	X1=10+20*rand('UNIFORM');
	X2=40+10*rand('UNIFORM');
	X3=500+100*rand('UNIFORM');
	epsilon=5*rand("NORMAL");
	Y=200+3*X1-4*X2+0.7*X3+epsilon;
	output;
end;
run;


*Regression with PROC REG;
proc reg data=regression_example;
model Y=X1 X2 X3;
run;


*Regression with PROC GLM;
proc glm data=regression_example;
model Y=X1 X2 X3;
run;


*Categorical Variable;
data milk_example;
call streaminit(329);
do i=1 to 1000;
	u=rand('UNIFORM');
	if u<=0.3 then farm="FARM_A";
	else if u<=0.8 then farm="FARM_B";
	else farm="FARM_C";
	output;
end;
run;

proc print data=milk_example;
run;

proc freq data=milk_example;
run;


*Regression with categorical variable;
data milk_example;
call streaminit(329);
do i=1 to 1000;
	u=rand('UNIFORM');
	epsilon=100*rand('NORMAL');
	if u<=0.3 then 
				do;
					farm="FARM_A";
					milk=7000+epsilon;
				end;
	else if u<=0.8 then 
					do;
						farm="FARM_B";
						milk=7500+epsilon;
					end;
	else
					do;
						farm="FARM_C";
						milk=8000+epsilon;
					end;
	output;
end;
run;


*Calculating average milk production for each farm;
proc means data=milk_example;
class farm;
run;


*Analysis of Variance (ANOVA) in SAS;
proc anova data=milk_example;
class farm;
model milk=farm;
run;



*Regression with categorical variable;
proc glm data=milk_example;
class farm;
model milk=farm;
run;




*Regression data set with quantitative and categorical variables,
*higher-order terms, and interaction terms;
data regression_example;
call streaminit(5366);  
do i=1 to 1000;
	u=rand('UNIFORM');
	if u<0.7 then do;
					indicator=0;
					mathgrade="A";
					end;
	else do;
			indicator = 1;
			mathgrade="B";
			end;
	X1=10+20*rand('UNIFORM');
	X2=40+10*rand('UNIFORM');
	X3=500+100*rand('UNIFORM');
	epsilon=5*rand("NORMAL");
	Y=200+3*X1+0.1*X1**2-4*X2+0.7*X3+50*indicator+20*X2*indicator+epsilon;
	output;
end;
run;


proc freq data=regression_example;
tables mathgrade;
run;


proc glmselect data=regression_example;
class mathgrade;
model Y=X1 X1*X1 X2 X3 mathgrade mathgrade*X2;
run;