*DATA MINING SAS NOTES * *Linear Regression Misc Topics (Simulating data, categorical variables, ANOVA, higher order terms, and interaction terms; *Loops; data loop_example; do i=1 to 10; x=i**2; output; end; run; proc print data=loop_example; run; *Generating Random Normal Data; data normal_example; call streaminit(5366); *Sets random seed; do i=1 to 100; x=rand('NORMAL'); *Normally distributed; output; end; run; proc univariate data=normal_example; histogram x/normal; run; *Generating Random Uniform Data; data normal_example; call streaminit(5366); *Sets random seed; do i=1 to 100; x=rand('UNIFORM'); *Uniformly distributed; output; end; run; proc univariate data=normal_example; histogram x/normal; run; *Generating Example Linear Regression Data; data regression_example; call streaminit(5366); do i=1 to 1000; X1=10+20*rand('UNIFORM'); X2=40+10*rand('UNIFORM'); X3=500+100*rand('UNIFORM'); epsilon=5*rand("NORMAL"); Y=200+3*X1-4*X2+0.7*X3+epsilon; output; end; run; *Regression with PROC REG; proc reg data=regression_example; model Y=X1 X2 X3; run; *Regression with PROC GLM; proc glm data=regression_example; model Y=X1 X2 X3; run; *Categorical Variable; data milk_example; call streaminit(329); do i=1 to 1000; u=rand('UNIFORM'); if u<=0.3 then farm="FARM_A"; else if u<=0.8 then farm="FARM_B"; else farm="FARM_C"; output; end; run; proc print data=milk_example; run; proc freq data=milk_example; run; *Regression with categorical variable; data milk_example; call streaminit(329); do i=1 to 1000; u=rand('UNIFORM'); epsilon=100*rand('NORMAL'); if u<=0.3 then do; farm="FARM_A"; milk=7000+epsilon; end; else if u<=0.8 then do; farm="FARM_B"; milk=7500+epsilon; end; else do; farm="FARM_C"; milk=8000+epsilon; end; output; end; run; *Calculating average milk production for each farm; proc means data=milk_example; class farm; run; *Analysis of Variance (ANOVA) in SAS; proc anova data=milk_example; class farm; model milk=farm; run; *Regression with categorical variable; proc glm data=milk_example; class farm; model milk=farm; run; *Regression data set with quantitative and categorical variables, *higher-order terms, and interaction terms; data regression_example; call streaminit(5366); do i=1 to 1000; u=rand('UNIFORM'); if u<0.7 then do; indicator=0; mathgrade="A"; end; else do; indicator = 1; mathgrade="B"; end; X1=10+20*rand('UNIFORM'); X2=40+10*rand('UNIFORM'); X3=500+100*rand('UNIFORM'); epsilon=5*rand("NORMAL"); Y=200+3*X1+0.1*X1**2-4*X2+0.7*X3+50*indicator+20*X2*indicator+epsilon; output; end; run; proc freq data=regression_example; tables mathgrade; run; proc glmselect data=regression_example; class mathgrade; model Y=X1 X1*X1 X2 X3 mathgrade mathgrade*X2; run;