*MATH 5366 Data Mining II *Brief Introduction to SAS; ******************************************************; *SAS Documentation can be found here; *https://support.sas.com/ *https://support.sas.com/documentation/cdl/en/allprodsproc/68038/HTML/default/viewer.htm#procedures.htm; *SAS Code for the examples from the packet is here; *http://www.prenhall.com/cody/; ******************************************************; ********************** *Data and Proc Blocks* *********************; *This is a DATA block, where we define our data set; data mydata; input x y; datalines; 1 4 3 8 5 9 ; *This is a PROC statement, where we get SAS to do something with our data; proc print data=mydata; run; *Summary statistics; proc means data=mydata; run; *Adding optional arguments; proc means data=mydata MEAN STD VAR; TITLE "Summary Statistics for Example Data"; OUTPUT out=mydata2; run; *Doing Calculations in a Data Step; data mydata; input x y; z=x+y; datalines; 1 4 3 8 5 9 ; proc print data=mydata; run; ************ *Regression* ***********; *Establishing a filename for an online data set; filename statslab url 'http://faculty.tarleton.edu/crawford/documents/Math5364/math5305Lab2Data.txt'; *Importing data; data math5305lab2; infile statslab dlm=','; input Y X1 X2 X3; proc print data=math5305lab2; run; *Multiple regression: Storing residuals in variable called myresiduals; proc reg data=math5305lab2; model Y=X1 X2 X3; output out=myoutput r=myresiduals; run; proc print data=myoutput; run; *Histogram and qqplot for residuals; proc univariate data=myoutput; histogram myresiduals/normal; qqplot myresiduals; run; *Testing normality of residuals; proc univariate data=myoutput normal; var myresiduals; run; *Testing homoscedasticity of residuals (constancy of error variance); proc reg data=math5305lab2; model Y=X1 X2 X3/SPEC; run; *Plotting with built in residuals and predicted values (yhat values); proc reg data=math5305lab2; model Y=X1 X2 X3; output out=myoutput r=myresiduals; plot Y*X1 Y*X2 Y*X3 residual.*X1 residual.*X2 residual.*X3 Y*predicted. residual.*predicted.; run; ***************************************** *Proc IML (Interactive Matrix Language) * ****************************************; proc iml; *Defining two matrices; A={1 2, 3 4}; B={5 6, 7 8}; C=A+B; *Sum; D=A*B; *Product; E=inv(A); *Inverse; F=A`; *Transpose; print A B C D E F; *Printing the matrices; run; quit; *Using the repeat statement; proc iml; A=repeat(0,4,3); x=repeat(1,10,1); print A x; run; quit; *Editing components of a matrix/vector; proc iml; A=repeat(0,4,3); x=repeat(1,10,1); A[1,2]=7; x[3]=7; print A x; run; quit; *MATH 5305 Lab 2 with PROC IML; proc iml; use math5305lab2; *Imports math5305lab2 data set into this procedure; read all; X0=repeat(1,100,1); *Creates a vector of ones to serve as an intercept column; X=X0 || X1 || X2 || X3; *Combines vectors into a design matrix; betahat=solve(X`*X,X`*Y); *Calculates OLS estimator betahat; Yhat=X*betahat; *Calculates fitted values Yhat; e=Y-Yhat; *Calculates residuals e; print betahat; *Prints betahat; call pgraf(Yhat || Y); *Scatterplot of Y vs. Yhat; call pgraf(Yhat || e); *Scatterplot of e vs. Yhat; run; quit; *Writing to a data set with proc iml; proc iml; x={1,2,3}; y={51,52,53}; create mydata1; append var {x,y}; close mydata1; run; quit; proc print data=mydata1; run; *Appending data to an existing data set with PROC IML; proc iml; x={4,5,6}; y={54,55,56}; edit mydata1; append var {x,y}; close mydata1; run; quit; proc print data=mydata1; run; *Appending data sets with PROC APPEND; *Data set 1; proc iml; x={1,2,3}; y={51,52,53}; create mydata1; append var {x,y}; close mydata1; run; quit; proc print data=mydata1; run; *Data set 2; proc iml; x={4,5,6}; y={54,55,56}; create mydata2; append var {x,y}; close mydata2; run; quit; proc print data=mydata2; run; *Appending mydata2 to mydata1; proc append base=mydata1 data=mydata2; run; proc print data=mydata1; run; ******************** *PROC IML Functions* *******************; *Creating a simple function to add two numbers; proc iml; start mysum(x,y); return(x+y); finish; s=mysum(5,7); print s; s2=mysum({1,2,3,4},{5,6,7,8}); print s2; run; quit; *Storing the first n Fibonacci numbers in a data set; proc iml; start fib(n); if (n=1 | n=2) then return(1); fibvector=repeat(1,n,1); do i=3 to n; fibvector[i]=fibvector[i-1]+fibvector[i-2]; end; return(fibvector); finish; f=fib(30); print(f); create fibonacci; append; close fibonacci; run; quit; proc print data=fibonacci; run; *Sequences and random vectors; proc IML; x=(1:20)`; u=uniform(x); print x u; run; quit; *Creating a vector y; proc IML; x=(1:20)`; u=uniform(x); y=repeat(0,20,1); do i = 1 to 10; if u[i]<0.2 then y[i]=1; end; do i=11 to 20; if u[i]>0.2 then y[i]=1; end; print x u y; run; quit; *The Loc command; proc IML; x=(1:20)`; u=uniform(x); y=repeat(0,20,1); do i = 1 to 10; if u[i]<0.2 then y[i]=1; end; do i=11 to 20; if u[i]>0.2 then y[i]=1; end; y1=y[loc(x<=10)]; y2=y[loc(x>10)]; mu1=mean(y1); mu2=mean(y2); print x y y1 y2 mu1 mu2; run; quit;