*Data Mining SAS Notes * *Principal Components Analysis and Factor Analysis; *Creating example data for PCA; data prin_comp_example; call streaminit(22415); do i=1 to 100; z1=rand("NORMAL"); z2=rand("NORMAL"); x1=5*z1+0*z2; x2=2*z1+0.4*z2; output; end; run; *Scatterplot of data; proc sgplot data=prin_comp_example; scatter x=x1 y=x2; run; *Computing the covariance matrix; proc corr data=prin_comp_example cov out=S; var x1 x2; run; proc print data=S; run; *Principal Components Analysis; proc princomp data=S cov; run; *Inputing a covariance matrix manually; data prin_comp_example2 (type=cov); input Name $ x1 x2; datalines; x1 27.655 10.957 x2 10.957 4.515 ; proc print data=prin_comp_example2; run; proc princomp data=prin_comp_example2 cov; run; *Creating example data for PCA; data prin_comp_example; call streaminit(22415); do i=1 to 100; z1=rand("NORMAL"); z2=rand("NORMAL"); x1=5*z1+0*z2; x2=2*z1+0.4*z2; output; end; run; *PCA Using Original Data; proc princomp data=prin_comp_example cov; var x1 x2; run; *Math/reading exam scores from Hotelling (1933); data score (type=corr); *_type_='corr'; input Name $ readsp readpow mathsp mathpow; datalines; readsp 1. .698 .264 .081 readpow .698 1. -.061 .092 mathsp .264 -.061 1. .594 mathpow .081 .092 .594 1. ; proc print data=score; run; proc princomp data=score; run; *Adelges Data; data adelges(type=corr); infile cards missover; input length width forwing hinwing spirac antseg1-antseg5 antspin; input tarsus3 tibia3 femur3 rostrum ovipos ovspin fold hooks; cards; 1.0 .934 1.0 .927 .941 1.0 .909 .944 .933 1.0 .524 .487 .543 .499 1.0 .799 .821 .856 .833 .703 1.0 .854 .865 .886 .889 .719 .923 1.0 .789 .834 .846 .885 .253 .699 .751 1.0 .835 .863 .862 .850 .462 .752 .793 .745 1.0 .845 .878 .863 .881 .567 .836 .913 .787 .805 1.0 -.458 -.496 -.522 -.488 -.174 -.317 -.383 -.497 -.356 -.371 1.0 .917 .942 .940 .945 .516 .846 .907 .861 .848 .902 -.465 1.0 .939 .961 .956 .952 .494 .849 .914 .876 .877 .901 -.447 .981 1.0 .953 .954 .946 .949 .452 .823 .886 .878 .883 .891 -.439 .971 .991 1.0 .895 .899 .882 .908 .551 .831 .891 .794 .818 .848 -.405 .908 .920 .921 1.0 .691 .652 .694 .623 .815 .812 .855 .410 .620 .712 -.198 .725 .714 .676 .720 1.0 .327 .305 .356 .272 .746 .553 .567 .067 .300 .384 -.032 .396 .360 .298 .378 .781 1.0 -.676 -.712 -.667 -.736 -.233 -.504 -.502 -.758 -.666 -.629 .492 -.657 -.655 -.687 -.633 -.186 .169 1.0 .702 .729 .746 .777 .285 .499 .592 .793 .671 .668 -.425 .696 .724 .731 .694 .287 -.026 -.775 1.0 ; proc print data=adelges; run; proc princomp data=adelges n=5; run; *Scores for Classics (C), French (F), *English (E), Math (M), *Discrimination of Pitch (D), and Music (MU) *for 33 students considered by Spearman (1904); data spearman (type=corr); infile cards missover; input _name_ $ Classics French English Math Pitch Music; lines; Classics 1.0 French .83 1.0 English .78 .67 1.0 Math .70 .67 .64 1.0 Pitch .66 .65 .54 .45 1.0 Music .63 .57 .51 .51 .40 1.0 ; proc factor data=spearman; run; *Factor analysis with residual matrix; proc factor data=spearman res; run; *Factor analysis with four factors; proc factor data=spearman nfact=4 res; run; *World bank data; data economy; input country $ pop area gnp life radio tourist food school; lines; Canada 25.6 9976 14120 76 758 12854 3404 98 USA 241.6 9363 17480 75 2133 20441 3632 99 Haiti 6.1 28 330 54 21 167 1906 48 Brazil 138.4 8512 1810 65 355 1420 2575 78 Austria 7.6 84 9990 74 475 14482 3479 80 Iceland .24 103 13410 77 593 78 3122 100 Spain 38.7 505 4860 76 274 25583 3325 97 UK 56.7 245 8870 75 986 12499 3210 96 Gambia 0.77 11 230 43 120 37 2217 34 India 781.4 3288 290 57 56 1305 2031 54 Malaysia 16.1 330 1830 69 415 1050 2569 77 Austrlia 16.0 7687 11920 78 1159 944 3044 89 ; proc factor data=economy nfact=4 res out=fscore; var pop area gnp life radio tourist food school; run; proc print data=fscore; run; *Varimax Rotation; proc factor data=economy nfact=4 res out=fscore rotate=varimax; var pop area gnp life radio tourist food school; run;