#CHAPTER 5 EXAMPLES IN R #Euclidean distance function euclidean=function(x1,x2){ return(sqrt((x1-x2)%*%(x1-x2))) } #Test example euclidean(c(1,0),c(0,1)) #More examples x1=c(90,1300) x2=c(85,1200) euclidean(x1,x2) x1=c(70,950) x2=c(40,880) euclidean(x1,x2) xbar=c(67.04,978.21) s=c(18.61,132.35) x1=c(90,1300) x2=c(85,1200) z1=(x1-xbar)/s z2=(x2-xbar)/s euclidean(z1,z2) x1=c(70,950) x2=c(40,880) z1=(x1-xbar)/s z2=(x2-xbar)/s euclidean(z1,z2) #Standardizing iris x=iris[,1:4] xbar=apply(x,2,mean) xbarMatrix=cbind(rep(1,150))%*%xbar s=apply(x,2,sd) sMatrix=cbind(rep(1,150))%*%s z=(x-xbarMatrix)/sMatrix apply(z,2,mean) apply(z,2,sd) plot(z[,3:4],col=iris$Species) #Split iris into 70% training and 30% test data. set.seed=5364 train=sample(nrow(z),nrow(z)*.7) z[train,] #This is the training data z[-train,] #This is the test data #confmatrix confmatrix=function(y,predy){ matrix=table(y,predy) accuracy=sum(diag(matrix))/sum(matrix) return(list(matrix=matrix,accuracy=accuracy,error=1-accuracy)) } #The class package and knn function library(class) Species=iris$Species predSpecies=knn(train=z[train,],test=z[-train,],cl=Species[train],k=3) confmatrix(Species[-train],predSpecies) #Leave-one-out CV for knn predSpecies=knn.cv(train=z,cl=Species,k=3) confmatrix(Species,predSpecies) #Optimizing k with cross validation accvect=1:10 for(k in 1:10){ predSpecies=knn.cv(train=z,cl=Species,k=k) accvect[k]=confmatrix(Species,predSpecies)$accuracy } which.max(accvect) #knn with k = 7 Species=iris$Species predSpecies=knn(train=z[train,],test=z[-train,],cl=Species[train],k=7) confmatrix(Species[-train],predSpecies) #fgl Data library(MASS) ## a library of example data sets data(fgl) ## loads the data into R; see help(fgl) par(mfrow=c(3,3),mai=c(.3,.7,.7,.36)) plot(RI ~ type, data=fgl, col=c(grey(.2),2:6)) plot(Al ~ type, data=fgl, col=c(grey(.2),2:6)) plot(Na ~ type, data=fgl, col=c(grey(.2),2:6)) plot(Mg ~ type, data=fgl, col=c(grey(.2),2:6)) plot(Ba ~ type, data=fgl, col=c(grey(.2),2:6)) plot(Si ~ type, data=fgl, col=c(grey(.2),2:6)) plot(K ~ type, data=fgl, col=c(grey(.2),2:6)) plot(Ca ~ type, data=fgl, col=c(grey(.2),2:6)) plot(Fe ~ type, data=fgl, col=c(grey(.2),2:6)) #standardize function #standardize accepts a matrix of quantitative variables x #and returns the corresponding standardized matrix z, #whose columns have mean 0 and st dev 1. standardize=function(x){ xbar=apply(x,2,mean) xbarMatrix=cbind(rep(1,nrow(x)))%*%xbar s=apply(x,2,sd) sMatrix=cbind(rep(1,nrow(x)))%*%s z=(x-xbarMatrix)/sMatrix return(z) } #knn method from last time #knn.cv x=fgl[,1:9] z=standardize(x) y=fgl[,10] set.seed(1019) accvect=1:15 library(class) for(k in 1:15){ predy=knn.cv(train=z,cl=y,k=k) accvect[k]=confmatrix(y,predy)$accuracy } which.max(accvect) #knn with training and test data train=sample(nrow(z),round(nrow(z)*.7,0)) predy=knn(train=z[train,],test=z[-train,],cl=y[train],k=3) confmatrix(y[-train],predy) #kknn approach library(kknn) fit.glass = train.kknn(type ~ ., data=fgl, kmax = 15, kernel =c("rectangular","triangular", "epanechnikov", "biweight", "triweight","cos","inv", "gaussian" ,"optimal"), distance = 2) plot(fit.glass) fit.glass$MISCLASS predy2=(kknn(type~.,train=fgl[train,],test=fgl[-train,], k=3,kernel="rectangular",distance=2))$fitted.values confmatrix(y[-train],predy2) fit.glass$best.parameters predy3=(kknn(type~.,train=fgl[train,],test=fgl[-train,], k=8,kernel="biweight",distance=2))$fitted.values confmatrix(y[-train],predy3) ############################################################## ####################NAIVE BAYES CLASSIFICATION################ ############################################################## #HouseVotes84 install.packages("mlbench") library(mlbench) data(HouseVotes84) #Finding prior distribution install.packages("e1071") library(e1071) model = naiveBayes(Class ~ ., data = HouseVotes84) model model$apriori table(HouseVotes84$Class) model$apriori/(sum(model$apriori)) #Conditional probabilities model$tables (model$tables)$V8 table(HouseVotes84$Class,HouseVotes84$V8) 45/(45+218) 218/(45+218) 133/(133+24) 24/(133+24) #Posterior probabilities predict(model,newdata=HouseVotes84[1,],type="raw") predict(model,newdata=HouseVotes84[1,],type="class") predict(model,newdata=HouseVotes84[1,]) #Training and test data train=sample(nrow(HouseVotes84),round(.7*nrow(HouseVotes84),0)) model=naiveBayes(Class ~ ., data = HouseVotes84[train,]) predClass=predict(model,newdata=HouseVotes84[-train,]) confmatrix(HouseVotes84$Class[-train],predClass) #Matrix of posterior probabilities predict(model,newdata=HouseVotes84[-train,],type="raw") #Quantitative Predictors attach(iris) model=naiveBayes(Species~.,data=iris) model mean(Petal.Length[Species=="setosa"]) sd(Petal.Length[Species=="setosa"]) #Normality for iris data hist(Petal.Length) shapiro.test(Petal.Length) qqnorm(Petal.Length) #stratify by species hist(Petal.Length[Species=="setosa"]) shapiro.test(Petal.Length[Species=="setosa"]) qqnorm(Petal.Length[Species=="setosa"]) hist(Petal.Length[Species=="versicolor"]) shapiro.test(Petal.Length[Species=="versicolor"]) qqnorm(Petal.Length[Species=="versicolor"]) hist(Petal.Length[Species=="virginica"]) shapiro.test(Petal.Length[Species=="virginica"]) qqnorm(Petal.Length[Species=="virginica"]) for(j in 1:4){ for(spec in levels(Species)){ vect=iris[,j] print(shapiro.test(vect[Species==spec])) } } for(j in 1:4){ for(spec in levels(Species)){ vect=iris[,j] hist(vect[Species==spec],xlab=spec) } } for(j in 1:4){ for(spec in levels(Species)){ vect=iris[,j] qqnorm(vect[Species==spec]) } } #test accuracy for iris data train=sample(nrow(iris),round(.7*nrow(iris),0)) model=naiveBayes(Species~.,data=iris[train,]) predSpecies=predict(model,newdata=iris[-train,]) confmatrix(Species[-train],predSpecies) #Discretizing iris x=cut(1:100,breaks=4) is.factor(x) disciris=iris for(j in 1:4){ disciris[,j]=as.factor(cut(iris[,j],4)) } model=naiveBayes(Species~.,data=disciris[train,]) predSpecies=predict(model,newdata=disciris[-train,]) confmatrix(Species[-train],predSpecies) #Modifying the cut function mycut=function(vector,numbreaks){ probs=seq(from=0,to=1,by=1/numbreaks) breaks=quantile(vector,probs=probs) return(cut(vector,breaks=breaks)) } #Flight data library(e1071) cattime=cut(flight$schedtime,breaks=seq(from=600,to=2200,by=100),right=FALSE) flight=cbind(flight,cattime) set.seed(5364) train=sample(nrow(flight),round(.7*nrow(flight),0)) model=naiveBayes(delay~carrier+dest+origin+weather+dayweek+cattime,data=flight[train,]) preddelay=predict(model,newdata=flight[-train,]) confmatrix(flight$delay[-train],preddelay) cmatrix=confmatrix(flight$delay[-train],preddelay)$matrix #Performance metrics TP=cmatrix[1,1] TN=cmatrix[2,2] FP=cmatrix[2,1] FN=cmatrix[1,2] acc=(TP+TN)/(TP+TN+FP+FN) acc TPR=TP/(TP+FN) TNR=TN/(TN+FP) FPR=FP/(FP+TN) FNR=FN/(TP+FN) TPR TNR FPR FNR p=TP/(TP+FP) r=TP/(TP+FN) p r F1=2*r*p/(r+p) F1 #Examples with F1 F1=function(r,p){ return(2*r*p/(r+p)) } F1(0.8,0.7) F1(0.2,0.85) F1(0.75,0.1) #Probability threshold #HouseVotes84 naiveBayes library(mlbench) data(HouseVotes84) library(e1071) housetrain=sample(nrow(HouseVotes84),round(0.7*nrow(HouseVotes84),0)) housemodel = naiveBayes(Class ~ ., data = HouseVotes84[housetrain,]) predClass=predict(housemodel,newdata=HouseVotes84[-housetrain,]) confmatrix(HouseVotes84$Class[-housetrain],predClass) #phat for naiveBayes table(HouseVotes84$Class) phat=predict(housemodel,newdata=HouseVotes84[-housetrain,],type="raw")[,2] hist(phat) table(predClass,(phat>=0.5)) #phat for decision tree library(rpart) library(rattle) mytree=rpart(Class~.,data=HouseVotes84[housetrain,]) fancyRpartPlot(mytree) treepredClass=predict(mytree,newdata=HouseVotes84[-housetrain,],type="class") confmatrix(HouseVotes84$Class[-housetrain],treepredClass) treephat=predict(mytree,newdata=HouseVotes84[-housetrain,])[,2] table(treepredClass,(treephat>=0.5)) #phat for kknn library(kknn) modHouse=HouseVotes84 for(j in 2:17){ levels(modHouse[,j])=c("n","y","abstain") } modHouse[is.na(modHouse)]="abstain" modHouse kknnfit=train.kknn(Class~.,data=modHouse) kknnfit kknnmodel=kknn(Class~.,train=modHouse[housetrain,], test=modHouse[-housetrain,],kernel="optimal",k=7) kknnpredClass=kknnmodel$fitted.values confmatrix(HouseVotes84$Class[-housetrain],kknnpredClass) kknnphat=(kknnmodel$prob)[,2] table(kknnpredClass,(kknnphat>=0.5)) #phat for flight data phat=predict(model,newdata=flight[-train,],type="raw")[,1] table(preddelay,(phat>=0.5)) #Tuning p0 with training data to optimize F1 for flight data trainphat=predict(model,newdata=flight[train,],type="raw")[,1] trainprec=1:100 trainrecall=1:100 trainF1=1:100 range(trainphat) p0vect=10^(-5+(1:100)/100*5) for(i in 1:100){ p0=p0vect[i] trainpreddelay=(trainphat>=p0)*1 TP=sum((trainpreddelay==1)&(flight$delay[train]=="delayed")) FP=sum((trainpreddelay==1)&(flight$delay[train]!="delayed")) FN=sum((trainpreddelay!=1)&(flight$delay[train]=="delayed")) trainprec[i]=TP/(TP+FP) trainrecall[i]=TP/(TP+FN) trainF1[i]=2*TP/(2*TP+FP+FN) } plot(p0vect,trainrecall,type='l',xlab="Probability Threshold",ylab="") lines(p0vect,trainprec,col="blue") lines(p0vect,trainF1,col='red') plot(log(p0vect,base=10),trainrecall,type='l',xlab="log10(Probability Threshold)",ylab="") lines(log(p0vect,base=10),trainprec,col="blue") lines(log(p0vect,base=10),trainF1,col='red') which.max(trainF1) trainF1[which.max(trainF1)] trainprec[which.max(trainF1)] trainrecall[which.max(trainF1)] #Validating on Test Data p0=p0vect[which.max(trainF1)] testphat=predict(model,newdata=flight[-train,],type="raw")[,1] newpreddelay=testphat newpreddelay[testphat>=p0]="delayed" newpreddelay[testphat=p0]="delayed" newpreddelay[kknnphat=p0]="delayed" newpreddelay[kknnphat