#################################################################### #Creating a data set for a hypothetical mathematics education study# ###################################################################################################################### clean=function(x,floor,ceiling,precision){ upper.trunc.fun=function(u){ return(min(u,ceiling)) } lower.trunc.fun=function(u){ return(max(u,floor)) } upper.truncate=sapply(x,upper.trunc.fun) lower.truncate=sapply(upper.truncate,lower.trunc.fun) result=round(lower.truncate,precision) return(result) } #Pretest variable pretest=clean(rnorm(1000,50,10), 0, 100, 0) hist(pretest) #hsrank variable hsrank=clean(rnorm(1000,60,10), 0, 100, 0) hist(hsrank) #mathsat variable mathsat=500+5*(hsrank-60)+rnorm(1000,0,50) mathsat=clean(mathsat,200,800,-1) hist(mathsat) #verbalsat variable verbalsat=500+5*(hsrank-60)+rnorm(1000,0,50) verbalsat=clean(verbalsat,200,800,-1) hist(verbalsat) #numabsence variable numabsence=rpois(1000,4) hist(numabsence) #groupwork variable group.u=runif(1000) groupwork=(group.u<0.5)*1 groupwork=as.factor(groupwork) groupwork=c("No","Yes")[groupwork] groupwork=as.factor(groupwork) table(groupwork) #clickers variable click.u=runif(1000) clickers=(click.u<0.5)*1 clickers=as.factor(clickers) clickers=c("No","Yes")[clickers] clickers=as.factor(clickers) table(clickers) #highschool variable highschool.u=runif(1000) highschool=(highschool.u>0.5)+(highschool.u>0.8)+(highschool.u>0.95) highschool=as.factor(highschool) highschool=c("Abrams","Baldwin","Campbell","Daniels")[highschool] highschool=as.factor(highschool) table(highschool) #posttest variable temp=runif(1000) temp.model=lm(temp~pretest+hsrank+mathsat+verbalsat+numabsence+groupwork+clickers+highschool) X=model.matrix(temp.model) beta=c(3,0.5,0.8,0,0,-2,5,0,8,-6,8) posttest=X%*%beta+rnorm(1000,0,7) posttest=clean(posttest,0,100,0) mathed.data=data.frame(posttest,pretest,hsrank,mathsat,verbalsat,numabsence,groupwork,clickers,highschool) remove(posttest,pretest,hsrank,mathsat,verbalsat,numabsence,groupwork,clickers,highschool) remove(beta,clean,click.u,group.u,highschool.u,temp,temp.model,X) ##################################################################################################################### ########################### #Exploratory Data Analysis# ##################################################################################################################### #What's in our workspace? ls() head(mathed.data) hist(posttest) #Attaching the data attach(mathed.data) hist(posttest) mean(posttest) sort(posttest) #Quick exploratory data analysis for each variable #pretest hist(pretest) sort(pretest) plot(pretest,posttest) cor(pretest,posttest) summary(lm(posttest~pretest)) #hsrank hist(hsrank) sort(hsrank) plot(hsrank,posttest) cor(hsrank,posttest) summary(lm(posttest~hsrank)) #mathsat hist(mathsat) sort(mathsat) plot(mathsat,posttest) cor(mathsat,posttest) summary(lm(posttest~mathsat)) #verbalsat hist(verbalsat) sort(verbalsat) plot(verbalsat,posttest) cor(verbalsat,posttest) summary(lm(posttest~verbalsat)) #numabsence hist(numabsence) sort(numabsence) plot(numabsence,posttest) cor(numabsence,posttest) summary(lm(posttest~numabsence)) #groupwork table(groupwork) plot(groupwork,posttest) summary(lm(posttest~groupwork)) #clickers table(clickers) plot(clickers,posttest) summary(lm(posttest~clickers)) #highschool table(highschool) plot(highschool,posttest) summary(lm(posttest~highschool)) ####################################################################################################################### ########################### #Multiple Regression Model# ##################################################################################################################### model=lm(posttest~pretest+hsrank+mathsat+verbalsat+numabsence+groupwork+clickers+highschool,data=mathed.data) model=lm(posttest~.,data=mathed.data) #Same thing. Writing "~." means that all explanatory variables are included. summary(model) X=model.matrix(model) head(X) #f-test function f.test=function(model0,model){ e02=sum((model0$residuals)^2) #Residual sum of squares for small model. e2=sum((model$residuals)^2) #Residual sum of squares for big model. df0=model0$df.residual #Residual degrees of freedom for small model. df=model$df.residual #Residual degrees of freedom for big model. f.stat=((e02-e2)/(df0-df))/(e2/df) p.value=pf(f.stat, df1=df0-df, df2=df, lower.tail=FALSE) return(list(f.stat=f.stat,p.value=p.value)) } #Testing the null hypothesis that highschool is statistically insignificant model0=lm(posttest~pretest+hsrank+mathsat+verbalsat+numabsence+groupwork+clickers,data=mathed.data) f.test(model0,model) #Testing the null hypothesis that none of the variables are statistically significant. model0=lm(posttest~1,data=mathed.data) f.test(model0,model) summary(model) #Same as the f-statistic reported here. #Testing the null hypothesis that mathsat, verbalsat, and clickers are statistically insignificant model0=lm(posttest~pretest+hsrank+numabsence+groupwork+highschool,data=mathed.data) f.test(model0,model) #Testing the null hypothesis that the effect size for the Baldwin and Daniels highschools are the same. new.highschool=c("Alexander","Baldwin/Daniels","Campbell","Baldwin/Daniels")[highschool] table(highschool) table(new.highschool) mathed.data=cbind(mathed.data,new.highscool) head(mathed.data) model0=lm(posttest~pretest+hsrank+mathsat+verbalsat+numabsence+groupwork+clickers+new.highschool,data=mathed.data) f.test(model0,model)