####################################################################
#Creating a data set for a hypothetical mathematics education study#
######################################################################################################################


clean=function(x,floor,ceiling,precision){
  upper.trunc.fun=function(u){
    return(min(u,ceiling))
  }
  lower.trunc.fun=function(u){
    return(max(u,floor))
  }
  upper.truncate=sapply(x,upper.trunc.fun)
  lower.truncate=sapply(upper.truncate,lower.trunc.fun)
  result=round(lower.truncate,precision)
  return(result)
}



#Pretest variable
pretest=clean(rnorm(1000,50,10),
              0,
              100,
              0)

hist(pretest)



#hsrank variable
hsrank=clean(rnorm(1000,60,10),
             0,
             100,
             0)

hist(hsrank)




#mathsat variable

mathsat=500+5*(hsrank-60)+rnorm(1000,0,50)
mathsat=clean(mathsat,200,800,-1)

hist(mathsat)




#verbalsat variable

verbalsat=500+5*(hsrank-60)+rnorm(1000,0,50)
verbalsat=clean(verbalsat,200,800,-1)

hist(verbalsat)



#numabsence variable

numabsence=rpois(1000,4)
hist(numabsence)



#groupwork variable
group.u=runif(1000)

groupwork=(group.u<0.5)*1
groupwork=as.factor(groupwork)

groupwork=c("No","Yes")[groupwork]
groupwork=as.factor(groupwork)

table(groupwork)



#clickers variable
click.u=runif(1000)

clickers=(click.u<0.5)*1
clickers=as.factor(clickers)

clickers=c("No","Yes")[clickers]
clickers=as.factor(clickers)

table(clickers)



#highschool variable

highschool.u=runif(1000)

highschool=(highschool.u>0.5)+(highschool.u>0.8)+(highschool.u>0.95)
highschool=as.factor(highschool)

highschool=c("Abrams","Baldwin","Campbell","Daniels")[highschool]
highschool=as.factor(highschool)

table(highschool)



#posttest variable
temp=runif(1000)

temp.model=lm(temp~pretest+hsrank+mathsat+verbalsat+numabsence+groupwork+clickers+highschool)
X=model.matrix(temp.model)


beta=c(3,0.5,0.8,0,0,-2,5,0,8,-6,8)

posttest=X%*%beta+rnorm(1000,0,7)

posttest=clean(posttest,0,100,0)


mathed.data=data.frame(posttest,pretest,hsrank,mathsat,verbalsat,numabsence,groupwork,clickers,highschool)
remove(posttest,pretest,hsrank,mathsat,verbalsat,numabsence,groupwork,clickers,highschool)
remove(beta,clean,click.u,group.u,highschool.u,temp,temp.model,X)
#####################################################################################################################







###########################
#Exploratory Data Analysis#
#####################################################################################################################


#What's in our workspace?
ls()
head(mathed.data)

hist(posttest)


#Attaching the data
attach(mathed.data)

hist(posttest)
mean(posttest)
sort(posttest)




#Quick exploratory data analysis for each variable


#pretest
hist(pretest)
sort(pretest)

plot(pretest,posttest)
cor(pretest,posttest)

summary(lm(posttest~pretest))



#hsrank
hist(hsrank)
sort(hsrank)

plot(hsrank,posttest)
cor(hsrank,posttest)

summary(lm(posttest~hsrank))



#mathsat
hist(mathsat)
sort(mathsat)

plot(mathsat,posttest)
cor(mathsat,posttest)

summary(lm(posttest~mathsat))



#verbalsat
hist(verbalsat)
sort(verbalsat)

plot(verbalsat,posttest)
cor(verbalsat,posttest)

summary(lm(posttest~verbalsat))


#numabsence
hist(numabsence)
sort(numabsence)

plot(numabsence,posttest)
cor(numabsence,posttest)

summary(lm(posttest~numabsence))



#groupwork
table(groupwork)

plot(groupwork,posttest)

summary(lm(posttest~groupwork))



#clickers
table(clickers)

plot(clickers,posttest)

summary(lm(posttest~clickers))



#highschool
table(highschool)

plot(highschool,posttest)

summary(lm(posttest~highschool))
#######################################################################################################################




###########################
#Multiple Regression Model#
#####################################################################################################################



model=lm(posttest~pretest+hsrank+mathsat+verbalsat+numabsence+groupwork+clickers+highschool,data=mathed.data)
model=lm(posttest~.,data=mathed.data)   #Same thing.  Writing "~." means that all explanatory variables are included.

summary(model)


X=model.matrix(model)
head(X)




#f-test function
f.test=function(model0,model){
  e02=sum((model0$residuals)^2)   #Residual sum of squares for small model.
  e2=sum((model$residuals)^2)     #Residual sum of squares for big model.
  df0=model0$df.residual          #Residual degrees of freedom for small model.
  df=model$df.residual            #Residual degrees of freedom for big model.
  
  f.stat=((e02-e2)/(df0-df))/(e2/df)
  
  p.value=pf(f.stat,
             df1=df0-df,
             df2=df,
             lower.tail=FALSE)
  
  return(list(f.stat=f.stat,p.value=p.value))
}




#Testing the null hypothesis that highschool is statistically insignificant

model0=lm(posttest~pretest+hsrank+mathsat+verbalsat+numabsence+groupwork+clickers,data=mathed.data)

f.test(model0,model)



#Testing the null hypothesis that none of the variables are statistically significant.

model0=lm(posttest~1,data=mathed.data)

f.test(model0,model)

summary(model)   #Same as the f-statistic reported here.



#Testing the null hypothesis that mathsat, verbalsat, and clickers are statistically insignificant

model0=lm(posttest~pretest+hsrank+numabsence+groupwork+highschool,data=mathed.data)

f.test(model0,model)



#Testing the null hypothesis that the effect size for the Baldwin and Daniels highschools are the same.


new.highschool=c("Alexander","Baldwin/Daniels","Campbell","Baldwin/Daniels")[highschool]

table(highschool)
table(new.highschool)


mathed.data=cbind(mathed.data,new.highscool)
head(mathed.data)

model0=lm(posttest~pretest+hsrank+mathsat+verbalsat+numabsence+groupwork+clickers+new.highschool,data=mathed.data)

f.test(model0,model)