#Importing Data for Wells, CAFOs, and Gradient Map
Wells <- read.csv("http://faculty.tarleton.edu/crawford/documents/Research/Wells.txt")
cafos <- read.csv("http://faculty.tarleton.edu/crawford/documents/Research/CAFOs.txt")
gradientmap <- read.table("http://faculty.tarleton.edu/crawford/documents/Research/gradientmap.txt", 
                          quote="\"")


#############################
#GEODESIC DISTANCE FUNCTIONS#
#############################

# Calculates the geodesic distance between two points specified by radian latitude/longitude using the
# Haversine formula (hf)
gcd.hf <- function(long1, lat1, long2, lat2) {
  R <- 6371 # Earth mean radius [km]
  delta.long <- (long2 - long1)
  delta.lat <- (lat2 - lat1)
  a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.long/2)^2
  c <- 2 * asin(min(1,sqrt(a)))
  d = R * c
  return(d) # Distance in km
}


#Uses the haversine formula to calculate GCD, but assumes lat/long are in degrees.  Returns distance in km.
gcd.hfd=function(long1, lat1, long2, lat2){
  return(gcd.hf(long1*pi/180, lat1*pi/180, long2*pi/180, lat2*pi/180))
}


####################
#FLOWPATH FUNCTIONS#
####################

#createflowpaths accepts gradientmap, a matrix obtained from GIS, and related info, such as ncols, nrows, etc.
#It also accepts vectors of lat/longs for CAFO's, and a maximum possible length for a flow path.
#It returns the flowpatharray, which contains information about the flowpaths, and flowpathlengths, a vector
#containing the length of each flowpath.


createflowpaths=function(gradientmap,ncols,nrows,xllcorner,yllcorner,cellsize,nodatavalue,
                         cafolong,cafolat,maxflowpathlength){
  xurcorner=xllcorner+(ncols-1)*cellsize
  yurcorner=yllcorner+(nrows-1)*cellsize   #Calculates long/lat for the upper right corner.
  
  deltax=gcd.hfd(xllcorner,yllcorner,xurcorner,yllcorner)/(ncols-1)  #cellwidth in km
  deltay=gcd.hfd(xllcorner,yllcorner,xllcorner,yurcorner)/(nrows-1)  #cellheight in km
  
  theta=atan(deltay/deltax)*180/pi   #Important angles for dealing with gradient in degrees.
  phi=90-theta
  
  numcafos=length(cafolat)  #number of cafos
  flowpathlengths=rep(maxflowpathlength,numcafos)   #Vector that stores the length of each flowpath.
  
  flowpatharray=array(dim=c(maxflowpathlength,numcafos,5))  #5 corresponds to (rowindex,colindex,long,lat,arclength)
  
  
  #This loop initializes the top "row" of flowpath array.
  for(j in 1:numcafos){
    long=cafolong[j]
    lat=cafolat[j]
    rowindex=1+round((yurcorner-lat)/cellsize)
    colindex=1+round((long-xllcorner)/cellsize)
    flowpatharray[1,j,]=c(rowindex,colindex,long,lat,0)  #last component is arclength
  }
  for(j in 1:numcafos){
    print(j)
    for(i in 1:(maxflowpathlength-1)){
      rowindex=flowpatharray[i,j,1]
      colindex=flowpatharray[i,j,2]
      long=flowpatharray[i,j,3]
      lat=flowpatharray[i,j,4]
      arclength=flowpatharray[i,j,5]
      gradient=gradientmap[rowindex,colindex]
      if(gradient==nodatavalue){
        flowpathlengths[j]=i
        break      
      }
      if((0<=gradient)&(gradient<=phi/2)){shift=c(-1,0)}
      if((phi/2<gradient)&(gradient<=phi+theta/2)){shift=c(-1,1)}
      if((phi+theta/2<gradient)&(gradient<=phi+3/2*theta)){shift=c(0,1)}
      if((phi+3/2*theta<gradient)&(gradient<=3/2*phi+2*theta)){shift=c(1,1)}
      if((3/2*phi+2*theta<gradient)&(gradient<=5/2*phi+2*theta)){shift=c(1,0)}
      if((5/2*phi+2*theta<gradient)&(gradient<=3*phi+5/2*theta)){shift=c(1,-1)}
      if((3*phi+5/2*theta<gradient)&(gradient<=3*phi+7/2*theta)){shift=c(0,-1)}
      if((3*phi+7/2*theta<gradient)&(gradient<=7/2*phi+4*theta)){shift=c(-1,-1)}
      if((7/2*phi+4*theta<gradient)&(gradient<=360)){shift=c(-1,0)}
      newrowindex=rowindex+shift[1]
      newcolindex=colindex+shift[2]
      if((newrowindex<1)|(newcolindex<1)|(newrowindex>nrows)|(newcolindex>ncols)){
        flowpathlengths[j]=i
        break
      }
      newlong=long+shift[2]*cellsize
      newlat=lat-shift[1]*cellsize
      newarclength=arclength+sqrt((shift[2]*deltax)^2+(shift[1]*deltay)^2)
      flowpatharray[i+1,j,]=c(newrowindex,newcolindex,newlong,newlat,newarclength)
    }
  }
  return(list(flowpatharray=flowpatharray,flowpathlengths=flowpathlengths))
}


#Epanechnikov Kernel

K=function(u){
  return((3/4*(1-u^2))*(abs(u)<=1))
}

uindex=seq(from=0,to=1.2*100,by=.001)
plot(uindex,sapply(uindex/100,K),type="l",ylim=c(0,1),xlab="d",ylab="K(d)")


#flowpathdistance accepts m-vectors of well long/lat and a flowpath array for n CAFOs.
#It also accepts a vector of flowpath lengths.
#It returns an mxnx2 array, where the (i,j,1) entry is the minimum distance
#from the ith well to the jth cafo's flowpath, and the (i,j,2) entry is the arclength at that point.


flowpathdistance=function(wlong,wlat,flowpatharray,flowpathlengths){
  m=length(wlong)
  n=dim(flowpatharray)[2]
  distancearray=array(dim=c(m,n,2))
  
  for(i in 1:m){
    print(i)
    for(j in 1:n){
      D=flowpathlengths[j]    #D is the length of the jth flowpath
      distances=rep(0,D)
      
      for(k in 1:D){
        distances[k]=gcd.hfd(wlong[i],wlat[i],flowpatharray[k,j,3],flowpatharray[k,j,4])
      }
      distancearray[i,j,1]=min(distances)
      kmin=which.min(distances)
      distancearray[i,j,2]=flowpatharray[kmin,j,5]
    }
  }
  return(distancearray)
  
}


#buildcafofeature accepts the distancearray returned by flowpathdistance and returns the 
#cafo migration score (CMS) vector.
buildcafofeature=function(distancearray,pathradius,tonsofsolids,exponent){
  return((sapply(distancearray[,,1]/pathradius,K)/(1+distancearray[,,2]^exponent))%*%tonsofsolids)
}


###############################
#LOGISTIC REGRESSION FUNCTIONS#
###############################


################################
#APPLYING FUNCTIONS TO OUR DATA#
################################


#Creating CAFO flowpaths
ncols=466
nrows=348
xllcorner = -100.15047166
yllcorner = 30.43341834
cellsize = 0.00872332
nodatavalue=-9999


L=createflowpaths(gradientmap,ncols,nrows,xllcorner,yllcorner,cellsize,nodatavalue,
                  cafos$longitude,cafos$Lattitude,50)


#Creating distance array
distancearray=flowpathdistance(Wells$long_dec,Wells$lat_dec,L$flowpatharray,L$flowpathlengths)


#Creating CAFO Migration Score (CMS)

exponent=2
tonsofsolids=cafos$Tons_Of_Solids
pathradius=100

CMS=buildcafofeature(distancearray,pathradius,tonsofsolids,exponent)


###########################
#LOGISTIC REGRESSION MODEL#
###########################

#Calculating the amount of nitrate as nitrogen
nitrate=Wells$NO3_Avg*14.007/(14.007+3*15.999)

#Making nitrate a dichotomous variable (only two values)
dnitrate=(nitrate>=3)

#Logistic regression model
model=glm(dnitrate~CMS,family='binomial')
summary(model)

#ROC Curve
pihat=predict(model,type="response")   #Predicted probabilities of nitrate contamination for each well
library(pROC)
plot(roc(response=dnitrate,predictor=pihat))


########################################
#LOGISTIC REGRESSION PLOTTING FUNCTIONS#
########################################

LRtest=function(model0,model){
  1-pchisq(model0$deviance-model$deviance,
           df=model0$df.residual-model$df.residual)}


logit=function(p){
  log(p/(1-p))}


###Given a vector of numerical values x, a response vector Y, and a 
###number of groups k, groupplot returns a list.
###x=mean of x groups
###Pi=mean of Y values in each group (with Wilson's adjustment)
###g=logit of Pi

groupplot=function(x,Y,k){
  sortframe=sort(x,index=TRUE)
  x=sortframe$x
  Y=Y[sortframe$ix]
  xmeans=1:k
  Pi=1:k
  s=floor(length(x)/k)  #groupsize
  
  for(i in 1:k){
    index=(((i-1)*s+1):(i*s))
    xmeans[i]=mean(x[index])
    Pi[i]=(sum(Y[index])+2)/(s+4)}
  
  g=logit(Pi)
  return(list(x=xmeans,g=g,Pi=Pi))}


quantlogitplot=function(Y,x,degree,xname,Yname,numgroups,yrange){
  xrange=range(x)
  n=length(Y)
  X=rep(1,n)
  for(i in 1:degree){
    X=cbind(X,x^i)
  }
  model=glm(Y~X-1,family='binomial')
  L=groupplot(x,Y,numgroups)
  plot(L$x,L$Pi,xlab=xname,ylab=Yname,ylim=yrange)
  a=xrange[1]
  b=xrange[2]
  index=(1:100)/100
  index=a+(b-a)*index
  Index=rep(1,100)
  for(i in 1:degree){
    Index=cbind(Index,index^i)
  }
  betahat=coef(model)
  lines(index,1/(1+exp(-Index%*%betahat)),col='red')
  
}

#############################################################
#PLOTTING THE LOGISTIC REGRESSION MODEL FOR DNITRATE AND CMS#
#############################################################

quantlogitplot(dnitrate,
               CMS,1,
               "CAFO Migration Score",
               "Probability that NO3 Concentration Exceeds 3 mg/L",10,c(0,1))