#Importing Data for Wells, CAFOs, and Gradient Map Wells <- read.csv("http://faculty.tarleton.edu/crawford/documents/Research/Wells.txt") cafos <- read.csv("http://faculty.tarleton.edu/crawford/documents/Research/CAFOs.txt") gradientmap <- read.table("http://faculty.tarleton.edu/crawford/documents/Research/gradientmap.txt", quote="\"") ############################# #GEODESIC DISTANCE FUNCTIONS# ############################# # Calculates the geodesic distance between two points specified by radian latitude/longitude using the # Haversine formula (hf) gcd.hf <- function(long1, lat1, long2, lat2) { R <- 6371 # Earth mean radius [km] delta.long <- (long2 - long1) delta.lat <- (lat2 - lat1) a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.long/2)^2 c <- 2 * asin(min(1,sqrt(a))) d = R * c return(d) # Distance in km } #Uses the haversine formula to calculate GCD, but assumes lat/long are in degrees. Returns distance in km. gcd.hfd=function(long1, lat1, long2, lat2){ return(gcd.hf(long1*pi/180, lat1*pi/180, long2*pi/180, lat2*pi/180)) } #################### #FLOWPATH FUNCTIONS# #################### #createflowpaths accepts gradientmap, a matrix obtained from GIS, and related info, such as ncols, nrows, etc. #It also accepts vectors of lat/longs for CAFO's, and a maximum possible length for a flow path. #It returns the flowpatharray, which contains information about the flowpaths, and flowpathlengths, a vector #containing the length of each flowpath. createflowpaths=function(gradientmap,ncols,nrows,xllcorner,yllcorner,cellsize,nodatavalue, cafolong,cafolat,maxflowpathlength){ xurcorner=xllcorner+(ncols-1)*cellsize yurcorner=yllcorner+(nrows-1)*cellsize #Calculates long/lat for the upper right corner. deltax=gcd.hfd(xllcorner,yllcorner,xurcorner,yllcorner)/(ncols-1) #cellwidth in km deltay=gcd.hfd(xllcorner,yllcorner,xllcorner,yurcorner)/(nrows-1) #cellheight in km theta=atan(deltay/deltax)*180/pi #Important angles for dealing with gradient in degrees. phi=90-theta numcafos=length(cafolat) #number of cafos flowpathlengths=rep(maxflowpathlength,numcafos) #Vector that stores the length of each flowpath. flowpatharray=array(dim=c(maxflowpathlength,numcafos,5)) #5 corresponds to (rowindex,colindex,long,lat,arclength) #This loop initializes the top "row" of flowpath array. for(j in 1:numcafos){ long=cafolong[j] lat=cafolat[j] rowindex=1+round((yurcorner-lat)/cellsize) colindex=1+round((long-xllcorner)/cellsize) flowpatharray[1,j,]=c(rowindex,colindex,long,lat,0) #last component is arclength } for(j in 1:numcafos){ print(j) for(i in 1:(maxflowpathlength-1)){ rowindex=flowpatharray[i,j,1] colindex=flowpatharray[i,j,2] long=flowpatharray[i,j,3] lat=flowpatharray[i,j,4] arclength=flowpatharray[i,j,5] gradient=gradientmap[rowindex,colindex] if(gradient==nodatavalue){ flowpathlengths[j]=i break } if((0<=gradient)&(gradient<=phi/2)){shift=c(-1,0)} if((phi/2nrows)|(newcolindex>ncols)){ flowpathlengths[j]=i break } newlong=long+shift[2]*cellsize newlat=lat-shift[1]*cellsize newarclength=arclength+sqrt((shift[2]*deltax)^2+(shift[1]*deltay)^2) flowpatharray[i+1,j,]=c(newrowindex,newcolindex,newlong,newlat,newarclength) } } return(list(flowpatharray=flowpatharray,flowpathlengths=flowpathlengths)) } #Epanechnikov Kernel K=function(u){ return((3/4*(1-u^2))*(abs(u)<=1)) } uindex=seq(from=0,to=1.2*100,by=.001) plot(uindex,sapply(uindex/100,K),type="l",ylim=c(0,1),xlab="d",ylab="K(d)") #flowpathdistance accepts m-vectors of well long/lat and a flowpath array for n CAFOs. #It also accepts a vector of flowpath lengths. #It returns an mxnx2 array, where the (i,j,1) entry is the minimum distance #from the ith well to the jth cafo's flowpath, and the (i,j,2) entry is the arclength at that point. flowpathdistance=function(wlong,wlat,flowpatharray,flowpathlengths){ m=length(wlong) n=dim(flowpatharray)[2] distancearray=array(dim=c(m,n,2)) for(i in 1:m){ print(i) for(j in 1:n){ D=flowpathlengths[j] #D is the length of the jth flowpath distances=rep(0,D) for(k in 1:D){ distances[k]=gcd.hfd(wlong[i],wlat[i],flowpatharray[k,j,3],flowpatharray[k,j,4]) } distancearray[i,j,1]=min(distances) kmin=which.min(distances) distancearray[i,j,2]=flowpatharray[kmin,j,5] } } return(distancearray) } #buildcafofeature accepts the distancearray returned by flowpathdistance and returns the #cafo migration score (CMS) vector. buildcafofeature=function(distancearray,pathradius,tonsofsolids,exponent){ return((sapply(distancearray[,,1]/pathradius,K)/(1+distancearray[,,2]^exponent))%*%tonsofsolids) } ############################### #LOGISTIC REGRESSION FUNCTIONS# ############################### ################################ #APPLYING FUNCTIONS TO OUR DATA# ################################ #Creating CAFO flowpaths ncols=466 nrows=348 xllcorner = -100.15047166 yllcorner = 30.43341834 cellsize = 0.00872332 nodatavalue=-9999 L=createflowpaths(gradientmap,ncols,nrows,xllcorner,yllcorner,cellsize,nodatavalue, cafos$longitude,cafos$Lattitude,50) #Creating distance array distancearray=flowpathdistance(Wells$long_dec,Wells$lat_dec,L$flowpatharray,L$flowpathlengths) #Creating CAFO Migration Score (CMS) exponent=2 tonsofsolids=cafos$Tons_Of_Solids pathradius=100 CMS=buildcafofeature(distancearray,pathradius,tonsofsolids,exponent) ########################### #LOGISTIC REGRESSION MODEL# ########################### #Calculating the amount of nitrate as nitrogen nitrate=Wells$NO3_Avg*14.007/(14.007+3*15.999) #Making nitrate a dichotomous variable (only two values) dnitrate=(nitrate>=3) #Logistic regression model model=glm(dnitrate~CMS,family='binomial') summary(model) #ROC Curve pihat=predict(model,type="response") #Predicted probabilities of nitrate contamination for each well library(pROC) plot(roc(response=dnitrate,predictor=pihat)) ######################################## #LOGISTIC REGRESSION PLOTTING FUNCTIONS# ######################################## LRtest=function(model0,model){ 1-pchisq(model0$deviance-model$deviance, df=model0$df.residual-model$df.residual)} logit=function(p){ log(p/(1-p))} ###Given a vector of numerical values x, a response vector Y, and a ###number of groups k, groupplot returns a list. ###x=mean of x groups ###Pi=mean of Y values in each group (with Wilson's adjustment) ###g=logit of Pi groupplot=function(x,Y,k){ sortframe=sort(x,index=TRUE) x=sortframe$x Y=Y[sortframe$ix] xmeans=1:k Pi=1:k s=floor(length(x)/k) #groupsize for(i in 1:k){ index=(((i-1)*s+1):(i*s)) xmeans[i]=mean(x[index]) Pi[i]=(sum(Y[index])+2)/(s+4)} g=logit(Pi) return(list(x=xmeans,g=g,Pi=Pi))} quantlogitplot=function(Y,x,degree,xname,Yname,numgroups,yrange){ xrange=range(x) n=length(Y) X=rep(1,n) for(i in 1:degree){ X=cbind(X,x^i) } model=glm(Y~X-1,family='binomial') L=groupplot(x,Y,numgroups) plot(L$x,L$Pi,xlab=xname,ylab=Yname,ylim=yrange) a=xrange[1] b=xrange[2] index=(1:100)/100 index=a+(b-a)*index Index=rep(1,100) for(i in 1:degree){ Index=cbind(Index,index^i) } betahat=coef(model) lines(index,1/(1+exp(-Index%*%betahat)),col='red') } ############################################################# #PLOTTING THE LOGISTIC REGRESSION MODEL FOR DNITRATE AND CMS# ############################################################# quantlogitplot(dnitrate, CMS,1, "CAFO Migration Score", "Probability that NO3 Concentration Exceeds 3 mg/L",10,c(0,1))