The workflow when using your own programs for simulations

An example can be found also in the help file of the global_envelope_test function.

Read in example data

Download Boreality.txt from https://github.com/jebyrnes/spatial_correction_lavaan/blob/master/Boreality.txt

datafile <- "Boreality.txt"
Boreality <- read.table(datafile, header = TRUE, dec = ".")
head(Boreality)
##   point       x       y Oxalis   boreal nBor nTot       Grn     NDVI     T61
## 1     1 2109.70 2093.52      0 15.38462    2   13 0.0597027 0.480180 296.367
## 2     2 2190.18 2105.71      1 19.04762    4   21 0.0514881 0.483990 296.367
## 3     3 2064.48 2052.77      1 20.00000    6   30 0.0509510 0.489213 296.367
## 4     4 2277.34 2103.42      0 15.38462    2   13 0.0521183 0.473226 296.367
## 5     5 2347.91 2074.81      0  0.00000    0   13 0.0422267 0.405898 296.785
## 6     6 2437.21 2086.95      0 16.66667    1    6 0.0417779 0.424769 296.367
##          Wet
## 1 -0.0264378
## 2 -0.0234048
## 3 -0.0189264
## 4 -0.0280431
## 5 -0.0292287
## 6 -0.0229209

Let us inspect the proportion of species belonging to Boreal Coenosis (call this boreality). Check the data for normality and transform it for approximative normality:

par(mfrow=c(1,2))
hist(Boreality$nBor, main="nBor") # Number of species belonging to Boreal Coenosis
hist(Boreality$nTot, main="nTot") # Total number of species

qqnorm((Boreality$nBor+1)/Boreality$nTot, main="Q-Q Plot for (nBor+1)/nTot")
qqline((Boreality$nBor+1)/Boreality$nTot)
Boreality$Bor <- sqrt(1000*(Boreality$nBor+1)/Boreality$nTot)
qqnorm(Boreality$Bor, main="Q-Q Plot for a sqrt transform")
qqline(Boreality$Bor)

hist(Boreality$Bor)

ggplot(data=Boreality, aes(x=x, y=y, col=Bor)) + geom_point() + coord_fixed()

After the square root transformation, the data look rather normal. Let us then fit first a linear model \(Boreality \sim a+ b*wetness\).

M0.lm <- lm(Bor ~ Wet, data=Boreality)
summary(M0.lm)
## 
## Call:
## lm(formula = Bor ~ Wet, data = Boreality)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0918 -2.4214 -0.1382  1.9783 17.6054 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  18.4880     0.3787   48.82   <2e-16 ***
## Wet         165.8036    10.5991   15.64   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.491 on 531 degrees of freedom
## Multiple R-squared:  0.3155, Adjusted R-squared:  0.3142 
## F-statistic: 244.7 on 1 and 531 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(M0.lm)

The residuals look fine. But, what about the spatial effect? The following plot of residuals in space shows that there is clear spatial inhomogeneity in the residuals, meaning that either a covariate is missing or there is a spatial correlation.

E <- rstandard(M0.lm) # Standardized residuals
mydata <- data.frame(E=E, x=Boreality$x, y=Boreality$y)
coordinates(mydata) = ~x+y
bubble(mydata, main="Standardized residuals")

Let us estimate the variogram to describe the spatial dependence of the residuals.

Let us use the gls for fitting the linear model (we will fit another model after this). The variogram can be computed directly from the gls model (from its residuals) using the nlme package. The argument resType = “pearson” specifies that we want standardized residuals.

f1 <- formula(Bor ~ Wet)
M1 <- gls(f1, data=Boreality)
Vario.gls <- Variogram(M1, form = ~ x+y, robust = TRUE, maxDist = 2000, resType = "pearson")
plot(Vario.gls, smooth = TRUE)

We see that there is some trend in the variogram, but without knowing variability of the variogram under independence of the residuals, we are in general not sure if patterns in the variogram could be due to random variability or not. So, let us compute the 95% global envelope for the independent residuals. A way to sample the residuals under independence is to permute the model residuals. A permutation can be done by the function from R. The variogram model must now be specified for the residuals. The function requires the coordinates for the computations, therefore we create a data frame with the residuals and coordinates.

residdata <- data.frame(E=residuals(M1, "pearson"), x=Boreality$x, y=Boreality$y)
coordinates(residdata) = ~x+y

# Calculate variogram for the data (the same as above with another specification)
Mresid <- gls(E ~ 1, data=residdata) # Model for residuals
Vario_obs <- Variogram(Mresid, form = ~ x+y, robust = TRUE, maxDist = 2000, resType = "pearson")
plot(Vario_obs)

# Vario_obs$dist
# Note: One must check that the distances where variograms are evaluated
# for data and simulations are the same. Here it is the case.
# Otherwise argument Distance could be used.

# Do permutations of residuals and calculate variogram for each simulation
Vario_sim <- sapply(X=1:999, FUN = function(i) {
  residdata$E <- residdata$E[sample(1:nrow(residdata), size=nrow(residdata), replace=FALSE)] # Permuted residuals
  Mresid <- gls(E ~ 1, data=residdata)
  Variogram(Mresid, form = ~ x+y, robust = TRUE, maxDist = 2000, resType = "pearson")$variog
})
dim(Vario_sim)
## [1]  20 999
# Create a curve set of all the variograms
cset <- create_curve_set(list(r=Vario_obs$dist, obs=Vario_obs$variog, sim_m=Vario_sim))
plot(cset)

# Calculate the global envelope
Vario_with_envelope <- global_envelope_test(cset)
plot(Vario_with_envelope, xlab="Distance", ylab="Semivariogram")

If we saw a horizontal variogram with respect to distance (on the x-axis) or if the empirical variogram was within the global envelope, then there would be spatial independence. This is not the case here. Next we add spatial correlation to the model. The best spatial correlation model from the ones that we tested was the exponential correlation model (M2), according to the AIC or BIC criteria. We choose it. This model can be fitted to the data as follows:

M2 <- gls(f1, correlation = corExp(form = ~x+y, nugget = TRUE), data = Boreality)
summary(M2)
## Generalized least squares fit by REML
##   Model: f1 
##   Data: Boreality 
##        AIC      BIC    logLik
##   2732.224 2753.597 -1361.112
## 
## Correlation Structure: Exponential spatial correlation
##  Formula: ~x + y 
##  Parameter estimate(s):
##       range      nugget 
## 481.1721128   0.4849349 
## 
## Coefficients:
##                Value Std.Error   t-value p-value
## (Intercept) 15.02343  0.825323 18.203089       0
## Wet         75.43165 13.541701  5.570323       0
## 
##  Correlation: 
##     (Intr)
## Wet 0.571 
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -2.1902927 -0.5385566  0.1088981  0.7354905  5.0260467 
## 
## Residual standard error: 3.707339 
## Degrees of freedom: 533 total; 531 residual

Have we removed the spatial autocorrelation from the residuals?

VarioE <- Variogram(M2, form = ~ x+y, robust = TRUE, maxDist = 2000, resType = "pearson")
plot(VarioE, smooth = TRUE)

# Normalized residues are now horizontal, so we removed the spatial autocorrelation from the model
VarioEN <- Variogram(M2, form = ~ x+y, robust = TRUE, maxDist = 2000, resType = "normalized")
plot(VarioEN, smooth = TRUE)

# Let us look at the envelopes
residdata <- data.frame(E=residuals(M2, "normalized"), x=Boreality$x, y=Boreality$y)
coordinates(residdata) = ~x+y

# Calculate variogram for the data
Mresid <- gls(E ~ 1, data=residdata) # Model for residuals
Vario_obs <- Variogram(Mresid, form = ~ x+y, robust = TRUE, maxDist = 2000, resType = "normalized")
plot(Vario_obs)

# Do permutations of residuals and calculate variogram for each simulation
Vario_sim <- sapply(X=1:999, FUN = function(i) {
  residdata$E <- residdata$E[sample(1:nrow(residdata), size=nrow(residdata), replace=FALSE)]
  Mresid <- gls(E ~ 1, data=residdata)
  Variogram(Mresid, form = ~ x+y, robust = TRUE, maxDist = 2000, resType = "normalized")$variog
})

# Create a curve set of all the variograms
cset2 <- create_curve_set(list(r=Vario_obs$dist, obs=Vario_obs$variog, sim_m=Vario_sim))

# Calculate the global envelope
Vario2_with_envelope <- global_envelope_test(cset2)
plot(Vario2_with_envelope, xlab="Distance", ylab="Semivariogram")