Preparations

Required libraries

library(rgdal)
library(randomForest)
library(ranger)
library(ggplot2)
library(Metrics)
library(sp)
library(raster)
library(Boruta)
library(caret)
library(tictoc)
library(quantregForest)
library(snow)
library(forcats)
library(gridExtra)
library(ggpubr)
library(RColorBrewer)
library(RStoolbox)

Define some variables for later use

seed <- 42
setwd("N:/Prosjekter/311700_MAREANO/311720_Havbunnskart/c_Metodeprosjekter/xxx_Karbonlagring/sedrate.model")
dir("data")
##  [1] "land.dbf"                                 
##  [2] "land.prj"                                 
##  [3] "land.shp"                                 
##  [4] "land.shx"                                 
##  [5] "predictors"                               
##  [6] "schema.ini"                               
##  [7] "sedrate.cpg"                              
##  [8] "sedrate.dbf"                              
##  [9] "sedrate.prj"                              
## [10] "sedrate.sbn"                              
## [11] "sedrate.sbx"                              
## [12] "sedrate.shp"                              
## [13] "sedrate.shp.MAR-PROS02.6196.16084.sr.lock"
## [14] "sedrate.shp.xml"                          
## [15] "sedrate.shx"                              
## [16] "sedrate.txt"                              
## [17] "sedrate.txt.xml"

Load and prepare the data

Read shapefile with response variable

samples <- readOGR(dsn = "./data", layer = "sedrate")
## OGR data source with driver: ESRI Shapefile 
## Source: "N:\Prosjekter\311700_MAREANO\311720_Havbunnskart\c_Metodeprosjekter\xxx_Karbonlagring\sedrate.model\data", layer: "sedrate"
## with 219 features
## It has 14 fields
str(samples@data)
## 'data.frame':    219 obs. of  14 variables:
##  $ StnNo    : chr  NA NA NA "95-12" ...
##  $ lat      : chr  NA NA NA "60" ...
##  $ latmin   : chr  NA NA NA "29.97" ...
##  $ lon      : chr  NA NA NA "0" ...
##  $ lonmin   : chr  NA NA NA "0" ...
##  $ Latitude : num  60.7 60.7 60.7 60.5 60.3 ...
##  $ Longitude: num  4.5 4.17 3.35 0 3.37 ...
##  $ SedRate  : num  0.03 0.28 0.06 0 0.14 0.06 0.14 0.16 0.09 0.12 ...
##  $ Unit     : chr  "cm/yr" "cm/yr" "cm/yr" "cm/yr" ...
##  $ Method   : chr  "210Pb" "210Pb" "210Pb" "210Pb" ...
##  $ Remarks  : chr  NA NA NA "no net sedimentation" ...
##  $ Source   : chr  "de Haas et al. (1996)" "de Haas et al. (1996)" "de Haas et al. (1996)" "de Haas et al. (1997)" ...
##  $ POINT_X  : num  4019906 4002124 3957396 3771876 3954069 ...
##  $ POINT_Y  : num  4192156 4192112 4197521 4194247 4154079 ...
plot(samples)

Load environmental predictor variables

env.vars <- stack(list.files(path="./data/predictors", pattern='.tif$', full.names=T), RAT=F)

Create data frame with response and predictor variables

ov.sr <- as.data.frame(extract(env.vars, samples))

rm.sr <- cbind(samples$SedRate, samples$POINT_X,samples$POINT_Y, ov.sr)
names(rm.sr)[1] <- "SedRate"
names(rm.sr)[2] <- "POINT_X"
names(rm.sr)[3] <- "POINT_Y"
rm.sr <- na.omit(rm.sr)
str(rm.sr)
## 'data.frame':    219 obs. of  16 variables:
##  $ SedRate   : num  0.03 0.28 0.06 0 0.14 0.06 0.14 0.16 0.09 0.12 ...
##  $ POINT_X   : num  4019906 4002124 3957396 3771876 3954069 ...
##  $ POINT_Y   : num  4192156 4192112 4197521 4194247 4154079 ...
##  $ Bathy     : num  -374 -315 -328 -116 -282 ...
##  $ delta_star: num  0.0702 0.0669 0.0718 0.5582 0.0869 ...
##  $ DistCoast : num  12388 29055 71638 41591 82658 ...
##  $ Folk      : num  12 12 12 10 12 12 8 12 10 12 ...
##  $ Geomorph  : num  3 3 3 1 3 3 3 3 1 3 ...
##  $ Gravel    : num  0.139275 0.002758 0.001096 0.023923 0.000521 ...
##  $ M2Speed   : num  0.0487 0.0453 0.0436 0.1395 0.0568 ...
##  $ Mud       : num  82.74 70.07 87.31 3.72 75.96 ...
##  $ PkOrbVel  : num  0.0146 0.013 0.0143 0.1598 0.02 ...
##  $ Sand      : num  17.1 29.9 12.7 96.3 24 ...
##  $ SedEnv    : num  2 2 2 1 2 2 2 2 1 2 ...
##  $ SPM_summer: num  0.741 0.768 0.945 0.915 0.943 ...
##  $ SPM_winter: num  0.621 0.604 0.56 0.866 0.587 ...

Boruta variable selection

set.seed(seed)
B1 <- Boruta(rm.sr[[1]] ~ .,data=rm.sr[4:ncol(rm.sr)], pValue = 0.05,
             maxRuns = 500)
implist <- names(B1$finalDecision[B1$finalDecision =='Confirmed'])

print(B1)
## Boruta performed 366 iterations in 17.61587 secs.
##  12 attributes confirmed important: Bathy, delta_star, DistCoast,
## Geomorph, Gravel and 7 more;
##  1 attributes confirmed unimportant: Folk;
plot(B1, las=2, colCode = c("greenyellow", "yellow2", "red3", "cadetblue"), xlab = "")

seldata <- rm.sr[implist]

Plot of predictor variables

plot(subset(env.vars,implist))

Environmental space

A visual check to what extent the samples cover the environmental space. This is useful as legacy data were used and no formal sampling design was applied in the analysis.

smp <- as.data.frame(sampleRandom(x = subset(env.vars,implist), size = 10000))

for (i in 1:ncol(seldata)) {
    
  print(ggplot() +
          geom_density(data = seldata, aes(x=seldata[,i]),colour="darkblue",fill="darkblue", alpha=0.1,size=1)+
          geom_density(data = smp, aes(x=smp[,i]), colour="red",fill="red", alpha=0.1, size=1)+
          scale_x_continuous(name = names(seldata[i])))
        
    }

Model preparation

Convert simple data frame into a spatial data frame object

dat <- rm.sr[c("SedRate", "POINT_X", "POINT_Y", implist)]
coordinates(dat)= ~ POINT_X+POINT_Y

proj4string(dat)<- CRS("+proj=laea +lat_0=52 +lon_0=10 +x_0=4321000 
                       +y_0=3210000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs")

Define Random Forest model formula

fm <- as.formula(paste(names(rm.sr[1]), " ~ ", paste(implist, collapse = "+")))
fm
## SedRate ~ Bathy + delta_star + DistCoast + Geomorph + Gravel + 
##     M2Speed + Mud + PkOrbVel + Sand + SedEnv + SPM_summer + SPM_winter

10-fold cross-validation with 3 repeats

ctrl <- trainControl(method="repeatedcv", number=10, repeats=3, search="grid")

Search for the best mtry parameter

tic()

set.seed(seed)
tunegrid <- expand.grid(.mtry=c(2:length(implist)))
rf_gridsearch <- train(fm, data=dat@data, method="rf", tuneGrid=tunegrid, trControl=ctrl)
print(rf_gridsearch)
## Random Forest 
## 
## 219 samples
##  12 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 197, 197, 197, 197, 197, 197, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared   MAE       
##    2    0.1208535  0.6567253  0.05773963
##    3    0.1211717  0.6593099  0.05745631
##    4    0.1225664  0.6515976  0.05782771
##    5    0.1229608  0.6524133  0.05794256
##    6    0.1235871  0.6492473  0.05818822
##    7    0.1239611  0.6484806  0.05797254
##    8    0.1255456  0.6419638  0.05882261
##    9    0.1251281  0.6440528  0.05882117
##   10    0.1249790  0.6422116  0.05860344
##   11    0.1258311  0.6382230  0.05913286
##   12    0.1271401  0.6335688  0.05943402
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
plot(rf_gridsearch, xlab="mtry", ylab="RMSE")

mtry <- rf_gridsearch$bestTune$mtry

toc()
## 129.13 sec elapsed

Quantile regression forest with resampling

How many resampling iterations?

imax <- 25

Generate empty dataframes for validation statistics and variable importance

validation <- data.frame(rmse=numeric(), r2=numeric())
importance <- data.frame(matrix(nrow = length(implist),ncol= imax))
rownames(importance) <- implist
colnames(importance) <- paste0("modn",1:imax)

Build and predict quantregForest model for mean and standard deviation with resampling

pred <- predict(env.vars, rf_gridsearch) # These predictions (pred and unc) are just dummies
unc <- predict(env.vars, rf_gridsearch)

tic()
beginCluster()
## 8 cores detected, using 7
set.seed(seed)

for (i in 1:imax){
  # Splits of sample data into training (70%) and test sets (30%)
  smp_size <- floor(0.7 * nrow(dat))
  train_ind <- sample(seq_len(nrow(dat)), size = smp_size)
  train <- dat[train_ind, ]
  test <- dat[-train_ind, ]
  
  # Quantile regression forest model
  modn <- quantregForest(y=train@data$SedRate, x=train@data[,2:(length(implist)+1)],
          ntree=500, keep.inbag=TRUE, mtry = mtry)
  
  # Prediction of mean and standard deviation
  pred <- stack(pred, clusterR(env.vars, predict, args=list(model=modn,what=mean)))
  unc <- stack(unc, clusterR(env.vars, predict, args=list(model=modn,what=sd)))
  
  # The predicted values are written into a dataframe and NA values omitted
  test$pred <- extract(pred[[i+1]], test)
  df <- data.frame(test$SedRate, test$pred)
  df <- na.omit(df, na.action="omit")
  
  # Validation results are stored in a dataframe
  validation[i, 1] <- rmse(df$test.SedRate, df$test.pred)
  validation[i, 2] <- cor(df$test.SedRate, df$test.pred)^2
  
  # Store variable importance in a dataframe
  importance[,i] <- modn$importance
}

endCluster()
toc()
## 2855.02 sec elapsed
pred <- dropLayer(pred, 1) #Removal of the dummy layers
unc <- dropLayer(unc, 1)

Validation metrics

summary(validation)
##       rmse               r2        
##  Min.   :0.06864   Min.   :0.4228  
##  1st Qu.:0.11337   1st Qu.:0.5225  
##  Median :0.13158   Median :0.5937  
##  Mean   :0.12840   Mean   :0.5846  
##  3rd Qu.:0.14377   3rd Qu.:0.6192  
##  Max.   :0.17240   Max.   :0.8241
print(paste0("RMSE = ", round(mean(validation$rmse), 2), " cm/yr ± ", round(sd(validation$rmse), 2), " cm/yr"))
## [1] "RMSE = 0.13 cm/yr ± 0.03 cm/yr"
print(paste0("R^2 = ", round(mean(validation$r2), 2), " ± ", round(sd(validation$r2), 2)))
## [1] "R^2 = 0.58 ± 0.09"

Variable importance

imp <- as.data.frame(rowMeans(importance))
imp$Var <- rownames(imp)
rownames(imp) <- NULL
colnames(imp) <- c("IncNodePurity", "Predictor")
imp <- imp[order(imp[1],decreasing=T),c(2,1)]
imp
##     Predictor IncNodePurity
## 6     M2Speed     0.8421705
## 2  delta_star     0.6816144
## 8    PkOrbVel     0.5273061
## 9        Sand     0.5232604
## 7         Mud     0.5208298
## 1       Bathy     0.4474648
## 10     SedEnv     0.3442520
## 12 SPM_winter     0.3084469
## 3   DistCoast     0.2925455
## 5      Gravel     0.2251134
## 11 SPM_summer     0.2002119
## 4    Geomorph     0.1532224
impfig <- imp %>%
  mutate(Predictor = fct_reorder(Predictor, IncNodePurity)) %>%
  ggplot( aes(x=Predictor, y=IncNodePurity)) +
    geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
    coord_flip() +
    xlab("") +
    ylab("Increase in node purity") +
    theme_bw() +
    ggtitle("Sedimentation rate")

impfig

Variable importance figure

tiff("./figs/VarImp.tif", width = 10, height = 12, units = "cm", res = 500, compression = "lzw")
impfig
dev.off()
## png 
##   2

QRF mean prediction and model uncertainty

pred_mean <- calc(pred, fun = mean, na.rm = TRUE)
unc_mean <- calc(unc, fun = mean, na.rm = TRUE)

plot(pred_mean, main='Sed. Rate, mean estimate (cm/yr)')

plot(unc_mean, main='Sed. Rate, model uncertainty (cm/yr)')

Sensitivity map of the mean prediction

The sensitivity map shows the dispersion of all individual models

sensitivity <- calc(pred, fun = sd, na.rm = TRUE)
plot(sensitivity, main='Sensitivity of the mean')

The total uncertainty is the sum of sensitivity and model uncertainty

tot.unc <- unc_mean + sensitivity
tot.unc.percent <- 100*tot.unc/pred_mean
plot(tot.unc, main='Total uncertainty (cm/yr)')

plot(tot.unc.percent, main='Total uncertainty (% of mean)')

Save the resulting maps as *.tif files

writeRaster(pred_mean, file='output/SedRate3_quantrf_mean.tif',
overwrite=TRUE)
writeRaster(unc_mean, file='output/SedRate3_quantrf_unc.tif',
overwrite=TRUE)
writeRaster(tot.unc, file='output/SedRate3_quantrf_tot.unc.tif',
overwrite=TRUE)
writeRaster(tot.unc.percent, file='output/SedRate3_quantrf_tot.unc.percent.tif',
overwrite=TRUE)

Figures for publication

col.pal1 <- brewer.pal(8,"YlGnBu")
col.pal2 <- brewer.pal(8,"YlOrRd")

AoI <- extent(3450000,4450000,3100000,4300000)
land <- readOGR(dsn = "data", layer = "land")
## OGR data source with driver: ESRI Shapefile 
## Source: "N:\Prosjekter\311700_MAREANO\311720_Havbunnskart\c_Metodeprosjekter\xxx_Karbonlagring\sedrate.model\data", layer: "land"
## with 8186 features
## It has 1 fields
land <- crop(land, AoI)
sr <- crop(raster("./output/SedRate3_quantrf_mean.tif"), AoI)
sr.unc <- crop(tot.unc, AoI)
sr.unc.pc <- crop(tot.unc.percent, AoI)

# Sedimentation rates
p1 <- ggR(sr, geom_raster = TRUE) + 
  geom_polygon(data = land, aes(x = long, y = lat, group = group)) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle(expression(Sedimentation~Rate~(cm~yr^{-1}))) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p2 <- ggR(sr.unc, geom_raster = TRUE) + 
  geom_polygon(data = land, aes(x = long, y = lat, group = group)) + 
  scale_fill_gradientn(colours = col.pal2, na.value = NA, name = "") +
  ggtitle(expression(Total~Uncertainty~(cm~yr^{-1}))) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p3 <- ggR(sr.unc.pc, geom_raster = TRUE) + 
  geom_polygon(data = land, aes(x = long, y = lat, group = group)) + 
  scale_fill_gradientn(colours = col.pal2, na.value = NA, name = "") +
  ggtitle(expression(Total~Uncertainty~('%'~of~Mean))) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

tiff("./figs/sr_figure_v1.tif", width = 30, height = 10, units = "cm", res = 500, compression = "lzw")
grid.arrange(p1, p2, p3, ncol = 3)
dev.off()
## png 
##   2
tiff("./figs/sr_figure_v2.tif", width = 20, height = 10, units = "cm", res = 500, compression = "lzw")
grid.arrange(p1, p2, ncol = 2)
dev.off()
## png 
##   2
# Predictor variables
p4 <- ggR(crop(subset(env.vars,implist)[[1]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = rev(col.pal1), na.value = NA, name = "") +
  ggtitle("Bathymetry (m)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p5 <- ggR(crop(subset(env.vars,implist)[[2]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle("delta star (-)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p6 <- ggR((crop(subset(env.vars,implist)[[3]], AoI)/1000), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle("Dist. to coast (km)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p7 <- ggR(crop(subset(env.vars,implist)[[4]], AoI), geom_raster = TRUE, forceCat = T) + 
  scale_fill_manual(values = c("#edf8b1", "#7fcdbb", "#2c7fb8"), name = "", na.translate = FALSE) +
  ggtitle("Geomorphology") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p8 <- ggR(crop(subset(env.vars,implist)[[5]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle("Gravel (%)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p9 <- ggR(crop(subset(env.vars,implist)[[6]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle("M2 speed (m/s)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p10 <- ggR(crop(subset(env.vars,implist)[[7]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle("Mud (%)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p11 <- ggR(crop(subset(env.vars,implist)[[8]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle("Pk. orb. vel. (m/s)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p12 <- ggR(crop(subset(env.vars,implist)[[9]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle("Sand (%)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p13 <- ggR(crop(subset(env.vars,implist)[[10]], AoI), geom_raster = TRUE, forceCat = TRUE) + 
  scale_fill_manual(values = c("#edf8b1", "#2c7fb8"), name = "", na.translate = FALSE) +
  ggtitle("Sedimentary env.") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p14 <- ggR(crop(subset(env.vars,implist)[[11]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle(expression(SPM~summer~(g~m^{-3}))) +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p15 <- ggR(crop(subset(env.vars,implist)[[12]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle(expression(SPM~winter~(g~m^{-3}))) +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

tiff("./figs/predictors.tif", width = 21, height = 29.7, units = "cm", res = 500, compression = "lzw")
grid.arrange(p4, p6, p10, p12, p8, p14, p15, p9, p11, p5, p7, p13, nrow = 4, ncol = 3)
dev.off()
## png 
##   2

Save sessionInfo to file

sessionInfo = sessionInfo()
save(sessionInfo, file = "sessionInfo.Rdata")
rm("sessionInfo")