Preparations

Required libraries

library(rgdal)
library(randomForest)
library(ranger)
library(ggplot2)
library(Metrics)
library(sp)
library(raster)
library(Boruta)
library(caret)
library(tictoc)
library(quantregForest)
library(snow)
library(forcats)
library(gridExtra)
library(ggpubr)
library(RColorBrewer)
library(RStoolbox)
library(spatialEco)

Define some variables for later use

seed <- 42
setwd("N:/Prosjekter/311700_MAREANO/311720_Havbunnskart/c_Metodeprosjekter/xxx_Karbonlagring/ocdensity.model")

Load and prepare the data

Read shapefile with response variable

samples <- readOGR(dsn = "./data", layer = "ocdens")
## OGR data source with driver: ESRI Shapefile 
## Source: "N:\Prosjekter\311700_MAREANO\311720_Havbunnskart\c_Metodeprosjekter\xxx_Karbonlagring\ocdensity.model\data", layer: "ocdens"
## with 373 features
## It has 21 fields
str(samples@data)
## 'data.frame':    373 obs. of  21 variables:
##  $ Station   : chr  "NGU-2" "NGU-3" "NGU-4" "NGU-5" ...
##  $ Latitude  : num  59 59 58.9 58.9 58.7 ...
##  $ Longitude : num  10.7 10.4 10.5 10.6 10.5 ...
##  $ Depth_m   : num  460 108 159 120 140 167 184 211 349 184 ...
##  $ Date      : chr  NA NA NA NA ...
##  $ OC_from_cm: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ OC_to_cm  : num  10 10 10 10 10 10 10 10 10 10 ...
##  $ Count_OC  : num  5 5 5 5 5 5 5 5 5 5 ...
##  $ OC        : num  2.13 1.28 1.95 1.8 1.86 ...
##  $ OC_unit   : chr  "percent" "percent" "percent" "percent" ...
##  $ Porosity  : num  84.1 72.8 80.9 83.7 78.1 ...
##  $ Por_unit  : chr  "percent" "percent" "percent" "percent" ...
##  $ Por_dep_cm: chr  "4-8" "4-8" "4-8" "4-8" ...
##  $ DBD       : num  436 748 524 448 602 ...
##  $ DBD_unit  : chr  "kg/m3" "kg/m3" "kg/m3" "kg/m3" ...
##  $ OC_density: num  9.29 9.59 10.21 8.09 11.22 ...
##  $ OCden_unit: chr  "kg/m3" "kg/m3" "kg/m3" "kg/m3" ...
##  $ Source    : chr  "NGU rapport 95.054 (Porosity, DBD)" "NGU rapport 95.054 (Porosity, DBD)" "NGU rapport 95.054 (Porosity, DBD)" "NGU rapport 95.054 (Porosity, DBD)" ...
##  $ Remarks   : chr  "DBD calculated from weight of dry sample and volume of wet sample. Porosity calculated with grain density of 2700 kg/m3." "DBD calculated from weight of dry sample and volume of wet sample. Porosity calculated with grain density of 2700 kg/m3." "DBD calculated from weight of dry sample and volume of wet sample. Porosity calculated with grain density of 2700 kg/m3." "DBD calculated from weight of dry sample and volume of wet sample. Porosity calculated with grain density of 2700 kg/m3." ...
##  $ POINT_X   : num  4359821 4342811 4348766 4354597 4349066 ...
##  $ POINT_Y   : num  3988344 3984789 3979800 3972880 3955068 ...
plot(samples)

Load environmental predictor variables

env.vars <- stack(list.files(path="./data/predictors", pattern='.tif$', full.names=T), RAT=F)

Create data frame with response and predictor variables

ov.oc <- as.data.frame(extract(env.vars, samples))

rm.oc <- cbind(samples$OC_density, samples$POINT_X,samples$POINT_Y, ov.oc)
names(rm.oc)[1] <- "OC_density"
names(rm.oc)[2] <- "POINT_X"
names(rm.oc)[3] <- "POINT_Y"
rm.oc <- na.omit(rm.oc)
str(rm.oc)
## 'data.frame':    371 obs. of  16 variables:
##  $ OC_density : num  9.29 9.59 10.21 8.09 11.22 ...
##  $ POINT_X    : num  4359821 4342811 4348766 4354597 4349066 ...
##  $ POINT_Y    : num  3988344 3984789 3979800 3972880 3955068 ...
##  $ Bathy      : num  -483 -97.9 -157.1 -124.2 -139 ...
##  $ DistCoast  : num  9485 8323 12582 19728 33028 ...
##  $ M2Speed    : num  0.00762 0.01034 0.00973 0.00966 0.00854 ...
##  $ Mud        : num  96.6 46.6 84.4 66.7 68.1 ...
##  $ O2_mean    : num  279 276 276 279 275 ...
##  $ oet        : num  2.38 3.63 3.49 2.95 2.53 ...
##  $ opd        : num  0.932 0.932 0.932 0.932 0.932 ...
##  $ PkOrbVel   : num  0.2798 0.2764 0.0313 0.0373 0.0446 ...
##  $ sedrate    : num  0.395 0.27 0.267 0.318 0.371 ...
##  $ SPM_summer : num  0.987 0.923 0.883 0.856 0.853 ...
##  $ SPM_winter : num  1.65 1.57 1.76 1.86 1.88 ...
##  $ surfPP_mean: num  0.015 0.01335 0.01379 0.01237 0.00794 ...
##  $ Temp_mean  : num  7.21 6.89 6.9 7.21 6.84 ...
##  - attr(*, "na.action")= 'omit' Named int [1:2] 257 275
##   ..- attr(*, "names")= chr [1:2] "257" "275"

Boruta variable selection

set.seed(seed)
B1 <- Boruta(rm.oc[[1]] ~ .,data=rm.oc[4:ncol(rm.oc)], pValue = 0.05,
             maxRuns = 500)
implist <- names(B1$finalDecision[B1$finalDecision =='Confirmed'])

print(B1)
## Boruta performed 12 iterations in 1.637282 secs.
##  13 attributes confirmed important: Bathy, DistCoast, M2Speed, Mud,
## O2_mean and 8 more;
##  No attributes deemed unimportant.
plot(B1, las=2, colCode = c("greenyellow", "yellow2", "red3", "cadetblue"), xlab = "")

seldata <- rm.oc[implist]

Plot of predictor variables

plot(subset(env.vars,implist))

Environmental space

A visual check to what extent the samples cover the environmental space. This is useful as legacy data were used and no formal sampling design was applied in the analysis.

smp <- as.data.frame(sampleRandom(x = subset(env.vars,implist), size = 10000))

for (i in 1:ncol(seldata)) {
    
  print(ggplot() +
          geom_density(data = seldata, aes(x=seldata[,i]),colour="darkblue",fill="darkblue", alpha=0.1,size=1)+
          geom_density(data = smp, aes(x=smp[,i]), colour="red",fill="red", alpha=0.1, size=1)+
          scale_x_continuous(name = names(seldata[i])))
        
    }

Model preparation

Convert simple data frame into a spatial data frame object

dat <- rm.oc[c("OC_density", "POINT_X", "POINT_Y", implist)]
coordinates(dat)= ~ POINT_X+POINT_Y

proj4string(dat)<- CRS("+proj=laea +lat_0=52 +lon_0=10 +x_0=4321000 
                       +y_0=3210000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs")

Define Random Forest model formula

fm <- as.formula(paste(names(rm.oc[1]), " ~ ", paste(implist, collapse = "+")))
fm
## OC_density ~ Bathy + DistCoast + M2Speed + Mud + O2_mean + oet + 
##     opd + PkOrbVel + sedrate + SPM_summer + SPM_winter + surfPP_mean + 
##     Temp_mean

10-fold cross-validation with 3 repeats

ctrl <- trainControl(method="repeatedcv", number=10, repeats=3, search="grid")

Search for the best mtry parameter

tic()

set.seed(seed)
tunegrid <- expand.grid(.mtry=c(2:length(implist)))
rf_gridsearch <- train(fm, data=dat@data, method="rf", tuneGrid=tunegrid, trControl=ctrl)
print(rf_gridsearch)
## Random Forest 
## 
## 371 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 334, 335, 334, 334, 334, 335, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    2    2.045792  0.7338706  1.435533
##    3    2.045823  0.7331795  1.433483
##    4    2.045405  0.7333726  1.431264
##    5    2.054203  0.7315382  1.433213
##    6    2.061669  0.7292277  1.439491
##    7    2.066628  0.7279335  1.442571
##    8    2.066495  0.7280685  1.443463
##    9    2.074459  0.7259418  1.447701
##   10    2.077866  0.7251686  1.454114
##   11    2.080117  0.7242338  1.455811
##   12    2.085741  0.7231652  1.458989
##   13    2.090266  0.7219301  1.462332
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 4.
plot(rf_gridsearch, xlab="mtry", ylab="RMSE")

mtry <- rf_gridsearch$bestTune$mtry

toc()
## 315.69 sec elapsed

Quantile regression forest with resampling

How many resampling iterations?

imax <- 25

Generate empty dataframes for validation statistics and variable importance

validation <- data.frame(rmse=numeric(), r2=numeric())
importance <- data.frame(matrix(nrow = length(implist),ncol= imax))
rownames(importance) <- implist
colnames(importance) <- paste0("modn",1:imax)

Build and predict quantregForest model for mean and standard deviation with resampling

pred <- predict(env.vars, rf_gridsearch) # These predictions (pred and unc) are just dummies
unc <- predict(env.vars, rf_gridsearch)

tic()
beginCluster()
## 8 cores detected, using 7
set.seed(seed)

for (i in 1:imax){
  # Splits of sample data into training (70%) and test sets (30%)
  smp_size <- floor(0.7 * nrow(dat))
  train_ind <- sample(seq_len(nrow(dat)), size = smp_size)
  train <- dat[train_ind, ]
  test <- dat[-train_ind, ]
  
  # Quantile regression forest model
  modn <- quantregForest(y=train@data$OC_density, x=train@data[,2:(length(implist)+1)],
          ntree=500, keep.inbag=TRUE, mtry = mtry)
  
  # Prediction of mean and standard deviation
  pred <- stack(pred, clusterR(env.vars, predict, args=list(model=modn,what=mean)))
  unc <- stack(unc, clusterR(env.vars, predict, args=list(model=modn,what=sd)))
  
  # The predicted values are written into a dataframe and NA values omitted
  test$pred <- extract(pred[[i+1]], test)
  df <- data.frame(test$OC_density, test$pred)
  df <- na.omit(df, na.action="omit")
  
  # Validation results are stored in a dataframe
  validation[i,1] <- rmse(df$test.OC_density, df$test.pred)
  validation[i,2] <- cor(df$test.OC_density, df$test.pred)^2

  # Store variable importance in a dataframe
  importance[,i] <- modn$importance
}

endCluster()
toc()
## 3045.38 sec elapsed
pred <- dropLayer(pred, 1) #Removal of the dummy layers
unc <- dropLayer(unc, 1)

Validation metrics

summary(validation)
##       rmse             r2        
##  Min.   :1.660   Min.   :0.5881  
##  1st Qu.:1.963   1st Qu.:0.6836  
##  Median :2.134   Median :0.7270  
##  Mean   :2.158   Mean   :0.7157  
##  3rd Qu.:2.368   3rd Qu.:0.7513  
##  Max.   :2.605   Max.   :0.8317
print(paste0("RMSE = ", round(mean(validation$rmse), 2), " kg/m3 ± ", round(sd(validation$rmse), 2), " kg/m3"))
## [1] "RMSE = 2.16 kg/m3 ± 0.25 kg/m3"
print(paste0("R^2 = ", round(mean(validation$r2), 2), " ± ", round(sd(validation$r2), 2)))
## [1] "R^2 = 0.72 ± 0.06"

Variable importance

imp <- as.data.frame(rowMeans(importance))
imp$Var <- rownames(imp)
rownames(imp) <- NULL
colnames(imp) <- c("IncNodePurity", "Predictor")
imp <- imp[order(imp[1],decreasing=T),c(2,1)]
imp
##      Predictor IncNodePurity
## 1        Bathy     691.28779
## 9      sedrate     658.84057
## 13   Temp_mean     605.49615
## 6          oet     557.76469
## 4          Mud     342.84628
## 8     PkOrbVel     284.05071
## 3      M2Speed     196.89683
## 5      O2_mean      98.18211
## 2    DistCoast      97.68523
## 12 surfPP_mean      92.65550
## 11  SPM_winter      90.22662
## 7          opd      71.58293
## 10  SPM_summer      64.64438
impfig <- imp %>%
  mutate(Predictor = fct_reorder(Predictor, IncNodePurity)) %>%
  ggplot( aes(x=Predictor, y=IncNodePurity)) +
    geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
    coord_flip() +
    xlab("") +
    ylab("Increase in node purity") +
    theme_bw() +
    ggtitle("Organic carbon density")

impfig

Variable importance figure

tiff("./figs/VarImp.tif", width = 10, height = 12, units = "cm", res = 500, compression = "lzw")
impfig
dev.off()
## png 
##   2

QRF mean prediction and model uncertainty

pred_mean <- calc(pred, fun = mean, na.rm = TRUE)
unc_mean <- calc(unc, fun = mean, na.rm = TRUE)

plot(pred_mean, main='OC density, mean estimate (kg/m3)')

plot(unc_mean, main='OC density model uncertainty (kg/m3)')

Sensitivity map of the mean prediction

The sensitivity map shows the dispersion of all individual models

sensitivity <- calc(pred, fun = sd, na.rm = TRUE)
plot(sensitivity, main='Sensitivity of the mean (kg/m3)')

The total uncertainty is the sum of sensitivity and model uncertainty

tot.unc <- unc_mean + sensitivity
tot.unc.percent <- 100*tot.unc/pred_mean
plot(tot.unc, main='Total uncertainty (kg/m3)')

plot(tot.unc.percent, main='Total uncertainty (% of mean)')

Calculation of the OC stock and the associated uncertainty

d <- 0.1 #reference depth
A <- res(pred_mean)[1]*res(pred_mean)[2] #Area of one pixel

oc_stock <- cellStats(pred_mean, sum)*d*A/1000000000
oc_stock_unc <- cellStats(tot.unc, sum)*d*A/1000000000

print(paste0("OC stock = ", round(oc_stock, 1), " Tg ± ", round(oc_stock_unc, 1), " Tg"))
## [1] "OC stock = 230.5 Tg ± 134.5 Tg"

Calculation of OC stock of the Norwegian Trough

shp <- readOGR(dsn = "./data/shp", layer = "NorthSeaSkagerrak_NorwegianTrough")
## OGR data source with driver: ESRI Shapefile 
## Source: "N:\Prosjekter\311700_MAREANO\311720_Havbunnskart\c_Metodeprosjekter\xxx_Karbonlagring\ocdensity.model\data\shp", layer: "NorthSeaSkagerrak_NorwegianTrough"
## with 2 features
## It has 3 fields
## Integer64 fields read as strings:  Id gridcode
oc_sum <- zonal.stats(x = shp, y = pred_mean, stats = "sum")
oc_sum_unc <- zonal.stats(x = shp, y = tot.unc, stats = "sum")

oc_sum <- oc_sum*d*A/1000000000
oc_sum_unc <- oc_sum_unc*d*A/1000000000

print(paste0("OC stock Norwegian Trough = ", round(oc_sum[2,], 1), " Tg ± ", round(oc_sum_unc[2,], 1), " Tg"))
## [1] "OC stock Norwegian Trough = 60.1 Tg ± 18.3 Tg"

Save the resulting maps as *.tif files

writeRaster(pred_mean, file='output/OCdensity_quantrf_mean.tif',
overwrite=TRUE)
writeRaster(unc_mean, file='output/OCdensity_quantrf_unc.tif',
overwrite=TRUE)
writeRaster(tot.unc, file='output/OCdensity_quantrf_tot.unc.tif',
overwrite=TRUE)
writeRaster(tot.unc.percent, file='output/OCdensity_quantrf_tot.unc.percent.tif',
overwrite=TRUE)

Figures for publication

col.pal1 <- brewer.pal(8,"YlGnBu")
col.pal2 <- brewer.pal(8,"YlOrRd")

AoI <- extent(3450000,4450000,3100000,4300000)
land <- readOGR(dsn = "data", layer = "land")
## OGR data source with driver: ESRI Shapefile 
## Source: "N:\Prosjekter\311700_MAREANO\311720_Havbunnskart\c_Metodeprosjekter\xxx_Karbonlagring\ocdensity.model\data", layer: "land"
## with 8186 features
## It has 1 fields
land <- crop(land, AoI)
oc <- crop(raster("./output/OCdensity_quantrf_mean.tif"), AoI)
oc.unc <- crop(tot.unc, AoI)
oc.unc.pc <- crop(tot.unc.percent, AoI)

p1 <- ggR(oc, geom_raster = TRUE) + 
  geom_polygon(data = land, aes(x = long, y = lat, group = group)) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle(expression(OC~Density~(kg~m^{-3}))) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p2 <- ggR(oc.unc, geom_raster = TRUE) + 
  geom_polygon(data = land, aes(x = long, y = lat, group = group)) + 
  scale_fill_gradientn(colours = col.pal2, na.value = NA, name = "") +
  ggtitle(expression(Total~Uncertainty~(kg~m^{-3}))) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p3 <- ggR(oc.unc.pc, geom_raster = TRUE) + 
  geom_polygon(data = land, aes(x = long, y = lat, group = group)) + 
  scale_fill_gradientn(colours = col.pal2, na.value = NA, name = "") +
  ggtitle(expression(Total~Uncertainty~('%'~of~Mean))) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

tiff("./figs/oc_figure_v1.tif", width = 30, height = 10, units = "cm", res = 500, compression = "lzw")
grid.arrange(p1, p2, p3, ncol = 3)
dev.off()
## png 
##   2
tiff("./figs/oc_figure_v2.tif", width = 20, height = 10, units = "cm", res = 500, compression = "lzw")
grid.arrange(p1, p2, ncol = 2)
dev.off()
## png 
##   2
# Predictor variables
p4 <- ggR(crop(subset(env.vars,implist)[[1]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = rev(col.pal1), na.value = NA, name = "") +
  ggtitle("Bathymetry (m)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p5 <-  ggR((crop(subset(env.vars,implist)[[2]], AoI)/1000), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle("Dist. to coast (km)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p6 <- ggR(crop(subset(env.vars,implist)[[3]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle("M2 speed (m/s)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p7 <- ggR(crop(subset(env.vars,implist)[[4]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle("Mud (%)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p8 <- ggR(crop(subset(env.vars,implist)[[5]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle(expression(Bottom~water~O[2]~(mol~m^{-3}))) +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p9 <- ggR(crop(subset(env.vars,implist)[[6]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "",
      trans = "log", breaks = c(10, 100, 1000), labels = c(10, 100, 1000)) +
  ggtitle("OET (yr)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p10 <- ggR(crop(subset(env.vars,implist)[[7]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle("OPD (cm)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p11 <- ggR(crop(subset(env.vars,implist)[[8]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle("Pk. orb. vel. (m/s)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p12 <- ggR(crop(subset(env.vars,implist)[[9]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle("Sedimentation rate (cm/yr)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p13 <- ggR(crop(subset(env.vars,implist)[[10]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle(expression(SPM~summer~(g~m^{-3}))) +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p14 <- ggR(crop(subset(env.vars,implist)[[11]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle(expression(SPM~winter~(g~m^{-3}))) +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p15 <- ggR(crop(subset(env.vars,implist)[[12]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle(expression(Sea~surface~PP~(g~m^{-3}~day^{-1}))) +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

p16 <- ggR(crop(subset(env.vars,implist)[[13]], AoI), geom_raster = TRUE) + 
  scale_fill_gradientn(colours = col.pal1, na.value = NA, name = "") +
  ggtitle("Bottom water temp. (ºC)") +
  scale_x_continuous(breaks = c(3500000, 4000000)) +
  theme(axis.title = element_blank()) +
  ggpubr::rotate_y_text()

tiff("./figs/predictors.tif", width = 21, height = 29.7, units = "cm", res = 500, compression = "lzw")
grid.arrange(p5, p7, p13, p14, p6, p11, p8, p10, p9, p16, p15, p12, nrow = 4, ncol = 3)
dev.off()
## png 
##   2

Save sessionInfo to file

sessionInfo = sessionInfo()
save(sessionInfo, file = "sessionInfo.Rdata")
rm("sessionInfo")