library(rgdal)
## Loading required package: sp
## rgdal: version: 1.4-7, (SVN revision 845)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.2.3, released 2017/11/20
## Path to GDAL shared files: C:/Users/diesing_markus/Documents/R/R-3.6.1/library/rgdal/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
## Path to PROJ.4 shared files: C:/Users/diesing_markus/Documents/R/R-3.6.1/library/rgdal/proj
## Linking to sp version: 1.3-2
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(ranger)
##
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
##
## importance
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(Metrics)
## Warning: package 'Metrics' was built under R version 3.6.2
library(sp)
library(raster)
library(Boruta)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
##
## precision, recall
library(tictoc)
library(quantregForest)
## Warning: package 'quantregForest' was built under R version 3.6.2
## Loading required package: RColorBrewer
library(snow)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.6.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.6.3
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:raster':
##
## extract
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:raster':
##
## rotate
library(RColorBrewer)
library(RStoolbox)
## Warning: package 'RStoolbox' was built under R version 3.6.3
seed <- 42
setwd("N:/Prosjekter/311700_MAREANO/311720_Havbunnskart/c_Metodeprosjekter/xxx_Karbonlagring/ocdensity.model")
samples <- readOGR(dsn = "./data", layer = "ocdens")
## OGR data source with driver: ESRI Shapefile
## Source: "N:\Prosjekter\311700_MAREANO\311720_Havbunnskart\c_Metodeprosjekter\xxx_Karbonlagring\ocdensity.model\data", layer: "ocdens"
## with 373 features
## It has 21 fields
str(samples@data)
## 'data.frame': 373 obs. of 21 variables:
## $ Station : Factor w/ 373 levels "A84-01","A84-02",..: 160 242 252 263 274 285 295 306 58 69 ...
## $ Latitude : num 59 59 58.9 58.9 58.7 ...
## $ Longitude : num 10.7 10.4 10.5 10.6 10.5 ...
## $ Depth_m : num 460 108 159 120 140 167 184 211 349 184 ...
## $ Date : Factor w/ 38 levels "01/06/1994","01/06/1995",..: NA NA NA NA NA NA NA NA NA NA ...
## $ OC_from_cm: num 0 0 0 0 0 0 0 0 0 0 ...
## $ OC_to_cm : num 10 10 10 10 10 10 10 10 10 10 ...
## $ Count_OC : num 5 5 5 5 5 5 5 5 5 5 ...
## $ OC : num 2.13 1.28 1.95 1.8 1.86 ...
## $ OC_unit : Factor w/ 1 level "percent": 1 1 1 1 1 1 1 1 1 1 ...
## $ Porosity : num 84.1 72.8 80.9 83.7 78.1 ...
## $ Por_unit : Factor w/ 1 level "percent": 1 1 1 1 1 1 1 1 1 1 ...
## $ Por_dep_cm: Factor w/ 12 levels "0-0.5","0-10",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ DBD : num 436 748 524 448 602 ...
## $ DBD_unit : Factor w/ 1 level "kg/m3": 1 1 1 1 1 1 1 1 1 1 ...
## $ OC_density: num 9.29 9.59 10.21 8.09 11.22 ...
## $ OCden_unit: Factor w/ 1 level "kg/m3": 1 1 1 1 1 1 1 1 1 1 ...
## $ Source : Factor w/ 64 levels "de Haas et al. (1997) Mar Geol 144, 131-146.",..: 63 63 63 63 63 63 63 63 63 63 ...
## $ Remarks : Factor w/ 5 levels "DBD calculated from porosity with grain density of 2650 kg/m3.",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ POINT_X : num 4359821 4342811 4348766 4354597 4349066 ...
## $ POINT_Y : num 3988344 3984789 3979800 3972880 3955068 ...
plot(samples)
env.vars <- stack(list.files(path="./data/predictors", pattern='.tif$', full.names=T), RAT=F)
ov.oc <- as.data.frame(extract(env.vars, samples))
rm.oc <- cbind(samples$OC_density, samples$POINT_X,samples$POINT_Y, ov.oc)
names(rm.oc)[1] <- "OC_density"
names(rm.oc)[2] <- "POINT_X"
names(rm.oc)[3] <- "POINT_Y"
rm.oc <- na.omit(rm.oc)
str(rm.oc)
## 'data.frame': 371 obs. of 16 variables:
## $ OC_density : num 9.29 9.59 10.21 8.09 11.22 ...
## $ POINT_X : num 4359821 4342811 4348766 4354597 4349066 ...
## $ POINT_Y : num 3988344 3984789 3979800 3972880 3955068 ...
## $ Bathy : num -483 -97.9 -157.1 -124.2 -139 ...
## $ DistCoast : num 9485 8323 12582 19728 33028 ...
## $ M2Speed : num 0.00762 0.01034 0.00973 0.00966 0.00854 ...
## $ Mud : num 96.6 46.6 84.4 66.7 68.1 ...
## $ O2_mean : num 279 276 276 279 275 ...
## $ oet : num 2.38 3.63 3.49 2.95 2.53 ...
## $ opd : num 0.932 0.932 0.932 0.932 0.932 ...
## $ PkOrbVel : num 0.2798 0.2764 0.0313 0.0373 0.0446 ...
## $ sedrate : num 0.395 0.27 0.267 0.318 0.371 ...
## $ SPM_summer : num 0.987 0.923 0.883 0.856 0.853 ...
## $ SPM_winter : num 1.65 1.57 1.76 1.86 1.88 ...
## $ surfPP_mean: num 0.015 0.01335 0.01379 0.01237 0.00794 ...
## $ Temp_mean : num 7.21 6.89 6.9 7.21 6.84 ...
## - attr(*, "na.action")= 'omit' Named int 257 275
## ..- attr(*, "names")= chr "257" "275"
set.seed(seed)
B1 <- Boruta(rm.oc[[1]] ~ .,data=rm.oc[4:ncol(rm.oc)], pValue = 0.05,
maxRuns = 500)
implist <- names(B1$finalDecision[B1$finalDecision =='Confirmed'])
print(B1)
## Boruta performed 12 iterations in 1.736507 secs.
## 13 attributes confirmed important: Bathy, DistCoast, M2Speed, Mud,
## O2_mean and 8 more;
## No attributes deemed unimportant.
plot(B1, las=2, colCode = c("greenyellow", "yellow2", "red3", "cadetblue"), xlab = "")
seldata <- rm.oc[implist]
A visual check to what extent the samples cover the environmental space. This is useful as legacy data were used and no formal sampling design was applied in the analysis.
Blue: Samples
Red: Environmental data (based on random subsample)
smp <- as.data.frame(sampleRandom(x = subset(env.vars,implist), size = 10000))
for (i in 1:ncol(seldata)) {
print(ggplot() +
geom_density(data = seldata, aes(x=seldata[,i]),colour="darkblue",fill="darkblue", alpha=0.1,size=1)+
geom_density(data = smp, aes(x=smp[,i]), colour="red",fill="red", alpha=0.1, size=1)+
scale_x_continuous(name = names(seldata[i])))
}
dat <- rm.oc[c("OC_density", "POINT_X", "POINT_Y", implist)]
coordinates(dat)= ~ POINT_X+POINT_Y
proj4string(dat)<- CRS("+proj=laea +lat_0=52 +lon_0=10 +x_0=4321000
+y_0=3210000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs")
fm <- as.formula(paste(names(rm.oc[1]), " ~ ", paste(implist, collapse = "+")))
fm
## OC_density ~ Bathy + DistCoast + M2Speed + Mud + O2_mean + oet +
## opd + PkOrbVel + sedrate + SPM_summer + SPM_winter + surfPP_mean +
## Temp_mean
ctrl <- trainControl(method="repeatedcv", number=10, repeats=3, search="grid")
tic()
set.seed(seed)
tunegrid <- expand.grid(.mtry=c(2:length(implist)))
rf_gridsearch <- train(fm, data=dat@data, method="rf", tuneGrid=tunegrid, trControl=ctrl)
print(rf_gridsearch)
## Random Forest
##
## 371 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 334, 335, 334, 334, 334, 335, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 2.045792 0.7338706 1.435533
## 3 2.045823 0.7331795 1.433483
## 4 2.045405 0.7333726 1.431264
## 5 2.054203 0.7315382 1.433213
## 6 2.061669 0.7292277 1.439491
## 7 2.066628 0.7279335 1.442571
## 8 2.066495 0.7280685 1.443463
## 9 2.074459 0.7259418 1.447701
## 10 2.077866 0.7251686 1.454114
## 11 2.080117 0.7242338 1.455811
## 12 2.085741 0.7231652 1.458989
## 13 2.090266 0.7219301 1.462332
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 4.
plot(rf_gridsearch, xlab="mtry", ylab="RMSE")
mtry <- rf_gridsearch$bestTune$mtry
toc()
## 318.77 sec elapsed
imax <- 25
validation <- data.frame(rmse=numeric(), r2=numeric(), ve=numeric())
importance <- data.frame(matrix(nrow = length(implist),ncol= imax))
rownames(importance) <- implist
colnames(importance) <- paste0("modn",1:imax)
pred <- predict(env.vars, rf_gridsearch) # These predictions (pred and unc) are just dummies
unc <- predict(env.vars, rf_gridsearch)
tic()
beginCluster()
## 8 cores detected, using 7
set.seed(seed)
for (i in 1:imax){
# Splits of sample data into training (70%) and test sets (30%)
smp_size <- floor(0.7 * nrow(dat))
train_ind <- sample(seq_len(nrow(dat)), size = smp_size)
train <- dat[train_ind, ]
test <- dat[-train_ind, ]
# Quantile regression forest model
modn <- quantregForest(y=train@data$OC_density, x=train@data[,2:(length(implist)+1)],
ntree=500, keep.inbag=TRUE, mtry = mtry)
# Prediction of mean and standard deviation
pred <- stack(pred, clusterR(env.vars, predict, args=list(model=modn,what=mean)))
unc <- stack(unc, clusterR(env.vars, predict, args=list(model=modn,what=sd)))
# The predicted values are written into a dataframe and NA values omitted
test$pred <- extract(pred[[i+1]], test)
df <- data.frame(test$OC_density, test$pred)
df <- na.omit(df, na.action="omit")
# Validation results are stored in a dataframe
validation[i,1] <- rmse(df$test.OC_density, df$test.pred)
validation[i,2] <- cor(df$test.OC_density, df$test.pred)^2
validation[i,3] <- 1-(mse(df$test.OC_density, df$test.pred)/var(df$test.OC_density, df$test.pred))
# Store variable importance in a dataframe
importance[,i] <- modn$importance
}
endCluster()
toc()
## 2439.7 sec elapsed
pred <- dropLayer(pred, 1) #Removal of the dummy layers
unc <- dropLayer(unc, 1)
summary(validation)
## rmse r2 ve
## Min. :1.660 Min. :0.5881 Min. :0.3333
## 1st Qu.:1.963 1st Qu.:0.6836 1st Qu.:0.5251
## Median :2.134 Median :0.7270 Median :0.6049
## Mean :2.158 Mean :0.7157 Mean :0.5802
## 3rd Qu.:2.368 3rd Qu.:0.7513 3rd Qu.:0.6496
## Max. :2.605 Max. :0.8317 Max. :0.7837
print(paste0("RMSE = ", round(mean(validation$rmse), 2), " kg/m3 ± ", round(sd(validation$rmse), 2), " kg/m3"))
## [1] "RMSE = 2.16 kg/m3 ± 0.25 kg/m3"
print(paste0("R^2 = ", round(mean(validation$r2), 2), " ± ", round(sd(validation$r2), 2)))
## [1] "R^2 = 0.72 ± 0.06"
print(paste0("VE = ", round(mean(validation$ve), 2), " ± ", round(sd(validation$ve), 2)))
## [1] "VE = 0.58 ± 0.11"
imp <- as.data.frame(rowMeans(importance))
imp$Var <- rownames(imp)
rownames(imp) <- NULL
colnames(imp) <- c("IncNodePurity", "Predictor")
imp <- imp[order(imp[1],decreasing=T),c(2,1)]
imp
pred_mean <- calc(pred, fun = mean, na.rm = TRUE)
unc_mean <- calc(unc, fun = mean, na.rm = TRUE)
plot(pred_mean, main='OC density, mean estimate (kg/m3)')
plot(unc_mean, main='OC density model uncertainty (kg/m3)')
The sensitivity map shows the dispersion of all individual models
sensitivity <- calc(pred, fun = sd, na.rm = TRUE)
plot(sensitivity, main='Sensitivity of the mean (kg/m3)')
tot.unc <- unc_mean + sensitivity
tot.unc.percent <- 100*tot.unc/pred_mean
plot(tot.unc, main='Total uncertainty (kg/m3)')
plot(tot.unc.percent, main='Total uncertainty (% of mean)')
d <- 0.1 #reference depth
A <- res(pred_mean)[1]*res(pred_mean)[2] #Area of one pixel
oc_stock <- cellStats(pred_mean, sum)*d*A/1000000000
oc_stock_unc <- cellStats(tot.unc, sum)*d*A/1000000000
print(paste0("OC stock = ", round(oc_stock, 1), " Tg ± ", round(oc_stock_unc, 1), " Tg"))
## [1] "OC stock = 230.5 Tg ± 134.5 Tg"
writeRaster(pred_mean, file='output/OCdensity_quantrf_mean.tif',
overwrite=TRUE)
writeRaster(unc_mean, file='output/OCdensity_quantrf_unc.tif',
overwrite=TRUE)
writeRaster(tot.unc, file='output/OCdensity_quantrf_tot.unc.tif',
overwrite=TRUE)
writeRaster(tot.unc.percent, file='output/OCdensity_quantrf_tot.unc.percent.tif',
overwrite=TRUE)
col.pal1 <- brewer.pal(8,"YlGnBu")
col.pal2 <- brewer.pal(8,"YlOrRd")
AoI <- extent(3450000,4450000,3100000,4300000)
land <- readOGR(dsn = "N:/Prosjekter/311700_MAREANO/311720_Havbunnskart/c_Metodeprosjekter/xxx_Karbonlagring/GIS", layer = "Europe_coastline_poly")
## OGR data source with driver: ESRI Shapefile
## Source: "N:\Prosjekter\311700_MAREANO\311720_Havbunnskart\c_Metodeprosjekter\xxx_Karbonlagring\GIS", layer: "Europe_coastline_poly"
## with 1 features
## It has 1 fields
land <- crop(land, AoI)
oc <- crop(raster("./output/OCdensity_quantrf_mean.tif"), AoI)
oc.unc <- crop(tot.unc, AoI)
oc.unc.pc <- crop(tot.unc.percent, AoI)
p1 <- ggR(oc, geom_raster = TRUE) +
geom_polygon(data = land, aes(x = long, y = lat, group = group)) +
scale_fill_gradientn(colours = col.pal1, na.value = "", name = "") +
ggtitle(expression(OC~Density~(kg~m^{-3}))) +
theme(axis.title = element_blank()) +
ggpubr::rotate_y_text()
## Regions defined for each Polygons
p2 <- ggR(oc.unc, geom_raster = TRUE) +
geom_polygon(data = land, aes(x = long, y = lat, group = group)) +
scale_fill_gradientn(colours = col.pal2, na.value = "", name = "") +
ggtitle(expression(Total~Uncertainty~(kg~m^{-3}))) +
theme(axis.title = element_blank()) +
ggpubr::rotate_y_text()
## Regions defined for each Polygons
p3 <- ggR(oc.unc.pc, geom_raster = TRUE) +
geom_polygon(data = land, aes(x = long, y = lat, group = group)) +
scale_fill_gradientn(colours = col.pal2, na.value = "", name = "") +
ggtitle(expression(Total~Uncertainty~('%'~of~Mean))) +
theme(axis.title = element_blank()) +
ggpubr::rotate_y_text()
## Regions defined for each Polygons
tiff("./figs/oc_figure_v1.tif", width = 30, height = 10, units = "cm", res = 500, compression = "lzw")
grid.arrange(p1, p2, p3, ncol = 3)
dev.off()
## png
## 2
tiff("./figs/oc_figure_v2.tif", width = 20, height = 10, units = "cm", res = 500, compression = "lzw")
grid.arrange(p1, p2, ncol = 2)
dev.off()
## png
## 2