## ----randfor_req, message=FALSE----------------------------------------------- library(agrodata) library(agro) library(randomForest) library(rpart) ## ----randfor_1---------------------------------------------------------------- d <- reagro_data("soilfert") dim(d) head(d) ## ----randfor_10--------------------------------------------------------------- set.seed(2019) i <- sample(nrow(d), 0.5*nrow(d)) d1 <- d[i,] d2 <- d[-i,] ## ----randfor_20, echo=FALSE--------------------------------------------------- dtab <- data.frame(variable=c('temp', 'precip', 'ExchP', 'TotK', 'ExchAl', 'TotN', 'sand', 'clay', 'SOC', 'pH', 'AWC', 'fert'), description=c('Average temperature', 'Annual precipitation', 'Soil exchangeble P', 'Soil total K', 'Soil exchangeble Al', 'Soil total N', 'Soil franction sand (%)', 'Soil fraction clay (%)', 'Soil organic carbon (g/kg)', 'Soil pH', 'Soil water holding capacity', 'fertilizer (index) kg/ha')) knitr::kable(dtab) ## ----randfor_30, fig.width=10------------------------------------------------- par(mfrow=c(2,3), mai=rep(0.5, 4)) for (v in c("SOC", "pH", "precip", "temp", "clay", "sand")) { boxplot(d[,v], main=v) } ## ----randfor_40--------------------------------------------------------------- model <- SOC~pH+precip+temp+sand+clay lrm <- lm(model, data=d1) summary(lrm) agro::RMSE_null(d2$SOC, predict(lrm, d2)) ## ----randfor_50--------------------------------------------------------------- cart <- rpart::rpart(model, data=d1) agro::RMSE_null(d2$SOC, predict(cart, d2)) ## ----randfor_60, fig.width=10, fig.width=8------------------------------------ plot(cart) text(cart) ## ----randfor_70--------------------------------------------------------------- cart ## ----randfor_75--------------------------------------------------------------- nulldev <- function(x) { sum((x - mean(x))^2) } nulldev(d1$SOC) ## ----randfor_80--------------------------------------------------------------- cdev <- sum(cart$frame[cart$frame$var == "", "dev"]) cdev rdev <- round(100 * cdev / nulldev(d1$SOC)) rdev ## ----randfor_90--------------------------------------------------------------- cart2 <- rpart::rpart(model, data=d2) agro::RMSE_null(d1$SOC, predict(cart2, d1)) ## ----randfor_95--------------------------------------------------------------- # model 1, test data agro::RMSE_null(d2$SOC, predict(cart, d2)) # model 1, train data agro::RMSE_null(d1$SOC, predict(cart, d1)) # model 2, test data agro::RMSE_null(d1$SOC, predict(cart2, d1)) # model 2, train data agro::RMSE_null(d2$SOC, predict(cart2, d2)) ## ----randfor_100, fig.width=10------------------------------------------------ par(mfrow=c(1,2)) plot(cart) text(cart, cex=0.8) plot(cart2) text(cart2, cex=0.8) ## ----randfor_110-------------------------------------------------------------- mean(replicate(10000, length(unique(sample(100, replace=TRUE))))) ## ----randfor_120-------------------------------------------------------------- n <- 10 set.seed(99) predictions <- matrix(nrow=nrow(d2), ncol=n) eval <- rep(NA, n) for (i in 1:n) { k <- sample(nrow(d1), replace=TRUE) cartmod <- rpart(model, data=d1[k,]) p <- predict(cartmod, d2) eval[i] <- agro::RMSE_null(d2$SOC, p) predictions[, i] <- p } ## ----randfor_130-------------------------------------------------------------- head(predictions) ## ----randfor_140-------------------------------------------------------------- pavg <- apply(predictions, 1, mean) ## ----------------------------------------------------------------------------- round(eval, 3) ## ----randfor_150-------------------------------------------------------------- mean(eval) agro::RMSE_null(d2$SOC, pavg) ## ----randfor_160-------------------------------------------------------------- agro::RMSE_null(d2$SOC, predict(cart, d2)) ## ----randfor_170-------------------------------------------------------------- library(randomForest) rf <- randomForest(model, data=d1) rf ## ----randfor_180-------------------------------------------------------------- p <- predict(rf, d2) agro::RMSE_null(d2$SOC, p) ## ----randfor_190-------------------------------------------------------------- # Mean of squared residuals agro::RMSE(d2$SOC, p)^2 # % Var explained: round(100 * (1 - var(d2$SOC - p) / var(d2$SOC)), 1) ## ----randfor_200-------------------------------------------------------------- n <- 5 set.seed(31415) k <- agro::make_groups(d, n) table(k) ## ----randfor_210-------------------------------------------------------------- rfRMSE <- rfRMSE_null <- rfvarexp <- rfcor <- rep(NA, n) for (i in 1:n) { test <- d[k==i, ] train <- d[k!=i, ] m <- randomForest(model, data=train) p <- predict(m, test) rfRMSE[i] <- agro::RMSE(test$SOC, p) rfRMSE_null[i] <- agro::RMSE_null(test$SOC, p) rfvarexp[i] <- var(test$SOC - p) / var(test$SOC) rfcor[i] <- cor(test$SOC, p) } mean(rfRMSE) mean(rfRMSE_null) mean(rfvarexp) mean(rfcor) ## ----randfor_220-------------------------------------------------------------- lmRMSE <- lmRMSE_null <- lmvarexp <- lmcor <- rep(NA, n) for (i in 1:n) { test <- d[k==i, ] train <- d[k!=i, ] m <- lm(model, data=train) p <- predict(m, test) lmRMSE[i] <- agro::RMSE(test$SOC, p) lmRMSE_null[i] <- agro::RMSE_null(test$SOC, p) lmvarexp[i] <- var(test$SOC - p) / var(test$SOC) lmcor[i] <- cor(test$SOC, p) } mean(lmRMSE) mean(lmRMSE_null) mean(lmvarexp) mean(lmcor) ## ----randfor_230-------------------------------------------------------------- rfm <- randomForest(model, data=d) varImpPlot(rfm) ## ----randfor_240-------------------------------------------------------------- agro::varImportance ## ----randfor_250-------------------------------------------------------------- predvars <- c("pH", "precip", "clay", "sand", "temp") vi <- agro::varImportance(rfm, d, predvars) vimean <- colMeans(vi) p <- predict(m, d) RMSEfull <- agro::RMSE(d$SOC, p) x <- sort(vimean - RMSEfull) dotchart(x) ## ----randfor_260-------------------------------------------------------------- mlr <- lm(model, data=d) vi <- agro::varImportance(mlr, d, predvars) vimean <- colMeans(vi) p <- predict(m, d) RMSEfull <- agro::RMSE(d$SOC, p) x <- sort(vimean - RMSEfull) dotchart(x) ## ----randfor_270-------------------------------------------------------------- par(mfrow=c(2,2), mai=c(.75,rep(0.5,3))) partialPlot(rfm, d, "pH") partialPlot(rfm, d, "clay") partialPlot(rfm, d, "sand") partialPlot(rfm, d, "precip") ## ----randfor_280-------------------------------------------------------------- agro::partialResponse ## ----randfor_290-------------------------------------------------------------- pr_pH <- agro::partialResponse(rfm, d, "pH") plot(pr_pH, type="l") rug(quantile(d$pH, seq(0, 1, 0.1))) ## ----randfor_300-------------------------------------------------------------- lrm <- lm(model, data=d) pr_pH <- agro::partialResponse(lrm, d, "pH") plot(pr_pH, type="l") rug(quantile(d$pH, seq(0, 1, 0.1)))