% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ModelBoot.R
\name{ModelBoot}
\alias{ModelBoot}
\title{Statistics and ALE data for a bootstrapped model}
\usage{
ModelBoot(
  model,
  data = NULL,
  ...,
  model_call_string = NULL,
  model_call_string_vars = character(),
  parallel = "all",
  model_packages = NULL,
  y_col = NULL,
  positive = TRUE,
  pred_fun = function(object, newdata, type = pred_type) {
     stats::predict(object =
    object, newdata = newdata, type = type)
 },
  pred_type = "response",
  boot_it = 100,
  boot_alpha = 0.05,
  boot_centre = "mean",
  seed = 0,
  output_model_stats = TRUE,
  output_model_coefs = TRUE,
  output_ale = TRUE,
  output_boot_data = FALSE,
  ale_options = list(),
  ale_p = "auto",
  tidy_options = list(),
  glance_options = list(),
  silent = FALSE
)
}
\arguments{
\item{model}{Required. See documentation for \code{\link[=ALE]{ALE()}}}

\item{data}{dataframe. Dataset to be bootstrapped. This must be the same data on which the \code{model} was trained. If not provided, \code{ModelBoot()} will try to detect it automatically. For non-standard models, \code{data} should be provided.}

\item{...}{not used. Inserted to require explicit naming of subsequent arguments.}

\item{model_call_string}{character(1). If \code{NULL} (default), the \code{ModelBoot} tries to automatically detect and construct the call for bootstrapped datasets. If it cannot, the function will fail early. In that case, a character string of the full call for the model must be provided that includes \code{boot_data} as the data argument for the call. See examples.}

\item{model_call_string_vars}{character. Names of variables included in \code{model_call_string} that are not columns in \code{data}. If any such variables exist, they must be specified here or else parallel processing may produce an error. If parallelization is disabled with \code{parallel = 0}, then this is not a concern. See documentation for the \code{model_packages} argument in \code{\link[=ALE]{ALE()}}.}

\item{parallel, model_packages}{See documentation for \code{\link[=ALE]{ALE()}}}

\item{y_col, pred_fun, pred_type}{See documentation for \code{\link[=ALE]{ALE()}}. Used to calculate bootstrapped performance measures. If left at their default values, then the relevant performance measures are calculated only if these arguments can be automatically detected. Otherwise, they should be specified.}

\item{positive}{any single atomic value. If the model represented by \code{model} or \code{model_call_string} is a binary classification model, \code{positive} specifies the 'positive' value of \code{y_col} (the target outcome), that is, the value of interest that is considered \code{TRUE}; any other value of \code{y_col} is considered \code{FALSE}. This argument is ignored if the model is not a binary classification model. For example, if 2 means \code{TRUE} and 1 means \code{FALSE}, then set \code{positive = 2}.}

\item{boot_it}{non-negative integer(1). Number of bootstrap iterations for full-model bootstrapping. For bootstrapping of ALE values, see details to verify if \code{\link[=ALE]{ALE()}} with bootstrapping is not more appropriate than \code{\link[=ModelBoot]{ModelBoot()}}. If \code{boot_it = 0}, then the model is run as normal once on the full \code{data} with no bootstrapping.}

\item{boot_alpha}{numeric(1) from 0 to 1. Alpha for percentile-based confidence interval range for the bootstrap intervals; the bootstrap confidence intervals will be the lowest and highest \code{(1 - 0.05) / 2} percentiles. For example, if \code{boot_alpha = 0.05} (default), the intervals will be from the 2.5 and 97.5 percentiles.}

\item{boot_centre}{character(1) in c('mean', 'median'). When bootstrapping, the main estimate for the ALE y value is considered to be \code{boot_centre}. Regardless of the value specified here, both the mean and median will be available.}

\item{seed}{integer. Random seed. Supply this between runs to assure identical bootstrap samples are generated each time on the same data. See documentation for \code{\link[=ALE]{ALE()}} for further details.}

\item{output_model_stats}{logical(1). If \code{TRUE} (default), return overall model statistics using \code{\link[broom:reexports]{broom::glance()}} (if available for \code{model}) and bootstrap-validated statistics if \code{boot_it > 0}.}

\item{output_model_coefs}{logical(1). If \code{TRUE} (default), return model coefficients using \code{\link[broom:reexports]{broom::tidy()}} (if available for \code{model}).}

\item{output_ale}{logical(1). If \code{TRUE} (default), return ALE data and statistics.}

\item{output_boot_data}{logical(1). If \code{TRUE}, return the full raw data for each bootstrap iteration, specifically, the bootstrapped models and the model row indices. Default \code{FALSE} does not return this large, detailed data.}

\item{ale_options, tidy_options, glance_options}{list of named arguments. Arguments to pass to the \code{\link[=ALE]{ALE()}} constructor when \code{ale = TRUE}, \code{\link[broom:reexports]{broom::tidy()}} when \code{model_coefs = TRUE}, or \code{\link[broom:reexports]{broom::glance()}} when \code{model_stats = TRUE}, respectively, beyond (or overriding) their defaults. Note: to obtain p-values for ALE statistics, see the \code{ale_p} argument.}

\item{ale_p}{Same as the \code{p_values} argument for the \code{\link[=ALE]{ALE()}} constructor; see documentation there. This argument overrides the \code{p_values} element of the \code{ale_options} argument.}

\item{silent}{See documentation for \code{\link[=ALE]{ALE()}}}
}
\value{
An object of class \code{ALE} with properties \code{model_stats}, \code{model_coefs}, \code{ale}, \code{model_stats}, \code{boot_data}, and \code{params}.
}
\description{
A \code{ModelBoot} S7 object contains full-model bootstrapped statistics and ALE data for a trained model. Full-model bootstrapping (as distinct from data-only bootstrapping) retrains a model for each bootstrap iteration. Thus, it can be rather slow, though it is much more reliable. However, for obtaining bootstrapped ALE data, plots, and statistics, full-model bootstrapping as provided by \code{ModelBoot} is only necessary for models that have not been developed by cross-validation. For cross-validated models, it is sufficient (and much faster) to create a regular \verb{[ALE()]} object with bootstrapping by setting the \code{boot_it} argument in its constructor. In fact, full-model bootstrapping with \code{ModelBoot} is often infeasible for slow machine-learning models trained on large datasets, which should rather be cross-validated to assure their reliability. However, for models that have not been cross-validated, full-model bootstrapping with \code{ModelBoot} is necessary for reliable results. Further details follow below; see also \code{vignette('ale-statistics')}.
}
\section{Properties}{

\describe{
\item{model_stats}{
\code{tibble} of bootstrapped results from \code{\link[broom:reexports]{broom::glance()}}. \code{NULL} if \code{model_stats} argument is \code{FALSE}. In general, only \code{\link[broom:reexports]{broom::glance()}} results that make sense when bootstrapped are included, such as \code{df} and \code{adj.r.squared}. Results that are incomparable across bootstrapped datasets (such as \code{aic}) are excluded. In addition, certain model performance measures are included; these are bootstrap-validated with the .632 correction (Efron & Tibshirani 1986) (NOT the .632+ correction):
\itemize{
\item For regression (numeric prediction) models:
\itemize{
\item \code{mae}: mean absolute error (MAE)
\item \code{sa_mae}: standardized accuracy of the MAE referenced on the mean absolute deviation
\item \code{rmse}: root mean squared error (RMSE)
\item \code{sa_rmse}: standardized accuracy of the RMSE referenced on the standard deviation
}
\item For binary or categorical classification (probability) models:
\itemize{
\item \code{auc}: area under the ROC curve
}
}
}

\item{model_coefs}{
A \code{tibble} of bootstrapped results from \code{\link[broom:reexports]{broom::tidy()}}.
\code{NULL} if \code{model_coefs} argument is \code{FALSE}.
}
\item{ale}{
A list of bootstrapped ALE results using default \code{\link[=ALE]{ALE()}} settings unless if overridden with \code{ale_options}. \code{NULL} if \code{ale} argument is \code{FALSE}. Elements are:

\if{html}{\out{<div class="sourceCode">}}\preformatted{  * `single`: an `ALE` object of ALE calculations on the full dataset without bootstrapping.
  * `boot`: a list of bootstrapped ALE data and statistics. This element is not an `ALE` object; it uses a special internal format.
}\if{html}{\out{</div>}}

}
\item{boot_data}{
A \code{tibble} of bootstrap results. Each row represents a bootstrap iteration. \code{NULL} if \code{boot_data} argument is \code{FALSE}. The columns are:

\if{html}{\out{<div class="sourceCode">}}\preformatted{  * `it`: the specific bootstrap iteration from 0 to `boot_it` iterations. Iteration 0 is the results from the full dataset (not bootstrapped).
  * `row_idxs`: the row indexes for the bootstrapped sample for that iteration. To save space, the row indexes are returned rather than the full datasets. So, for example, iteration i's bootstrap sample can be reproduced by `data[ModelBoot_obj@boot_data$row_idxs[[2]], ]` where `data` is the dataset and `ModelBoot_obj` is the result of `ModelBoot()`.
  * `model`: the model object trained on that iteration.
  * `ale`: the results of `ALE()` on that iteration.
  * `tidy`: the results of `broom::tidy(model)` on that iteration.
  * `stats`: the results of `broom::glance(model)` on that iteration.
  * `perf`: performance measures on the entire dataset. These are the measures specified above for regression and classification models.
}\if{html}{\out{</div>}}

}

\item{params}{
Parameters used to calculate bootstrapped data. Most of these repeat the arguments passed to \code{ModelBoot()}. These are either the values provided by the user or used by default if the user did not change them but the following additional objects created internally are also provided:

\if{html}{\out{<div class="sourceCode">}}\preformatted{* `y_cats`: same as `ALE@params$y_cats` (see documentation there).
* `y_type`: same as `ALE@params$y_type` (see documentation there).
* `model`: same as `ALE@params$model` (see documentation there).
* `data`: same as `ALE@params$data` (see documentation there).
}\if{html}{\out{</div>}}

}
}
}

\section{Full-model bootstrapping}{

No modelling results, with or without ALE, should be considered reliable without appropriate validation. For ALE, both the trained model itself and the ALE that explains the trained model must be validated. ALE must be validated by bootstrapping. The trained model might be validated either by cross-validation or by bootstrapping. For ALE that explains trained models that have been developed by cross-validation, it is sufficient to bootstrap just the training data. That is what the \code{ALE} object does with its \code{boot_it} argument. However, unvalidated models must be validated by bootstrapping them along with the calculation of ALE; this is what the \code{ModelBoot} object does with its \code{boot_it} argument.

\code{\link[=ModelBoot]{ModelBoot()}} carries out full-model bootstrapping to validate models. Specifically, it:
\itemize{
\item Creates multiple bootstrap samples (default 100; the user can specify any number);
\item Creates a model on each bootstrap sample;
\item Calculates overall model statistics, variable coefficients, and ALE values for each model on each bootstrap sample;
\item Calculates the mean, median, and lower and upper confidence intervals for each of those values across all bootstrap samples.
}
}

\examples{

# attitude dataset
attitude

## ALE for generalized additive models (GAM)
## GAM is tweaked to work on the small dataset.
gam_attitude <- mgcv::gam(rating ~ complaints + privileges + s(learning) +
                            raises + s(critical) + advance,
                          data = attitude)
summary(gam_attitude)

\donttest{
# Full model bootstrapping

# # To generate the code, uncomment the following lines.
# # For speed, this vignette loads a pre-created ModelBoot object.
# # For standard models like lm that store their data,
# # there is no need to specify the data argument.
# # 100 bootstrap iterations by default.
# mb_gam_attitude <- ModelBoot(gam_attitude)
# saveRDS(mb_gam_attitude, file.choose())
mb_gam_attitude <- url(paste0(
  'https://github.com/tripartio/ale/raw/main/download/',
  'mb_gam_attitude.0.5.2.rds'
  )) |>
  readRDS()

# If the model is not standard, supply model_call_string with 'data = boot_data'
# in the string instead of the actual dataset name (in addition to the actual dataset
# as the 'data' argument directly to the `ModelBoot` constructor).
# mb_gam_attitude <- ModelBoot(
#   gam_attitude,
#   data = attitude,  # the actual dataset
#   model_call_string = 'mgcv::gam(
#     rating ~ complaints + privileges + s(learning) +
#       raises + s(critical) + advance,
#     data = boot_data  # required for model_call_string
#   )'
# )

# Model statistics and coefficients
mb_gam_attitude@model_stats
mb_gam_attitude@model_coefs

# Plot ALE
plot(mb_gam_attitude)

# Retrieve ALE data
get(mb_gam_attitude, type = 'boot')    # bootstrapped
get(mb_gam_attitude, type = 'single')  # full (unbootstrapped) model
# See get.ALE() for other options

}

}
\references{
Okoli, Chitu. 2023. “Statistical Inference Using Machine Learning and Classical Techniques Based on Accumulated Local Effects (ALE).” arXiv. \url{doi:10.48550/arXiv.2310.09877}.<

Efron, Bradley, and Robert Tibshirani. "Bootstrap methods for standard errors, confidence intervals, and other measures of statistical accuracy." Statistical science (1986): 54-75. \url{doi:10.1214/ss/1177013815}
}
