% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rfpi.R
\name{rfpi}
\alias{rfpi}
\title{Prediction intervals with random forests}
\usage{
rfpi(
  formula,
  traindata,
  testdata,
  alpha = 0.05,
  split_rule = c("ls", "l1", "spi"),
  pi_method = c("lm", "spi", "quant", "hdr", "chdr"),
  calibration = TRUE,
  rf_package = c("rfsrc", "ranger"),
  params_rfsrc = list(ntree = 2000, mtry = ceiling(px/3), nodesize = 5, samptype =
    "swr"),
  params_ranger = list(num.trees = 2000, mtry = ceiling(px/3), min.node.size = 5,
    replace = TRUE),
  params_calib = list(range = c(1 - alpha - 0.005, 1 - alpha + 0.005), start = (1 -
    alpha), step = 0.01, refine = TRUE)
)
}
\arguments{
\item{formula}{Object of class \code{formula} or \code{character} describing
the model to fit.}

\item{traindata}{Training data of class \code{data.frame}.}

\item{testdata}{Test data of class \code{data.frame}.}

\item{alpha}{Confidence level. (1 - \code{alpha}) is the desired coverage
level. The default is \code{alpha} = 0.05 for the 95\% prediction interval.}

\item{split_rule}{Split rule for building a forest. Options are \code{"ls"}
for CART with least-squares (LS) splitting rule, \code{"l1"} for CART with
L1 splitting rule, \code{"spi"} for CART with shortest prediction interval
(SPI) splitting rule. The default is \code{"ls"}.}

\item{pi_method}{Methods for building a prediction interval. Options are
\code{"lm"} for classical method, \code{"spi"} for shortest prediction
interval, \code{"quant"} for quantile method, \code{"hdr"} for highest
density region, and \code{"chdr"} for contiguous HDR. The default is to use
all methods for PI construction. Single method or a subset of methods can
be applied.}

\item{calibration}{Apply OOB calibration for finding working level of
\code{alpha}, i.e. \eqn{\alpha_w}. See below for details. The default is
\code{TRUE}.}

\item{rf_package}{Random forest package that can be used for RF training.
Options are \code{"rfsrc"} for \code{randomForestSRC} and \code{"ranger"}
for \code{ranger} packages. Split rule \code{"ls"} can be used with both
packages. However, \code{"l1"} and \code{"spi"} split rules can only be
used with \code{"rfsrc"}. The default is \code{"rfsrc"}.}

\item{params_rfsrc}{List of parameters that should be passed to
\code{randomForestSRC}. In the default parameter set, \code{ntree} = 2000,
\code{mtry} = \eqn{px/3}  (rounded up), \code{nodesize} = 5,
\code{samptype} = "swr". See \code{randomForestSRC} for possible
parameters.}

\item{params_ranger}{List of parameters that should be passed to
\code{ranger}. In the default parameter set, \code{num.trees} = 2000,
\code{mtry} = \eqn{px/3}  (rounded up), \code{min.node.size} = 5,
\code{replace} = TRUE. See \code{ranger} for possible parameters.}

\item{params_calib}{List of parameters for calibration procedure.
\code{range} is the allowed target calibration range for coverage level.
The value that provides a coverage level within the range is chosen as
\eqn{\alpha_w}. \code{start} is the initial coverage level to start
calibration procedure. \code{step} is the coverage step size for each
calibration iteration. \code{refine} is the gradual decrease in \code{step}
value when close to target coverage level, the default is \code{TRUE} which
allows gradual decrease.}
}
\value{
A list with the following components:

\item{lm_interval}{Prediction intervals for test data with the classical
method. A list containing lower and upper bounds.}
\item{spi_interval}{Prediction intervals for test data with SPI method. A
list containing lower and upper bounds.}
\item{hdr_interval}{Prediction intervals for test data with HDR method. A
list containing lower and upper bounds of prediction interval for each test
observation. There may be multiple PIs for a single observation.}
\item{chdr_interval}{Prediction intervals for test data with contiguous HDR
method. A list containing lower and upper bounds.}
\item{quant_interval}{Prediction intervals for test data with quantiles
method. A list containing lower and upper bounds.}
\item{test_pred}{Random forest predictions for test data.}
\item{alphaw}{Working level of \code{alpha}, i.e. \eqn{\alpha_w}. A numeric
array for the PI methods entered with \code{pi_method}. If
\code{calibration = FALSE}, it returns \code{NULL}.}
\item{split_rule}{Split rule used for building the random forest.}
\item{rf_package}{Random forest package that was used for RF training.}
}
\description{
Constructs prediction intervals with 15 distinct variations proposed by Roy
and Larocque (2020). The variations include two aspects: The method used to
build the forest and the method used to build the prediction interval. There
are three methods to build the forest, (i) least-squares (LS), (ii) L1 and
(iii) shortest prediction interval (SPI) from the CART paradigm. There are
five methods for constructing prediction intervals, classical method,
shortest prediction interval, quantile method, highest density region, and
contiguous HDR.
}
\section{Details}{


\strong{Calibration process}

The calibration procedure uses the "Bag of Observations for Prediction"
(BOP) idea. BOP for a new observation is built with the set inbag
observations that are in the same terminal nodes as the new observation.
The calibration procedure uses the BOPs constructed for the training
observations. BOP for a training observation is built using only the trees
where this training observation is out-of-bag (OOB).

Let (\eqn{1-\alpha}) be the target coverage level. The goal of the
calibration is to find the value of \eqn{\alpha_w}, which is the working
level of \eqn{\alpha} called by Roy and Larocque (2020), such that the
coverage level of the prediction intervals for the training observations is
closest to the target coverage level. The idea is to find the value of
\eqn{\alpha_w} using the OOB-BOPs. Once found, (\eqn{1-\alpha_w}) becomes
the level used to build the prediction intervals for the new observations.
}

\examples{
## load example data
data(BostonHousing, package = "RFpredInterval")
set.seed(2345)

## define train/test split
trainindex <- sample(1:nrow(BostonHousing),
  size = round(nrow(BostonHousing) * 0.7), replace = FALSE)
traindata <- BostonHousing[trainindex, ]
testdata <- BostonHousing[-trainindex, ]
px <- ncol(BostonHousing) - 1

## contruct 90\% PI with "l1" split rule and "spi" PI method with calibration
out <- rfpi(formula = medv ~ ., traindata = traindata,
  testdata = testdata, alpha = 0.1, calibration = TRUE,
  split_rule = "l1", pi_method = "spi", params_rfsrc = list(ntree = 50),
  params_calib = list(range = c(0.89, 0.91), start = 0.9, step = 0.01,
  refine = TRUE))

## get the PI with "spi" method for first observation in the testdata
c(out$spi_interval$lower[1], out$spi_interval$upper[1])

## get the random forest predictions for testdata
out$test_pred

## get the working level of alpha (alphaw)
out$alphaw

## contruct 95\% PI with "ls" split rule, "lm" and "quant" PI methods
## with calibration and use "ranger" package for RF training
out2 <- rfpi(formula = medv ~ ., traindata = traindata,
  testdata = testdata, split_rule = "ls", pi_method = c("lm", "quant"),
  rf_package = "ranger", params_ranger = list(num.trees = 50))

## get the PI with "quant" method for the testdata
cbind(out2$quant_interval$lower, out2$quant_interval$upper)

}
\references{
Roy, M. H., & Larocque, D. (2020). Prediction intervals with
random forests. Statistical methods in medical research, 29(1), 205-229.
doi:10.1177/0962280219829885.
}
\seealso{
\code{\link{pibf}} \code{\link{piall}}
}
