% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/hotdeck.R
\name{impute_hotdeck}
\alias{impute_hotdeck}
\alias{impute_rhd}
\alias{impute_shd}
\alias{impute_pmm}
\alias{impute_knn}
\title{Hot deck imputation}
\usage{
impute_rhd(dat, formula, pool = c("complete", "univariate",
  "multivariate"), prob, backend = getOption("simputation.hdbackend",
  default = c("simputation", "VIM")), ...)

impute_shd(dat, formula, pool = c("complete", "univariate",
  "multivariate"), order = c("locf", "nocb"),
  backend = getOption("simputation.hdbackend", default = c("simputation",
  "VIM")), ...)

impute_pmm(dat, formula, predictor = impute_lm, pool = c("complete",
  "univariate", "multivariate"), ...)

impute_knn(dat, formula, pool = c("complete", "univariate",
  "multivariate"), k = 5, backend = getOption("simputation.hdbackend",
  default = c("simputation", "VIM")), ...)
}
\arguments{
\item{dat}{\code{[data.frame]}, with variables to be imputed and their
predictors.}

\item{formula}{\code{[formula]} imputation model description (see Details below).}

\item{pool}{\code{[character]} Specify donor pool when \code{backend="simputation"}
\itemize{
\item{\code{"complete"}. Only records for which the variables on the
   left-hand-side of the model formula are complete are used as donors. If a
   record has multiple missings, all imputations are taken from a single 
   donor.}
\item{\code{"univariate"}. Imputed variables are treated one by one and
   independently so the order of variable imputation is unimportant. If a 
   record has multiple missings, separate donors are drawn for each missing 
   value.}
\item{\code{"multivariate"}. A donor pool is created for each missing data 
   pattern. If a record has multiple missings, all imputations are taken from 
   a single donor.}
}}

\item{prob}{\code{[numeric]} Sampling probability weights (passed through to
\code{\link[base]{sample}}). Must be of length \code{nrow(dat)}.}

\item{backend}{\code{[character]} Choose the backend for imputation.}

\item{...}{further arguments passed to \code{\link[VIM:hotdeck]{VIM::hotdeck}}
  if \code{VIM} is chosen as backend, otherwise they are passed to
\itemize{
  \item{\code{\link[base]{order}} for \code{impute_shd}} and
  \code{backend="simputation"} 
  \item{\code{\link[VIM:hotdeck]{VIM::hotdeck}}
  for \code{impute_shd} and \code{impute_rhd} when \code{backend="VIM"}}.
  \item{\code{\link[VIM:kNN]{VIM:kNN}} for \code{impute_knn} when 
  \code{backend="VIM"}}
  \item{The \code{predictor} function for \code{impute_pmm}.}
}}

\item{order}{\code{[character]} Last Observation Carried Forward or Next
Observarion Carried Backward. Only for \code{backend="simputation"}}

\item{predictor}{\code{[function]} Imputation to use for predictive part in
predictive mean matching. Any of the \code{impute_} functions of this
package (it makes no sense to use a hot-deck imputation).}

\item{k}{\code{[numeric]} Number of nearest neighbours to draw the donor from.}
}
\description{
Hot-deck imputation methods include random and sequential hot deck, 
k-nearest neighbours imputation and predictive mean matching.
}
\section{Model specification}{


Formulas are of the form

\code{IMPUTED_VARIABLES ~ MODEL_SPECIFICATION [ | GROUPING_VARIABLES ] }

The left-hand-side of the formula object lists the variable or variables to 
be imputed. The interpretation of the independent variables on the
right-hand-side depends on the imputation method.

\itemize{
\item{\code{impute_rhd} Variables in \code{MODEL_SPECIFICATION} and/or 
\code{GROUPING_VARIABLES} are used to split the data set into groups prior to
imputation. Use \code{~ 1} to specify that no grouping is to be applied.}
\item{\code{impute_shd} Variables in \code{MODEL_SPECIFICATION} are used to 
sort the data. When multiple variables are specified, each variable after
the first serves as tie-breaker for the previous one.}
\item{\code{impute_knn} The predictors are used to determine Gower's distance
between records (see \code{\link[gower]{gower_topn}}). This may include the
variables to be imputed.}.
\item{\code{impute_pmm}} Predictive mean matching. The
 \code{MODEL_SPECIFICATION} is passed through to the \code{predictor}
 function.
} 


If grouping variables are specified, the data set is split according to the
values of those variables, and model estimation and imputation occur
independently for each group.

Grouping using \code{dplyr::group_by} is also supported. If groups are 
defined in both the formula and using \code{dplyr::group_by}, the data is 
grouped by the union of grouping variables. Any missing value in one of the 
grouping variables results in an error.
}

\section{Methodology}{


\bold{Random hot deck imputation} with \code{impute_rhd} can be applied to
numeric, categorical or mixed data. A missing value is copied from a sampled
record. Optionally samples are taken within a group, or with non-uniform
sampling probabilities. See Andridge and Little (2010) for an overview
of hot deck imputation methods.

\bold{Sequential hot deck imputation} with \code{impute_rhd} can be applied
to numeric, categorical, or mixed data. The dataset is sorted using the
`predictor variables'. Missing values or combinations thereof are copied
from the previous record where the value(s) are available in the case
of LOCF and from the next record in the case of NOCF. 
  
\bold{Predictive mean matching} with \code{impute_pmm} can be applied to
numeric data. Missing values or combinations thereof are first imputed using
a predictive model. Next, these predictions are replaced with observed
(combinations of) values nearest to the prediction. The nearest value is the
observed value with the smallest absolute deviation from the prediction.

\bold{K-nearest neighbour imputation} with \code{impute_knn} can be applied 
to numeric, categorical, or mixed data. For each record containing missing 
values, the \eqn{k} most similar completed records are determined based on
Gower's (1977) similarity coefficient. From these records the actual donor is
sampled.
}

\section{Using the VIM backend}{


The \href{ https://CRAN.R-project.org/package=VIM}{VIM} package has efficient
implementations of several popular imputation methods. In particular, its 
random and sequential hotdeck implementation is faster and more
memory-efficient than that of the current package. Moreover, \pkg{VIM} offers
more fine-grained control over the imputation process then \pkg{simputation}.

If you have this package installed, it can be used by setting
\code{backend="VIM"} for functions supporting this option. Alternatively, one
can set \code{options(simputation.hdbackend="VIM")} so it becomes the
default. 


Simputation will map the simputation call to a function in the
\pkg{VIM} package. In particular:

 \itemize{
 \item{\code{impute_rhd} is mapped to \code{VIM::hotdeck} where imputed
 variables are passed to the \code{variable} argument and the union of
 predictor and grouping variables are passed to \code{domain_var}.
 Extra arguments in \code{...} are passed to \code{VIM::hotdeck} as well.
 Argument \code{pool} is ignored.}
 \item{\code{impute_shd} is mapped to \code{VIM::hotdeck} where
 imputed variables are passed to the \code{variable} argument, predictor
 variables to \code{ord_var} and grouping variables to \code{domain_var}.
 Extra arguments in \code{...} are passed to \code{VIM::hotdeck} as well.
 Arguments \code{pool} and \code{order} are ignored. In \code{VIM} the donor pool
 is determined on a per-variable basis, equivalent to setting \code{pool="univariate"}
 with the simputation backend. \pkg{VIM} is LOCF-based. Differences between
 \pkg{simputation} and \code{VIM} likely occurr when the sorting variables contain missings.}
 \item{\code{impute_knn} is mapped to \code{VIM::kNN} where imputed variables
 are passed to \code{variable}, predictor variables are passed to \code{dist_var}
 and grouping variables are ignored with a message. 
 Extra arguments in \code{...} are passed to \code{VIM::kNN} as well.
 Argument \code{pool} is ignored.
 Note that simputation  adheres stricktly to the Gower's original
 definition of the distance measure, while \pkg{VIM} uses a generalized variant
 that can take ordered factors into account.
 }
}
By default, \pkg{VIM}'s imputation functions add indicator variables to the
original data to trace what values have been imputed. This is switched off by
default for consistency with the rest of the simputation package, but it may
be turned on again by setting \code{imp_var=TRUE}.
}

\references{
Andridge, R.R. and Little, R.J., 2010. A review of hot deck imputation for
survey non-response. International statistical review, 78(1), pp.40-64.

Gower, J.C., 1971. A general coefficient of similarity and some of its
properties. Biometrics, pp.857--871.
}
\seealso{
Other imputation: \code{\link{impute_cart}},
  \code{\link{impute_lm}}, \code{\link{impute}}
}
\concept{imputation}
