% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/train_spectra.R
\name{train_spectra}
\alias{train_spectra}
\title{Train a model based predict reference values with spectral data}
\usage{
train_spectra(
  df,
  num.iterations,
  test.data = NULL,
  k.folds = 5,
  proportion.train = 0.7,
  tune.length = 50,
  model.method = "pls",
  best.model.metric = "RMSE",
  stratified.sampling = TRUE,
  cv.scheme = NULL,
  trial1 = NULL,
  trial2 = NULL,
  trial3 = NULL,
  split.test = FALSE,
  seed = 1,
  verbose = TRUE,
  save.model = deprecated(),
  rf.variable.importance = deprecated(),
  output.summary = deprecated(),
  return.model = deprecated()
)
}
\arguments{
\item{df}{\code{data.frame} object. First column contains unique identifiers,
second contains reference values, followed by spectral columns. Include no
other columns to right of spectra! Column names of spectra must start with
"X" and reference column must be named "reference"}

\item{num.iterations}{Number of training iterations to perform}

\item{test.data}{\code{data.frame} with same specifications as \code{df}. Use
if specific test set is desired for hyperparameter tuning. If \code{NULL},
function will automatically train with a stratified sample of 70\%. Default
is \code{NULL}.}

\item{k.folds}{Number indicating the number of folds for k-fold
cross-validation during model training. Default is 5.}

\item{proportion.train}{Fraction of samples to include in the training set.
Default is 0.7.}

\item{tune.length}{Number delineating search space for tuning of the PLSR
hyperparameter \code{ncomp}. Must be set to 5 when using the random forest
algorithm (\code{model.method == rf}). Default is 50.}

\item{model.method}{Model type to use for training. Valid options include:
\itemize{ \item "pls": Partial least squares regression (Default) \item
"rf": Random forest \item "svmLinear": Support vector machine with linear
kernel \item "svmRadial": Support vector machine with radial kernel }}

\item{best.model.metric}{Metric used to decide which model is best. Must be
either "RMSE" or "Rsquared"}

\item{stratified.sampling}{If \code{TRUE}, training and test sets will be
selected using stratified random sampling. This term is only used if
\code{test.data == NULL}. Default is \code{TRUE}.}

\item{cv.scheme}{A cross validation (CV) scheme from Jarquín et al., 2017.
Options for \code{cv.scheme} include:
\itemize{
    \item "CV1": untested lines in tested environments
    \item "CV2": tested lines in tested environments
    \item "CV0": tested lines in untested environments
    \item "CV00": untested lines in untested environments
}}

\item{trial1}{\code{data.frame} object that is for use only when
\code{cv.scheme} is provided. Contains the trial to be tested in subsequent
model training functions. The first column contains unique identifiers,
second contains genotypes, third contains reference values, followed by
spectral columns. Include no other columns to right of spectra! Column
names of spectra must start with "X", reference column must be named
"reference", and genotype column must be named "genotype".}

\item{trial2}{\code{data.frame} object that is for use only when
\code{cv.scheme} is provided. This data.frame contains a trial that has
overlapping genotypes with \code{trial1} but that were grown in a different
site/year (different environment). Formatting must be consistent with
\code{trial1}.}

\item{trial3}{\code{data.frame} object that is for use only when
\code{cv.scheme} is provided. This data.frame contains a trial that may or
may not contain genotypes that overlap with \code{trial1}. Formatting must
be consistent with \code{trial1}.}

\item{split.test}{boolean that allows for a fixed training set and a split
test set. Example// train model on data from two breeding programs and a
stratified subset (70\%) of a third and test on the remaining samples
(30\%)  of the third. If \code{FALSE}, the entire provided test set
\code{test.data} will remain as a testing set or if none is provided, 30\%
of the provided \code{train.data} will be used for testing. Default is
\code{FALSE}.}

\item{seed}{Integer to be used internally as input for \code{set.seed()}.
Only used if \code{stratified.sampling = TRUE}. In all other cases, seed
is set to the current iteration number. Default is 1.}

\item{verbose}{If \code{TRUE}, the number of rows removed through filtering
will be printed to the console. Default is \code{TRUE}.}

\item{save.model}{DEPRECATED \code{save.model = FALSE} is no
longer supported; this function will always return a saved model.}

\item{rf.variable.importance}{DEPRECATED
\code{rf.variable.importance = FALSE} is no longer supported; variable
importance results are always returned if the \code{model.method} is
set to `pls` or `rf`.}

\item{output.summary}{DEPRECATED \code{output.summary = FALSE}
is no longer supported; a summary of output is always returned alongside
the full performance statistics.}

\item{return.model}{DEPRECATED \code{return.model = FALSE}
is no longer supported; a trained model object is always returned
alongside the full performance statistics and summary.}
}
\value{
list of the following:
\enumerate{
  \item \code{model} is a model object trained with all rows of \code{df}.
  \item \code{summary.model.performance} is a \code{data.frame} with model
  performance statistics in summary format (2 rows, one with mean and one
  with standard deviation of all training iterations).
  \item \code{full.model.performance} is a \code{data.frame} with model
  performance statistics in long format
  (number of rows = \code{num.iterations})
  \item \code{predictions} is a \code{data.frame} containing predicted values
  for each test set entry at each iteration of model training.
  \item \code{importance} is a \code{data.frame} that contains variable
  importance for each wavelength. Only available for \code{model.method}
  options "rf" and "pls".
  }
Included summary statistics:
\itemize{
  \item Tuned parameters depending on the model algorithm:
  \itemize{
    \item \strong{Best.n.comp}, the best number of components
    \item \strong{Best.ntree}, the best number of trees in an RF model
    \item \strong{Best.mtry}, the best number of variables to include at
    every decision point in an RF model
    }
  \item \strong{RMSECV}, the root mean squared error of cross-validation
  \item \strong{R2cv}, the coefficient of multiple determination of
  cross-validation for PLSR models
  \item \strong{RMSEP}, the root mean squared error of prediction
  \item \strong{R2p}, the squared Pearson’s correlation between predicted and
  observed test set values
  \item \strong{RPD}, the ratio of standard deviation of observed test set
  values to RMSEP
  \item \strong{RPIQ}, the ratio of performance to interquartile difference
  \item \strong{CCC}, the concordance correlation coefficient
  \item \strong{Bias}, the average difference between the predicted and
  observed values
  \item \strong{SEP}, the standard error of prediction
  \item \strong{R2sp}, the squared Spearman’s rank correlation between
  predicted and observed test set values
}
}
\description{
Trains spectral prediction models using one of several
algorithms and sampling procedures.
}
\examples{
\donttest{
library(magrittr)
ikeogu.2017 \%>\%
  dplyr::filter(study.name == "C16Mcal") \%>\%
  dplyr::rename(reference = DMC.oven,
                unique.id = sample.id) \%>\%
  dplyr::select(unique.id, reference, dplyr::starts_with("X")) \%>\%
  na.omit() \%>\%
  train_spectra(
    df = .,
    tune.length = 3,
    num.iterations = 3,
    best.model.metric = "RMSE",
    stratified.sampling = TRUE
  ) \%>\%
  summary()
}
}
\author{
Jenna Hershberger \email{jmh579@cornell.edu}
}
