% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/spmtree.R
\name{spmtree}
\alias{spmtree}
\title{Simple Precision Medicine Tree}
\usage{
spmtree(
  formula,
  data,
  types = NULL,
  nmin = 5,
  maxdepth = Inf,
  print = TRUE,
  dataframe = FALSE,
  prune = FALSE
)
}
\arguments{
\item{formula}{A description of the model to be fit with format
\code{Y ~ treatment | X1 + X2} for data with a
continuous outcome variable Y and 
\code{Surv(Y, delta) ~ treatment | X1 + X2} for data with
a right-censored survival outcome variable Y and
a status indicator delta}

\item{data}{A matrix or data frame of the data}

\item{types}{A vector, data frame, or matrix of the types
of each variable in the data; if left blank, the
default is to assume all of the candidate split
variables are ordinal; otherwise, all variables in 
the data must be specified, and the possible variable 
types are: "response", "treatment", "status", "binary", 
"ordinal", and "nominal" for outcome variable Y, the 
treatment variable, the status indicator (if 
applicable), binary candidate split variables, ordinal
candidate split variables, and nominal candidate split
variables respectively}

\item{nmin}{An integer specifying the minimum node size of
the overall classification tree}

\item{maxdepth}{An integer specifying the maximum depth of the
overall classification tree; this argument is 
optional but useful for shortening computation 
time; if left blank, the default is to grow the 
full tree until the minimum node size \code{nmin} 
is reached}

\item{print}{A boolean (TRUE/FALSE) value, where TRUE prints
a more readable version of the final tree to the
screen}

\item{dataframe}{A boolean (TRUE/FALSE) value, where TRUE returns
the final tree as a dataframe}

\item{prune}{A boolean (TRUE/FALSE) value, where TRUE prunes
the final tree using \code{pmprune} function}
}
\value{
\code{spmtree} returns the final classification tree as a 
        \code{party} object by default or a data frame. See 
        Hothorn and Zeileis (2015) for details. The data 
        frame contains the following columns of information:
        \item{node}{Unique integer values that identify each node
                    in the tree, where all of the nodes are
                    indexed starting from 1}
        \item{splitvar}{Integers that represent the candidate split
                        variable used to split each node, where
                        all of the variables are indexed starting
                        from 1; for terminal nodes, i.e., nodes
                        without child nodes, the value is set 
                        equal to NA}
        \item{splitvar_name}{The names of the candidate split 
                             variables used to split each node
                             obtained from the column names of the
                             supplied data; for terminal nodes,
                             the value is set equal to NA}
        \item{type}{Characters that denote the type of each 
                    candidate split variable; "bin" is for binary
                    variables, "ord" for ordinal, and "nom" for
                    nominal; for terminal nodes, the value is set
                    equal to NA}
        \item{splitval}{Values of the left child node of the 
                        current split/node; for binary variables,
                        a value of 0 is printed, and subjects with
                        values of 0 for the current \code{splitvar}
                        are in the left child node, while subjects
                        with values of 1 are in the right child
                        node; for ordinal variables,
                        \code{splitval} is numeric and implies
                        that subjects with values of the current
                        \code{splitvar} less than or equal to
                        \code{splitval} are in the left child 
                        node, while the remaining subjects with 
                        values greater than \code{splitval} are in 
                        the right child node; for nominal
                        variables, the \code{splitval} is a set of
                        integers separated by commas, and subjects
                        in that set of categories are in the left
                        child node, while the remaining subjects
                        are in the right child node; for terminal
                        nodes, the value is set equal to NA}
        \item{lchild}{Integers that represent the index (i.e.,
                      \code{node} value) of each node's left
                      child node; for terminal nodes, the value is
                      set equal to NA}
        \item{rchild}{Integers that represent the index (i.e.,
                      \code{node} value) of each node's right
                      child node; for terminal nodes, the value is
                      set equal to NA}
        \item{depth}{Integers that specify the depth of each
                     node; the root node has depth 1, its 
                     children have depth 2, etc.}
        \item{nsubj}{Integers that count the total number of
                     subjects within each node}
        \item{besttrt}{Integers that denote the identified best 
                       treatment assignment of each node}
}
\description{
This function creates a classification tree
             designed to identify subgroups in which subjects
             perform especially well or especially poorly in a
             given treatment group.
}
\details{
To identify the best split at each node of the 
         classification tree, all possible splits of all 
         candidate split variables are considered. The single
         split with the highest split criteria score is
         identified as the best split of the node. For data with 
         a continuous outcome variable, the split criteria is the 
         DIFF value that was first proposed for usage in the 
         relative-effectiveness based method (Zhang et al. (2010),
         Tsai et al. (2016)). For data with a survival outcome 
         variable, the split criteria is the squared test 
         statistic that tests the significance of the split by 
         treatment interaction term in a Cox proportional hazards 
         model.

         When using \code{spmtree}, note the following
         requirements for the supplied data. First, the dataset
         must contain an outcome variable Y and a treatment
         variable. If Y is a right-censored survival time
         outcome, then there must also be a status indicator
         delta, where values of 1 denote the occurrence of the 
         (harmful) event of interest, and values of 0 denote
         censoring. If there are only two treatment groups, then
         the two possible values must be 0 or 1. If there are
         more than two treatment groups, then the possible values
         must be integers starting from 1 to the total number of
         treatment assignments. In regard to the candidate split
         variables, if a variable is binary, then the variable
         must take values of 0 or 1. If a variable is nominal,
         then the values must be integers starting from 1 to the
         total number of categories. There cannot be any missing
         values in the dataset. For candidate split variables
         with missing values, the missings together (MT) method
         proposed by Zhang et al. (1996) is helpful.
}
\examples{

#
# ... an example with a continuous outcome variable
#     and two treatment groups
#

N = 300
set.seed(123)

# generate binary treatments
treatment = rbinom(N, 1, 0.5)

# generate candidate split variables
X1 = rnorm(n = N, mean = 0, sd = 1)
X2 = rnorm(n = N, mean = 0, sd = 1)
X3 = rnorm(n = N, mean = 0, sd = 1)
X4 = rnorm(n = N, mean = 0, sd = 1)
X5 = rnorm(n = N, mean = 0, sd = 1)
X = cbind(X1, X2, X3, X4, X5)
colnames(X) = paste0("X", 1:5)

# generate continuous outcome variable
calculateLink = function(X, treatment){

    ((X[, 1] <= 0) & (X[, 2] <= 0)) *
        (25 * (1 - treatment) + 8 * treatment) + 

    ((X[, 1] <= 0) & (X[, 2] > 0)) *
        (18 * (1 - treatment) + 20 * treatment) +

    ((X[, 1] > 0) & (X[, 3] <= 0)) *
        (20 * (1 - treatment) + 18 * treatment) + 

    ((X[, 1] > 0) & (X[, 3] > 0)) *
        (8 * (1 - treatment) + 25 * treatment)
}

Link = calculateLink(X, treatment)
Y = rnorm(N, mean = Link, sd = 1)

# combine variables in a data frame
data = data.frame(X, Y, treatment)

# fit a classification tree
tree1 = spmtree(Y ~ treatment | ., data, maxdepth = 3)
# predict optimal treatment for new subjects
predict(tree1, newdata = head(data), 
FUN = function(n)  as.numeric(n$info$opt_trt))

\donttest{
#
# ... an example with a continuous outcome variable
#     and three treatment groups
#

N = 600
set.seed(123)

# generate treatments
treatment = sample(1:3, N, replace = TRUE)

# generate candidate split variables
X1 = round(rnorm(n = N, mean = 0, sd = 1), 4)
X2 = round(rnorm(n = N, mean = 0, sd = 1), 4)
X3 = sample(1:4, N, replace = TRUE)
X4 = sample(1:5, N, replace = TRUE)
X5 = rbinom(N, 1, 0.5)
X6 = rbinom(N, 1, 0.5)
X7 = rbinom(N, 1, 0.5)
X = cbind(X1, X2, X3, X4, X5, X6, X7)
colnames(X) = paste0("X", 1:7)

# generate continuous outcome variable
calculateLink = function(X, treatment){

    10.2 - 0.3 * (treatment == 1) - 0.1 * X[, 1] + 
    2.1 * (treatment == 1) * X[, 1] +
    1.2 * X[, 2]
}

Link = calculateLink(X, treatment)
Y = rnorm(N, mean = Link, sd = 1)

# combine variables in a data frame
data = data.frame(X, Y, treatment)

# create vector of variable types
types = c(rep("ordinal", 2), rep("nominal", 2), rep("binary", 3),
        "response", "treatment")

# fit a classification tree
tree2 = spmtree(Y ~ treatment | ., data, types = types)

#
# ... an example with a survival outcome variable
#     and two treatment groups
#

N = 300
set.seed(321)

# generate binary treatments
treatment = rbinom(N, 1, 0.5)

# generate candidate split variables
X1 = rnorm(n = N, mean = 0, sd = 1)
X2 = rnorm(n = N, mean = 0, sd = 1)
X3 = rnorm(n = N, mean = 0, sd = 1)
X4 = rnorm(n = N, mean = 0, sd = 1)
X5 = rnorm(n = N, mean = 0, sd = 1)
X = cbind(X1, X2, X3, X4, X5)
colnames(X) = paste0("X", 1:5)

# generate survival outcome variable
calculateLink = function(X, treatment){

    X[, 1] + 0.5 * X[, 3] + (3 * treatment - 1.5) * (abs(X[, 5]) - 0.67)
}

Link = calculateLink(X, treatment)
T = rexp(N, exp(-Link))
C0 = rexp(N, 0.1 * exp(X[, 5] + X[, 2]))
Y = pmin(T, C0)
delta = (T <= C0)

# combine variables in a data frame
data = data.frame(X, Y, delta, treatment)

# fit a classification tree
tree3 = spmtree(Surv(Y, delta) ~ treatment | ., data, maxdepth = 2)

#
# ... an example with a survival outcome variable
#     and four treatment groups
#

N = 800
set.seed(321)

# generate treatments
treatment = sample(1:4, N, replace = TRUE)

# generate candidate split variables
X1 = round(rnorm(n = N, mean = 0, sd = 1), 4)
X2 = round(rnorm(n = N, mean = 0, sd = 1), 4)
X3 = sample(1:4, N, replace = TRUE)
X4 = sample(1:5, N, replace = TRUE)
X5 = rbinom(N, 1, 0.5)
X6 = rbinom(N, 1, 0.5)
X7 = rbinom(N, 1, 0.5)
X = cbind(X1, X2, X3, X4, X5, X6, X7)
colnames(X) = paste0("X", 1:7)

# generate survival outcome variable
calculateLink = function(X, treatment, noise){

    -0.2 * (treatment == 1) +
    -1.1 * X[, 1] + 
    1.2 * (treatment == 1) * X[, 1] +
    1.2 * X[, 2]
}

Link = calculateLink(X, treatment)
T = rweibull(N, shape = 2, scale = exp(Link))
Cnoise = runif(n = N) + runif(n = N)
C0 = rexp(N, exp(0.3 * -Cnoise))
Y = pmin(T, C0)
delta = (T <= C0)

# combine variables in a data frame
data = data.frame(X, Y, delta, treatment)

# create vector of variable types
types = c(rep("ordinal", 2), rep("nominal", 2), rep("binary", 3),
        "response", "status", "treatment")

# fit two classification trees
tree4 = spmtree(Surv(Y, delta) ~ treatment | ., data, types = types, maxdepth = 2)
tree5 = spmtree(Surv(Y, delta) ~ treatment | X3 + X4, data, types = types,
             maxdepth = 2)
}
}
\references{
Chen, V., Li, C., and Zhang, H. (2022). The dipm R 
            package: implementing the depth importance in 
            precision medicine (DIPM) tree and forest based method.
            \emph{Manuscript}.
            
            Chen, V. and Zhang, H. (2022). Depth importance in 
            precision medicine (DIPM): A tree-and forest-based 
            method for right-censored survival outcomes. 
            \emph{Biostatistics} \strong{23}(1), 157-172.
            
            Chen, V. and Zhang, H. (2020). Depth importance in 
            precision medicine (DIPM): a tree and forest based method. 
            In \emph{Contemporary Experimental Design, 
            Multivariate Analysis and Data Mining}, 243-259.

            Tsai, W.-M., Zhang, H., Buta, E., O'Malley, S., 
            Gueorguieva, R. (2016). A modified classification
            tree method for personalized medicine decisions.
            \emph{Statistics and its Interface} \strong{9}, 
            239-253.

            Zhang, H., Holford, T., and Bracken, M.B. (1996).
            A tree-based method of analysis for prospective
            studies. \emph{Statistics in Medicine} \strong{15},
            37-49.

            Zhang, H., Legro, R.S., Zhang, J., Zhang, L., Chen,
            X., et al. (2010). Decision trees for identifying
            predictors of treatment effectiveness in clinical
            trials and its application to ovulation in a study of
            women with polycystic ovary syndrome. \emph{Human
            Reproduction} \strong{25}, 2612-2621.
            
            Hothorn, T. and Zeileis, A. (2015). partykit: 
            a modular toolkit for recursive partytioning in R. 
            \emph{The Journal of Machine Learning Research} 
            \strong{16}(1), 3905-3909.
}
\seealso{
\code{\link{dipm}}
}
