\name{ptycho}
\alias{ptycho}
\alias{ptycho.all}
\title{Sample From Posterior Distributions}
\description{
  Generate MCMC samples from posterior distribution.  Two interfaces are
  provided: \code{ptycho} generates samples for one design matrix and response
  matrix while \code{ptycho.all} runs in batch an object generated by
  \code{\link{createData}}.
}
\usage{
ptycho(X, y, initStates, groups = NULL,
       tau.min = 0.01, tau.max = 10, tau.sd = (tau.max - tau.min)/4,
       doGPrior = TRUE, doDetPrior = FALSE, prob.varadd = 0.5,
       isOmegaFixed = FALSE, omega = NULL, omega.grp = NULL,
       probs.grp = NULL, rho.alpha = 10, rho.lambda = rho.alpha,
       only.means = FALSE, nburn = 0, nthin = 1, nSavePerChain, ncpu = 1,
       chainIterator = ifelse(ncpu == 1, "chainLoop", "chainLoopRNG"))
ptycho.all(data, across=c("none","traits","sites"), doGrpIndicator,
           dir.out, nreplicates=NULL, ncpu.replicates=1,
           replicateIterator=ifelse(ncpu.replicates==1,
                                    "replicateLoop","replicateLoopMC"),
           doSetSeed=TRUE, ncolumns=NULL, ...)
}
\arguments{
  \item{X}{\eqn{n}-by-\eqn{p} design matrix}
  \item{y}{\eqn{n}-by-\eqn{q} matrix containing response(s)}
  \item{initStates}{List containing initial states for chains.  Each state is a
   list with components:
    \describe{
      \item{\code{indic.var}}{\eqn{p}-by-\eqn{q} logical matrix.  If \eqn{(j,k)}
        entry is \code{TRUE}, then covariate \eqn{j} is initially in the model
        for response \eqn{k}.}
      \item{\code{tau}}{Scalar}
      \item{\code{indic.grp}}{Logical vector of length equal to the number of
        groups; analogous to \code{indic.var}; \code{NULL} to use priors that do
        not incorporate a second-level indicator variable
    }
  }}
  \item{groups}{To combine information across variants, list containing
    \describe{
      \item{\code{var2group}}{Integer vector of length \eqn{p}, with entry
        \eqn{j} being the index of the group containing covariate \eqn{j}}
      \item{\code{group2var}}{List of length \eqn{G}, each entry of which is an
        integer vector containing the indices of the covariates belonging to
        that group}
      \item{\code{sizes}}{Vector of length \eqn{G} containing the number of
          covariates in each group}
    }
    Otherwise, \code{NULL}.
  }
  \item{tau.min, tau.max}{Endpoints of uniform prior distribution on \code{tau}}
  \item{tau.sd}{Standard deviation of the Metropolis-Hastings proposal
    distribution for \code{tau}}
  \item{doGPrior}{Logical indicating whether to use the g-prior for effect
    sizes}
  \item{doDetPrior}{Unsupported; use default value}
  \item{prob.varadd}{If \code{initStates[[1]]]$indic.grp} is \code{NULL}, the
    probability that the Metropolis-Hastings proposal changes one entry of
    \code{indic.var} from \code{FALSE} to \code{TRUE}.  Otherwise, the
    probability of this event given that the proposal does not change
    \code{indic.grp}.}
  \item{isOmegaFixed}{Logical indicating whether \code{omega} is known}
  \item{omega}{If \code{isOmegaFixed} is \code{TRUE}, a \eqn{p}-by-\eqn{q}
    matrix containing the known probabilities.  Otherwise, a matrix containing
    the parameters for the Beta prior distribution on \code{omega}.  Such a
    matrix has columns \dQuote{A} and \dQuote{B}; the number of rows should be:
      \itemize{
       \item 1 if \eqn{q=1} and \code{initStates[[1]]$indic.grp} is \code{NULL},
       \item \code{length(groups$group2var)} if that is nonzero, or
       \item \eqn{p} otherwise.
      }
    If \code{omega} is \code{NULL} and \code{isOmegaFixed} is \code{FALSE},
    defaults to uniform priors.}
  \item{omega.grp}{If \code{isOmegaFixed} is \code{TRUE}, the known probability
    that entries in \code{indic.grp} are \code{TRUE}.  Otherwise, a vector with
    names \dQuote{A} and \dQuote{B} containing the parameters for the Beta prior
    distribution on \code{omega.grp}.  If \code{NULL}, defaults to uniform
    priors.  Unused if \code{initStates[[1]]$indic.grp} is \code{NULL}.}
  \item{probs.grp}{Vector containing the probabilities that the
    Metropolis-Hastings proposal will add, leave unchanged, or remove,
    respectively, a group.  If \code{NULL}, defaults to \code{c(0.25,0.5,0.25)}.
    Unused if \code{initStates[[1]]$indic.grp} is \code{NULL}.}
  \item{rho.alpha, rho.lambda}{Parameters for the Gamma prior distribution on
    \eqn{\rho}, which is the precision of the noise.  Here, the
    Gamma\eqn{(\alpha,\lambda)} distribution has density function proportional
    to \eqn{x^\alpha e^{-\lambda x}}.}
  \item{only.means}{If logical, specifies whether to return samples or the
    running means of the samples.  Can also be a vector containing the
    iterations (after the burn-in interval) at which to save the means.}
  \item{nburn}{Number of MCMC samples to make before starting to save samples or
    to compute means}
  \item{nthin}{Interval between saved samples; default value 1 saves all
    samples.  Unused if \code{only.means} is \code{TRUE} or a vector.}
  \item{nSavePerChain}{If \code{only.means} is \code{FALSE}, number of MCMC
    samples to return from each chain, which means a total of
    \code{nthin * nSavePerChain + nburn} samples are drawn per chain.  If
    \code{only.means} is \code{TRUE}, then \code{nSavePerChain + nburn} samples
    are drawn, and only the averages of the last \code{nSavePerChain} samples
    are returned.  Unused if \code{only.means} is not a logical.}
  \item{ncpu}{Number of CPUs over which to parallelize chains}
  \item{chainIterator}{Name of function for looping over chains.  Package
    includes \code{chainLoop}, which runs chains serially (ignoring
    \code{ncpu}), and \code{chainLoopRNG}, which uses multicores as implemented
    by the \pkg{doMC} package and also uses the \pkg{doRNG} package to enable
    reproducibility.  See \code{\link{mcmcLoop}} for information about writing
    functions to use different parallelization packages.}
  \item{data}{Data in format output by \code{\link{createData}}}
  \item{across}{Whether to combine information across traits, sites, or
    neither}
  \item{doGrpIndicator}{Whether to use priors that incorporate \code{indic.grp}}
  \item{dir.out}{Directory to which to \code{\link{save}} samples or means}
  \item{nreplicates}{Vector of replicates to run; if \code{NULL}, all will be run}
  \item{ncpu.replicates}{Number of CPUs over which to parallelize replicates}
  \item{replicateIterator}{Name of function for looping over replicates.
    Package includes \code{replicateLoop} to run replicates serially and
    \code{replicateLoopMC}, which uses multicores as implemented by the
    \pkg{doMC} package.  See \code{\link{mcmcLoop}} for information about
    writing functions to use different parallelization packages.}
  \item{doSetSeed}{If \code{TRUE}, call \code{set.seed(n.repl)} before running
    samples.}
  \item{ncolumns}{Scalar. If \code{across} is \dQuote{none} or
    \dQuote{sites}, each of the first \code{ncolumns} of \code{repl$y} will
    be used in turn, running all columns by default.  Ignored if \code{across}
    is \dQuote{sites}.}
  \item{\dots}{Additional arguments passed to \code{ptycho}}
}
\details{
  These functions run MCMC sampling from the posterior of the linear regression
  models using hierarchical priors described in Stell and Sabatti (2015).  The
  function \code{ptycho.all} is a wrapper of \code{ptycho} to simplify running
  the simulation experiments in that paper.  These functions determine which
  priors to use as follows:
  \itemize{
    \item Standard spike and slab priors that do not combine information
    (basic)\cr
    For \code{ptycho.all}, argument \code{across} is \dQuote{none} and
    \code{doGrpIndicator} is \code{FALSE}.\cr
    For \code{ptycho}, argument \code{y} has one column, \code{groups} is
    \code{NULL}, and \code{indic.grp} is \code{NULL} or missing in each entry of
    \code{initStates}.
    \item Combine information across traits (\emph{Across Traits})\cr
    For \code{ptycho.all}, argument \code{across} is \dQuote{traits} and
    \code{doGrpIndicator} is \code{TRUE}.\cr
    For \code{ptycho}, argument \code{y} has \eqn{p > 1} columns, \code{groups}
    is \code{NULL}, and \code{indic.grp} is a logical vector of length \eqn{p}
    in each entry of \code{initStates}.
    \item Combine information across variants (\emph{Across Sites})\cr
    For \code{ptycho.all}, argument \code{across} is \dQuote{sites} and
    \code{doGrpIndicator} is \code{TRUE}.\cr
    For \code{ptycho}, argument \code{y} has one column, \code{groups} specifies
    how to combine information, and \code{indic.grp} in each entry of
    \code{initStates} is a logical vector of the same length as
    \code{groups$group2var}.
    \item Combine information across traits \emph{incorrectly}
    (\emph{Unadjusted})\cr
    For \code{ptycho.all}, argument \code{across} is \dQuote{traits} and
    \code{doGrpIndicator} is \code{FALSE}.\cr
    For \code{ptycho}, argument \code{y} has \eqn{p > 1} columns, \code{groups}
    is \code{NULL}, and \code{indic.grp} is \code{NULL} in each entry of
    \code{initStates}.\cr
    This prior does not properly correct for multiple hypothesis testing and is
    only included because it is needed to reproduce results in Stell and Sabatti
    (2015).
  }
  Combining information across both phenotypes and variants is planned for a
  future release.  These functions perform some checks for compatibility of
  \code{X}, \code{y}, \code{groups}, and \code{initStates}; but invalid input
  could lead to unpredictable behavior.  Singular \eqn{X} can result in an
  error; even strongly correlated covariates can cause difficulties as described
  by Stell and Sabatti (2015).

The MCMC sampler is written in R.  Our actual data has 5335 subjects, 764
variants and three traits.  An \code{mcmc.list} containing 50,000 samples for
each of four chains can take about 5~GB.  Running chains in parallel, it takes
less than an hour (on a Linux computer with 2.6 GHz processors) to perform
510,000 samples per chain.  The run time depends primarily on the number of
entries that are \code{TRUE} in the sampled \code{indic.var} matrices;
increasing this will increase run times.  A chain that initially has all entries
of \code{indic.var} set to \code{TRUE} will take longer than one where the model
is initially empty.  Priors that inflate the posterior expectation of
\code{indic.var[j,k]} (such as combining information across responses without
using \code{indic.grp}) will also take longer.

The simplest way to run the simulations in Stell and Sabatti (2015) is, for
example,
\preformatted{
  data <- createPubData("pleiotropy")
  ptycho.all(data=data, across="traits", doGrpIndicator=TRUE,
             dir.out="/path/to/output/dir/",
             only.means=50000*(1:10), nburn=10000)
  ptycho.all(data=data, across="gene", doGrpIndicator=TRUE,
             dir.out="/path/to/another/dir/",
             groups=createGroupsSim(G=10, ncol(data$X)),
             only.means=50000*(1:10), nburn=10000)
}
}
\value{
  The results of \code{ptycho.all} are written to files by \code{\link{save}}.
  For priors that use only one response, the output for replicate \eqn{r} and
  column \eqn{c} will be written to \file{rpl<r>col<c>.Rdata} in the directory
  specified by \code{dir.out}.  For priors that use multiple responses,
  \code{ptycho} is called only once for each replicate, and the file name will
  be \file{rpl<r>col1.Rdata}.  The object in each such file has the name
  \code{smpl} and is the value of a call to \code{ptycho}.  The format of these
  objects depends upon the argument \code{only.means}.  In all cases, however,
  it has attribute \code{params} set to a list containing most of the arguments
  in the call to \code{ptycho}.

  If \code{only.means} is \code{FALSE}, then \code{ptycho} returns an
  \code{\link{mcmc.list}} whose length is the same as the length of
  \code{initStates}.  Each entry in this list is an \code{\link{mcmc}} object
  with \code{nSavePerChain} rows and a column for each entry of \code{indic.var}
  and \code{indic.grp} plus a column for \code{tau}.

  Otherwise, \code{ptycho} returns an object of class \code{ptycho}, which is
  actually a matrix.  The matrix has a column for each sampled indicator
  variable, for \code{tau} and its square (so that its variance can be
  computed), and for the chain and iteration numbers.  If \code{only.means} is
  \code{TRUE}, then each row contains the means of the samples in one chain and
  there will be \code{length(initStates) * nSavePerChain} rows.
  If
  \code{only.means} is a vector, then there will be
  \code{length(initStates) * length(only.means)} rows.
}
\author{
  Laurel Stell and Chiara Sabatti\cr
  Maintainer: Laurel Stell <lstell@stanford.edu>
}
\references{
  Stell, L. and Sabatti, C. (2015) Genetic variant selection: learning across
  traits and sites, arXiv:1504.00946.
}
\seealso{
  \code{\link{createData}} for simulating input data.

  \code{\link{checkConvergence}} and \link{PosteriorStatistics} for analyzing
  output of \code{ptycho}.
  
  \link{Data} describes \code{tinysim} in example below as well as an object
  created with \code{ptycho}.
}
\examples{
data(tinysim)
# Use replicate 4.
X <- tinysim$X; p <- ncol(X); nr <- 4
# COMBINE INFORMATION ACROSS RESPONSES
Y <- tinysim$replicates[[nr]]$y; q <- ncol(Y)
# Run 2 chains.
state <- list(list(indic.grp=rep(FALSE,p),
                   indic.var=matrix(FALSE,nrow=p,ncol=q), tau=1),
              list(indic.grp=rep(TRUE,p),
                   indic.var=matrix(TRUE,nrow=p,ncol=q), tau=1))
# In each chain, discard first 10 burn-in samples, then generate
# 100 samples and save running means after every 20 samples.
smpl.ph <- ptycho(X=X, y=Y, initStates=state, only.means=20*(1:5),
                  nburn=10)
# COMBINE INFORMATION ACROSS VARIANTS
# Use two groups of variants.
G <- 2; groups <- createGroupsSim(G, p)
# Run 2 chains.
state <- list(list(indic.grp=rep(FALSE,G),
                   indic.var=matrix(FALSE,nrow=p,ncol=1), tau=1),
              list(indic.grp=rep(TRUE,G),
                   indic.var=matrix(TRUE,nrow=p,ncol=1), tau=1))
# Use response 3.
y <- tinysim$replicates[[nr]]$y[,3,drop=FALSE]
smpl.var <- ptycho(X=X, y=y, groups=groups, initStates=state,
                   only.means=c(20*(1:5)), nburn=10, nthin=1)
}
\keyword{models}
\keyword{regression}
