## ----setup, include = FALSE----------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ---- results='hide'-----------------------------------------------------
# install the "adaptMT" package from github.
# will be submitted to CRAN very soon.
# devtools::install_github("lihualei71/adaptMT")
library("adaptMT")

## ---- out.width = "75%", out.height = "250px", fig.align = "center", echo = FALSE----
knitr::include_graphics('flowchart_AdaPT.pdf')

## ----results='hide'------------------------------------------------------
# load the data.
data("estrogen")
# Take the first 5000 genes
if (!requireNamespace("dplyr")){
  install.packages("dplyr")
}
library("dplyr")
estrogen <- select(estrogen, pvals, ord_high) %>% 
  filter(ord_high <= 5000)
rownames(estrogen) <- NULL

## ------------------------------------------------------------------------
head(estrogen, 5)

## ------------------------------------------------------------------------
summary(estrogen)

## ---- echo=FALSE, fig.height=5, fig.width=7------------------------------
  plot(estrogen$ord, estrogen$pvals, pch = ".", xlab = "order", ylab = "p-values", xaxs = "i", yaxs = "i", main = "Scatter plot of p-values in the (sub-sampled) estrogen dataset")

## ------------------------------------------------------------------------
# prepare the inputs of AdaPT
# need "splines" package to construct the formula for glm
library("splines")
pvals <- as.numeric(estrogen$pvals)
x <- data.frame(x = as.numeric(estrogen$ord))
formulas <- paste0("ns(x, df = ", 6:10, ")")
formulas

## ---- warning=FALSE------------------------------------------------------
# run AdaPT
res_glm <- adapt_glm(x = x, pvals = pvals, pi_formulas = formulas, mu_formulas = formulas)

## ---- echo=FALSE, fig.height=5, fig.width=7------------------------------
par(mfrow = c(2, 3))
for (alpha in seq(0.3, 0.05, -0.05)){
  nrejs <- res_glm$nrejs[floor(alpha * 100)]
  title <- paste0("alpha = ", alpha, ", nrejs = ", nrejs)
  plot_1d_thresh(res_glm, x, pvals, alpha, title, disp_ymax = 1, xlab = "order")
}

## ---- fig.height=5, fig.width=7------------------------------------------
par(mfrow = c(2, 3))
for (alpha in seq(0.3, 0.05, -0.05)){
  nrejs <- res_glm$nrejs[floor(alpha * 100)]
  title <- paste0("alpha = ", alpha, ", nrejs = ", nrejs)
  plot_1d_lfdr(res_glm, x, pvals, alpha, title, disp_ymax = 0.25, xlab = "order")
}

## ------------------------------------------------------------------------
# Generate a 2-dim x
n <- 400
x1 <- x2 <- seq(-100, 100, length.out = 20)
x <- expand.grid(x1, x2)
colnames(x) <- c("x1", "x2")

# Generate p-values (one-sided z test)
H0 <- apply(x, 1, function(coord){sum(coord^2) < 40^2})
mu <- ifelse(H0, 2, 0)
set.seed(0)
zvals <- rnorm(n) + mu
pvals <- 1 - pnorm(zvals)

## ------------------------------------------------------------------------
# install.packages("mgcv")
library("mgcv")

## ------------------------------------------------------------------------
formula <- "s(x1, x2)"
res_gam <- adapt_gam(x = x, pvals = pvals, pi_formulas = formula, mu_formulas = formula)

## ---- fig.height=3.5, fig.width=7----------------------------------------
par(mfrow = c(1, 2), mar = c(4, 4, 2, 2))
# Truth
plot(x[, 1], x[, 2], type = "p", xlab = "", ylab = "", col = ifelse(H0, "#000000", "#A9A9A9"), cex = 2, pch = 15)
# Rejection set
rej <- pvals <= res_gam$s[, 10]
plot(x[, 1], x[, 2], type = "p", xlab = "", ylab = "", col = ifelse(rej, "#800000", "#FFB6C1"), cex = 2, pch = 15)

## ---- fig.height=4.5, fig.width=6----------------------------------------
plot_2d_thresh(res_gam, x, pvals, 0.1, "P-Value Threshold", xlab = "", ylab = "")

## ---- fig.height=4.5, fig.width=6----------------------------------------
par(mfrow = c(2, 2), mar = c(4, 4, 2, 2))
for (targetp in c(0.001, 0.005, 0.01, 0.05)){
  title <- paste0("Local FDR Estimates (p = ", targetp, ")")
  plot_2d_lfdr(res_gam, x, pvals, 0.1, title, targetp, xlab = "", ylab = "")
}

## ------------------------------------------------------------------------
set.seed(0)
m <- 50
n <- 500
x <- matrix(runif(n * m), n, m)

## ------------------------------------------------------------------------
inv_logit <- function(x) {exp(x) / (1 + exp(x))}
pi <- inv_logit(x[, 1] * 2 + x[, 2] * 2 - 3)
mu <- pmax(1, x[, 1] * 2 + x[, 2] * 2)

## ------------------------------------------------------------------------
H0 <- as.logical(ifelse(runif(n) < pi, 1, 0))
y <- ifelse(H0, rexp(n, 1/mu), rexp(n, 1))
pvals <- exp(-y)

## ------------------------------------------------------------------------
res <- adapt_glmnet(x, pvals, s0 = rep(0.15, n), nfits = 5)

## ---- out.width = "450px", out.height = "250px", fig.align = "center", echo = FALSE----
knitr::include_graphics('flowchart_EM.pdf')

## ---- out.width = "500px", out.height = "250px", fig.align = "center", echo = FALSE----
knitr::include_graphics('flowchart_EM_ms.pdf')

## ------------------------------------------------------------------------
# g function
g <- function(x){
    tmp <- -log(x)
    pmax(pmin(tmp, -log(10^-15)), -log(1-10^-15))
}
# inverse function of g
ginv <- function(x){
    tmp <- exp(-x)
    pmax(pmin(tmp, 1 - 10^-15), 10^-15)
}
# eta function (mapping from the mean parameter to the natural parameter)
eta <- function(mu){-1/mu}
# mu* (the mean parameter corresponding to U([0, 1]))
mustar <- 1
# A function (the partition function)
A <- log
# Set a name; optional
name <- "beta"
# Set the default family as Gamma() (for glm, gam, glmnet, etc.); optional
family <- Gamma()
# Generate beta_family()
beta_family <- gen_exp_family(g, ginv, eta, mustar, A, name, family)

## ------------------------------------------------------------------------
pifun_glm <- function(formula, data, weights, ...){
  formula <- as.formula(formula) # Transform the argument `formula` from character to formula
  fit <- stats::glm(formula, data, weights = weights,
                    family = quasibinomial(), ...) 
  # Here quasibinomial() family gets rid of meaningless warning messages
  fitv <- predict(fit, type = "response") # fitted values for pi1(x)
  info <- list(df = fit$rank) # degree-of-freedom being the rank of the design matrix
  return(list(fitv = fitv, info = info))
}

mufun_glm <- function(formula, data, weights, ...){
  formula <- as.formula(formula) # Transform the argument `formula` from character to formula
  fit <- stats::glm(formula, data, weights = weights,
                family = Gamma(), ...)
  fitv <- predict(fit, type = "response") # fitted values for mu(x)
  info <- list(df = fit$rank) # degree-of-freedom being the rank of the design matrix
  return(list(fitv = fitv, info = info))
}

## ------------------------------------------------------------------------
pifun_glmnet <- function(x, y, weights, ...){
  fit <- glmnet::cv.glmnet(x, y, weights = weights, family = "binomial", ...) 
  fitv <- predict(fit, newx = x, s = "lambda.min", type = "response") # fitted values for pi1(x)
  beta <- coef(fit, s = "lambda.min")
  info <- list(df = sum(beta != 0)) # degree-of-freedom being the number of non-zero coefficients
  return(list(fitv = fitv, info = info))
}

mufun_glmnet <- function(x, y, weights, ...){
  fit <- HDtweedie::cv.HDtweedie(x, y, p = 2, weights = weights, ...)
  fitv <- predict(fit, newx = x, s = "lambda.min", type = "response") # fitted values for mu(x)
  beta <- coef(fit, s = "lambda.min")
  info <- list(df = sum(beta != 0)) # degree-of-freedom being the number of non-zero coefficients
  return(list(fitv = fitv, info = info))
}

## ------------------------------------------------------------------------
pifun_init_glm <- function(x, pvals, s, 
                       formula, ...){
    # The input `formula` is a half-formula of type character, e.g. ~ ns(x, df = 8)
    J <- ifelse(
        pvals < s | pvals > 1 - s, 1,
        2 * s / (2 * s - 1)
        )

    # Complete the formula by adding the response variable J
    complete_formula <- as.formula(paste("J", formula)) 
    data <- data.frame(x, J = J)
    mod <- stats::glm(complete_formula, family = gaussian(), data = data)
    
    fitv <- as.numeric(predict(mod, type = "response"))
    info <- list(df = fit$rank)
    
    return(list(fitv = fitv, info = info))
}

mufun_init_glm <- function(x, pvals, s, ...){
    phat <- ifelse(
        pvals < s | pvals > 1 - s,
        pmin(pvals, 1 - pvals),
        pvals
        )
    yhat <- dist$g(phat)

    # Complete the formula by adding the response variable yhat
    complete_formula <- as.formula(paste("yhat", formula)) 
    data <- data.frame(x, yhat = yhat)
    mod <- stats::glm(complete_formula, family = Gamma(), data = data)
    
    fitv <- as.numeric(predict(mod, type = "response"))
    info <- list(df = fit$rank)
    
    return(list(fitv = fitv, info = info))
}

## ------------------------------------------------------------------------
library("splines") 
piargs <- muargs <- list(formula = "~ ns(x, df = 5)")
adapt_model_glm_example <- gen_adapt_model(
  pifun = pifun_glm, mufun = mufun_glm, 
  pifun_init = pifun_init_glm, mufun_init = mufun_init_glm,
  piargs = piargs, muargs = muargs,
  piargs_init = piargs, muargs_init = muargs
)

## ------------------------------------------------------------------------
adapt_model_glm_example2 <- gen_adapt_model(
  name = "glm",
  piargs = piargs, muargs = muargs
)

## ------------------------------------------------------------------------
data(estrogen)
pvals <- as.numeric(estrogen$pvals)
x <- data.frame(x = as.numeric(estrogen$ord_high))
dist <- beta_family()
inds <- (x$x <= 1000)
pvals <- pvals[inds]
x <- x[inds,,drop = FALSE]

res1 <- adapt_glm(x = x, pvals = pvals, pi_formulas = "~ ns(x, df = 5)", mu_formulas = "~ ns(x, df = 5)")
res2 <- adapt(x = x, pvals = pvals, models = adapt_model_glm_example2)
identical(res1$s, res2$s)