## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.dim = c(6, 4)
)

## ----setup--------------------------------------------------------------------
library(bakR)

# Packages that are NOT automatically installed when bakR is installed
library(DESeq2)

# Packages which are installed when bakR is installed
library(dplyr) 
library(magrittr) 
library(ggplot2) 
library(stats)

# Set the seed for reproducibility
set.seed(123)

## -----------------------------------------------------------------------------
# Simulate a nucleotide recoding dataset
sim_data <- Simulate_bakRData(1000,
                         num_kd_DE = c(0, 0),
                         num_ks_DE = c(0, 200))
  # This will simulate 500 features, 2 experimental conditions
  # and 3 replicates for each experimental condition
  # See ?Simulate_bakRData for details regarding tunable parameters

# Extract simulated bakRData object
bakRData <- sim_data$bakRData

# Extract simualted ground truths
sim_truth <- sim_data$sim_list

## Run the efficient model
# We'll tell it what the mutation rates are just for efficiency's sake
Fit <- bakRFit(bakRData, pnew = rep(0.05, times = 6), pold = 0.001)



## ----results = 'hide'---------------------------------------------------------

# Get the count matrix from bakR
Counts <- Fit$Data_lists$Count_Matrix

# Experimental conditions for each sample
# There are 6 s4U treated samples (3 replicates of each condition)
# In addition, there are 2 -s4U control samples (1 for each condition)

## s4U conditions
# 1st three samples are reference (ref) samples
# Next three samples are experimental (exp) samples
conditions_s4U <- as.factor(rep(c("ref", "exp"), each = 3))

## -s4U control conditions
# 1st sample is reference, next is experimental
conditions_ctl <- as.factor(c("ref", "exp"))

# Combined s4U and -s4U control conditions
conditions <- c(conditions_s4U, conditions_ctl)

# Make the colData input for DESeq2
colData <- data.frame(conditions = conditions)
rownames(colData) <- colnames(Counts)

# Take a look at the colData object
print(t(colData))


## ----echo = FALSE, warning = FALSE--------------------------------------------
knitr::kable(t(colData))


## -----------------------------------------------------------------------------
dds <- DESeqDataSetFromMatrix(countData = Counts,
                              colData = colData,
                              design = ~conditions)


## -----------------------------------------------------------------------------
ddso <- DESeq(dds)

# Extract results of experimental vs. reference comparison
reso <- results(ddso, contrast = c("conditions", "exp", "ref"))

# Look at the column names of the reso object
colnames(as.data.frame(reso))
                

## -----------------------------------------------------------------------------
ksyn_df <- data.frame(L2FC = reso$log2FoldChange + Fit$Fast_Fit$Effects_df$L2FC_kdeg,
                      Gene = Fit$Fast_Fit$Effects_df$XF)
                

## -----------------------------------------------------------------------------
ksyn_df$se <- sqrt(reso$lfcSE^2 + (Fit$Fast_Fit$Effects_df$se*log2(exp(1)))^2 ) 
                

## -----------------------------------------------------------------------------
# Calculate p-value using asymptotic Wald test
ksyn_df <- ksyn_df %>%
  mutate(pval = 2*pnorm(-abs(L2FC/se)),
         padj = p.adjust(pval, method = "BH"))
                

## ----fig.align='center'-------------------------------------------------------

# Add conclusion at 0.01 FDR control
ksyn_df <- ksyn_df %>%
  mutate(conclusion = as.factor(ifelse(padj < 0.01, 
                             ifelse(L2FC < 0, "Decreased txn", "Increased txn"),
                             "Not sig.")))

# Make volcano plot
ksyn_volc <- ggplot(ksyn_df, aes(x = L2FC, y = -log10(padj), color = conclusion)) + 
  theme_classic() + 
  geom_point(size = 1) + 
  xlab("L2FC(ksyn)") + 
  ylab("-log10(padj)") + 
  scale_color_manual(values = c("blue", "orange", "gray"))

# Observe volcano plot
ksyn_volc


## ----fig.align='center'-------------------------------------------------------

# Add conclusion at 0.01 FDR control
ksyn_df <- ksyn_df %>%
  mutate(result = as.factor(ifelse(padj < 0.01, 
                             ifelse(as.integer(Gene) <= 800, "FP", "TP"),
                             ifelse(as.integer(Gene) <= 800, "TN", "FN"))))

# Make volcano plot
ksyn_results <- ggplot(ksyn_df, aes(x = L2FC, y = -log10(padj), color = result)) + 
  theme_classic() + 
  geom_point(size = 1) + 
  xlab("L2FC(ksyn)") + 
  ylab("-log10(padj)") + 
  scale_color_manual(values = c("black", "gray", "forestgreen", "blue"))

# Observe volcano plot
ksyn_results