## ---- include = FALSE--------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup, include = FALSE--------------------------------------------------- library(vacuum) library(ggplot2) library(magrittr) library(tidyr) library(dplyr) ## ----echo = FALSE, fig.width = 7, fig.height = 4------------------------------ # Normal Q-Q plot for Tukey's sample data data.frame(y = table_1) %>% ggplot(aes(sample = y)) + stat_qq() + stat_qq_line() + #geom_abline(aes(intercept = 0, slope = 1)) + labs(title = 'Normal Q-Q plot of sample data') ## ----echo = FALSE, fig.width = 7, fig.height = 4------------------------------ x <- table_1 n <- length(x) y_split <- median(x) dat <- data.frame(i = 1:n, y_sorted = sort(x), a = a_qnorm(1:n, n), z = (sort(x) - y_split) / a_qnorm(1:n, n) ) # find the point with the greatest slope key_pt <- dat[which.max(dat$z), ] # plot the points and the slope of the point key point ggplot(dat, aes(x = a, y = y_sorted)) + geom_point(color = 'darkgrey') + geom_point(dat = key_pt, aes(x = a, y = y_sorted), color = 'black') + geom_point(aes(x = 0, y = y_split), color = 'red') + # example line geom_segment(aes(x = 0, y = y_split, xend = key_pt$a, yend = key_pt$y_sorted)) + labs(title = 'Sorted values of y plotted against theoretical distribution a(y)', y = 'y') ## ----echo = FALSE, , fig.width = 7, fig.height = 4---------------------------- A <- 0 B <- 1.5 tmp <- funop(table_1, A, B) # need z & special for every point tmp$z <- (tmp$y - median(tmp$y)) / a_qnorm(tmp$i, n) tmp$special[is.na(tmp$special)] <- FALSE # need outer to get special and outer to add up as a factor tmp$outer <- !tmp$middle z_split <- attr(tmp, 'z_split') y_split <- attr(tmp, 'y_split') extreme_B <- B * z_split extreme_A <- A * z_split ggplot(tmp, aes(x = y, y = z)) + geom_point(aes(color = as.factor(outer + special))) + geom_hline(yintercept = z_split) + geom_hline(yintercept = extreme_B, color = 'blue', alpha = 0.5) + geom_vline(xintercept = y_split, color = 'green3', alpha = 0.75) + geom_rect( xmin = -Inf, xmax = Inf, ymin = extreme_B, ymax = Inf, alpha = 0.0075, fill = 'blue' ) + geom_rect( xmin = y_split - extreme_A, xmax = y_split + extreme_A, ymin = -Inf, ymax = Inf, alpha = 0.01, fill = 'green' ) + labs(title = 'FUNOP: slope (z) plotted against original values') + labs(x = 'original values') + scale_color_manual( name = 'values', labels = c( 'excluded', 'undistinguished', 'special attention' ), values = c('green3', 'black', 'red') ) ## ----------------------------------------------------------------------------- # Table 1 contains example data from Tukey's paper table_1 result <- funop(table_1) result # value of the outliers result[which(result$special == TRUE), 'y'] ## ----echo = FALSE, fig.width = 7, fig.height = 4------------------------------ # cleanse the table cleansed_t2 <- funor_funom(table_2) data.frame( i = 1:length(table_2), original = as.vector(table_2), cleansed = as.vector(cleansed_t2) ) %>% ggplot(aes(x = i)) + geom_point(aes(y = original), color = 'black') + geom_segment(aes( x = i, y = original, xend = i, yend = cleansed ), color = 'black', linetype = 'dashed') + geom_point(aes(y = cleansed), color = 'grey') + labs(title = 'Changes to Table 2', y = 'value', x = '') ## ----------------------------------------------------------------------------- # create a random dataset, representing a two-way table set.seed(42) dat <- matrix(rnorm(16), nrow = 4) # add some noise dat[2, 1] <- rnorm(1, mean = 10) # cleanse the table of outliers cleansed_dat <- funor_funom(dat) # look at the two tables, before and after dat cleansed_dat # look at the difference between the two tables # only one non-zero difference dat - cleansed_dat ## ----------------------------------------------------------------------------- # convert the output of vacuum_cleaner into a long data frame dat <- vacuum_cleaner(table_8) %>% as.data.frame() %>% dplyr::mutate('row' = dplyr::row_number()) %>% tidyr::pivot_longer(cols = dplyr::starts_with('V'), names_to = 'col', values_to = 'value') # since the table_8 was a two-way table, the rows are factors dat$row <- as.factor(dat$row) # conduct analysis of variance aov_results <- aov(value ~ row + col, dat) summary(aov_results)