## ----------------------------------------------------------------------------- library("vtreat") packageVersion("vtreat") citation('vtreat') ## ----------------------------------------------------------------------------- # categorical example set.seed(23525) # we set up our raw training and application data dTrainC <- data.frame( x = c('a', 'a', 'a', 'b', 'b', NA, NA), z = c(1, 2, 3, 4, NA, 6, NA), y = c(FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE)) dTestC <- data.frame( x = c('a', 'b', 'c', NA), z = c(10, 20, 30, NA)) # we perform a vtreat cross frame experiment # and unpack the results into treatmentsC # and dTrainCTreated unpack[ treatmentsC = treatments, dTrainCTreated = crossFrame ] <- mkCrossFrameCExperiment( dframe = dTrainC, varlist = setdiff(colnames(dTrainC), 'y'), outcomename = 'y', outcometarget = TRUE, verbose = FALSE) # the treatments include a score frame relating new # derived variables to original columns treatmentsC$scoreFrame[, c('origName', 'varName', 'code', 'rsq', 'sig', 'extraModelDegrees', 'recommended')] %.>% knitr::kable(.) # the treated frame is a "cross frame" which # is a transform of the training data built # as if the treatment were learned on a different # disjoint training set to avoid nested model # bias and over-fit. dTrainCTreated %.>% head(.) %.>% knitr::kable(.) # Any future application data is prepared with # the prepare method. dTestCTreated <- prepare(treatmentsC, dTestC, pruneSig=NULL) dTestCTreated %.>% head(.) %.>% knitr::kable(.) ## ----------------------------------------------------------------------------- # numeric example set.seed(23525) # we set up our raw training and application data dTrainN <- data.frame( x = c('a', 'a', 'a', 'a', 'b', 'b', NA, NA), z = c(1, 2, 3, 4, 5, NA, 7, NA), y = c(0, 0, 0, 1, 0, 1, 1, 1)) dTestN <- data.frame( x = c('a', 'b', 'c', NA), z = c(10, 20, 30, NA)) # we perform a vtreat cross frame experiment # and unpack the results into treatmentsN # and dTrainNTreated unpack[ treatmentsN = treatments, dTrainNTreated = crossFrame ] <- mkCrossFrameNExperiment( dframe = dTrainN, varlist = setdiff(colnames(dTrainN), 'y'), outcomename = 'y', verbose = FALSE) # the treatments include a score frame relating new # derived variables to original columns treatmentsN$scoreFrame[, c('origName', 'varName', 'code', 'rsq', 'sig', 'extraModelDegrees')] %.>% knitr::kable(.) # the treated frame is a "cross frame" which # is a transform of the training data built # as if the treatment were learned on a different # disjoint training set to avoid nested model # bias and over-fit. dTrainNTreated %.>% head(.) %.>% knitr::kable(.) # Any future application data is prepared with # the prepare method. dTestNTreated <- prepare(treatmentsN, dTestN, pruneSig=NULL) dTestNTreated %.>% head(.) %.>% knitr::kable(.)