Title: | Interpret Tree Ensembles |
Version: | 1.4 |
Date: | 2024-04-22 |
Imports: | RRF, arules, gbm, xtable, xgboost, data.table, methods |
Description: | For tree ensembles such as random forests, regularized random forests and gradient boosted trees, this package provides functions for: extracting, measuring and pruning rules; selecting a compact rule set; summarizing rules into a learner; calculating frequent variable interactions; formatting rules in latex code. Reference: Interpreting tree ensembles with inTrees (Houtao Deng, 2019, <doi:10.1007/s41060-018-0144-8>). |
Maintainer: | Houtao Deng <softwaredeng@gmail.com> |
BugReports: | https://github.com/softwaredeng/inTrees/issues |
License: | GPL (≥ 3) |
Packaged: | 2024-04-23 05:26:26 UTC; houtaodeng |
NeedsCompilation: | no |
Repository: | CRAN |
Date/Publication: | 2024-04-23 06:10:03 UTC |
Author: | Houtao Deng [aut, cre], Xin Guan [aut], Vadim Khotilovich [aut] |
Transform gbm object to a list of trees
Description
Transform gbm object to a list of trees that can be used for rule condition extraction
Usage
GBM2List(gbm1,X)
Arguments
gbm1 |
gbm object |
X |
predictor variable matrix |
Value
a list of trees in an inTrees-required format
See Also
Examples
library(gbm)
data(iris)
X <- iris[,1:(ncol(iris)-1)]
target <- iris[,"Species"]
gbmFit <- gbm(Species~ ., data=iris, n.tree = 400,
interaction.depth = 10,distribution="multinomial")
treeList <- GBM2List(gbmFit,X)
ruleExec = extractRules(treeList,X)
ruleExec <- unique(ruleExec)
#ruleExec <- ruleExec[1:min(2000,length(ruleExec)),,drop=FALSE]
ruleMetric <- getRuleMetric(ruleExec,X,target)
ruleMetric <- pruneRule(ruleMetric,X,target)
ruleMetric <- unique(ruleMetric)
learner <- buildLearner(ruleMetric,X,target)
pred <- applyLearner(learner,X)
readableLearner <- presentRules(learner,colnames(X)) # more readable format
err <- 1-sum(pred==target)/length(pred);
internal function
Description
internal function
Usage
Num2Level(rfList, splitV)
Arguments
rfList |
|
splitV |
Value
data frame with numeric variables converted to categorical variables.
Examples
##---- Should be DIRECTLY executable !! ----
##-- ==> Define data, use random,
##-- or do help(data=index) for the standard data sets.
## The function is currently defined as
function (rfList, splitV)
{
for (i in 1:rfList$ntree) {
rfList$list[[i]] <- data.frame(rfList$list[[i]])
rfList$list[[i]][, "prediction"] <- data.frame(dicretizeVector(rfList$list[[i]][,
"prediction"], splitV))
colnames(rfList$list[[i]]) <- c("left daughter", "right daughter",
"split var", "split point", "status", "prediction")
}
return(rfList)
}
Transform a random forest object to a list of trees
Description
Transform a random forest object to a list of trees
Usage
RF2List(rf)
Arguments
rf |
random forest object |
Value
a list of trees
See Also
Examples
library(RRF)
data(iris)
X <- iris[,1:(ncol(iris)-1)]
target <- iris[,"Species"]
rf <- RRF(X,as.factor(target),ntree=100) # build an ordinary RF
treeList <- RF2List(rf)
ruleExec <- extractRules(treeList,X) # transform to R-executable rules
Transform an xgboost object to a list of trees
Description
Transform an xgboost object to a list of trees
Usage
XGB2List(xgb, X)
Arguments
xgb |
xgboost object |
X |
predictor variable matrix |
Value
a list of trees in an inTrees-required format
See Also
Examples
library(data.table)
library(xgboost)
# test data set 1: iris
X <- within(iris,rm("Species")); Y <- iris[,"Species"]
X <- within(iris,rm("Species")); Y <- iris[,"Species"]
model_mat <- model.matrix(~. -1, data=X)
xgb <- xgboost(model_mat, label = as.numeric(Y) - 1, nrounds = 20,
objective = "multi:softprob", num_class = 3 )
tree_list <- XGB2List(xgb,model_mat)
apply a simplified tree ensemble learner (STEL) to data
Description
apply STEL to data and get predictions
Usage
applyLearner(learner, X)
Arguments
learner |
a matrix with rules ordered by priority |
X |
predictor variable matrix |
Value
predictions for the data
See Also
build a simplified tree ensemble learner (STEL)
Description
Build a simplified tree ensemble learner (STEL). Currently works only for classification problems.
Usage
buildLearner(ruleMetric, X, target, minFreq = 0.01)
Arguments
ruleMetric |
a matrix including the conditions, predictions, and and metrics |
X |
predictor variable matrix |
target |
target variable |
minFreq |
minimum frequency of a rule condition in order to be included in STEL. |
Value
a matrix including the conditions, prediction, and metrics, ordered by priority.
Author(s)
Houtao Deng
References
Houtao Deng, Interpreting Tree Ensembles with inTrees, technical report, 2014
Examples
data(iris)
library(RRF)
X <- iris[,1:(ncol(iris)-1)]
target <- iris[,"Species"]
rf <- RRF(X,as.factor(target),ntree=100) # build an ordinary RF
treeList <- RF2List(rf)
ruleExec <- extractRules(treeList,X)
ruleExec <- unique(ruleExec)
ruleMetric <- getRuleMetric(ruleExec,X,target) # measure rules
ruleMetric <- pruneRule(ruleMetric,X,target) # prune each rule
#ruleMetric <- selectRuleRRF(ruleMetric,X,target) # rule selection
learner <- buildLearner(ruleMetric,X,target)
pred <- applyLearner(learner,X)
read <- presentRules(learner,colnames(X)) # more readable format
# format the rule and metrics as a table in latex code
library(xtable)
print(xtable(read), include.rownames=FALSE)
print(xtable(ruleMetric[1:2,]), include.rownames=FALSE)
compute rule information
Description
compute rule information
Usage
computeRuleInfor(instIx, pred, target)
Arguments
instIx |
indices of the intances |
pred |
prediction from a rule |
target |
target values for the instances |
Value
return error and frequency
Examples
# this is an internal function.
Simulate data
Description
Simulate data
Usage
dataSimulate(flag = 1, nCol = 20, nRow = 1000)
Arguments
flag |
1 (default): team optimization; 2: non-linear; 3: linear. |
nCol |
the number of columns in the data set. must >= 2. |
nRow |
the number of rows in the data set. |
Value
predictor variable matrix and target variable
Examples
res <- dataSimulate(flag=1)
X <- res$X;
target <- res$target
discretize a variable
Description
discretize a variable
Usage
dicretizeVector(v, K = 3)
Arguments
v |
vector |
K |
discretize into up to K levels with equal frequency |
Value
discretized levels for v
Examples
data(iris)
dicretizeVector(iris[,1],3)
Extract rules from a list of trees
Description
Extract rule conditions from a list of trees. Use functions RF2List/GBM2List to transform RF/GBM objects to list of trees.
Usage
extractRules(treeList, X, ntree = 100, maxdepth = 6, random = FALSE, digits = NULL)
Arguments
treeList |
tree list |
X |
predictor variable matrix |
ntree |
conditions are extracted from the first ntree trees |
maxdepth |
conditions are extracted from the top maxdepth levels from each tree |
random |
the max depth for each tree is an integer randomly chosen between 1 and maxdepth |
digits |
digits for rounding |
Value
a set of rule conditions
Examples
library(RRF)
data(iris)
X <- iris[,1:(ncol(iris)-1)]
target <- iris[,"Species"]
rf <- RRF(X,as.factor(target),ntree=100) # build an ordinary RF
treeList <- RF2List(rf)
ruleExec <- extractRules(treeList,X,digits=4) # transform to R-executable rules
ruleExec <- unique(ruleExec)
internal
Description
internal
Usage
formatGBM(gbmList, splitBin,X)
Arguments
gbmList |
|
splitBin |
|
X |
predictor variable matrix |
Value
No return value
Examples
##---- Should be DIRECTLY executable !! ----
##-- ==> Define data, use random,
##-- or do help(data=index) for the standard data sets.
## The function is currently defined as
function (gbmList, splitBin,X)
{
for (j in 1:length(gbmList$list)) {
a <- gbmList$list[[j]]
rownames(a) <- 1:nrow(a)
a$status <- a$SplitVar
a <- a[, c("LeftNode", "RightNode", "MissingNode", "SplitVar",
"SplitCodePred", "status")]
a[which(a[, "SplitVar"] >= 0), c("SplitVar", "LeftNode",
"RightNode", "MissingNode")] <- a[which(a[, "SplitVar"] >=
0), c("SplitVar", "LeftNode", "RightNode", "MissingNode")] +
1
ix <- a$MissingNode[which(a$MissingNode > 0)]
if (length(ix) > 0)
a$status[ix] <- 10
a <- a[, c("LeftNode", "RightNode", "SplitVar", "SplitCodePred",
"status")]
cat <- which(sapply(X, is.factor) & !sapply(X, is.ordered))
ix <- which(a[, "SplitVar"] %in% cat)
for (i in ix) a[i, "SplitCodePred"] <- splitBin[a[i,
"SplitCodePred"] + 1]
colnames(a) <- c("left daughter", "right daughter", "split var",
"split point", "status")
gbmList$list[[j]] <- a
}
return(gbmList)
}
calculate frequent variable interactions
Description
calculate frequent variable interactions
Usage
getFreqPattern(ruleMetric, minsup = 0.01, minconf = 0.5, minlen = 1, maxlen = 4)
Arguments
ruleMetric |
a matrix including conditions, predictions, and the metrics |
minsup |
minimum support of conditions in a tree ensemble |
minconf |
minimum confidence of the rules |
minlen |
minimum length of the conditions |
maxlen |
max length of the conditions |
Value
a matrix including frequent variable interations (in a form of conditions), predictions, length, support, and confidence.
Examples
library(RRF)
library(arules)
data(iris)
X <- iris[,1:(ncol(iris)-1)]
target <- iris[,"Species"]
rf <- RRF(X,as.factor(target),ntree=100) # build an ordinary RF
treeList <- RF2List(rf)
ruleExec <- extractRules(treeList,X) # transform to R-executable rules
ruleMetric <- getRuleMetric(ruleExec,X,target)
freqPattern <- getFreqPattern(ruleMetric)
freqPatternMetric <- getRuleMetric(freqPattern,X,target)
Assign outcomes to a conditions, and measure the rules
Description
Assign outcomes to a conditions, and measure the rules
Usage
getRuleMetric(ruleExec, X, target)
Arguments
ruleExec |
a set of rule conditions |
X |
predictor variable matrix |
target |
target variable |
Value
a matrix including the condictions, predictions, and metrics
References
Houtao Deng, Interpreting Tree Ensembles with inTrees, technical report, 2014
Examples
library(RRF)
data(iris)
X <- iris[,1:(ncol(iris)-1)]
target <- iris[,"Species"]
rf <- RRF(X,as.factor(target),ntree=100) # build an ordinary RF
treeList <- RF2List(rf)
ruleExec <- extractRules(treeList,X) # transform to R-executable rules
ruleExec <- unique(ruleExec)
ruleMetric <- getRuleMetric(ruleExec,X,target) # measure rules
get type of each variable
Description
get type of each variable: numeric or categorical
Usage
getTypeX(X)
Arguments
X |
Value
A vector indicating each variable's type: numeric: 1; categorical: 2
internal
Description
internal
Usage
lookupRule(rules, strList)
Arguments
rules |
|
strList |
Value
rules that matched to strList
Examples
##---- Should be DIRECTLY executable !! ----
##-- ==> Define data, use random,
##-- or do help(data=index) for the standard data sets.
## The function is currently defined as
function (rules, strList)
{
ix <- grep(strList[1], rules[, "condition"])
if (length(strList) >= 2) {
for (i in 2:length(strList)) {
ix2 <- grep(strList[i], rules[, "condition"])
ix <- intersect(ix, ix2)
}
}
if (length(ix) >= 1)
return(rules[ix, , drop = FALSE])
if (length(ix) == 0)
return(NULL)
}
internal
Description
internal
Usage
measureRule(ruleExec, X, target, pred = NULL, regMethod = "mean")
Arguments
ruleExec |
|
X |
|
target |
|
pred |
|
regMethod |
Value
data frame including rule's length, frequency, error, rule condition and prediction
Examples
##---- Should be DIRECTLY executable !! ----
##-- ==> Define data, use random,
##-- or do help(data=index) for the standard data sets.
## The function is currently defined as
function (ruleExec, X, target, pred = NULL)
{
len <- length(unlist(strsplit(ruleExec, split = " & ")))
origRule <- ruleExec
ruleExec <- paste("which(", ruleExec, ")")
ixMatch <- eval(parse(text = ruleExec))
if (length(ixMatch) == 0) {
v <- c("-1", "-1", "-1", "", "")
names(v) <- c("len", "freq", "err", "condition", "pred")
return(v)
}
ys <- target[ixMatch]
freq <- round(length(ys)/nrow(X), digits = 3)
if (is.numeric(target)) {
ysMost <- mean(ys)
err <- sum((ysMost - ys)^2)/length(ys)
}
else {
if (length(pred) > 0) {
ysMost = pred
}
else {
ysMost <- names(which.max(table(ys)))
}
conf <- round(table(ys)[ysMost]/sum(table(ys)), digits = 3)
err <- 1 - conf
}
rule <- origRule
v <- c(len, freq, err, rule, ysMost)
names(v) <- c("len", "freq", "err", "condition", "pred")
return(v)
}
Present a learner using column names instead of X[i,]
Description
Present a learner using column names instead of X[i,]
Usage
presentRules(rules, colN, digits)
Arguments
rules |
a set of rules |
colN |
a vector including the column names |
digits |
digits for rounding |
Value
a matrix including the conditions (with column names), etc.
See Also
Examples
# See function "buildLearner"
Prune irrevant variable-value pair from a rule condition
Description
Prune irrevant variable-value pair from a rule condition
Usage
pruneRule(rules, X, target, maxDecay = 0.05, typeDecay = 2)
Arguments
rules |
A metrix including the rules and metrics |
X |
predictor variable matrix |
target |
target variable vector |
maxDecay |
threshold of decay |
typeDecay |
1: relative error; 2: error; default :2 |
Value
A matrix including the rules each being pruned, and metrics
Author(s)
Houtao Deng
References
Houtao Deng, Interpreting Tree Ensembles with inTrees, technical report, 2014
See Also
Examples
# see function "buildLearner"
internal
Description
internal
Usage
pruneSingleRule(rule, X, target, maxDecay, typeDecay)
Arguments
rule |
|
X |
|
target |
|
maxDecay |
|
typeDecay |
Value
a pruned rule and its metrics.
Examples
##---- Should be DIRECTLY executable !! ----
##-- ==> Define data, use random,
##-- or do help(data=index) for the standard data sets.
## The function is currently defined as
function (rule, X, target, maxDecay, typeDecay)
{
newRuleMetric <- measureRule(rule["condition"], X, target)
errOrig <- as.numeric(newRuleMetric["err"])
ruleV <- unlist(strsplit(rule["condition"], split = " & "))
pred <- rule["pred"]
if (length(ruleV) == 1)
return(newRuleMetric)
for (i in length(ruleV):1) {
restRule <- ruleV[-i]
restRule <- paste(restRule, collapse = " & ")
metricTmp <- measureRule(restRule, X, target, pred)
errNew <- as.numeric(metricTmp["err"])
if (typeDecay == 1) {
decay <- (errNew - errOrig)/max(errOrig, 1e-06)
}
else {
decay <- (errNew - errOrig)
}
if (decay <= maxDecay) {
ruleV <- ruleV[-i]
newRuleMetric <- metricTmp
if (length(ruleV) <= 1)
break
}
}
return(newRuleMetric)
}
internal function
Description
internal function
Usage
rule2Table(ruleExec, X, target)
Arguments
ruleExec |
|
X |
|
target |
Value
a matrix of indicators matching each rule condition and each row of data
Examples
##---- Should be DIRECTLY executable !! ----
##-- ==> Define data, use random,
##-- or do help(data=index) for the standard data sets.
## The function is currently defined as
function (ruleExec, X, target)
{
I <- rep(0, nrow(X))
ruleExec <- paste("which(", ruleExec, ")")
ixMatch <- eval(parse(text = ruleExec))
if (length(ixMatch) > 0)
I[ixMatch] <- 1
names(I) = NULL
return(I)
}
internal
Description
internal
Usage
ruleList2Exec(X, allRulesList)
Arguments
X |
|
allRulesList |
Value
data frame containing rule conditions
Examples
##---- Should be DIRECTLY executable !! ----
##-- ==> Define data, use random,
##-- or do help(data=index) for the standard data sets.
## The function is currently defined as
function (X, allRulesList)
{
typeX = getTypeX(X)
ruleExec <- unique(t(sapply(allRulesList, singleRuleList2Exec,
typeX = typeX)))
ruleExec <- t(ruleExec)
colnames(ruleExec) <- "condition"
return(ruleExec)
}
select a set of relevant and non-redundant rules
Description
select a set of relevant and non-redundant rules using regularized random forests
Usage
selectRuleRRF(ruleMetric, X, target)
Arguments
ruleMetric |
a matrix including the rules and metrics |
X |
predictor variable matrix |
target |
response variable |
Value
a matrix including a set of relevant and non-redundant rules, and their metrics
Author(s)
Houtao Deng
See Also
Examples
# See function "buildLearner:
internal
Description
internal
Usage
singleRuleList2Exec(ruleList, typeX)
Arguments
ruleList |
|
typeX |
Value
data frame containing rule conditions
Examples
##---- Should be DIRECTLY executable !! ----
##-- ==> Define data, use random,
##-- or do help(data=index) for the standard data sets.
## The function is currently defined as
function (ruleList, typeX)
{
ruleExec <- ""
vars <- ls(ruleList)
vars <- vars[order(as.numeric(vars))]
for (i in 1:length(vars)) {
if (typeX[as.numeric(vars[i])] == 2) {
values <- paste("c(", paste(paste("'", ruleList[[vars[i]]],
"'", sep = ""), collapse = ","), ")", sep = "")
tmp = paste("X[,", vars[i], "] %in% ", values, sep = "")
}
else {
tmp = ruleList[[vars[i]]]
}
if (i == 1)
ruleExec <- paste(ruleExec, tmp, sep = "")
if (i > 1)
ruleExec <- paste(ruleExec, " & ", tmp, sep = "")
}
return(c(ruleExec))
}
internal
Description
internal
Usage
sortRule(M, decreasing = TRUE)
Arguments
M |
|
decreasing |
Value
sorted rule conditions
Examples
##---- Should be DIRECTLY executable !! ----
##-- ==> Define data, use random,
##-- or do help(data=index) for the standard data sets.
## The function is currently defined as
function (M, decreasing = TRUE)
{
qIx = order((1 - as.numeric(ruleMetric[, "err"])), as.numeric(ruleMetric[,
"freq"]), -as.numeric(ruleMetric[, "len"]), decreasing = decreasing)
return(M[qIx, ])
}
internal function
Description
internal function
Usage
treeVisit(tree, rowIx, count, ruleSet, rule, levelX, length, max_length, digits)
Arguments
tree |
|
rowIx |
|
count |
|
ruleSet |
|
rule |
|
levelX |
|
length |
|
max_length |
|
digits |
Value
a list containing rules and the count
internal
Description
Predictions from a rule set
Usage
voteAllRules(ruleMetric, X, type = "r", method = "median")
Arguments
ruleMetric |
rules and metrics |
X |
predictor variable matrix |
type |
regression or classification |
method |
for regression, use median or average |
Value
predictions from the rule set
Examples
##---- Should be DIRECTLY executable !! ----
##-- ==> Define data, use random,
##-- or do help(data=index) for the standard data sets.
## The function is currently defined as
function (ruleMetric, X, type = "r", method = "median")
{
xVoteList = vector("list", nrow(X))
predY <- rep("", nrow(X))
for (i in 1:nrow(ruleMetric)) {
ixMatch <- eval(parse(text = paste("which(", ruleMetric[i,
"condition"], ")")))
if (length(ixMatch) == 0)
next
for (ii in ixMatch) {
xVoteList[[ii]] = c(xVoteList[[ii]], ruleMetric[i,
"pred"])
}
}
for (i in 1:length(xVoteList)) {
thisV <- xVoteList[[i]]
if (length(thisV) == 0)
next
if (type == "c")
predY[i] <- names(table(thisV)[which.max(table(thisV))])
if (type == "r") {
thisV = as.numeric(thisV)
if (method == "median") {
predY[i] <- median(thisV)
}
else {
predY[i] <- mean(thisV)
}
}
}
if (type == "r")
predY <- as.numeric(predY)
return(predY)
}