
R Under development (unstable) (2018-06-03 r74839) -- "Unsuffered Consequences"
Copyright (C) 2018 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> 
> library("partykit")
Loading required package: grid
Loading required package: libcoin
Loading required package: mvtnorm
> library("sandwich")
> 
> set.seed(29)
> n <- 1000
> x <- runif(n)
> z <- runif(n)
> y <- rnorm(n, mean = x * c(-1, 1)[(z > 0.7) + 1], sd = 3)
> z_noise <- factor(sample(1:3, size = n, replace = TRUE))
> d <- data.frame(y = y, x = x, z = z, z_noise = z_noise)
> 
> 
> fmla <- as.formula("y ~ x | z + z_noise")
> fmly <- gaussian()
> fit <- partykit:::glmfit
> 
> # versions of the data
> d1 <- d
> d1$z <- signif(d1$z, digits = 1)
> 
> k <- 20
> zs_noise <- matrix(rnorm(n*k), nrow = n)
> colnames(zs_noise) <- paste0("z_noise_", 1:k)
> d2 <- cbind(d, zs_noise)
> fmla2 <- as.formula(paste("y ~ x | z + z_noise +",
+                           paste0("z_noise_", 1:k, collapse = " + ")))
> 
> 
> d3 <- d2
> d3$z <- factor(sample(1:3, size = n, replace = TRUE, prob = c(0.1, 0.5, 0.4)))
> d3$y <- rnorm(n, mean = x * c(-1, 1)[(d3$z == 2) + 1], sd = 3)
> 
> ## check weights
> w <- rep(1, n)
> w[1:10] <- 2
> (mw1 <- glmtree(formula = fmla, data = d, weights = w))
Generalized linear model tree (family: gaussian)

Model formula:
y ~ x | z + z_noise

Fitted party:
[1] root
|   [2] z <= 0.70311: n = 706
|       (Intercept)           x 
|        -0.1447422  -0.8138701 
|   [3] z > 0.70311: n = 304
|       (Intercept)           x 
|        0.07006626  0.73278593 

Number of inner nodes:    1
Number of terminal nodes: 2
Number of parameters per node: 2
Objective function (negative log-likelihood): 2551.48
> (mw2 <- glmtree(formula = fmla, data = d, weights = w, caseweights = FALSE))
Generalized linear model tree (family: gaussian)

Model formula:
y ~ x | z + z_noise

Fitted party:
[1] root
|   [2] z <= 0.70311: n = 704
|       (Intercept)           x 
|        -0.1447422  -0.8138701 
|   [3] z > 0.70311: n = 296
|       (Intercept)           x 
|        0.07006626  0.73278593 

Number of inner nodes:    1
Number of terminal nodes: 2
Number of parameters per node: 2
Objective function (negative log-likelihood): 2551.48
> 
> 
> 
> ## check dfsplit
> (mmfluc2 <- mob(formula = fmla, data = d, fit = partykit:::glmfit))
Model-based recursive partitioning (partykit:::glmfit)

Model formula:
y ~ x | z + z_noise

Fitted party:
[1] root
|   [2] z <= 0.70311: n = 704
|       (Intercept)           x 
|        -0.1619978  -0.7896293 
|   [3] z > 0.70311: n = 296
|       (Intercept)           x 
|        0.08683535  0.65598287 

Number of inner nodes:    1
Number of terminal nodes: 2
Number of parameters per node: 2
Objective function: 2551.673
> (mmfluc3 <- glmtree(formula = fmla, data = d))
Generalized linear model tree (family: gaussian)

Model formula:
y ~ x | z + z_noise

Fitted party:
[1] root
|   [2] z <= 0.70311: n = 704
|       (Intercept)           x 
|        -0.1619978  -0.7896293 
|   [3] z > 0.70311: n = 296
|       (Intercept)           x 
|        0.08683535  0.65598287 

Number of inner nodes:    1
Number of terminal nodes: 2
Number of parameters per node: 2
Objective function (negative log-likelihood): 2551.673
> (mmfluc3_dfsplit <- glmtree(formula = fmla, data = d, dfsplit = 10))
Generalized linear model tree (family: gaussian)

Model formula:
y ~ x | z + z_noise

Fitted party:
[1] root
|   [2] z <= 0.70311: n = 704
|       (Intercept)           x 
|        -0.1619978  -0.7896293 
|   [3] z > 0.70311: n = 296
|       (Intercept)           x 
|        0.08683535  0.65598287 

Number of inner nodes:    1
Number of terminal nodes: 2
Number of parameters per node: 2
Objective function (negative log-likelihood): 2551.673
> 
> 
> ## check tests
> library("strucchange")
Loading required package: zoo

Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric

> sctest(mmfluc3, node = 1) # does not yet work
                     z   z_noise
statistic 2.292499e+01 0.6165335
p.value   7.780038e-04 0.9984952
> 
> x <- mmfluc3
> (tst3 <- nodeapply(x, ids = nodeids(x), function(n) n$info$criterion))
$`1`
NULL

$`2`
NULL

$`3`
NULL

> 
> 
> 
> 
> ## check logLik and AIC
> logLik(mmfluc2)
'log Lik.' -2551.673 (df=7)
> logLik(mmfluc3)
'log Lik.' -2551.673 (df=7)
> logLik(mmfluc3_dfsplit)
'log Lik.' -2551.673 (df=16)
> logLik(glm(y ~ x, data = d))
'log Lik.' -2563.694 (df=3)
> 
> AIC(mmfluc3)
[1] 5117.347
> AIC(mmfluc3_dfsplit)
[1] 5135.347
> 
> ## check pruning
> pr2 <- prune.modelparty(mmfluc2)
> AIC(mmfluc2)
[1] 5117.347
> AIC(pr2)
[1] 5117.347
> 
> mmfluc_dfsplit3 <- glmtree(formula = fmla, data = d, alpha = 0.5, dfsplit = 3)
> mmfluc_dfsplit4 <- glmtree(formula = fmla, data = d, alpha = 0.5, dfsplit = 4)
> pr_dfsplit3 <- prune.modelparty(mmfluc_dfsplit3)
> pr_dfsplit4 <- prune.modelparty(mmfluc_dfsplit4)
> AIC(mmfluc_dfsplit3)
[1] 5142.774
> AIC(mmfluc_dfsplit4)
[1] 5156.774
> AIC(pr_dfsplit3)
[1] 5142.774
> AIC(pr_dfsplit4)
[1] 5124.456
> 
> width(mmfluc_dfsplit3)
[1] 8
> width(mmfluc_dfsplit4)
[1] 8
> width(pr_dfsplit3)
[1] 8
> width(pr_dfsplit4)
[1] 3
> 
> ## check inner and terminal
> options <- list(NULL,
+                 "object",
+                 "estfun",
+                 c("object", "estfun"))
> 
> arguments <- list("inner",
+                   "terminal",
+                   c("inner", "terminal"))
> 
> 
> for (o in options) {
+   print(o)
+   x <- glmtree(formula = fmla, data = d, inner = o)
+   str(nodeapply(x, ids = nodeids(x), function(n) n$info[c("object", "estfun")]), 2)
+ }
NULL
List of 3
 $ 1:List of 2
  ..$ NA: NULL
  ..$ NA: NULL
 $ 2:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ NA    : NULL
 $ 3:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ NA    : NULL
[1] "object"
List of 3
 $ 1:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ NA    : NULL
 $ 2:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ NA    : NULL
 $ 3:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ NA    : NULL
[1] "estfun"
List of 3
 $ 1:List of 2
  ..$ NA    : NULL
  ..$ estfun: num [1:1000, 1:2] -0.1375 -0.0583 -0.0553 0.1043 -0.0744 ...
  .. ..- attr(*, "dimnames")=List of 2
 $ 2:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ NA    : NULL
 $ 3:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ NA    : NULL
[1] "object" "estfun"
List of 3
 $ 1:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ estfun: num [1:1000, 1:2] -0.1375 -0.0583 -0.0553 0.1043 -0.0744 ...
  .. ..- attr(*, "dimnames")=List of 2
 $ 2:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ NA    : NULL
 $ 3:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ NA    : NULL
> 
> for (o in options) {
+   print(o)
+   x <- glmtree(formula = fmla, data = d, terminal = o)
+   str(nodeapply(x, ids = nodeids(x), function(n) n$info[c("object", "estfun")]), 2)
+ }
NULL
List of 3
 $ 1:List of 2
  ..$ NA: NULL
  ..$ NA: NULL
 $ 2:List of 2
  ..$ NA: NULL
  ..$ NA: NULL
 $ 3:List of 2
  ..$ NA: NULL
  ..$ NA: NULL
[1] "object"
List of 3
 $ 1:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ NA    : NULL
 $ 2:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ NA    : NULL
 $ 3:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ NA    : NULL
[1] "estfun"
List of 3
 $ 1:List of 2
  ..$ NA    : NULL
  ..$ estfun: num [1:1000, 1:2] -0.1375 -0.0583 -0.0553 0.1043 -0.0744 ...
  .. ..- attr(*, "dimnames")=List of 2
 $ 2:List of 2
  ..$ NA    : NULL
  ..$ estfun: num [1:704, 1:2] -0.1291 0.5104 -0.0603 -0.1868 -0.0981 ...
  .. ..- attr(*, "dimnames")=List of 2
 $ 3:List of 2
  ..$ NA    : NULL
  ..$ estfun: num [1:296, 1:2] -0.1053 -0.0877 0.0544 -0.1581 0.43 ...
  .. ..- attr(*, "dimnames")=List of 2
[1] "object" "estfun"
List of 3
 $ 1:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ estfun: num [1:1000, 1:2] -0.1375 -0.0583 -0.0553 0.1043 -0.0744 ...
  .. ..- attr(*, "dimnames")=List of 2
 $ 2:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ estfun: num [1:704, 1:2] -0.1291 0.5104 -0.0603 -0.1868 -0.0981 ...
  .. ..- attr(*, "dimnames")=List of 2
 $ 3:List of 2
  ..$ object:List of 23
  .. ..- attr(*, "class")= chr [1:2] "glm" "lm"
  ..$ estfun: num [1:296, 1:2] -0.1053 -0.0877 0.0544 -0.1581 0.43 ...
  .. ..- attr(*, "dimnames")=List of 2
> 
> 
> ## check model
> m_mt <- glmtree(formula = fmla, data = d, model = TRUE)
> m_mf <- glmtree(formula = fmla, data = d, model = FALSE)
> 
> dim(m_mt$data)
[1] 1000    4
> dim(m_mf$data)
[1] 0 4
> 
> 
> ## check multiway
> (m_mult <- glmtree(formula = fmla2, data = d3, catsplit = "multiway", minsize = 80))
Generalized linear model tree (family: gaussian)

Model formula:
y ~ x | z + z_noise + z_noise_1 + z_noise_2 + z_noise_3 + z_noise_4 + 
    z_noise_5 + z_noise_6 + z_noise_7 + z_noise_8 + z_noise_9 + 
    z_noise_10 + z_noise_11 + z_noise_12 + z_noise_13 + z_noise_14 + 
    z_noise_15 + z_noise_16 + z_noise_17 + z_noise_18 + z_noise_19 + 
    z_noise_20

Fitted party:
[1] root
|   [2] z in 1: n = 76
|       (Intercept)           x 
|         0.9859847  -3.2600047 
|   [3] z in 2: n = 537
|       (Intercept)           x 
|       -0.06970187  1.12305074 
|   [4] z in 3: n = 387
|       (Intercept)           x 
|         0.3824392  -1.8337151 

Number of inner nodes:    1
Number of terminal nodes: 3
Number of parameters per node: 2
Objective function (negative log-likelihood): 2511.927
> 
> 
> ## check parm
> fmla_p <- as.formula("y ~ x + z_noise + z_noise_1 | z + z_noise_2")
> (m_interc <- glmtree(formula = fmla_p, data = d2, parm = 1))
Generalized linear model tree (family: gaussian)

Model formula:
y ~ x + z_noise + z_noise_1 | z + z_noise_2

Fitted party:
[1] root
|   [2] z <= 0.65035: n = 644
|       (Intercept)           x    z_noise2    z_noise3   z_noise_1 
|       -0.05585503 -1.01257554  0.34044520 -0.16384987  0.24197601 
|   [3] z > 0.65035: n = 356
|       (Intercept)           x    z_noise2    z_noise3   z_noise_1 
|        0.06411865  0.78733976 -0.67811149 -0.14240432 -0.01239154 

Number of inner nodes:    1
Number of terminal nodes: 2
Number of parameters per node: 5
Objective function (negative log-likelihood): 2548.32
> 
> (m_p3 <- glmtree(formula = fmla_p, data = d2, parm = 3))
Generalized linear model tree (family: gaussian)

Model formula:
y ~ x + z_noise + z_noise_1 | z + z_noise_2

Fitted party:
[1] root: n = 1000
     (Intercept)            x     z_noise2     z_noise3    z_noise_1 
    -0.058855295 -0.340314311 -0.008404682 -0.109839080  0.154798281  

Number of inner nodes:    0
Number of terminal nodes: 1
Number of parameters per node: 5
Objective function (negative log-likelihood): 2562.32
> 
> 
> ## check trim
> (m_tt <- glmtree(formula = fmla, data = d, trim = 0.2))
Generalized linear model tree (family: gaussian)

Model formula:
y ~ x | z + z_noise

Fitted party:
[1] root
|   [2] z <= 0.70311: n = 704
|       (Intercept)           x 
|        -0.1619978  -0.7896293 
|   [3] z > 0.70311: n = 296
|       (Intercept)           x 
|        0.08683535  0.65598287 

Number of inner nodes:    1
Number of terminal nodes: 2
Number of parameters per node: 2
Objective function (negative log-likelihood): 2551.673
> 
> (m_tf <- glmtree(formula = fmla, data = d, trim = 300, minsize = 300))
Generalized linear model tree (family: gaussian)

Model formula:
y ~ x | z + z_noise

Fitted party:
[1] root
|   [2] z <= 0.6892: n = 691
|       (Intercept)           x 
|        -0.1778199  -0.7692901 
|   [3] z > 0.6892: n = 309
|       (Intercept)           x 
|         0.1065746   0.5562243 

Number of inner nodes:    1
Number of terminal nodes: 2
Number of parameters per node: 2
Objective function (negative log-likelihood): 2552.12
> 
> 
> 
> ## check breakties
> m_bt <- glmtree(formula = fmla, data = d1, breakties = TRUE)
> m_df <- glmtree(formula = fmla, data = d1, breakties = FALSE)
> 
> all.equal(m_bt, m_df)
[1] "Component \"node\": Component \"kids\": Component 1: Component 5: Component 6: Mean relative difference: 0.1237503"
[2] "Component \"node\": Component \"kids\": Component 2: Component 5: Component 5: Mean relative difference: 0.1746109"
[3] "Component \"node\": Component \"kids\": Component 2: Component 5: Component 6: Mean relative difference: 0.0443985"
[4] "Component \"node\": Component \"info\": Component \"p.value\": Mean relative difference: 1.100407"                 
[5] "Component \"node\": Component \"info\": Component \"test\": Mean relative difference: 0.07721086"                  
[6] "Component \"info\": Component \"call\": target, current do not match when deparsed"                                
[7] "Component \"info\": Component \"control\": Component \"breakties\": 1 element mismatch"                            
> 
> unclass(m_bt)$node$info$criterion
NULL
> unclass(m_df)$node$info$criterion
NULL
> 
> 
> ### example from mob vignette
> data("PimaIndiansDiabetes", package = "mlbench")
> 
> logit <- function(y, x, start = NULL, weights = NULL, offset = NULL, ...) {
+   glm(y ~ 0 + x, family = binomial, start = start, ...)
+ }
> 
> pid_formula <- diabetes ~ glucose | pregnant + pressure + triceps +
+   insulin + mass + pedigree + age
> 
> pid_tree <- mob(pid_formula, data = PimaIndiansDiabetes, fit = logit)
> pid_tree
Model-based recursive partitioning (logit)

Model formula:
diabetes ~ glucose | pregnant + pressure + triceps + insulin + 
    mass + pedigree + age

Fitted party:
[1] root
|   [2] mass <= 26.3: n = 167
|       x(Intercept)     xglucose 
|        -9.95150963   0.05870786 
|   [3] mass > 26.3
|   |   [4] age <= 30: n = 304
|   |       x(Intercept)     xglucose 
|   |        -6.70558554   0.04683748 
|   |   [5] age > 30: n = 297
|   |       x(Intercept)     xglucose 
|   |        -2.77095386   0.02353582 

Number of inner nodes:    2
Number of terminal nodes: 3
Number of parameters per node: 2
Objective function: 355.4578
> nodeapply(pid_tree, ids = nodeids(pid_tree), function(n) n$info$criterion)
$`1`
NULL

$`2`
NULL

$`3`
NULL

$`4`
NULL

$`5`
NULL

> 
> 
> 
> 
> proc.time()
   user  system elapsed 
 40.720   0.212  40.948 
