
R : Copyright 1999, The R Development Core Team
Version 0.90.0 Patched (December 15, 1999)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type	"?license" or "?licence" for distribution details.

R is a collaborative project with many contributors.
Type	"?contributors" for a list.

Type	"demo()" for some demos, "help()" for on-line help, or
    	"help.start()" for a HTML browser interface to help.
Type	"q()" to quit R.

> invisible(options(echo = TRUE))
> # Any necessary setup
> library(rpart2)
> library(survival5)

Attaching Package "package:survival5":


	The following object(s) are masked from package:rpart2 :

	 is.Surv 


	The following object(s) are masked from package:base :

	 sort.list 

> data(state)
> data(cu.summary)
> data(kyphosis)
> options(na.action="na.omit")
> #
> # Read the data
> #
> #   Time to progression in years
> #   status  1=progressed, 0= censored
> #   age
> #   early endocrine therapy   1=no 2=yes
> #   % of cells in g2 phase, from flow cytometry
> #   tumor grade (Farrow) 1,2,3,4
> #   Gleason score (competing grading system)
> #   ploidy
> 
> stagec <- read.table('data.stagec',  col.names=c("pgtime", "pgstat", "age",
+ 			"eet", "g2", "grade", "gleason", "ploidy"))
> stagec$ploidy <- factor(stagec$ploidy, levels=1:3,
+ 				labels=c("diploid", "tetraploid", "aneuploid"))
> 
> cox0 <- coxph(Surv(pgtime, pgstat) ~ 1, stagec)
> cox1 <- coxph(Surv(pgtime, pgstat) ~ age + eet + g2 + grade + ploidy, stagec)
> cox1
Call:
coxph(formula = Surv(pgtime, pgstat) ~ age + eet + g2 + grade + 
    ploidy, data = stagec)


                    coef exp(coef) se(coef)      z       p
age              -0.0173     0.983   0.0287 -0.604 5.5e-01
eet               0.0704     1.073   0.3730  0.189 8.5e-01
g2               -0.0527     0.949   0.0267 -1.973 4.9e-02
grade             1.7289     5.634   0.3369  5.132 2.9e-07
ploidytetraploid  1.0444     2.842   0.4225  2.472 1.3e-02
ploidyaneuploid   1.1497     3.157   0.6610  1.739 8.2e-02

Likelihood ratio test=48.6  on 6 df, p=8.92e-09  n=137 (9 observations deleted due to missing)
> 
> fit1 <- rpart(Surv(pgtime, pgstat) ~ age + eet + g2+grade+gleason +ploidy,
+ 		stagec, control=rpart.control(usesurrogate=0, cp=0),
+ 		method='poisson')
> fit1
node), split, n, deviance, yval
      * denotes terminal node

  1) root 146 193.500 0.058500  
    2) grade<2.5 61  46.090 0.021290  
      4) g2<11.36 33   9.325 0.007180  
        8) g2>6.545 26   1.847 0.004466 *
        9) g2<6.545 7   5.904 0.027880 *
      5) g2>11.36 27  27.940 0.044910  
       10) g2>14.37 15  12.460 0.030970 *
       11) g2<14.37 12  13.730 0.067920 *
    3) grade>2.5 85 122.600 0.094370  
      6) age>56.5 75 104.300 0.082810  
       12) g2>22.765 12  14.840 0.039570 *
       13) g2<22.765 57  75.780 0.100200  
         26) gleason<7.5 41  55.530 0.076840  
           52) g2<13.475 23  26.750 0.050850  
            104) g2>11.52 7   5.309 0.030860 *
            105) g2<11.52 16  20.210 0.063860 *
           53) g2>13.475 18  25.300 0.112100 *
         27) gleason>7.5 16  15.170 0.163900 *
      7) age<56.5 10  11.780 0.197100 *
> summary(fit1)
Call:
rpart(formula = Surv(pgtime, pgstat) ~ age + eet + g2 + grade + 
    gleason + ploidy, data = stagec, method = "poisson", control = rpart.control(usesurrogate = 0, 
    cp = 0))

        CP nsplit rel error xerror    xstd
1 0.128138      0    1.0000 1.0057 0.06307
2 0.052155      1    0.8719 0.8854 0.07188
3 0.045615      3    0.7676 0.9049 0.07857
4 0.026271      4    0.7219 0.9171 0.07759
5 0.017966      5    0.6957 0.9363 0.08723
6 0.009059      6    0.6777 0.9590 0.08978
7 0.008132      7    0.6686 1.0386 0.09672
8 0.006384      8    0.6605 1.0392 0.09708
9 0.000000      9    0.6541 1.0420 0.09643

Node number 1: 146 observations,    complexity param=0.1281
  events=54,  estimated rate=0.0585 , mean deviance=1.325
  left son=2 (61 obs) right son=3 (85 obs)
  Primary splits:
      grade   < 2.5   to the left,  improve=24.840, (0 missing)
      gleason < 5.5   to the left,  improve=21.620, (3 missing)
      ploidy  splits as  LRR,       improve=13.000, (0 missing)
      g2      < 13.2  to the left,  improve=11.440, (7 missing)
      age     < 53.5  to the right, improve= 2.482, (0 missing)
  Surrogate splits:
      gleason < 5.5   to the left,  agree=0.863, adj=0.672, (0 split)
      ploidy  splits as  LRR,       agree=0.644, adj=0.148, (0 split)
      g2      < 9.945 to the left,  agree=0.630, adj=0.115, (0 split)
      age     < 66.5  to the right, agree=0.589, adj=0.016, (0 split)

Node number 2: 61 observations,    complexity param=0.04562
  events=9,  estimated rate=0.02129 , mean deviance=0.7556
  left son=4 (33 obs) right son=5 (27 obs), 1 observation remains
  Primary splits:
      g2      < 11.36 to the left,  improve=8.9660, (1 missing)
      ploidy  splits as  LRL,       improve=7.4650, (0 missing)
      age     < 68.5  to the right, improve=3.9540, (0 missing)
      gleason < 5.5   to the left,  improve=1.8060, (3 missing)
      eet     < 1.5   to the left,  improve=0.5441, (1 missing)
  Surrogate splits:
      ploidy splits as  LR-,       agree=0.917, adj=0.815, (0 split)
      age    < 65.5  to the left,  agree=0.617, adj=0.148, (0 split)

Node number 3: 85 observations,    complexity param=0.05215
  events=45,  estimated rate=0.09437 , mean deviance=1.442
  left son=6 (75 obs) right son=7 (10 obs)
  Primary splits:
      age     < 56.5  to the right, improve=7.3330, (0 missing)
      g2      < 22.77 to the right, improve=6.8130, (6 missing)
      gleason < 8.5   to the left,  improve=4.8430, (0 missing)
      ploidy  splits as  LLR,       improve=1.7270, (0 missing)
      eet     < 1.5   to the right, improve=0.9572, (1 missing)

Node number 4: 33 observations,    complexity param=0.008132
  events=1,  estimated rate=0.00718 , mean deviance=0.2826
  left son=8 (26 obs) right son=9 (7 obs)
  Primary splits:
      g2      < 6.545 to the right, improve=3.1310, (0 missing)
      age     < 62.5  to the right, improve=1.3530, (0 missing)
      eet     < 1.5   to the left,  improve=0.6605, (0 missing)
      gleason < 5.5   to the right, improve=0.4880, (2 missing)
  Surrogate splits:
      grade < 1.5   to the right, agree=0.848, adj=0.286, (0 split)

Node number 5: 27 observations,    complexity param=0.009059
  events=8,  estimated rate=0.04491 , mean deviance=1.035
  left son=10 (15 obs) right son=11 (12 obs)
  Primary splits:
      g2      < 14.37 to the right, improve=1.818, (0 missing)
      gleason < 5.5   to the left,  improve=1.698, (1 missing)
      age     < 62.5  to the left,  improve=1.137, (0 missing)
  Surrogate splits:
      ploidy  splits as  RL-,       agree=0.741, adj=0.417, (0 split)
      age     < 65    to the left,  agree=0.630, adj=0.167, (0 split)
      gleason < 5.5   to the left,  agree=0.630, adj=0.167, (0 split)

Node number 6: 75 observations,    complexity param=0.05215
  events=37,  estimated rate=0.08281 , mean deviance=1.391
  left son=12 (12 obs) right son=13 (57 obs), 6 observations remain
  Primary splits:
      g2      < 22.77 to the right, improve=4.8340, (6 missing)
      gleason < 7.5   to the left,  improve=4.0830, (0 missing)
      age     < 69.5  to the left,  improve=2.7450, (0 missing)
      ploidy  splits as  LRR,       improve=0.7867, (0 missing)
      eet     < 1.5   to the right, improve=0.1978, (1 missing)

Node number 7: 10 observations
  events=8,  estimated rate=0.1971 , mean deviance=1.178

Node number 8: 26 observations
  events=0,  estimated rate=0.004466 , mean deviance=0.07105

Node number 9: 7 observations
  events=1,  estimated rate=0.02788 , mean deviance=0.8435

Node number 10: 15 observations
  events=3,  estimated rate=0.03097 , mean deviance=0.8305

Node number 11: 12 observations
  events=5,  estimated rate=0.06792 , mean deviance=1.144

Node number 12: 12 observations
  events=4,  estimated rate=0.03957 , mean deviance=1.236

Node number 13: 57 observations,    complexity param=0.02627
  events=30,  estimated rate=0.1002 , mean deviance=1.33
  left son=26 (41 obs) right son=27 (16 obs)
  Primary splits:
      gleason < 7.5   to the left,  improve=5.37200, (0 missing)
      g2      < 13.2  to the left,  improve=4.45500, (0 missing)
      ploidy  splits as  LRR,       improve=3.48400, (0 missing)
      age     < 69.5  to the left,  improve=3.14300, (0 missing)
      eet     < 1.5   to the left,  improve=0.04885, (1 missing)
  Surrogate splits:
      grade < 3.5   to the left,  agree=0.772, adj=0.187, (0 split)

Node number 26: 41 observations,    complexity param=0.01797
  events=18,  estimated rate=0.07684 , mean deviance=1.354
  left son=52 (23 obs) right son=53 (18 obs)
  Primary splits:
      g2      < 13.48 to the left,  improve=3.55300, (0 missing)
      ploidy  splits as  LRR,       improve=3.04200, (0 missing)
      age     < 65.5  to the left,  improve=0.67910, (0 missing)
      eet     < 1.5   to the left,  improve=0.09085, (0 missing)
      gleason < 6.5   to the right, improve=0.01940, (0 missing)
  Surrogate splits:
      ploidy splits as  LRL,       agree=0.976, adj=0.944, (0 split)
      eet    < 1.5   to the right, agree=0.634, adj=0.167, (0 split)
      age    < 58.5  to the right, agree=0.585, adj=0.056, (0 split)

Node number 27: 16 observations
  events=12,  estimated rate=0.1639 , mean deviance=0.9483

Node number 52: 23 observations,    complexity param=0.006384
  events=7,  estimated rate=0.05085 , mean deviance=1.163
  left son=104 (7 obs) right son=105 (16 obs)
  Primary splits:
      g2      < 11.52 to the right, improve=1.40500, (0 missing)
      age     < 62.5  to the right, improve=1.24800, (0 missing)
      gleason < 6.5   to the left,  improve=0.01894, (0 missing)
  Surrogate splits:
      age    < 71    to the right, agree=0.826, adj=0.429, (0 split)
      ploidy splits as  RLR,       agree=0.739, adj=0.143, (0 split)

Node number 53: 18 observations
  events=11,  estimated rate=0.1121 , mean deviance=1.405

Node number 104: 7 observations
  events=1,  estimated rate=0.03086 , mean deviance=0.7585

Node number 105: 16 observations
  events=6,  estimated rate=0.06386 , mean deviance=1.263

> 
> fit2 <- rpart(cox0$residual ~ age + eet + g2+grade+gleason +ploidy,
+ 		stagec)
> fit2
node), split, n, deviance, yval
      * denotes terminal node

 1) root 146 53.420 -4.563e-17  
   2) grade<2.5 61 11.330 -2.892e-01  
     4) g2<13.19 40  3.270 -4.119e-01 *
     5) g2>13.19 21  6.314 -5.547e-02  
      10) g2>14.615 14  3.098 -1.913e-01 *
      11) g2<14.615 7  2.440  2.163e-01 *
   3) grade>2.5 85 33.320  2.075e-01  
     6) age>56.5 75 29.090  1.526e-01  
      12) gleason<7.5 50 18.400  5.706e-02  
        24) g2<13.475 25  7.725 -4.713e-02 *
        25) g2>13.475 25 10.140  1.612e-01  
          50) g2>17.915 14  4.903 -6.099e-02 *
          51) g2<17.915 11  3.662  4.441e-01 *
      13) gleason>7.5 25  9.318  3.436e-01  
        26) g2>17.005 9  3.753  7.496e-02 *
        27) g2<17.005 16  4.551  4.946e-01 *
     7) age<56.5 10  2.304  6.199e-01 *
> summary(fit2)
Call:
rpart(formula = cox0$residual ~ age + eet + g2 + grade + gleason + 
    ploidy, data = stagec)

       CP nsplit rel error xerror    xstd
1 0.16404      0    1.0000 1.0125 0.05732
2 0.03608      1    0.8360 0.8543 0.06605
3 0.02561      3    0.7671 0.9824 0.08384
4 0.01979      4    0.7415 0.9929 0.08621
5 0.01452      7    0.6830 1.0111 0.09059
6 0.01000      8    0.6684 1.0257 0.09503

Node number 1: 146 observations,    complexity param=0.164
  mean=-4.563e-17, MSE=0.3659
  left son=2 (61 obs) right son=3 (85 obs)
  Primary splits:
      grade   < 2.5   to the left,  improve=0.16400, (0 missing)
      gleason < 5.5   to the left,  improve=0.13570, (3 missing)
      ploidy  splits as  LRR,       improve=0.09188, (0 missing)
      g2      < 13.2  to the left,  improve=0.08482, (7 missing)
      age     < 58.5  to the right, improve=0.01991, (0 missing)
  Surrogate splits:
      gleason < 5.5   to the left,  agree=0.863, adj=0.672, (0 split)
      ploidy  splits as  LRR,       agree=0.644, adj=0.148, (0 split)
      g2      < 9.945 to the left,  agree=0.630, adj=0.115, (0 split)
      age     < 66.5  to the right, agree=0.589, adj=0.016, (0 split)

Node number 2: 61 observations,    complexity param=0.03275
  mean=-0.2892, MSE=0.1858
  left son=4 (40 obs) right son=5 (21 obs)
  Primary splits:
      g2      < 13.19 to the left,  improve=0.14990, (1 missing)
      ploidy  splits as  LRL,       improve=0.13360, (0 missing)
      gleason < 5.5   to the left,  improve=0.03917, (3 missing)
      age     < 68.5  to the right, improve=0.03753, (0 missing)
      eet     < 1.5   to the left,  improve=0.01233, (1 missing)
  Surrogate splits:
      ploidy splits as  LR-, agree=0.983, adj=0.952, (0 split)

Node number 3: 85 observations,    complexity param=0.03608
  mean=0.2075, MSE=0.392
  left son=6 (75 obs) right son=7 (10 obs)
  Primary splits:
      age     < 56.5  to the right, improve=0.05784, (0 missing)
      g2      < 23.48 to the right, improve=0.05207, (6 missing)
      gleason < 8.5   to the left,  improve=0.04485, (0 missing)
      ploidy  splits as  LRR,       improve=0.02537, (0 missing)
      eet     < 1.5   to the right, improve=0.00724, (1 missing)

Node number 4: 40 observations
  mean=-0.4119, MSE=0.08176

Node number 5: 21 observations,    complexity param=0.01452
  mean=-0.05547, MSE=0.3007
  left son=10 (14 obs) right son=11 (7 obs)
  Primary splits:
      g2  < 14.62 to the right, improve=0.12280, (0 missing)
      age < 63.5  to the left,  improve=0.05198, (0 missing)

Node number 6: 75 observations,    complexity param=0.02561
  mean=0.1526, MSE=0.3879
  left son=12 (50 obs) right son=13 (25 obs)
  Primary splits:
      gleason < 7.5   to the left,  improve=0.0470300, (0 missing)
      g2      < 23.48 to the right, improve=0.0439100, (6 missing)
      age     < 69.5  to the left,  improve=0.0255400, (0 missing)
      ploidy  splits as  LRR,       improve=0.0191000, (0 missing)
      eet     < 1.5   to the right, improve=0.0009644, (1 missing)
  Surrogate splits:
      grade < 3.5   to the left,  agree=0.733, adj=0.2, (0 split)

Node number 7: 10 observations
  mean=0.6199, MSE=0.2304

Node number 10: 14 observations
  mean=-0.1913, MSE=0.2213

Node number 11: 7 observations
  mean=0.2163, MSE=0.3486

Node number 12: 50 observations,    complexity param=0.01979
  mean=0.05706, MSE=0.3681
  left son=24 (25 obs) right son=25 (25 obs)
  Primary splits:
      g2      < 13.48 to the left,  improve=4.238e-02, (3 missing)
      ploidy  splits as  LRR,       improve=3.143e-02, (0 missing)
      age     < 62.5  to the right, improve=1.709e-02, (0 missing)
      gleason < 6.5   to the left,  improve=5.610e-04, (0 missing)
      eet     < 1.5   to the right, improve=6.361e-05, (0 missing)
  Surrogate splits:
      ploidy  splits as  LRL,       agree=0.979, adj=0.957, (3 split)
      age     < 66.5  to the right, agree=0.596, adj=0.174, (0 split)
      eet     < 1.5   to the right, agree=0.553, adj=0.087, (0 split)
      gleason < 6.5   to the left,  agree=0.553, adj=0.087, (0 split)

Node number 13: 25 observations,    complexity param=0.01899
  mean=0.3436, MSE=0.3727
  left son=26 (9 obs) right son=27 (16 obs)
  Primary splits:
      g2     < 17.01 to the right, improve=1.562e-01, (3 missing)
      age    < 61.5  to the right, improve=5.841e-02, (0 missing)
      ploidy splits as  RLR,       improve=1.562e-02, (0 missing)
      eet    < 1.5   to the right, improve=2.882e-05, (1 missing)
  Surrogate splits:
      age < 57.5  to the left,  agree=0.682, adj=0.125, (3 split)

Node number 24: 25 observations
  mean=-0.04713, MSE=0.309

Node number 25: 25 observations,    complexity param=0.01979
  mean=0.1612, MSE=0.4055
  left son=50 (14 obs) right son=51 (11 obs)
  Primary splits:
      g2      < 17.92 to the right, improve=0.1271000, (1 missing)
      eet     < 1.5   to the right, improve=0.0309400, (0 missing)
      age     < 64.5  to the left,  improve=0.0241800, (0 missing)
      gleason < 6.5   to the right, improve=0.0003038, (0 missing)
  Surrogate splits:
      age     < 65.5  to the left,  agree=0.708, adj=0.364, (1 split)
      eet     < 1.5   to the right, agree=0.667, adj=0.273, (0 split)
      gleason < 6.5   to the right, agree=0.625, adj=0.182, (0 split)

Node number 26: 9 observations
  mean=0.07496, MSE=0.417

Node number 27: 16 observations
  mean=0.4946, MSE=0.2844

Node number 50: 14 observations
  mean=-0.06099, MSE=0.3502

Node number 51: 11 observations
  mean=0.4441, MSE=0.3329

> 
> 
> 
> fit3 <- rpart(Surv(pgtime, pgstat) ~ age + eet + g2+grade+gleason +ploidy,
+ 		stagec, control=rpart.control(usesurrogate=1, cp=.001))
> 
> summary(fit3)
Call:
rpart(formula = Surv(pgtime, pgstat) ~ age + eet + g2 + grade + 
    gleason + ploidy, data = stagec, control = rpart.control(usesurrogate = 1, 
    cp = 0.001))

        CP nsplit rel error xerror    xstd
1 0.129126      0    1.0000 1.0136 0.07396
2 0.041689      1    0.8709 0.9063 0.07773
3 0.028803      2    0.8292 0.9906 0.09174
4 0.017195      3    0.8004 1.0453 0.09568
5 0.015181      4    0.7832 1.0699 0.09671
6 0.013514      5    0.7680 1.0763 0.09662
7 0.009607      8    0.7291 1.0911 0.10118
8 0.001000     10    0.7105 1.0884 0.10162

Node number 1: 146 observations,    complexity param=0.1291
  events=54,  estimated rate=1 , mean deviance=1.338
  left son=2 (61 obs) right son=3 (85 obs)
  Primary splits:
      grade   < 2.5   to the left,  improve=25.270, (0 missing)
      gleason < 5.5   to the left,  improve=21.630, (3 missing)
      ploidy  splits as  LRR,       improve=14.020, (0 missing)
      g2      < 13.2  to the left,  improve=12.580, (7 missing)
      age     < 58.5  to the right, improve= 2.796, (0 missing)
  Surrogate splits:
      gleason < 5.5   to the left,  agree=0.863, adj=0.672, (0 split)
      ploidy  splits as  LRR,       agree=0.644, adj=0.148, (0 split)
      g2      < 9.945 to the left,  agree=0.630, adj=0.115, (0 split)
      age     < 66.5  to the right, agree=0.589, adj=0.016, (0 split)

Node number 2: 61 observations,    complexity param=0.04169
  events=9,  estimated rate=0.3617 , mean deviance=0.7373
  left son=4 (33 obs) right son=5 (28 obs)
  Primary splits:
      g2      < 11.36 to the left,  improve=9.0400, (1 missing)
      ploidy  splits as  LRL,       improve=7.6600, (0 missing)
      age     < 68.5  to the right, improve=4.1550, (0 missing)
      gleason < 5.5   to the left,  improve=1.6610, (3 missing)
      eet     < 1.5   to the left,  improve=0.6096, (1 missing)
  Surrogate splits:
      ploidy splits as  LR-,       agree=0.917, adj=0.815, (0 split)
      age    < 65.5  to the left,  agree=0.617, adj=0.148, (1 split)

Node number 3: 85 observations,    complexity param=0.0288
  events=45,  estimated rate=1.623 , mean deviance=1.472
  left son=6 (75 obs) right son=7 (10 obs)
  Primary splits:
      age     < 56.5  to the right, improve=6.3660, (0 missing)
      g2      < 23.48 to the right, improve=4.6720, (6 missing)
      gleason < 8.5   to the left,  improve=4.2320, (0 missing)
      ploidy  splits as  LRR,       improve=1.5970, (0 missing)
      eet     < 1.5   to the right, improve=0.8234, (1 missing)

Node number 4: 33 observations,    complexity param=0.008959
  events=1,  estimated rate=0.122 , mean deviance=0.2767
  left son=8 (26 obs) right son=9 (7 obs)
  Primary splits:
      g2      < 6.545 to the right, improve=3.2920, (0 missing)
      age     < 62.5  to the right, improve=1.4220, (0 missing)
      eet     < 1.5   to the left,  improve=0.6913, (0 missing)
      gleason < 5.5   to the right, improve=0.5440, (2 missing)
  Surrogate splits:
      grade < 1.5   to the right, agree=0.848, adj=0.286, (0 split)

Node number 5: 28 observations,    complexity param=0.01351
  events=8,  estimated rate=0.7341 , mean deviance=0.9894
  left son=10 (19 obs) right son=11 (8 obs), 1 observation remains
  Primary splits:
      gleason < 5.5   to the left,  improve=2.129, (1 missing)
      age     < 67.5  to the right, improve=1.667, (0 missing)
      g2      < 14.37 to the right, improve=1.397, (1 missing)

Node number 6: 75 observations,    complexity param=0.0172
  events=37,  estimated rate=1.432 , mean deviance=1.386
  left son=12 (50 obs) right son=13 (25 obs)
  Primary splits:
      gleason < 7.5   to the left,  improve=3.4240, (0 missing)
      g2      < 23.48 to the right, improve=3.3370, (6 missing)
      age     < 69.5  to the left,  improve=2.2340, (0 missing)
      ploidy  splits as  LRR,       improve=1.1470, (0 missing)
      eet     < 1.5   to the right, improve=0.1427, (1 missing)
  Surrogate splits:
      grade < 3.5   to the left,  agree=0.733, adj=0.2, (0 split)

Node number 7: 10 observations
  events=8,  estimated rate=3.198 , mean deviance=1.552

Node number 8: 26 observations
  events=0,  estimated rate=0.07451 , mean deviance=0.07119

Node number 9: 7 observations
  events=1,  estimated rate=0.5041 , mean deviance=0.7898

Node number 10: 19 observations
  events=4,  estimated rate=0.5521 , mean deviance=0.7319

Node number 11: 8 observations
  events=4,  estimated rate=1.314 , mean deviance=1.395

Node number 12: 50 observations,    complexity param=0.01271
  events=21,  estimated rate=1.149 , mean deviance=1.33
  left son=24 (25 obs) right son=25 (25 obs)
  Primary splits:
      g2      < 13.48 to the left,  improve=1.94600, (3 missing)
      ploidy  splits as  LRR,       improve=1.49100, (0 missing)
      age     < 62.5  to the right, improve=0.80830, (0 missing)
      gleason < 6.5   to the left,  improve=0.02956, (0 missing)
      eet     < 1.5   to the right, improve=0.01135, (0 missing)
  Surrogate splits:
      ploidy  splits as  LRL,       agree=0.979, adj=0.957, (3 split)
      age     < 66.5  to the right, agree=0.596, adj=0.174, (0 split)
      eet     < 1.5   to the right, agree=0.553, adj=0.087, (0 split)
      gleason < 6.5   to the left,  agree=0.553, adj=0.087, (0 split)

Node number 13: 25 observations,    complexity param=0.01518
  events=16,  estimated rate=2.028 , mean deviance=1.365
  left son=26 (10 obs) right son=27 (15 obs)
  Primary splits:
      g2     < 15.29 to the right, improve=3.92900, (3 missing)
      age    < 68.5  to the left,  improve=1.34300, (0 missing)
      ploidy splits as  RLR,       improve=0.83660, (0 missing)
      eet    < 1.5   to the right, improve=0.00105, (1 missing)
  Surrogate splits:
      ploidy  splits as  RLR,       agree=0.727, adj=0.4, (3 split)
      gleason < 8.5   to the right, agree=0.636, adj=0.2, (0 split)
      age     < 57.5  to the left,  agree=0.591, adj=0.1, (0 split)
      eet     < 1.5   to the left,  agree=0.591, adj=0.1, (0 split)

Node number 24: 25 observations,    complexity param=0.009607
  events=8,  estimated rate=0.8817 , mean deviance=1.164
  left son=48 (8 obs) right son=49 (17 obs)
  Primary splits:
      g2      < 11.52 to the right, improve=1.19200, (2 missing)
      age     < 65.5  to the right, improve=1.16600, (0 missing)
      gleason < 6.5   to the left,  improve=0.06199, (0 missing)
  Surrogate splits:
      age    < 70    to the right, agree=0.826, adj=0.429, (2 split)
      ploidy splits as  RLR,       agree=0.739, adj=0.143, (0 split)

Node number 25: 25 observations,    complexity param=0.01271
  events=13,  estimated rate=1.408 , mean deviance=1.442
  left son=50 (14 obs) right son=51 (11 obs)
  Primary splits:
      g2      < 17.92 to the right, improve=3.122000, (1 missing)
      eet     < 1.5   to the right, improve=1.212000, (0 missing)
      age     < 64.5  to the left,  improve=0.633000, (0 missing)
      gleason < 6.5   to the right, improve=0.001267, (0 missing)
  Surrogate splits:
      age     < 65.5  to the left,  agree=0.708, adj=0.364, (1 split)
      eet     < 1.5   to the right, agree=0.667, adj=0.273, (0 split)
      gleason < 6.5   to the right, agree=0.625, adj=0.182, (0 split)

Node number 26: 10 observations
  events=5,  estimated rate=1.214 , mean deviance=1.181

Node number 27: 15 observations
  events=11,  estimated rate=2.702 , mean deviance=1.291

Node number 48: 8 observations
  events=1,  estimated rate=0.4749 , mean deviance=0.6646

Node number 49: 17 observations
  events=7,  estimated rate=1.144 , mean deviance=1.289

Node number 50: 14 observations
  events=5,  estimated rate=0.8795 , mean deviance=1.337

Node number 51: 11 observations
  events=8,  estimated rate=2.183 , mean deviance=1.246

> 
> #
> # In order to compare the x-vals estimates of the mainline and S versions,
> #  it is necessary that we use stratified xval sets (like the mainline
> #  does).  
> 
> mystate <- data.frame(state.x77, region=factor(state.region))
> names(mystate) <- c("population","income" , "illiteracy","life" ,
+        "murder", "hs.grad", "frost",     "area",      "region")
> 
> xvals <- 1:nrow(mystate)
> xvals[order(mystate$income)] <- rep(1:10, length=nrow(mystate))
> 
> mystate <- data.frame(state.x77, region=factor(state.region))
> names(mystate) <- c("population","income" , "illiteracy","life" ,
+        "murder", "hs.grad", "frost",     "area",      "region")
> 
> fit4 <- rpart(income ~ population + region + illiteracy +life + murder +
+ 			hs.grad + frost , mystate,
+ 		   control=rpart.control(minsplit=10, xval=xvals))
> 
> summary(fit4)
Call:
rpart(formula = income ~ population + region + illiteracy + life + 
    murder + hs.grad + frost, data = mystate, control = rpart.control(minsplit = 10, 
    xval = xvals))

       CP nsplit rel error xerror   xstd
1 0.42831      0    1.0000 1.0157 0.2265
2 0.13514      1    0.5717 0.5948 0.1747
3 0.06458      2    0.4365 0.7005 0.1802
4 0.05485      3    0.3720 0.8606 0.2299
5 0.02479      5    0.2637 0.8289 0.2644
6 0.01940      6    0.2389 0.8196 0.2641
7 0.01394      7    0.2195 0.8196 0.2641
8 0.01000      8    0.2055 0.8117 0.2640

Node number 1: 50 observations,    complexity param=0.4283
  mean=4436, MSE=370000
  left son=2 (10 obs) right son=3 (40 obs)
  Primary splits:
      hs.grad    < 44.3  to the left,  improve=0.4283, (0 missing)
      illiteracy < 1.55  to the right, improve=0.3249, (0 missing)
      region     splits as  RLRR,      improve=0.2285, (0 missing)
      murder     < 11.55 to the right, improve=0.2012, (0 missing)
      life       < 68.9  to the left,  improve=0.1672, (0 missing)
  Surrogate splits:
      illiteracy < 1.55  to the right, agree=0.90, adj=0.5, (0 split)
      life       < 69.26 to the left,  agree=0.90, adj=0.5, (0 split)
      region     splits as  RLRR,      agree=0.88, adj=0.4, (0 split)
      murder     < 11.55 to the right, agree=0.88, adj=0.4, (0 split)
      frost      < 81    to the left,  agree=0.82, adj=0.1, (0 split)

Node number 2: 10 observations,    complexity param=0.0194
  mean=3640, MSE=66880
  left son=4 (7 obs) right son=5 (3 obs)
  Primary splits:
      population < 3990  to the left,  improve=0.53670, (0 missing)
      frost      < 55    to the left,  improve=0.30250, (0 missing)
      illiteracy < 2.2   to the right, improve=0.29240, (0 missing)
      hs.grad    < 40.8  to the right, improve=0.14540, (0 missing)
      murder     < 12.85 to the left,  improve=0.08289, (0 missing)

Node number 3: 40 observations,    complexity param=0.1351
  mean=4635, MSE=247700
  left son=6 (34 obs) right son=7 (6 obs)
  Primary splits:
      life       < 70.26 to the right, improve=0.25230, (0 missing)
      murder     < 10    to the left,  improve=0.16790, (0 missing)
      hs.grad    < 60.95 to the left,  improve=0.12350, (0 missing)
      illiteracy < 0.75  to the left,  improve=0.11520, (0 missing)
      population < 659   to the right, improve=0.08106, (0 missing)
  Surrogate splits:
      population < 613.5 to the right, agree=0.875, adj=0.167, (0 split)
      murder     < 11.2  to the left,  agree=0.875, adj=0.167, (0 split)
      hs.grad    < 64.55 to the left,  agree=0.875, adj=0.167, (0 split)

Node number 4: 7 observations
  mean=3516, MSE=38440

Node number 5: 3 observations
  mean=3929, MSE=13610

Node number 6: 34 observations,    complexity param=0.06458
  mean=4530, MSE=169000
  left son=12 (12 obs) right son=13 (22 obs)
  Primary splits:
      population < 1374  to the left,  improve=0.20790, (0 missing)
      life       < 70.41 to the left,  improve=0.19010, (0 missing)
      frost      < 33.5  to the right, improve=0.10930, (0 missing)
      illiteracy < 0.75  to the left,  improve=0.08942, (0 missing)
      hs.grad    < 59.7  to the left,  improve=0.06429, (0 missing)
  Surrogate splits:
      frost      < 152   to the right, agree=0.794, adj=0.417, (0 split)
      life       < 70.41 to the left,  agree=0.735, adj=0.250, (0 split)
      murder     < 2     to the left,  agree=0.706, adj=0.167, (0 split)
      region     splits as  RRRL,      agree=0.676, adj=0.083, (0 split)
      illiteracy < 1.85  to the right, agree=0.676, adj=0.083, (0 split)

Node number 7: 6 observations
  mean=5230, MSE=276800

Node number 12: 12 observations,    complexity param=0.05485
  mean=4276, MSE=194200
  left son=24 (3 obs) right son=25 (9 obs)
  Primary splits:
      population < 994.5 to the right, improve=0.4355, (0 missing)
      life       < 71.89 to the left,  improve=0.2955, (0 missing)
      frost      < 172.5 to the left,  improve=0.2333, (0 missing)
      illiteracy < 0.75  to the left,  improve=0.1965, (0 missing)
      murder     < 2.55  to the right, improve=0.1847, (0 missing)
  Surrogate splits:
      life < 70.47 to the left,  agree=0.833, adj=0.333, (0 split)

Node number 13: 22 observations,    complexity param=0.05346
  mean=4668, MSE=101000
  left son=26 (4 obs) right son=27 (18 obs)
  Primary splits:
      hs.grad    < 52.05 to the left,  improve=0.44500, (0 missing)
      region     splits as  RLLR,      improve=0.35050, (0 missing)
      life       < 71.57 to the left,  improve=0.12220, (0 missing)
      murder     < 5.65  to the right, improve=0.10520, (0 missing)
      population < 2981  to the left,  improve=0.06276, (0 missing)
  Surrogate splits:
      region splits as  RLRR, agree=0.864, adj=0.25, (0 split)

Node number 24: 3 observations
  mean=3772, MSE=32610

Node number 25: 9 observations
  mean=4444, MSE=135300

Node number 26: 4 observations
  mean=4219, MSE=27700

Node number 27: 18 observations,    complexity param=0.02479
  mean=4768, MSE=62360
  left son=54 (8 obs) right son=55 (10 obs)
  Primary splits:
      region     splits as  RRLR,      improve=0.4086, (0 missing)
      illiteracy < 1     to the left,  improve=0.3653, (0 missing)
      murder     < 3.05  to the left,  improve=0.1805, (0 missing)
      hs.grad    < 52.75 to the right, improve=0.1507, (0 missing)
      population < 2413  to the left,  improve=0.1427, (0 missing)
  Surrogate splits:
      illiteracy < 1     to the left,  agree=0.833, adj=0.625, (0 split)
      frost      < 108.5 to the right, agree=0.833, adj=0.625, (0 split)
      life       < 72.31 to the right, agree=0.778, adj=0.500, (0 split)
      murder     < 3.05  to the left,  agree=0.778, adj=0.500, (0 split)
      hs.grad    < 59.95 to the left,  agree=0.667, adj=0.250, (0 split)

Node number 54: 8 observations
  mean=4590, MSE=10090

Node number 55: 10 observations,    complexity param=0.01394
  mean=4911, MSE=58320
  left son=110 (7 obs) right son=111 (3 obs)
  Primary splits:
      frost      < 109   to the left,  improve=0.4423, (0 missing)
      population < 2821  to the left,  improve=0.3546, (0 missing)
      hs.grad    < 57.05 to the right, improve=0.3103, (0 missing)
      region     splits as  RL-L,      improve=0.2564, (0 missing)
      illiteracy < 1.2   to the right, improve=0.1921, (0 missing)
  Surrogate splits:
      life < 71.94 to the left,  agree=0.8, adj=0.333, (0 split)

Node number 110: 7 observations
  mean=4806, MSE=29690

Node number 111: 3 observations
  mean=5156, MSE=39140

> 
> 
> #
> # Check out xpred.rpart
> #
> meany <- mean(mystate$income)
> xpr <- xpred.rpart(fit4, xval=xvals)
> xpr2 <- (xpr - mystate$income)^2
> risk0 <- mean((mystate$income - meany)^2)
> xpmean <- as.vector(apply(xpr2, 2, mean))   #kill the names
> all.equal(xpmean/risk0, as.vector(fit4$cptable[,'xerror']))
[1] TRUE
> 
> xpstd <- as.vector(apply((sweep(xpr2, 2, xpmean))^2, 2, sum))
> xpstd <- sqrt(xpstd)/(50*risk0)
> all.equal(xpstd, as.vector(fit4$cptable[,'xstd']))
[1] TRUE
> 
> #
> # recreate subset #3 of the xval
> #
> tfit4 <- rpart(income ~ population + region + illiteracy +life + murder +
+ 			hs.grad + frost , mystate,  subset=(xvals!=3),
+ 		   control=rpart.control(minsplit=10, xval=0))
> tpred <- predict(tfit4, mystate[xvals==3,])
Warning message: 
variable 2 is not a factor in: model.frame.default(Terms, newdata, na.action = act, xlev = attr(object,  
> all.equal(tpred, xpr[xvals==3,ncol(xpr)])
[1] TRUE
> 
> # How much does this differ from the "real" formula, more complex,
> #   found on page 309 of Breiman et al. ?
> #xtemp <- (xpr2/outer(rep(1,50),xpmean)) -  ((mystate$income - meany)^2)/risk0
> #real.se<- xpmean* sqrt(apply(xtemp^2,2,sum))/(risk0*50)
> 
> fit5 <- rpart(factor(pgstat) ~  age + eet + g2+grade+gleason +ploidy,
+ 	  stagec)
> 
> fit5
node), split, n, loss, yval, (yprob)
      * denotes terminal node

 1) root 146 54 0 ( 0.6301 0.3699 )  
   2) grade<2.5 61  9 0 ( 0.8525 0.1475 ) *
   3) grade>2.5 85 40 1 ( 0.4706 0.5294 )  
     6) g2<13.2 40 17 0 ( 0.5750 0.4250 )  
      12) ploidy:diploid,tetraploid 31 11 0 ( 0.6452 0.3548 )  
        24) g2>11.845 7  1 0 ( 0.8571 0.1429 ) *
        25) g2<11.845 24 10 0 ( 0.5833 0.4167 )  
          50) g2<11.005 17  5 0 ( 0.7059 0.2941 ) *
          51) g2>11.005 7  2 1 ( 0.2857 0.7143 ) *
      13) ploidy:aneuploid 9  3 1 ( 0.3333 0.6667 ) *
     7) g2>13.2 45 17 1 ( 0.3778 0.6222 )  
      14) g2>17.91 22  8 0 ( 0.6364 0.3636 )  
        28) age>62.5 15  4 0 ( 0.7333 0.2667 ) *
        29) age<62.5 7  3 1 ( 0.4286 0.5714 ) *
      15) g2<17.91 23  3 1 ( 0.1304 0.8696 ) *
> 
> fit6 <- rpart(factor(pgstat) ~  age + eet + g2+grade+gleason +ploidy,
+ 		stagec, parm=list(prior=c(.5,.5)))
> summary(fit6)
Call:
rpart(formula = factor(pgstat) ~ age + eet + g2 + grade + gleason + 
    ploidy, data = stagec, parms = list(prior = c(0.5, 0.5)))

       CP nsplit rel error xerror    xstd
1 0.39855      0    1.0000 1.1417 0.09340
2 0.02335      1    0.6014 0.6014 0.07304
3 0.01892      5    0.5093 0.7246 0.08264
4 0.01530      7    0.4714 0.7661 0.08616
5 0.01000      9    0.4408 0.7705 0.08780

Node number 1: 146 observations,    complexity param=0.3986
  predicted class= 0  expected loss= 0.5 
     class counts:  92 54 
    probabilities:  0.5 0.5 
  left son=2 (61 obs) right son=3 (85 obs)
  Primary splits:
      grade   < 2.5   to the left,  improve=12.490, (0 missing)
      gleason < 5.5   to the left,  improve=10.510, (3 missing)
      ploidy  splits as  LRR,       improve= 9.018, (0 missing)
      g2      < 13.2  to the left,  improve= 8.281, (7 missing)
      age     < 58.5  to the right, improve= 1.520, (0 missing)
  Surrogate splits:
      gleason < 5.5   to the left,  agree=0.863, adj=0.672, (0 split)
      ploidy  splits as  LRR,       agree=0.644, adj=0.148, (0 split)
      g2      < 9.945 to the left,  agree=0.630, adj=0.115, (0 split)
      age     < 66.5  to the right, agree=0.589, adj=0.016, (0 split)

Node number 2: 61 observations,    complexity param=0.0153
  predicted class= 0  expected loss= 0.1995 
     class counts:  52  9 
    probabilities:  0.7723 0.2277 
  left son=4 (33 obs) right son=5 (28 obs)
  Primary splits:
      g2      < 11.36 to the left,  improve=3.5470, (1 missing)
      ploidy  splits as  LRL,       improve=3.2970, (0 missing)
      age     < 68.5  to the right, improve=1.2020, (0 missing)
      gleason < 5.5   to the left,  improve=0.6474, (3 missing)
      eet     < 1.5   to the left,  improve=0.1910, (1 missing)
  Surrogate splits:
      ploidy splits as  LR-,       agree=0.917, adj=0.815, (0 split)
      age    < 65.5  to the left,  agree=0.617, adj=0.148, (1 split)

Node number 3: 85 observations,    complexity param=0.02335
  predicted class= 1  expected loss= 0.3734 
     class counts:  40 45 
    probabilities:  0.3429 0.6571 
  left son=6 (40 obs) right son=7 (45 obs)
  Primary splits:
      g2      < 13.2  to the left,  improve=1.96800, (6 missing)
      ploidy  splits as  LRR,       improve=1.84100, (0 missing)
      age     < 56.5  to the right, improve=1.32300, (0 missing)
      gleason < 8.5   to the left,  improve=1.26900, (0 missing)
      eet     < 1.5   to the right, improve=0.09632, (1 missing)
  Surrogate splits:
      ploidy  splits as  LRL,       agree=0.962, adj=0.914, (6 split)
      age     < 68.5  to the right, agree=0.608, adj=0.114, (0 split)
      gleason < 6.5   to the left,  agree=0.582, adj=0.057, (0 split)

Node number 4: 33 observations
  predicted class= 0  expected loss= 0.04097 
     class counts:  32  1 
    probabilities:  0.9495 0.0505 

Node number 5: 28 observations,    complexity param=0.0153
  predicted class= 0  expected loss= 0.3862 
     class counts:  20  8 
    probabilities:  0.5947 0.4053 
  left son=10 (20 obs) right son=11 (8 obs)
  Primary splits:
      gleason < 5.5   to the left,  improve=1.1580, (1 missing)
      age     < 67.5  to the right, improve=1.1420, (0 missing)
      g2      < 14.37 to the right, improve=0.8086, (1 missing)

Node number 6: 40 observations,    complexity param=0.02335
  predicted class= 1  expected loss= 0.4563 
     class counts:  23 17 
    probabilities:  0.4426 0.5574 
  left son=12 (7 obs) right son=13 (33 obs)
  Primary splits:
      g2      < 11.85 to the right, improve=1.42400, (5 missing)
      ploidy  splits as  LLR,       improve=1.31000, (0 missing)
      gleason < 7.5   to the left,  improve=0.86340, (0 missing)
      age     < 61.5  to the right, improve=0.57940, (0 missing)
      eet     < 1.5   to the left,  improve=0.05363, (0 missing)
  Surrogate splits:
      ploidy splits as  RLR, agree=0.857, adj=0.286, (5 split)

Node number 7: 45 observations,    complexity param=0.02275
  predicted class= 1  expected loss= 0.2998 
     class counts:  17 28 
    probabilities:  0.2627 0.7373 
  left son=14 (22 obs) right son=15 (23 obs)
  Primary splits:
      g2      < 17.91 to the right, improve=4.07900, (1 missing)
      age     < 62.5  to the right, improve=0.66100, (0 missing)
      gleason < 7.5   to the left,  improve=0.16170, (0 missing)
      eet     < 1.5   to the right, improve=0.09922, (1 missing)
  Surrogate splits:
      age     < 61.5  to the right, agree=0.614, adj=0.190, (1 split)
      eet     < 1.5   to the right, agree=0.591, adj=0.143, (0 split)
      grade   < 3.5   to the right, agree=0.545, adj=0.048, (0 split)
      gleason < 6.5   to the right, agree=0.545, adj=0.048, (0 split)

Node number 10: 20 observations
  predicted class= 0  expected loss= 0.2704 
     class counts:  16  4 
    probabilities:  0.7013 0.2987 

Node number 11: 8 observations
  predicted class= 1  expected loss= 0.3967 
     class counts:  4 4 
    probabilities:  0.3699 0.6301 

Node number 12: 7 observations
  predicted class= 0  expected loss= 0.1931 
     class counts:  6 1 
    probabilities:  0.7788 0.2212 

Node number 13: 33 observations,    complexity param=0.01892
  predicted class= 1  expected loss= 0.4088 
     class counts:  17 16 
    probabilities:  0.3841 0.6159 
  left son=26 (25 obs) right son=27 (8 obs)
  Primary splits:
      g2      < 11.01 to the left,  improve=1.06300, (5 missing)
      ploidy  splits as  L-R,       improve=0.73950, (0 missing)
      gleason < 6.5   to the left,  improve=0.57220, (0 missing)
      age     < 62.5  to the right, improve=0.13320, (0 missing)
      eet     < 1.5   to the left,  improve=0.05492, (0 missing)
  Surrogate splits:
      age < 70.5  to the left,  agree=0.821, adj=0.286, (5 split)

Node number 14: 22 observations,    complexity param=0.02275
  predicted class= 0  expected loss= 0.4916 
     class counts:  14  8 
    probabilities:  0.5067 0.4933 
  left son=28 (15 obs) right son=29 (7 obs)
  Primary splits:
      age     < 62.5  to the right, improve=0.976400, (0 missing)
      g2      < 23.48 to the right, improve=0.284300, (1 missing)
      gleason < 7.5   to the left,  improve=0.003765, (0 missing)
  Surrogate splits:
      gleason < 6.5   to the right, agree=0.773, adj=0.286, (0 split)

Node number 15: 23 observations
  predicted class= 1  expected loss= 0.1035 
     class counts:   3 20 
    probabilities:  0.0809 0.9191 

Node number 26: 25 observations,    complexity param=0.01892
  predicted class= 1  expected loss= 0.4443 
     class counts:  14 11 
    probabilities:  0.4276 0.5724 
  left son=52 (17 obs) right son=53 (8 obs)
  Primary splits:
      ploidy  splits as  L-R,       improve=2.157000, (0 missing)
      gleason < 6.5   to the left,  improve=1.972000, (0 missing)
      age     < 61.5  to the right, improve=0.971800, (0 missing)
      g2      < 7.65  to the right, improve=0.404500, (4 missing)
      eet     < 1.5   to the left,  improve=0.002614, (0 missing)
  Surrogate splits:
      eet < 1.5   to the right, agree=0.72, adj=0.125, (0 split)
      g2  < 4.33  to the right, agree=0.72, adj=0.125, (0 split)

Node number 27: 8 observations
  predicted class= 1  expected loss= 0.2976 
     class counts:  3 5 
    probabilities:  0.2605 0.7395 

Node number 28: 15 observations
  predicted class= 0  expected loss= 0.3605 
     class counts:  11  4 
    probabilities:  0.6175 0.3825 

Node number 29: 7 observations
  predicted class= 1  expected loss= 0.3401 
     class counts:  3 4 
    probabilities:  0.3057 0.6943 

Node number 52: 17 observations
  predicted class= 0  expected loss= 0.3976 
     class counts:  12  5 
    probabilities:  0.5848 0.4152 

Node number 53: 8 observations
  predicted class= 1  expected loss= 0.1984 
     class counts:  2 6 
    probabilities:  0.1636 0.8364 

> #
> # Fit a classification model to the car data.
> #  Now, since Reliability is an ordered category, this model doesn't
> # make a lot of statistical sense, but it does test out some
> # areas of the code that nothing else does
> #
> 
> carfit <- rpart(Reliability ~ Price + Country + Mileage + Type,
+ 		   method='class', data=cu.summary)
> 
> summary(carfit)
Call:
rpart(formula = Reliability ~ Price + Country + Mileage + Type, 
    data = cu.summary, method = "class")

       CP nsplit rel error xerror    xstd
1 0.30508      0    1.0000 1.0000 0.07200
2 0.08475      1    0.6949 0.6949 0.07808
3 0.05085      2    0.6102 0.6780 0.07800
4 0.03390      3    0.5593 0.6271 0.07747
5 0.01000      4    0.5254 0.6102 0.07721

Node number 1: 85 observations,    complexity param=0.3051
  predicted class= average  expected loss= 0.6941 
     class counts:  18 12 26  8 21 
    probabilities:  0.2118 0.1412 0.3059 0.0941 0.2471 
  left son=2 (58 obs) right son=3 (27 obs)
  Primary splits:
      Country splits as  ---LRRLLLL, improve=15.220, (0 missing)
      Type    splits as  RLLRLL,     improve= 4.288, (0 missing)
      Price   < 11970 to the right,  improve= 3.200, (0 missing)
      Mileage < 24.5  to the left,   improve= 2.476, (36 missing)

Node number 2: 58 observations,    complexity param=0.08475
  predicted class= average  expected loss= 0.6034 
     class counts:  18 12 23  5  0 
    probabilities:  0.3103 0.2069 0.3966 0.0862 0.0000 
  left son=4 (9 obs) right son=5 (49 obs)
  Primary splits:
      Type    splits as  RRRRLR,     improve=3.187, (0 missing)
      Price   < 11230 to the left,   improve=2.564, (0 missing)
      Mileage < 24.5  to the left,   improve=1.802, (30 missing)
      Country splits as  ---L--RLRL, improve=1.329, (0 missing)

Node number 3: 27 observations
  predicted class= Much better  expected loss= 0.2222 
     class counts:   0  0  3  3 21 
    probabilities:  0.0000 0.0000 0.1111 0.1111 0.7778 

Node number 4: 9 observations
  predicted class= Much worse  expected loss= 0.2222 
     class counts:  7 0 2 0 0 
    probabilities:  0.7778 0.0000 0.2222 0.0000 0.0000 

Node number 5: 49 observations,    complexity param=0.05085
  predicted class= average  expected loss= 0.5714 
     class counts:  11 12 21  5  0 
    probabilities:  0.2245 0.2449 0.4286 0.1020 0.0000 
  left son=10 (27 obs) right son=11 (22 obs)
  Primary splits:
      Type    splits as  RLLR-L,     improve=2.880, (0 missing)
      Mileage < 24.5  to the left,   improve=2.500, (25 missing)
      Price   < 11470 to the right,  improve=2.424, (0 missing)
      Country splits as  ---R--LRLR, improve=1.027, (0 missing)
  Surrogate splits:
      Price   < 11470 to the right,  agree=0.898, adj=0.773, (0 split)
      Country splits as  ---R--RRRL, agree=0.755, adj=0.455, (0 split)

Node number 10: 27 observations
  predicted class= average  expected loss= 0.4074 
     class counts:   7  4 16  0  0 
    probabilities:  0.2593 0.1481 0.5926 0.0000 0.0000 

Node number 11: 22 observations,    complexity param=0.0339
  predicted class= worse  expected loss= 0.6364 
     class counts:  4 8 5 5 0 
    probabilities:  0.1818 0.3636 0.2273 0.2273 0.0000 
  left son=22 (14 obs) right son=23 (8 obs)
  Primary splits:
      Country splits as  ---R--LRRL, improve=1.5190, (0 missing)
      Price   < 8646  to the left,   improve=1.2720, (0 missing)
      Type    splits as  L--R--,     improve=0.1909, (0 missing)
  Surrogate splits:
      Price < 13970 to the left,  agree=0.864, adj=0.625, (0 split)

Node number 22: 14 observations
  predicted class= worse  expected loss= 0.5714 
     class counts:  4 6 1 3 0 
    probabilities:  0.2857 0.4286 0.0714 0.2143 0.0000 

Node number 23: 8 observations
  predicted class= average  expected loss= 0.5 
     class counts:  0 2 4 2 0 
    probabilities:  0.00 0.25 0.50 0.25 0.00 

> #
> # Simplest weight test: treble the weights
> #
> #  By using the unshrunken estimates the weights will nearly cancel
> #   out:  frame$wt, frame$dev, frame$yval2, and improvement will all
> #   be threefold larger, other things will be the same.
> # The improvement is the splits matrix, column 3, rows with n>0.  Other
> #   rows are surrogate splits.
> 
> tempc <- rpart.control(maxsurrogate=0, cp=0, xval=0)
> fit1 <- rpart(Surv(pgtime, pgstat) ~ age + eet + g2+grade+gleason +ploidy,
+                 stagec, control=tempc,
+                 method='poisson', parms=list(shrink=0))
> wts <- rep(3, nrow(stagec))
> fit1b <- rpart(Surv(pgtime, pgstat) ~ age + eet + g2+grade+gleason +ploidy,
+                 stagec, control= tempc, parms=list(shrink=0),
+                 method='poisson', weights=wts)
> fit1b$frame$wt   <- fit1b$frame$wt/3
> fit1b$frame$dev  <- fit1b$frame$dev/3
> fit1b$frame$yval2<- fit1b$frame$yval2/3
> fit1b$splits[,3] <- fit1b$splits[,3]/3
> all.equal(fit1[-3], fit1b[-3])   #all but the "call"
[1] TRUE
> 
> #
> # Compare a pair of multiply weighted fits
> #  In this one, the lengths of where and y won't match
> # I have to set minsplit to the smallest possible, because otherwise
> #  the replicated data set will sometimes have enough "n" to split, but
> #  the weighted one won't.  Use of CP keeps the degenerate splits
> #  (n=2, several covariates with exactly the same improvement) at bay.
> # For larger trees, the weighted split will sometimes have fewer
> #  surrogates, because of the "at least two obs" rule.
> #
> .Random.seed <- c(53, 33, 62, 6, 45, 2, 43, 33, 16, 10, 39, 3)
> wts <- sample(1:5, nrow(stagec), replace=T)
> temp <- rep(1:nrow(stagec), wts)             #row replicates
> xgrp <- rep(1:10, length=146)[order(runif(146))]
> xgrp2<- rep(xgrp, wts)
> #  Direct: replicate rows in the data set, and use unweighted
> fit2 <- rpart(Surv(pgtime, pgstat) ~ age + eet + g2+grade+gleason +ploidy,
+ 	      control=rpart.control(minsplit=2, xval=xgrp2, cp=.025),
+ 	      data=stagec[temp,], method='poisson')
> #  Weighted
> fit2b<- rpart(Surv(pgtime, pgstat) ~ age + eet + g2+grade+gleason +ploidy,
+ 	      control=rpart.control(minsplit=2, xval=xgrp, cp=.025),
+ 	      data=stagec, method='poisson', weight=wts)
> 
> all.equal(fit2$frame[-2],  fit2b$frame[-2])  # the "n" component won't match
[1] TRUE
> all.equal(fit2$cptable,    fit2b$cptable)
[1] TRUE
> all.equal(fit2$splits[,-1],fit2b$splits[,-1]) #fails
[1] TRUE
> all.equal(fit2$splits[-24,-1],fit2b$splits[-24,-1]) #ok
[1] TRUE
> all.equal(fit2$csplit,    fit2b$csplit)
[1] TRUE
> # Line 24 is a surrogate split in a group whose 2 smallest ages are
> #  47 and 48.  The weighted fit won't split there because it wants to
> #  send at least 2 obs to the left; the replicate fit thinks that there
> #  are several 47's.
> 
> 
> 
> 
> 
> #
> # Test weights in a regression problem
> #
> 
> xgrp <- rep(1:10,5)
> fit4 <- rpart(income ~ population + region + illiteracy +life + murder +
+                         hs.grad + frost , mystate,
+                    control=rpart.control(minsplit=10, xval=xgrp))
> wts <- rep(3, nrow(mystate))
> fit4b <-  rpart(income ~ population + region + illiteracy +life + murder +
+                         hs.grad + frost , mystate,
+                    control=rpart.control(minsplit=10, xval=xgrp), weights=wts)
> fit4b$frame$wt   <- fit4b$frame$wt/3
> fit4b$frame$dev  <- fit4b$frame$dev/3
> fit4b$cptable[,5] <- fit4b$cptable[,5] * sqrt(3)
> temp <- c('frame', 'where', 'splits', 'csplit', 'cptable')
> all.equal(fit4[temp], fit4b[temp])  
[1] TRUE
> 
> 
> # Next is a very simple case, but worth keeping
> dummy <- data.frame(y=1:10, x1=c(10:4, 1:3), x2=c(1,3,5,7,9,2,4,6,8,0))
> 
> xx1 <- rpart(y ~ x1 + x2, dummy, minsplit=4, xval=0)
> xx2 <- rpart(y ~ x1 + x2, dummy, weights=rep(2,10), minsplit=4, xval=0)
> 
> all.equal(xx1$frame$dev, c(82.5, 10, 2, .5, 10, .5, 2))
[1] TRUE
> all.equal(xx2$frame$dev, c(82.5, 10, 2, .5, 10, .5, 2)*2)
[1] TRUE
> summary(xx2)
Call:
rpart(formula = y ~ x1 + x2, data = dummy, weights = rep(2, 10), 
    minsplit = 4, xval = 0)

       CP nsplit rel error
1 0.75758      0   1.00000
2 0.09091      1   0.24242
3 0.01000      3   0.06061

Node number 1: 10 observations,    complexity param=0.7576
  mean=5.5, MSE=8.25
  left son=2 (5 obs) right son=3 (5 obs)
  Primary splits:
      x1 < 5.5 to the right, improve=0.7576, (0 missing)
      x2 < 0.5 to the right, improve=0.2727, (0 missing)
  Surrogate splits:
      x2 < 2.5 to the right, agree=0.6, adj=0.2, (0 split)

Node number 2: 5 observations,    complexity param=0.09091
  mean=3, MSE=2
  left son=4 (3 obs) right son=5 (2 obs)
  Primary splits:
      x1 < 7.5 to the right, improve=0.75, (0 missing)
      x2 < 4   to the left,  improve=0.75, (0 missing)
  Surrogate splits:
      x2 < 6   to the left,  agree=1, adj=1, (0 split)

Node number 3: 5 observations,    complexity param=0.09091
  mean=8, MSE=2
  left son=6 (2 obs) right son=7 (3 obs)
  Primary splits:
      x1 < 3.5 to the right, improve=0.75, (0 missing)
      x2 < 1   to the right, improve=0.50, (0 missing)
  Surrogate splits:
      x2 < 5   to the left,  agree=0.8, adj=0.5, (0 split)

Node number 4: 3 observations
  mean=2, MSE=0.6667

Node number 5: 2 observations
  mean=4.5, MSE=0.25

Node number 6: 2 observations
  mean=6.5, MSE=0.25

Node number 7: 3 observations
  mean=9, MSE=0.6667

> 
> 
> # Now for a set of non-equal weights
> nn <- nrow(mystate)
> wts <- sample(1:5, nn, replace=T)
> temp <- rep(1:nn, wts)             #row replicates
> xgrp <- rep(1:10, length=nn)[order(runif(nn))]
> xgrp2<- rep(xgrp, wts)
> tempc <- rpart.control(minsplit=2, xval=xgrp2, maxsurrogate=0)
> #  Direct: replicate rows in the data set, and use unweighted
> fit5 <-  rpart(income ~ population + region + illiteracy +life + murder +
+                         hs.grad + frost , data=mystate[temp,], control=tempc)
> #  Weighted
> tempc <- rpart.control(minsplit=2, xval=xgrp, maxsurrogate=0)
> fit5b <-  rpart(income ~ population + region + illiteracy +life + murder +
+                         hs.grad + frost , data=mystate, control=tempc,
+                         weights=wts)
> all.equal(fit5$frame[-2],  fit5b$frame[-2])  # the "n" component won't match
[1] TRUE
> all.equal(fit5$cptable,    fit5b$cptable)
[1] TRUE
> all.equal(fit5$splits[,-1],fit5b$splits[,-1]) #fails
[1] TRUE
> all.equal(fit5$csplit,    fit5b$csplit)
[1] TRUE
> #
> # The treble test for classification trees
> #
> nn <- sum(!is.na(cu.summary$Reliability))
> xgrp <- rep(1:10,length=nn)
> carfit <- rpart(Reliability ~ Price + Country + Mileage + Type,
+ 		 method='class', data=cu.summary, 
+ 		 control=rpart.control(xval=xgrp))
> 
> carfit2 <- rpart(Reliability ~ Price + Country + Mileage + Type,
+ 		 method='class', data=cu.summary, 
+ 		 weight=rep(3,nrow(cu.summary)),
+ 		 control=rpart.control(xval=xgrp))
> 
> all.equal(carfit$frame$wt,    carfit2$frame$wt/3)
[1] TRUE
> all.equal(carfit$frame$dev,   carfit2$frame$dev/3)
[1] TRUE
> all.equal(carfit$frame$yval2, carfit2$frame$yval2/3)
[1] TRUE
> all.equal(carfit$frame[,5:10], carfit2$frame[,5:10])
[1] TRUE
> all.equal(carfit[c('where', 'csplit')],
+ 	  carfit2[c('where', 'csplit')])
[1] TRUE
> xx <- carfit2$splits
> xx[,'improve'] <- xx[,'improve'] / ifelse(xx[,1]==0,1,3)
> all.equal(xx, carfit$splits)
[1] TRUE
> all.equal(as.vector(carfit$cptable), 
+ 	  as.vector(carfit2$cptable%*% diag(c(1,1,1,1,sqrt(3)))))
[1] "Mean relative  difference: 0.03722347"
> 
> summary(carfit2)
Call:
rpart(formula = Reliability ~ Price + Country + Mileage + Type, 
    data = cu.summary, weights = rep(3, nrow(cu.summary)), method = "class", 
    control = rpart.control(xval = xgrp))

       CP nsplit rel error xerror    xstd
1 0.30508      0    1.0000 1.0000 0.04157
2 0.08475      1    0.6949 0.6949 0.04508
3 0.05085      2    0.6102 0.7458 0.04508
4 0.03390      3    0.5593 0.7288 0.04511
5 0.01000      4    0.5254 0.7966 0.04486

Node number 1: 85 observations,    complexity param=0.3051
  predicted class= average  expected loss= 0.6941 
     class counts:  54 36 78 24 63 
    probabilities:  0.2118 0.1412 0.3059 0.0941 0.2471 
  left son=2 (58 obs) right son=3 (27 obs)
  Primary splits:
      Country splits as  ---LRRLLLL, improve=45.660, (0 missing)
      Type    splits as  RLLRLL,     improve=12.860, (0 missing)
      Price   < 11970 to the right,  improve= 9.600, (0 missing)
      Mileage < 24.5  to the left,   improve= 7.429, (36 missing)

Node number 2: 58 observations,    complexity param=0.08475
  predicted class= average  expected loss= 0.6034 
     class counts:  54 36 69 15  0 
    probabilities:  0.3103 0.2069 0.3966 0.0862 0.0000 
  left son=4 (9 obs) right son=5 (49 obs)
  Primary splits:
      Type    splits as  RRRRLR,     improve=9.560, (0 missing)
      Price   < 11230 to the left,   improve=7.691, (0 missing)
      Mileage < 24.5  to the left,   improve=5.405, (30 missing)
      Country splits as  ---L--RLRL, improve=3.988, (0 missing)

Node number 3: 27 observations
  predicted class= Much better  expected loss= 0.2222 
     class counts:   0  0  9  9 63 
    probabilities:  0.0000 0.0000 0.1111 0.1111 0.7778 

Node number 4: 9 observations
  predicted class= Much worse  expected loss= 0.2222 
     class counts:  21  0  6  0  0 
    probabilities:  0.7778 0.0000 0.2222 0.0000 0.0000 

Node number 5: 49 observations,    complexity param=0.05085
  predicted class= average  expected loss= 0.5714 
     class counts:  33 36 63 15  0 
    probabilities:  0.2245 0.2449 0.4286 0.1020 0.0000 
  left son=10 (27 obs) right son=11 (22 obs)
  Primary splits:
      Type    splits as  RLLR-L,     improve=8.639, (0 missing)
      Mileage < 24.5  to the left,   improve=7.500, (25 missing)
      Price   < 11470 to the right,  improve=7.271, (0 missing)
      Country splits as  ---R--LRLR, improve=3.080, (0 missing)
  Surrogate splits:
      Price   < 11470 to the right,  agree=0.898, adj=0.773, (0 split)
      Country splits as  ---R--RRRL, agree=0.755, adj=0.455, (0 split)

Node number 10: 27 observations
  predicted class= average  expected loss= 0.4074 
     class counts:  21 12 48  0  0 
    probabilities:  0.2593 0.1481 0.5926 0.0000 0.0000 

Node number 11: 22 observations,    complexity param=0.0339
  predicted class= worse  expected loss= 0.6364 
     class counts:  12 24 15 15  0 
    probabilities:  0.1818 0.3636 0.2273 0.2273 0.0000 
  left son=22 (14 obs) right son=23 (8 obs)
  Primary splits:
      Country splits as  ---R--LRRL, improve=4.5580, (0 missing)
      Price   < 8646  to the left,   improve=3.8160, (0 missing)
      Type    splits as  L--R--,     improve=0.5727, (0 missing)
  Surrogate splits:
      Price < 13970 to the left,  agree=0.864, adj=0.625, (0 split)

Node number 22: 14 observations
  predicted class= worse  expected loss= 0.5714 
     class counts:  12 18  3  9  0 
    probabilities:  0.2857 0.4286 0.0714 0.2143 0.0000 

Node number 23: 8 observations
  predicted class= average  expected loss= 0.5 
     class counts:   0  6 12  6  0 
    probabilities:  0.00 0.25 0.50 0.25 0.00 

> 
> #
> # Treble test for class trees with 2 outcomes
> #
> fit1 <- rpart(Kyphosis ~ Age + Number + Start, data=kyphosis, 
+                control=rpart.control(maxsurrogate=0, cp=0, xval=0),
+                parms=list(prior=c(.7,.3), 
+                           loss=matrix(c(0,1,2,0),nrow=2,ncol=2)))
> wts <- rep(3, nrow(kyphosis))
> fit1b <- rpart(Kyphosis ~ Age + Number + Start, data=kyphosis, 
+                 control=rpart.control(maxsurrogate=0, cp=0, xval=0),
+ 	       weights=wts,
+                parms=list(prior=c(.7,.3), 
+                           loss=matrix(c(0,1,2,0),nrow=2,ncol=2)))
> fit1b$frame$wt   <- fit1b$frame$wt/3
> fit1b$frame$dev  <- fit1b$frame$dev/3
> fit1b$frame$yval2<- fit1b$frame$yval2/3
> fit1b$splits[,3] <- fit1b$splits[,3]/3
> all.equal(fit1[-3], fit1b[-3])   #all but the "call"
[1] TRUE
> 
> 
> # Now for a set of non-equal weights
> nn <- nrow(kyphosis)
> wts <- sample(1:5, nn, replace=T)
> temp <- rep(1:nn, wts)             #row replicates
> xgrp <- rep(1:10, length=nn)[order(runif(nn))]
> xgrp2<- rep(xgrp, wts)
> tempc <- rpart.control(minsplit=2, xval=xgrp2, maxsurrogate=0)
> #  Direct: replicate rows in the data set, and use unweighted
> fit2 <- rpart(Kyphosis ~ Age + Number + Start, data=kyphosis[temp,], 
+                control=tempc, 
+                parms=list(prior=c(.7,.3), 
+                           loss=matrix(c(0,1,2,0),nrow=2,ncol=2)))
> #  Weighted
> tempc <- rpart.control(minsplit=2, xval=xgrp, maxsurrogate=0)
> fit2b <- rpart(Kyphosis ~ Age + Number + Start, data=kyphosis, 
+                control=tempc, weights=wts,
+                parms=list(prior=c(.7,.3), 
+                           loss=matrix(c(0,1,2,0),nrow=2,ncol=2)))
> 
> all.equal(fit2$frame[-2],  fit2b$frame[-2])  # the "n" component won't match
[1] TRUE
> all.equal(fit2$cptable,    fit2b$cptable)
[1] TRUE
> all.equal(fit2$splits[,-1],fit2b$splits[,-1]) #fails
[1] "Mean relative  difference: 0.0248211"
> all.equal(fit2$csplit,    fit2b$csplit)
[1] TRUE
> q()
