## ----echo = FALSE------------------------------------------------------------- knitr::opts_chunk$set( fig.width = 5 , fig.height = 3.5, fig.align = 'center' ) oldpar <- list(mar = par()$mar, mfrow = par()$mfrow) ## ----------------------------------------------------------------------------- library(classmap) ## ----------------------------------------------------------------------------- data(TopGear, package = "robustHD") cars = TopGear; rm(TopGear) dim(cars) # [1] 297 32 rownames(cars) = paste(cars[,1],cars[,2]) # Now the rownames are make and model of the cars. colnames(cars) colnames(cars)[4:17] = c("fuel","price","cyl","displ","drive", "hp","torque","accel","topspeed", "MPG","weight","length","width", "height") summary(cars$accel) # zero minimum is impossible rownames(cars)[which(cars$accel == 0)] # [1] "Citroen C5 Tourer" "Ford Mondeo" "Lotus Elise" # [4] "Renault Twizy" "Ssangyong Rodius" cars$accel[cars$accel == 0] = NA summary(cars$accel) # OK cars$weight[order(cars$weight)[1:4]] rownames(cars)[order(cars$weight)[1:4]] cars$weight[cars$weight == 210] = NA summary(cars$weight) # OK # save(cars, file = "Topgear.RData") # load("Topgear.RData") ## ----------------------------------------------------------------------------- summary(cars$topspeed) # in mph summary(cars$displ) # in cc summary(cars$length) # in mm cars$length = cars$length/1000 # in meters fit = lm(hp ~ topspeed + length + displ, data = cars) summary(fit) predsplot(fit, main = "Top Gear data") fact = 0.51 width = fact*10; height = fact*8 maxnchar = 6 main = "Top Gear data" pdf(file = "topgear_numerical_predsplot.pdf", width = width, height = height) predsplot(fit, main=main, maxchar.level = maxnchar) dev.off() predscor(fit, sort.by.stdev = FALSE) # fact = 0.6 # width = fact*10; height = fact*8 # pdf(file = "topgear_numerical_predscor.pdf", # width = height, height = height) # predscor(fit, sort.by.stdev = FALSE) # dev.off() ## ----------------------------------------------------------------------------- cars$GPM = 1/cars$MPG fit4 = lm(GPM ~ accel + drive + weight + fuel, data = cars) summary(fit4) ## Figure 2: ## fact = 0.52 width = fact*10; height = fact*8 maxnchar = 6 main = "Top Gear data" # pdf(file = "topgear_4_predsplot.pdf", # width = width, height = height) predsplot(fit4, main=main, maxchar.level = maxnchar) # dev.off() ## ----------------------------------------------------------------------------- ## # fact = 0.6 # width = fact*10; height = fact*8 # pdf(file = "topgear_4_predscor.pdf", # width = height, height = height) predscor(fit4, sort.by.stdev = FALSE) # dev.off() car = "Bentley Continental" # pdf(file = "topgear_4_predsplot_1_dens.pdf", # width = width, height = height) predsplot(fit4, main = main, casetoshow=car, displaytype = "density", maxchar.level = maxnchar, xlab = paste0("prediction for ",car)) # dev.off() car = "Kia Rio" # pdf(file = "topgear_4_predsplot_2_stairs.pdf", # width = width, height = height) predsplot(fit4, main = main, casetoshow=car, staircase = TRUE, maxchar.level = maxnchar, xlab = paste0("prediction for ",car)) # dev.off() ## ----------------------------------------------------------------------------- summary(cars$AlarmSystem) cars$alarm = (cars$AlarmSystem == "standard") summary(cars$alarm) summary(cars$SatNav) cars$navig = "y" cars$navig[cars$SatNav == "no"] = "n" summary(cars$navig) table(cars$navig) class(cars$navig) str(cars) fitcombin = lm(1/MPG ~ accel + log(weight) + accel:torque + alarm + navig, data = cars) summary(fitcombin) fact = 0.45 width = fact*10; height = fact*8 main = "Example with transformation, interaction, logical, character" # pdf(file = "topgear_logical+char_predsplot.pdf", width = width, height = height) predsplot(fitcombin, main=main) # dev.off() fact = 0.6 width = fact*10; height = fact*8 # pdf(file = "topgear_logical+char_predscor.pdf", # width = height, height = height) predscor(fitcombin) # dev.off() ## ----------------------------------------------------------------------------- data(data_titanic, package = "classmap") titanic = data_titanic; rm(data_titanic) dim(titanic) # 1309 13 # A data frame with 1309 observations on the following variables. # passengerId: a unique identifier for each passenger. # pclass: travel class of the passenger. # name: name of the passenger. # sex: sex of the passenger. # age: age of the passenger. # sibsp: number of siblings and spouses traveling with the passenger. # parch: number of parents and children traveling with the passenger. # ticket: ticket number of the passenger. # fare: fare paid for the ticket. # Cabin: cabin number of the passenger. # embarked: Port of embarkation. Takes the values # C (Cherbourg), Q (Queenstown) and # S (Southampton). # y: factor indicating casualty or survivor. # dataType: vector taking the values “train” or “test” # indicating whether the observation belongs # to training or test data. # colnames(titanic) = c("passengerId","pclass","name","sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked", "survival", "dataType") str(titanic) unique(titanic$pclass) # 3 1 2 titanic$pclass = factor(titanic$pclass, levels = unique(titanic$pclass)) # unique(titanic$sex) titanic$sex = factor(titanic$sex, levels = unique(titanic$sex) ) titanic$sex = factor(titanic$sex, labels = c("M","F")) head(titanic) # save(titanic, file="Titanic.RData") # load("Titanic.RData") ## ----------------------------------------------------------------------------- fit5 <- glm(survival ~ sex + age + sibsp + parch + pclass, family=binomial, data = titanic) summary(fit5) ## ----------------------------------------------------------------------------- fact = 0.5 width = fact*10; height = fact*8 main = "Titanic data" # pdf(file = "titanic_5_predsplot_dens.pdf", # width = width, height = height) predsplot(fit5, main=main, displaytype = "density") # The glm fit used the logit link function. # dev.off() # With other options for density estimation: predsplot(fit5, main=main, displaytype = "density", bw = "SJ", adjust = 0.5) ## ----------------------------------------------------------------------------- # pdf(file = "titanic_5_predsplot_1.pdf", # width = width, height = height) predsplot(fit5, main = main, casetoshow=1) # dev.off() # pdf(file = "titanic_5_predsplot_2_stairs.pdf", # width = width, height = height) predsplot(fit5, main = main, casetoshow=2, staircase = TRUE) # dev.off() # fact = 0.6 # width = fact*10; height = fact*8 # pdf(file = "titanic_5_predscor.pdf", # width = height, height = height) predscor(fit5, sort.by.stdev = FALSE) # dev.off() ## ----------------------------------------------------------------------------- data(german.credit, package = "fairml") credit = german.credit; rm(german.credit) dim(credit) # 1000 21 str(credit) # # Shorten variable names for plotting: colnames(credit) <- c("checking","months","history","purpose","amount","savings","employment","rate","guarantors","residence","property","age","inst_plans","housing","nloans","job","nclients","phone","foreign","credit","sex") # # Give factors sex and purpose shorter labels for plotting: credit$sex = factor(as.numeric(credit$sex), labels = c("F","M")) levels(credit$purpose) # [10] "retrainin" levels(credit$purpose) = c("business","n.car","u.car","appliances","education","furniture","other","TV","repairs","retraining") # save(credit, file = "German_credit.RData") # load("German_credit.RData") ## ----------------------------------------------------------------------------- fit7 <- glm(credit ~ months + purpose + amount + rate + age + nclients + sex, family=binomial, data = credit) ## ----------------------------------------------------------------------------- main = "German credit data" # fact = 0.48 # width = fact*10; height = fact*8 # pdf(file = "germancredit_7_predsplot.pdf", # width = width, height = height) predsplot(fit7, main = main) # dev.off() ## ----------------------------------------------------------------------------- # fact = 0.48 # width = fact*10; height = fact*8 # pdf(file = "germancredit_7_predsplot_1.pdf", # width = width, height = height) predsplot(fit7, main = main, casetoshow=1, displaytype = "density") # dev.off() ## ----------------------------------------------------------------------------- # pdf(file = "germancredit_7_predsplot_2_stairs.pdf", # width = width, height = height) predsplot(fit7, main = main, casetoshow=2, staircase = TRUE) # dev.off() ## ----------------------------------------------------------------------------- # fact = 0.6 # width = fact*10; height = fact*8 # pdf(file = "germancredit_7_predscor_equalsizes.pdf", # width = height, height = height) predscor(fit7, sort.by.stdev = FALSE, cell.length = "equal") # dev.off() # # pdf(file = "germancredit_7_predscor.pdf", # width = height, height = height) predscor(fit7, sort.by.stdev = FALSE) # dev.off() ## ----------------------------------------------------------------------------- ## Prediction for a new case: newc = list("u.car", 36, 2, 6000, 55, "F", 1) names(newc) = c("purpose","months","rate","amount", "age","sex","nclients") # fact = 0.48 # width = fact*10; height = fact*8 # pdf(file = "germancredit_7_predsplot_new_stairs.pdf", # width = width, height = height) predsplot(fit7, main = main, casetoshow = newc, staircase = TRUE) # dev.off() ## Figure with profile: # pdf(file = "germancredit_7_predsplot_1_dens_profile.pdf", # width = width, height = height) predsplot(fit7, main = main, casetoshow=1, displaytype = "density", profile = TRUE) # dev.off() ## ----------------------------------------------------------------------------- credit$x1 = credit$months + credit$nclients credit$x2 = credit$months - credit$nclients cor(credit$x1,credit$x2) # 0.998199 # But minus that for prediction terms! fitart <- glm(credit ~ x1 + x2 + purpose + amount + rate + age + sex, family=binomial, data = credit) ## Figure 7: ## # fact = 0.48 # width = fact*10; height = fact*8 # main = "German credit data with artificial variables x1 and x2" # pdf(file = "germancredit_7_artif_predsplot.pdf", # width = width, height = height) predsplot(fitart, main = main) # dev.off() ## Figure 8: ## # fact = 0.6 # width = fact*10; height = fact*8 # pdf(file = "germancredit_7_artif_predscor.pdf", # width = height, height = height) predscor(fitart, sort.by.stdev = FALSE) # dev.off()