Using information criteria
Model comparison
library(rethinking)
data(milk)
d <- milk[ complete.cases(milk), ] # remove NA values
d$neocortex <- d$neocortex.perc / 100
dim(d)
head(d)
We will predict kcal.per.g using the predictors neocortex and the logarithm of mass. For this, we use four different models (all with flat priors):
a.start <- mean(d$kcal.per.g)
sigma.start <- log( sd( d$kcal.per.g ))
m6.11 <- map(
alist(
kcal.per.g ~ dnorm( a, exp(log.sigma) )
), data=d, start=list(a=a.start, log.sigma=sigma.start) )
m6.12 <- map(
alist(
kcal.per.g ~ dnorm( mu, exp(log.sigma) ),
mu <- a + bn*neocortex
), data=d, start=list(a=a.start, bn=0, log.sigma=sigma.start)
)
m6.13 <- map(
alist(
kcal.per.g ~ dnorm( mu, exp(log.sigma) ),
mu <- a + bm*log(mass)
), data=d, start=list(a=a.start, bm=0, log.sigma=sigma.start)
)
m6.14 <- map(
alist(
kcal.per.g ~ dnorm( mu, exp(log.sigma) ),
mu <- a + bn*neocortex + bm*log(mass)
), data=d, start=list(a=a.start, bn=0, bm=0, log.sigma=sigma.start)
)
To compute the WAIC value, we can use the function WAIC()
from the rethinking
package:
WAIC( m6.14 )
The package also provides a handy function that computes WAIC for all models and ranks from best to worst:
set.seed(2405)
( milk.models <- compare( m6.11, m6.12, m6.13, m6.14) )
plot( milk.models, SE=TRUE, dSE=TRUE )
How can we interpret the weights (Akaike weights): here, the model m6.14 has probability of 94% to make the best predictions on new data (compared to the other three models). There’s a caveat: Uncertainty propagates to the weights as well (and there is a lot of uncertainty since we only have few observations).
As an example, consider the difference between m6.14
and m6.11
, centered at 7.3 with a standard deviation of 7.29. We can compute the probability that this difference is negative, i.e. the two models are actually rank-reversed:
diff <- rnorm(1e5, 7.3, 7.29)
sum( diff < 0 ) / 1e5
Comparing estimates
To compare parameter estimates across different models, we can use the handy function coeftab
from McElreath’s package:
coeftab( m6.11, m6.12, m6.13, m6.14 )
plot( coeftab( m6.11, m6.12, m6.13, m6.14 ) )
One can use different options for coeftab_plot
:
# sort by model
plot( coeftab( m6.11, m6.12, m6.13, m6.14), by.model=TRUE )
# show only certain parameters
plot( coeftab( m6.11, m6.12, m6.13, m6.14), by.model=FALSE, pars=c("bn", "bm") )
Model averaging
In model averaging, we use the predictions for each model and average according to their Akaike weights.
Let’s compare first with the counterfactual predictions for the minimum-WAIC model m6.14
, holding mass fixed.
nc.seq <- seq(from=0.5, to =0.8, length.out = 30)
d.predict <- list(
kcal.per.g = rep(0, 30), # empty outcome
neocortex = nc.seq, # sequence of neocortex
mass = rep(median(d$mass), 30) # average mass (no idea where the 4.5 came from)
)
pred.m6.14 <- link( m6.14, data=d.predict)
mu <- apply( pred.m6.14, 2, mean )
mu.PI <- apply( pred.m6.14, 2, PI )
# plot it all
plot( kcal.per.g ~ neocortex, d, col=rangi2 )
lines( nc.seq, mu, lty=2 )
lines( nc.seq, mu.PI[1, ], lty=2)
lines( nc.seq, mu.PI[2, ], lty=2)
Now comes the ensemble part: Average the posterior predictions, using ensemble
, another handy function from rethinking
:
milk.ensemble <- ensemble( m6.11, m6.12, m6.13, m6.14, data=d.predict )
mu <- apply(milk.ensemble$link, 2, mean )
mu.PI <- apply( milk.ensemble$link, 2, PI )
kcal.per.g.PI <- apply(milk.ensemble$sim, 2, PI )
# same plot as before
plot( kcal.per.g ~ neocortex, d, col=rangi2 )
lines( nc.seq, mu, lty=2 )
lines( nc.seq, mu.PI[1, ], lty=2)
lines( nc.seq, mu.PI[2, ], lty=2)
# added ensemble predictions and their uncertainty
lines( nc.seq, mu )
shade( mu.PI, nc.seq )
shade( kcal.per.g.PI, nc.seq ) # yay even more uncertainty!
If we want to compute the ensemble average ourselves, we can do as follow:
# code mostly coppied from the ensemble function
n <- 1e3
ctab <- compare( m6.11, m6.12, m6.13, m6.14, sort = F)
weights <- ctab@output$weight
L <- list( m6.11, m6.12, m6.13, m6.14 )
# for each "observation" in d.predict, link computes 1000 samples of the posterior mu
# (this for each model)
link.list <- lapply( L, function(m) link(m, data=d.predict, n=n))
# simulate for each "observation" in d.predict an outcome using a sampled mu + sigma + dnorm
sim.list <- lapply( L, function(m) sim(m, data=d.predict, n=n))
# combine values
# for each model, we have a matrix of samples (both of mu and the outcome)
# we combine this into one matrix with 1000 samples taking from each model matrix
# rows according to their weight
idx <- round( weights * n)
idx_start <- rep(1,length(idx))
idx_end <- idx
if ( length(L)>1 )
for ( i in 2:length(idx) ) {
idx_start[i] <- min( idx_start[i-1] + idx[i-1] , n )
idx_end[i] <- min( idx_start[i] + idx[i] - 1 , n )
if ( i==length(idx) ) idx_end[i] <- n
}
link_out <- link.list[[1]] # initiate with first model matrix
sim_out <- sim.list[[1]]
for ( i in 1:length(idx) ) {
if ( idx[i]>0 ) {
idxrange <- idx_start[i]:idx_end[i]
link_out[idxrange,] <- link.list[[i]][idxrange,]
sim_out[idxrange,] <- sim.list[[i]][idxrange,]
}
} # alternatively, we could also use sample with prob = weight