# Using information criteria

## Model comparison

library(rethinking)
data(milk)
d <- milk[ complete.cases(milk), ]     # remove NA values
d$neocortex <- d$neocortex.perc / 100
dim(d)

head(d)

We will predict kcal.per.g using the predictors neocortex and the logarithm of mass. For this, we use four different models (all with flat priors):

a.start <- mean(d$kcal.per.g) sigma.start <- log( sd( d$kcal.per.g ))

m6.11 <- map(
alist(
kcal.per.g ~ dnorm( a, exp(log.sigma) )
), data=d, start=list(a=a.start, log.sigma=sigma.start) )

m6.12 <- map(
alist(
kcal.per.g ~ dnorm( mu, exp(log.sigma) ),
mu <- a + bn*neocortex
), data=d, start=list(a=a.start, bn=0, log.sigma=sigma.start)
)

m6.13 <- map(
alist(
kcal.per.g ~ dnorm( mu, exp(log.sigma) ),
mu <- a + bm*log(mass)
), data=d, start=list(a=a.start, bm=0, log.sigma=sigma.start)
)

m6.14 <- map(
alist(
kcal.per.g ~ dnorm( mu, exp(log.sigma) ),
mu <- a + bn*neocortex + bm*log(mass)
), data=d, start=list(a=a.start, bn=0, bm=0, log.sigma=sigma.start)
)

To compute the WAIC value, we can use the function WAIC() from the rethinking package:

WAIC( m6.14 )

The package also provides a handy function that computes WAIC for all models and ranks from best to worst:

set.seed(2405)
( milk.models <- compare( m6.11, m6.12, m6.13, m6.14) )
plot( milk.models, SE=TRUE, dSE=TRUE )

How can we interpret the weights (Akaike weights): here, the model m6.14 has probability of 94% to make the best predictions on new data (compared to the other three models). There’s a caveat: Uncertainty propagates to the weights as well (and there is a lot of uncertainty since we only have few observations).

As an example, consider the difference between m6.14 and m6.11, centered at 7.3 with a standard deviation of 7.29. We can compute the probability that this difference is negative, i.e. the two models are actually rank-reversed:

diff <- rnorm(1e5, 7.3, 7.29)
sum( diff < 0 ) / 1e5

## Comparing estimates

To compare parameter estimates across different models, we can use the handy function coeftab from McElreath’s package:

coeftab( m6.11, m6.12, m6.13, m6.14 )
plot( coeftab( m6.11, m6.12, m6.13, m6.14 ) )

One can use different options for coeftab_plot:

# sort by model
plot( coeftab( m6.11, m6.12, m6.13, m6.14), by.model=TRUE )
# show only certain parameters
plot( coeftab( m6.11, m6.12, m6.13, m6.14), by.model=FALSE, pars=c("bn", "bm") )    

## Model averaging

In model averaging, we use the predictions for each model and average according to their Akaike weights. Let’s compare first with the counterfactual predictions for the minimum-WAIC model m6.14, holding mass fixed.

nc.seq <- seq(from=0.5, to =0.8, length.out = 30)
d.predict <- list(
kcal.per.g = rep(0, 30),            # empty outcome
neocortex = nc.seq,                 # sequence of neocortex
mass = rep(median(d$mass), 30) # average mass (no idea where the 4.5 came from) ) pred.m6.14 <- link( m6.14, data=d.predict) mu <- apply( pred.m6.14, 2, mean ) mu.PI <- apply( pred.m6.14, 2, PI ) # plot it all plot( kcal.per.g ~ neocortex, d, col=rangi2 ) lines( nc.seq, mu, lty=2 ) lines( nc.seq, mu.PI[1, ], lty=2) lines( nc.seq, mu.PI[2, ], lty=2) Now comes the ensemble part: Average the posterior predictions, using ensemble, another handy function from rethinking: milk.ensemble <- ensemble( m6.11, m6.12, m6.13, m6.14, data=d.predict ) mu <- apply(milk.ensemble$link, 2, mean )
mu.PI <- apply( milk.ensemble$link, 2, PI ) kcal.per.g.PI <- apply(milk.ensemble$sim, 2, PI )

# same plot as before
plot( kcal.per.g ~ neocortex, d, col=rangi2 )
lines( nc.seq, mu, lty=2 )
lines( nc.seq, mu.PI[1, ], lty=2)
lines( nc.seq, mu.PI[2, ], lty=2)

# added ensemble predictions and their uncertainty
lines( nc.seq, mu )
shade( kcal.per.g.PI, nc.seq )    # yay even more uncertainty!

If we want to compute the ensemble average ourselves, we can do as follow:

# code mostly coppied from the ensemble function
n <- 1e3
ctab <- compare( m6.11, m6.12, m6.13, m6.14, sort = F)
weights <- ctab@output\$weight

L <- list( m6.11, m6.12, m6.13, m6.14 )
# for each "observation" in d.predict, link computes 1000 samples of the posterior mu
# (this for each model)
# simulate for each "observation" in d.predict an outcome using a sampled mu + sigma + dnorm
sim.list <- lapply( L, function(m) sim(m, data=d.predict, n=n))

# combine values
# for each model, we have a matrix of samples (both of mu and the outcome)
# we combine this into one matrix with 1000 samples taking from each model matrix
# rows according to their weight
idx <- round( weights * n)

idx_start <- rep(1,length(idx))
idx_end <- idx
if ( length(L)>1 )
for ( i in 2:length(idx) ) {
idx_start[i] <- min( idx_start[i-1] + idx[i-1] , n )
idx_end[i] <- min( idx_start[i] + idx[i] - 1 , n )
if ( i==length(idx) ) idx_end[i] <- n
}

} # alternatively, we could also use sample with prob = weight