Samples of Thoughts

about data, statistics and everything in between

Using Information Critera

Corrie July 4, 2018

Using information criteria

Model comparison

library(rethinking)
data(milk)
d <- milk[ complete.cases(milk), ]     # remove NA values
d$neocortex <- d$neocortex.perc / 100
dim(d)
## [1] 17  9
head(d)
##               clade            species kcal.per.g perc.fat perc.protein
## 1     Strepsirrhine     Eulemur fulvus       0.49    16.60        15.42
## 6  New World Monkey Alouatta seniculus       0.47    21.22        23.58
## 7  New World Monkey         A palliata       0.56    29.66        23.46
## 8  New World Monkey       Cebus apella       0.89    53.41        15.80
## 10 New World Monkey         S sciureus       0.92    50.58        22.33
## 11 New World Monkey   Cebuella pygmaea       0.80    41.35        20.85
##    perc.lactose mass neocortex.perc neocortex
## 1         67.98 1.95          55.16    0.5516
## 6         55.20 5.25          64.54    0.6454
## 7         46.88 5.37          64.54    0.6454
## 8         30.79 2.51          67.64    0.6764
## 10        27.09 0.68          68.85    0.6885
## 11        37.80 0.12          58.85    0.5885

We will predict kcal.per.g using the predictors neocortex and the logarithm of mass. For this, we use four different models (all with flat priors):

a.start <- mean(d$kcal.per.g)
sigma.start <- log( sd( d$kcal.per.g ))

m6.11 <- map( 
  alist(
    kcal.per.g ~ dnorm( a, exp(log.sigma) )
  ), data=d, start=list(a=a.start, log.sigma=sigma.start) )

m6.12 <- map(
  alist(
    kcal.per.g ~ dnorm( mu, exp(log.sigma) ),
    mu <- a + bn*neocortex
  ), data=d, start=list(a=a.start, bn=0, log.sigma=sigma.start)
)

m6.13 <- map(
  alist(
    kcal.per.g ~ dnorm( mu, exp(log.sigma) ),
    mu <- a + bm*log(mass)
  ), data=d, start=list(a=a.start, bm=0, log.sigma=sigma.start)
)

m6.14 <- map(
  alist(
    kcal.per.g ~ dnorm( mu, exp(log.sigma) ),
    mu <- a + bn*neocortex + bm*log(mass)
  ), data=d, start=list(a=a.start, bn=0, bm=0, log.sigma=sigma.start)
)

To compute the WAIC value, we can use the function WAIC() from the rethinking package:

WAIC( m6.14 )
## Constructing posterior predictions

## [ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]

## [1] -14.59507
## attr(,"lppd")
## [1] 12.33105
## attr(,"pWAIC")
## [1] 5.033517
## attr(,"se")
## [1] 7.774764

The package also provides a handy function that computes WAIC for all models and ranks from best to worst:

set.seed(2405)
( milk.models <- compare( m6.11, m6.12, m6.13, m6.14) )
##        WAIC pWAIC dWAIC weight   SE  dSE
## m6.14 -15.3   4.7   0.0   0.94 7.47   NA
## m6.11  -8.0   2.0   7.3   0.02 4.60 7.29
## m6.13  -7.8   3.0   7.4   0.02 5.63 5.26
## m6.12  -6.7   2.7   8.6   0.01 4.23 7.55
plot( milk.models, SE=TRUE, dSE=TRUE )

How can we interpret the weights (Akaike weights): here, the model m6.14 has probability of 94% to make the best predictions on new data (compared to the other three models). There’s a caveat: Uncertainty propagates to the weights as well (and there is a lot of uncertainty since we only have few observations).

As an example, consider the difference between m6.14 and m6.11, centered at 7.3 with a standard deviation of 7.29. We can compute the probability that this difference is negative, i.e. the two models are actually rank-reversed:

diff <- rnorm(1e5, 7.3, 7.29)
sum( diff < 0 ) / 1e5
## [1] 0.1572

Comparing estimates

To compare parameter estimates across different models, we can use the handy function coeftab from McElreath’s package:

coeftab( m6.11, m6.12, m6.13, m6.14 )
##           m6.11   m6.12   m6.13   m6.14  
## a            0.66    0.35    0.71   -1.09
## log.sigma   -1.79   -1.80   -1.85   -2.16
## bn             NA    0.45      NA    2.79
## bm             NA      NA   -0.03   -0.10
## nobs           17      17      17      17
plot( coeftab( m6.11, m6.12, m6.13, m6.14 ) )

One can use different options for coeftab_plot:

# sort by model 
plot( coeftab( m6.11, m6.12, m6.13, m6.14), by.model=TRUE )    

 # show only certain parameters
plot( coeftab( m6.11, m6.12, m6.13, m6.14), by.model=FALSE, pars=c("bn", "bm") )    

Model averaging

In model averaging, we use the predictions for each model and average according to their Akaike weights. Let’s compare first with the counterfactual predictions for the minimum-WAIC model m6.14, holding mass fixed.

nc.seq <- seq(from=0.5, to =0.8, length.out = 30)
d.predict <- list(
  kcal.per.g = rep(0, 30),            # empty outcome
  neocortex = nc.seq,                 # sequence of neocortex
  mass = rep(median(d$mass), 30)      # average mass   (no idea where the 4.5 came from)
)
pred.m6.14 <- link( m6.14, data=d.predict)
mu <- apply( pred.m6.14, 2, mean )
mu.PI <- apply( pred.m6.14, 2, PI )


# plot it all
plot( kcal.per.g ~ neocortex, d, col=rangi2 )
lines( nc.seq, mu, lty=2 )
lines( nc.seq, mu.PI[1, ], lty=2)
lines( nc.seq, mu.PI[2, ], lty=2)

Now comes the ensemble part: Average the posterior predictions, using ensemble, another handy function from rethinking:

milk.ensemble <- ensemble( m6.11, m6.12, m6.13, m6.14, data=d.predict )
## Constructing posterior predictions
## Constructing posterior predictions
## Constructing posterior predictions
## Constructing posterior predictions
mu <- apply(milk.ensemble$link, 2, mean )
mu.PI <- apply( milk.ensemble$link, 2, PI )
kcal.per.g.PI <- apply(milk.ensemble$sim, 2, PI )

# same plot as before
plot( kcal.per.g ~ neocortex, d, col=rangi2 )
lines( nc.seq, mu, lty=2 )
lines( nc.seq, mu.PI[1, ], lty=2)
lines( nc.seq, mu.PI[2, ], lty=2)

# added ensemble predictions and their uncertainty
lines( nc.seq, mu )
shade( mu.PI, nc.seq )
shade( kcal.per.g.PI, nc.seq )    # yay even more uncertainty!

If we want to compute the ensemble average ourselves, we can do as follow:

# code mostly coppied from the ensemble function
n <- 1e3
ctab <- compare( m6.11, m6.12, m6.13, m6.14, sort = F)
weights <- ctab@output$weight

L <- list( m6.11, m6.12, m6.13, m6.14 )
# for each "observation" in d.predict, link computes 1000 samples of the posterior mu 
# (this for each model)
link.list <- lapply( L, function(m) link(m, data=d.predict, n=n))
# simulate for each "observation" in d.predict an outcome using a sampled mu + sigma + dnorm
sim.list <- lapply( L, function(m) sim(m, data=d.predict, n=n))

# combine values
# for each model, we have a matrix of samples (both of mu and the outcome)
# we combine this into one matrix with 1000 samples taking from each model matrix 
# rows according to their weight
idx <- round( weights * n)

idx_start <- rep(1,length(idx))
idx_end <- idx
if ( length(L)>1 )
  for ( i in 2:length(idx) ) {
    idx_start[i] <- min( idx_start[i-1] + idx[i-1] , n )
    idx_end[i] <- min( idx_start[i] + idx[i] - 1 , n )
    if ( i==length(idx) ) idx_end[i] <- n
  }

link_out <- link.list[[1]]    # initiate with first model matrix
sim_out <- sim.list[[1]]
for ( i in 1:length(idx) ) {
  if ( idx[i]>0 ) {
    idxrange <- idx_start[i]:idx_end[i]
    link_out[idxrange,] <- link.list[[i]][idxrange,]
    sim_out[idxrange,] <- sim.list[[i]][idxrange,]
  }
} # alternatively, we could also use sample with prob = weight

Corrie

Read more posts by this author.