Data from a small experiment

Treatment: 4 diet groups
Response: Coagulation time
The Data–

group1	group2	group3	group4
62	63	68	56
60	67	66	62
63	71	71	60
59	64	67	61
	65	68	63
	66	68	64
			63
			59

The Bayesian hierarchical model

Model:

\[\large{ \begin{eqnarray} y_{ij} &\sim& N(\theta_j,\sigma^2) \\ \theta_j &\sim& N(\mu,\tau^2) \\ (\mu,\text{log}\sigma,\tau) &\sim& \text{uniform prior} \end{eqnarray} }\]

Notation:

\(\large{y_{ij},i=1,\ldots,n_j,\; j=1,\ldots,J}\): coagulation time for the \(\large{i}\)-th animal in the \(\large{j}\)-th group.

\(\large{\theta_j,j=1,\ldots,J}\): mean for the \(\large{j}\)-th treatment group.

Let’s apply MCMC to fit the model !

Gibbs Sampler
Metropolis algorithm
Softwares: JAGS, Stan

Gibbs Sampler: choose starting points

\(\large{\theta_j}:\) choose overdispersed starting points by simply taking random points from group j.
\(\large{\mu}:\) take the average of \(\large{\theta_j}\)
\(\large{\tau, \sigma}:\) don’t need; can be drawn as the first steps in the Gibbs sampler.

Gibbs Sampler: determine conditional distributions

\(\large{\theta_j | \mu,\sigma,\tau,y \sim N(\hat{\theta_j},V_{\theta_j})}\), where \(\large{\hat{\theta_j}=\frac{\frac{\mu}{\tau^2}+\frac{n_j}{\sigma^2}\bar{y}_{.j}}{\frac{1}{\tau^2}+\frac{n_j}{\sigma^2}}}\), \(\large{V_{\theta_j}=\frac{1}{\frac{1}{\tau^2}+\frac{n_j}{\sigma^2}}}\)
\(\large{\mu|\theta,\sigma,\tau,y \sim N(\hat{\mu},\tau^2/J)}\), where \(\large{\hat{\mu}=\frac{1}{J}\sum_{j=1}^J \theta_j}\)
\(\large{\sigma|\theta,\mu,\tau,y \sim \text{Inv-}\chi^2(n,\hat{\sigma}^2)}\), where \(\large{\hat{\sigma}^2=\frac{1}{n}\sum_{j=1}^{J}\sum_{i=1}^{n_j} (y_{ij}-\theta_j)^2}\)
\(\large{\tau^2|\theta,\mu,\sigma,y \sim \text{Inv-}\chi^2(J-1,\hat{\tau}^2)}\), where \(\large{\hat{\tau}^2=\frac{1}{J-1}\sum_{j=1}^J (\theta_j-\mu)^2}\)

Gibbs Sampler: the code

library(geoR) #Generate the (scaled) inverse chi-squared random numbers
gibbs=function(N,nthin,ndiscard,nchains){ 
  ## N: number of total iterations per chain
  ## nthin: thinning rate--thin the sequences by keeping every nthin-th simulation draw
  ## ndiscard: number of iterations to discard at the beginning
  ## nchains: number of Markov chains 
  arr=array(NA,dim = c(N,nparams,nchains),dimnames = list(c(1:N),
                                                          c(paste('theta',1:4,sep = ""),
                                                            "mu","sigma","tau"),
                                                          paste('chain',1:nchains,sep = ""))) 
  ## arr: save sampling result
  
  #Starting points
  theta=aggregate(y~group,data = dat,FUN = sample,nchains,replace=T)[,2]
  mu=colMeans(theta)
  
  #Gibbs sampling
  for(i in 1:N){
    #sigma
    sigma2_hat=colSums((y-matrix(rep(theta,times=rep(as.vector(table(group)),nchains)),
                                ncol = nchains))^2)/n
    sigma2=rinvchisq(nchains,df=n,scale = sigma2_hat)
    
    #tau
    tau2_hat=colSums((theta-mu)^2)/(J-1)
    tau2=rinvchisq(nchains,df=J-1,scale = tau2_hat)
    
    #theta
    theta_hat=aggregate(y~group,data = dat,
                        FUN = function(y){
                          num=mu/tau2+sum(y)/sigma2
                          den=1/tau2+length(y)/sigma2
                          return(num/den)
                        })[,2]
    V=aggregate(y~group,data = dat,
                FUN = function(y){
                  return(1/(1/tau2+length(y)/sigma2))
                })[,2]
    theta=rnorm(length(theta_hat[,1]),mean = theta_hat[,1],sd=sqrt(V[,1]))
    for(ch in 2:nchains){
      theta=cbind(theta,
                  rnorm(length(theta_hat[,ch]),mean = theta_hat[,ch],sd=sqrt(V[,ch])))
    }
   # theta=matrix(rnorm(J*nchains,mean = theta_hat,sd=sqrt(V)),ncol = nchains)
    
    #mu
    mu_hat=colMeans(theta)
    mu=rnorm(nchains,mean = mu_hat,sd=sqrt(tau2/J))
    
    #write down the i iteration
    arr[i,,]=rbind(theta,mu,sqrt(sigma2),sqrt(tau2))
  }
  #discard and thin
  arr1=arr[(ndiscard+1):N,,]
  arr2=arr[seq(1,(N-ndiscard),by=nthin),,]
  return(list(arr=arr,arr_discard=arr1,arr_thin=arr2))
}

set.seed(222)
n=length(y);J=4;nparams=7 
N=1500;nthin=1;nchains=5
samples_g=gibbs(N,nthin,ndiscard=N/2,nchains)

Gibbs Sampler: make inference

summary statistics

apply(samples_g$arr_thin,2,FUN = mean)

##    theta1    theta2    theta3    theta4        mu     sigma       tau 
## 61.226810 65.928245 67.817807 61.124255 64.147440  2.489103  8.753039

apply(samples_g$arr_thin,2,FUN = sd)

##    theta1    theta2    theta3    theta4        mu     sigma       tau 
## 1.7697192 1.3031052 1.4384974 1.1344045 6.3882468 0.8963711 9.0296164

apply(samples_g$arr_thin,2,FUN = quantile)

##         theta1    theta2    theta3    theta4         mu     sigma
## 0%    56.61572  61.26747  29.90920  48.87796   6.088071  1.486691
## 25%   60.31920  65.26394  67.14252  60.52523  62.025483  2.176331
## 50%   61.21053  65.92861  67.85173  61.11924  64.062131  2.415885
## 75%   62.06461  66.58551  68.49813  61.68879  66.113474  2.719351
## 100% 124.02273 114.74153 115.07477 102.41647 186.983664 48.232329
##              tau
## 0%     0.7070118
## 25%    4.0747481
## 50%    6.3479288
## 75%   10.1655766
## 100% 225.4439367

density curve

for(i in 1:nparams){
  plot(density(as.vector(samples_g$arr_thin[,i,])),main = dimnames(samples_g$arr_thin)[[2]][i])
}

Gibbs Sampler: assess convergence

trace plots

arr_thin_g=samples_g$arr_thin

for(i in 1:nparams){
  plot(seq(N/2+1,N,nthin),arr_thin_g[,i,1],type = 'l',ylab=dimnames(arr_thin_g)[[2]][i],xlab = "iteration")
  for(chain in 2:nchains){
    lines(seq(N/2+1,N,nthin),arr_thin_g[,i,chain],type='l',col=chain)
  }
}

Potential scale reduction factor

arr1_g=samples_g$arr_discard
mat=arr1_g[,,1]
for(i in 2:dim(arr1_g)[3]){
  mat=rbind(mat,arr1_g[,,i])
}
mat=as.data.frame(mat)
chain=factor(rep(1:(2*nchains),each=dim(arr1_g)[1]/2))
Rhat=apply(mat,2,FUN = function(y){
  B=anova(aov(y~chain))$`Mean Sq`[1]
  W=anova(aov(y~chain))$`Mean Sq`[2]
  nsim=dim(arr1_g)[1]/2
  var_hat=(nsim-1)*W/nsim+B/nsim
  R_hat=sqrt(var_hat/W)
  return(R_hat)
})
Rhat

##    theta1    theta2    theta3    theta4        mu     sigma       tau 
## 1.0021045 1.0004658 1.0017665 1.0001103 0.9995815 1.0007163 1.0030033

Effective number of indepedent simulation draws

n_eff=apply(mat,2,FUN = function(y){
  B=anova(aov(y~chain))$`Mean Sq`[1]
  W=anova(aov(y~chain))$`Mean Sq`[2]
  nsim=dim(arr1_g)[1]/2
  m=nchains*2
  var_hat=(nsim-1)*W/nsim+B/nsim
  neff=min(m*nsim*var_hat/B,m*nsim)
  return(neff)
})
n_eff

##   theta1   theta2   theta3   theta4       mu    sigma      tau 
## 1459.573 2781.563 1617.868 3464.217 3750.000 2442.674 1158.706

Metropolis algorithm: choose starting points

\(\large{\theta_j}:\) choose overdispersed starting points by simply taking random points from group j.
\(\large{\mu}:\) take the average of \(\large{\theta_j}\)
\(\large{\sigma^2=\frac{1}{J}\sum_{j=1}^J \sum_{i=1}^{n_j} (y_{ij}-\bar{y}_{.j})^2/(n_j-1)}\)
\(\large{\tau^2=\sum_{j=1}^J (\theta_j-\bar{\theta})^2/(J-1)}\)

Metropolis algorithm: determine jumping rules

Work more efficiently in a lower-dimensional space: \(\large{(\mu,\sigma,\tau)}\)
Taking advantage of the conjugacy: compute \(\large{p(\mu,\text{log}\sigma,\text{log}\tau|y)}\)
Steps–
1. Use Metroplis algorithm to jump through the marginal posterior distrubtion of \(\large{(\mu,\text{log}\sigma,\text{log}\tau)}\);
2. Draw \(\large{\theta}\) from its normal conditional posterior distribution.
Efficient jumping rules: \(\large{J(\zeta^\star|\zeta^{t-1})=N(\zeta^\star|\zeta^{t-1},c^2\Sigma)}\), where \(\large{\zeta=(\mu,\text{log}\sigma,\text{log}\tau)}\), \(\large{d=\text{number of parameters}=3}\), and \(\large{\Sigma}\) equals that of a normal approximation, i.e.,

\[\large{\Sigma=\large[ -\frac{d^2 \text{log}p(\zeta|y)}{d \zeta^2} \large]_{\zeta=\hat{\zeta}}^{-1}}\]

Three things we need–

The marginal posterior distribution \[\large{\begin{eqnarray} p(\zeta|y)=p(\mu,\text{log}\sigma,\text{log}\tau |y) &=& \frac{p(\theta,\mu,\text{log}\sigma,\text{log}\tau |y)}{p(\theta|\mu,\text{log}\sigma,\text{log}\tau ,y)} \\ &\propto& \tau \prod_{j=1}^J N(\hat{\theta}|\mu,\tau^2)\prod_{j=1}^J\prod_{i=1}^{n_j} N(y_{ij}|\hat{\theta},\sigma^2) \prod_{j=1}^J V_{\theta_j}^{1/2} \end{eqnarray}},\]

where \(\large{\hat{\theta_j}=\frac{\frac{\mu}{\tau^2}+\frac{n_j}{\sigma^2}\bar{y}_{.j}}{\frac{1}{\tau^2}+\frac{n_j}{\sigma^2}}}\), \(\large{V_{\theta_j}=\frac{1}{\frac{1}{\tau^2}+\frac{n_j}{\sigma^2}}}\)

The second derivatives of the log marginal posterior density– approximate them numerically using finite differences:

\[\large{\frac{d^2 L(\zeta)}{d\zeta_i d\zeta_j} \approx [L(\zeta+\delta_i e_i+\delta_j e_j)-L(\zeta-\delta_i e_i+\delta_j e_j)-L(\zeta+\delta_i e_i-\delta_j e_j)+L(\zeta-\delta_i e_i-\delta_j e_j)]/ (4\delta_i \delta_j)}\]

\(\large{\hat{\zeta}}\) is the mode of the marginal posterior distribution – Use EM algorithm to find:
- Starting points: the crude estimate
- E-step: \[\large{E_{old}((\theta_j-\mu)^2)=(\hat{\theta}_j-\mu)^2+V_{\theta_j}}\] \[\large{E_{old}((y_{ij}-\theta_j)^2)=(y_{ij}-\hat{\theta}_j)^2+V_{\theta_j}}\]
- M-step： \[\large{ \begin{eqnarray} \mu^{new} &=& \frac{1}{J}\sum_{j=1}^J \hat{\theta}_j \\ \sigma^{new} &=& \large(\frac{1}{n}\sum_{j=1}^{J}\sum_{i=1}^{n_j} ((y_{ij}-\hat{\theta}_j)^2+V_{\theta_j}) \large)^{1/2} \\ \tau^{new} &=& \large(\frac{1}{J-1}\sum_{j=1}^{J} ((\hat{\theta}_j-\mu^{new})^2+V_{\theta_j})\large)^{1/2} \end{eqnarray} }\]

Metropolis algorithm: calculate acceptance ratio

the acceptance ratio–

\[\large{r=\frac{p(\mu^\star,\text{log}\sigma^\star,\text{log}\tau^\star |y)}{p(\mu^{t-1},\text{log}\sigma^{t-1},\text{log}\tau^{t-1} |y)}}\]

Metropolis algorithm: the code

library(MASS)

##Use EM algorithm to find the marginal posterior mode
  #crude estimates as starting points
  theta_new=tapply(y,group,mean);mu_new=mean(theta_new)
  sigma2_new=mean(tapply(y,group,var));tau2_new=var(theta_new)
  
  theta_old=0;mu_old=0;sigma2_old=0;tau2_old=0
  
  niter=0
  
  while( (abs(mu_old-mu_new)>1e-04 | abs(sigma2_old-sigma2_new)>1e-06 | abs(tau2_old-tau2_new)>1e-04 ) & niter<10000){
    theta_old=theta_new;mu_old=mu_new;sigma2_old=sigma2_new;tau2_old=tau2_new
    
    theta_hat_m=aggregate(y~group,data = dat,
                        FUN = function(y){
                          num=mu_old/tau2_old+sum(y)/sigma2_old
                          den=1/tau2_old+length(y)/sigma2_old
                          return(num/den)
                        })[,2]
    V_m=aggregate(y~group,data = dat,
                FUN = function(y){
                  return(1/(1/tau2_old+length(y)/sigma2_old))
                })[,2]
    
    mu_new=mean(theta_hat_m)
    sigma2_new= (sum((y-rep(theta_hat_m,times=as.vector(table(group))) )^2)
               +sum(as.vector(table(group))*V_m) )/n
    tau2_new= sum((theta_hat_m-mu_new)^2+V_m)/(J-1)
    
    niter=niter+1
    
  }
  
  mu_mode=mu_new;sigma2_mode=sigma2_new;tau2_mode=tau2_new
  
  ##Approximate the second derivatives of the log marginal posterior distribution numerically
  delta=1e-04
  L=function(mu,log_sigma,log_tau){
    sigma2=exp(log_sigma)^2
    tau2=exp(log_tau)^2
    theta_hat=aggregate(y~group,data = dat,
                        FUN = function(y){
                          num=mu/tau2+sum(y)/sigma2
                          den=1/tau2+length(y)/sigma2
                          return(num/den)
                        })[,2]
    V=aggregate(y~group,data = dat,
                FUN = function(y){
                  return(1/(1/tau2+length(y)/sigma2))
                })[,2]
    L=log_tau+sum(dnorm(theta_hat,mu,sqrt(tau2),log=T))+sum(dnorm(y,
        mean = rep(theta_hat,times=as.vector(table(group))),sd=sqrt(sigma2),log=T))
    +sum(0.5*log(V))
    
    return(L)
  }
  #mu denote by 1; log_sigma denote by 2; log_tau denote by 3
  H11=(L(mu_mode+delta*2,log(sigma2_mode)*0.5,log(tau2_mode)*0.5)
      -2*L(mu_mode,log(sigma2_mode)*0.5,log(tau2_mode)*0.5)
      +L(mu_mode-delta*2,log(sigma2_mode)*0.5,log(tau2_mode)*0.5) )/(4*delta^2)
  
  H22=(L(mu_mode,log(sigma2_mode)*0.5+delta*2,log(tau2_mode)*0.5)
      -2*L(mu_mode,log(sigma2_mode)*0.5,log(tau2_mode)*0.5)
      +L(mu_mode,log(sigma2_mode)*0.5-delta*2,log(tau2_mode)*0.5) )/(4*delta^2)
  
  H33=(L(mu_mode,log(sigma2_mode)*0.5,log(tau2_mode)*0.5+delta*2)
      -2*L(mu_mode,log(sigma2_mode)*0.5,log(tau2_mode)*0.5)
      +L(mu_mode,log(sigma2_mode)*0.5,log(tau2_mode)*0.5-delta*2) )/(4*delta^2)
  
  H12=(L(mu_mode+delta,log(sigma2_mode)*0.5+delta,log(tau2_mode)*0.5)
      -L(mu_mode+delta,log(sigma2_mode)*0.5-delta,log(tau2_mode)*0.5)
      -L(mu_mode-delta,log(sigma2_mode)*0.5+delta,log(tau2_mode)*0.5)
      +L(mu_mode-delta,log(sigma2_mode)*0.5-delta,log(tau2_mode)*0.5) )/(4*delta^2)
  
  H13=(L(mu_mode+delta,log(sigma2_mode)*0.5,log(tau2_mode)*0.5+delta)
      -L(mu_mode+delta,log(sigma2_mode)*0.5,log(tau2_mode)*0.5-delta)
      -L(mu_mode-delta,log(sigma2_mode)*0.5,log(tau2_mode)*0.5+delta)
      +L(mu_mode-delta,log(sigma2_mode)*0.5,log(tau2_mode)*0.5-delta) )/(4*delta^2)
  
  H23=(L(mu_mode,log(sigma2_mode)*0.5+delta,log(tau2_mode)*0.5+delta)
      -L(mu_mode,log(sigma2_mode)*0.5+delta,log(tau2_mode)*0.5-delta)
      -L(mu_mode,log(sigma2_mode)*0.5-delta,log(tau2_mode)*0.5+delta)
      +L(mu_mode,log(sigma2_mode)*0.5-delta,log(tau2_mode)*0.5-delta) )/(4*delta^2)
    
  ##The covariance matrix for the normal jumping kernel
  H1=matrix(c(H11,H12,H13,H12,H22,H23,H13,H23,H33),ncol = 3)
  H=solve(H1)*(-1)


metroplis=function(N,nthin,ndiscard,nchains){
  arr=array(NA,dim = c(N,nparams,nchains),dimnames = list(c(1:N),
                                                          c(paste('theta',1:4,sep = ""),
                                                            "mu","sigma","tau"),
                                                          paste('chain',1:nchains,sep = "")
  )) 
  
  ##Overdispersed starting points for Metropolis algorithm
  theta=aggregate(y~group,data = dat,FUN = sample,nchains,replace=T)[,2]
  mu=colMeans(theta)
  sigma2=rep(mean(tapply(y,group,FUN = var)),nchains)
  tau2=apply(theta,2,var)
  
  ##Metropolis algorithm
  for(i in 1:N){
    
    ##Sample proposal params from the jumping distribution
    d=3
    params_star=mvrnorm(1,mu=c(mu[1],log(sigma2[1])*0.5,log(tau2[1])*0.5),
                        Sigma = H)
    for(ch in 2:nchains){
      params_star=rbind(params_star,
                        mvrnorm(1,mu=c(mu[ch],log(sigma2[ch])*0.5,log(tau2[ch])*0.5),
                        Sigma = H))
    }
    mu_star=params_star[,1]
    log_sigma_star=params_star[,2];sigma2_star=exp(log_sigma_star)^2
    log_tau_star=params_star[,3];tau2_star=exp(log_tau_star)^2
    
    
    ##Calculate the ratio for acceptance
    log_num=L(mu_star[1],log(sigma2_star[1])*0.5,log(tau2_star[1])*0.5)
    for(ch in 2:nchains){
      log_num=c(log_num,L(mu_star[ch],log(sigma2_star[ch])*0.5,log(tau2_star[ch])*0.5))
    }
   
    log_den=L(mu[1],log(sigma2[1])*0.5,log(tau2[1])*0.5)
    for(ch in 2:nchains){
      log_den=c(log_den,L(mu[ch],log(sigma2[ch])*0.5,log(tau2[ch])*0.5))
    }
    
    #acceptance ratio 
    r=exp(log_num-log_den)
    
    
    ##Generate new (mu, log_sigma, log_tau )
    #binary random number with success probability equals the acceptance ratio
    accept=rbinom(nchains,size = 1,prob = (r>=1)*1+(r<1)*r )
    
    #new mu
    mu=rowSums(cbind(mu,mu_star)*cbind(1-accept,accept))
    
    #new sigma
    sigma2=rowSums(cbind(sigma2,sigma2_star)*cbind(1-accept,accept))
    
    #new tau
    tau2=rowSums(cbind(tau2,tau2_star)*cbind(1-accept,accept))
    
    ##Sample new theta from its conditional posterior distribution (same as Gibbs)
    theta_hat=aggregate(y~group,data = dat,
                        FUN = function(y){
                          num=mu/tau2+sum(y)/sigma2
                          den=1/tau2+length(y)/sigma2
                          return(num/den)
                        })[,2]
    V=aggregate(y~group,data = dat,
                FUN = function(y){
                  return(1/(1/tau2+length(y)/sigma2))
                })[,2]
    theta=matrix(rnorm(J*nchains,mean = theta_hat,sd=sqrt(V)),ncol = nchains)
    
    ##write down the i iteration
    arr[i,,]=rbind(theta,mu,sqrt(sigma2),sqrt(tau2))
  }
  #discard and thin
  arr1=arr[(ndiscard+1):N,,]
  arr2=arr[seq(1,(N-ndiscard),by=nthin),,]
  return(list(arr=arr,arr_discard=arr1,arr_thin=arr2))
}  

set.seed(222)
n=length(y);J=4;nparams=7 
N=1500;nthin=1;nchains=5;ndiscard=N/2
samples_m=metroplis(N,nthin,ndiscard,nchains)

Metropolis algorithm: make inference

summary statistics

apply(samples_m$arr_thin,2,FUN = mean)

##    theta1    theta2    theta3    theta4        mu     sigma       tau 
## 61.268668 65.870548 67.757266 61.152226 63.942952  2.247331  5.546194

apply(samples_m$arr_thin,2,FUN = sd)

##    theta1    theta2    theta3    theta4        mu     sigma       tau 
## 1.1349269 0.9299713 0.9381882 0.8023437 2.5217698 0.3265883 4.2516998

apply(samples_m$arr_thin,2,FUN = quantile)

##        theta1   theta2   theta3   theta4       mu    sigma        tau
## 0%   56.28354 62.34475 64.05446 58.43413 51.77725 1.425035  0.5496124
## 25%  60.52192 65.25554 67.14452 60.63496 62.53167 2.022264  3.0907738
## 50%  61.24985 65.87636 67.77855 61.13761 63.89958 2.210399  4.2766813
## 75%  62.02316 66.48300 68.38120 61.65718 65.16939 2.441658  6.4380150
## 100% 65.59524 69.56063 71.18273 64.12564 79.06300 4.067672 54.2553655

density curve

for(i in 1:nparams){
  plot(density(as.vector(samples_m$arr_thin[,i,])),main = dimnames(samples_m$arr_thin)[[2]][i])
}

Metropolis algorithm: assess convergence

trace plots

arr_thin_m=samples_m$arr_thin

for(i in 1:nparams){
  plot(seq(N/2+1,N,nthin),arr_thin_m[,i,1],type = 'l',ylab=dimnames(arr_thin_m)[[2]][i],xlab = "iteration")
  for(chain in 2:nchains){
    lines(seq(N/2+1,N,nthin),arr_thin_m[,i,chain],type='l',col=chain)
  }
}

Potential scale reduction factor

arr1_m=samples_m$arr_discard
mat=arr1_m[,,1]
for(i in 2:dim(arr1_m)[3]){
  mat=rbind(mat,arr1_m[,,i])
}
mat=as.data.frame(mat)
chain=factor(rep(1:(2*nchains),each=dim(arr1_m)[1]/2))
Rhat=apply(mat,2,FUN = function(y){
  B=anova(aov(y~chain))$`Mean Sq`[1]
  W=anova(aov(y~chain))$`Mean Sq`[2]
  nsim=dim(arr1_m)[1]/2
  var_hat=(nsim-1)*W/nsim+B/nsim
  R_hat=sqrt(var_hat/W)
  return(R_hat)
})
Rhat

##    theta1    theta2    theta3    theta4        mu     sigma       tau 
## 1.0030404 0.9990871 0.9996229 0.9998591 1.1513643 1.0306355 1.0919592

Effective number of indepedent simulation draws

n_eff=apply(mat,2,FUN = function(y){
  B=anova(aov(y~chain))$`Mean Sq`[1]
  W=anova(aov(y~chain))$`Mean Sq`[2]
  nsim=dim(arr1_m)[1]/2
  m=nchains*2
  var_hat=(nsim-1)*W/nsim+B/nsim
  neff=min(m*nsim*var_hat/B,m*nsim)
  return(neff)
})
n_eff

##     theta1     theta2     theta3     theta4         mu      sigma 
## 1148.92594 3750.00000 3750.00000 3750.00000   40.37813  163.72875 
##        tau 
##   61.13442

JAGS: write jags model

library(R2jags)

mu_a=min(y); mu_b=max(y); log_sigma_b=2*log(sd(y));tau_b=2*sd(y)

#Write the model use jags language
cat("model{

    ##specify the distribution for observations
    for(i in 1:n){
      y[i]~dnorm(theta[group[i]],1/sigma2)
    }

    ##specify the prior for theta
    for(j in 1:J){
      theta[j]~dnorm(mu,1/tau2)
    }

    ##specify the prior for hyperparameters
    mu~dunif(55,75)
    
    log_sigma~dunif(-10,3)
    sigma2<-exp(2*log_sigma)
    sigma<-exp(log_sigma)
    
    tau~dunif(0,8)
    tau2<-pow(tau,2)

}",file="model.jag")

JAGS: call jags from R

#Data prepare
datlist=list('y','group','J','n')

#Call JAGS from R
samples_jags=jags(data = datlist,parameters.to.save = c('theta','mu','sigma','tau'),
                  model.file = 'model.jag',n.chains = nchains,n.iter = N,
                  n.burnin = ndiscard,n.thin = nthin)

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 24
##    Unobserved stochastic nodes: 7
##    Total graph size: 130
## 
## Initializing model

JAGS: display results

summary statistics

samples_jags$BUGSoutput$summary

##                mean        sd       2.5%        25%        50%        75%
## deviance 110.809734 3.6656153 106.036715 108.079751 110.105366 112.699181
## mu        63.981084 2.2854255  59.152783  62.576723  64.022952  65.375105
## sigma      2.460028 0.4211305   1.805325   2.162487   2.396482   2.688747
## tau        4.430385 1.5592327   1.894403   3.193914   4.291078   5.522600
## theta[1]  61.292702 1.2217256  58.908916  60.488746  61.280975  62.065920
## theta[2]  65.865279 1.0059236  63.845164  65.207928  65.867914  66.525188
## theta[3]  67.712821 1.0275781  65.672868  67.040757  67.722647  68.387651
## theta[4]  61.174376 0.8821369  59.449596  60.602598  61.169142  61.739311
##               97.5%     Rhat n.eff
## deviance 119.973573 1.008294   560
## mu        68.619629 1.001795  2300
## sigma      3.424235 1.008148   490
## tau        7.569826 1.003249  1000
## theta[1]  63.827046 1.003446   970
## theta[2]  67.851504 1.001379  2900
## theta[3]  69.695605 1.001410  2800
## theta[4]  62.959155 1.000878  3800

plots

#Display results
plot(as.mcmc(samples_jags),auto.layout = F )

#densplot(as.mcmc(samples_jags))
#traceplot(as.mcmc(samples_jags))

#Gelman and Rubin's convergence diagnostic
gelman.diag(as.mcmc(samples_jags))

## Potential scale reduction factors:
## 
##          Point est. Upper C.I.
## deviance       1.01       1.02
## mu             1.00       1.00
## sigma          1.01       1.03
## tau            1.00       1.01
## theta[1]       1.00       1.01
## theta[2]       1.00       1.00
## theta[3]       1.00       1.00
## theta[4]       1.00       1.00
## 
## Multivariate psrf
## 
## 1.01

gelman.plot(as.mcmc(samples_jags))

Stan: write Stan model

library(rstan)

#Write the model use stan language
cat("
    data{
      int<lower=0> n; //number of observations
      int<lower=0> J; //number of groups
      real y[n]; //observations
      int group[n]; //group of the observations
    }
    parameters{
      vector[J] theta;
      real<lower=0,upper=3> log_sigma;
      real<lower=55,upper=75> mu;
      real<lower=0,upper=5> tau;
    }
    transformed parameters{
      real<lower=0> tau2;
      real<lower=0> sigma2;
      real<lower=0> sigma;
      tau2<-pow(tau,2);
      sigma2<-exp(2*log_sigma);
      sigma<-exp(log_sigma);
    }
    model{
      for(i in 1:n){
        y[i]~normal(theta[group[i]],sqrt(sigma2)); //distribution of the observations
      }
      theta~normal(mu,sqrt(tau2));   //prior for theta
      mu~uniform(55,75);
      tau~uniform(0,5);
      log_sigma~uniform(0,3);
    }
    ",file="model.stan")

Stan: call Stan from R

#Data prepare
datlist=list('y','group','J','n')

#Call Stan from R
samples_stan<-stan(file = "model.stan",data = datlist,pars=c('theta','mu','tau','sigma'),
                   iter = N,chains = nchains,warmup = N/2,thin = nthin,init = 'random',
                   verbose = FALSE)

## 
## SAMPLING FOR MODEL 'model' NOW (CHAIN 1).
## 
## Chain 1, Iteration:    1 / 1500 [  0%]  (Warmup)
## Chain 1, Iteration:  150 / 1500 [ 10%]  (Warmup)
## Chain 1, Iteration:  300 / 1500 [ 20%]  (Warmup)
## Chain 1, Iteration:  450 / 1500 [ 30%]  (Warmup)
## Chain 1, Iteration:  600 / 1500 [ 40%]  (Warmup)
## Chain 1, Iteration:  750 / 1500 [ 50%]  (Warmup)
## Chain 1, Iteration:  751 / 1500 [ 50%]  (Sampling)
## Chain 1, Iteration:  900 / 1500 [ 60%]  (Sampling)
## Chain 1, Iteration: 1050 / 1500 [ 70%]  (Sampling)
## Chain 1, Iteration: 1200 / 1500 [ 80%]  (Sampling)
## Chain 1, Iteration: 1350 / 1500 [ 90%]  (Sampling)
## Chain 1, Iteration: 1500 / 1500 [100%]  (Sampling)# 
## #  Elapsed Time: 0.062 seconds (Warm-up)
## #                0.047 seconds (Sampling)
## #                0.109 seconds (Total)
## # 
## 
## SAMPLING FOR MODEL 'model' NOW (CHAIN 2).
## 
## Chain 2, Iteration:    1 / 1500 [  0%]  (Warmup)
## Chain 2, Iteration:  150 / 1500 [ 10%]  (Warmup)
## Chain 2, Iteration:  300 / 1500 [ 20%]  (Warmup)
## Chain 2, Iteration:  450 / 1500 [ 30%]  (Warmup)
## Chain 2, Iteration:  600 / 1500 [ 40%]  (Warmup)
## Chain 2, Iteration:  750 / 1500 [ 50%]  (Warmup)
## Chain 2, Iteration:  751 / 1500 [ 50%]  (Sampling)
## Chain 2, Iteration:  900 / 1500 [ 60%]  (Sampling)
## Chain 2, Iteration: 1050 / 1500 [ 70%]  (Sampling)
## Chain 2, Iteration: 1200 / 1500 [ 80%]  (Sampling)
## Chain 2, Iteration: 1350 / 1500 [ 90%]  (Sampling)
## Chain 2, Iteration: 1500 / 1500 [100%]  (Sampling)# 
## #  Elapsed Time: 0.047 seconds (Warm-up)
## #                0.047 seconds (Sampling)
## #                0.094 seconds (Total)
## # 
## 
## SAMPLING FOR MODEL 'model' NOW (CHAIN 3).
## 
## Chain 3, Iteration:    1 / 1500 [  0%]  (Warmup)
## Chain 3, Iteration:  150 / 1500 [ 10%]  (Warmup)
## Chain 3, Iteration:  300 / 1500 [ 20%]  (Warmup)
## Chain 3, Iteration:  450 / 1500 [ 30%]  (Warmup)
## Chain 3, Iteration:  600 / 1500 [ 40%]  (Warmup)
## Chain 3, Iteration:  750 / 1500 [ 50%]  (Warmup)
## Chain 3, Iteration:  751 / 1500 [ 50%]  (Sampling)
## Chain 3, Iteration:  900 / 1500 [ 60%]  (Sampling)
## Chain 3, Iteration: 1050 / 1500 [ 70%]  (Sampling)
## Chain 3, Iteration: 1200 / 1500 [ 80%]  (Sampling)
## Chain 3, Iteration: 1350 / 1500 [ 90%]  (Sampling)
## Chain 3, Iteration: 1500 / 1500 [100%]  (Sampling)# 
## #  Elapsed Time: 0.046 seconds (Warm-up)
## #                0.047 seconds (Sampling)
## #                0.093 seconds (Total)
## # 
## 
## SAMPLING FOR MODEL 'model' NOW (CHAIN 4).
## 
## Chain 4, Iteration:    1 / 1500 [  0%]  (Warmup)
## Chain 4, Iteration:  150 / 1500 [ 10%]  (Warmup)
## Chain 4, Iteration:  300 / 1500 [ 20%]  (Warmup)
## Chain 4, Iteration:  450 / 1500 [ 30%]  (Warmup)
## Chain 4, Iteration:  600 / 1500 [ 40%]  (Warmup)
## Chain 4, Iteration:  750 / 1500 [ 50%]  (Warmup)
## Chain 4, Iteration:  751 / 1500 [ 50%]  (Sampling)
## Chain 4, Iteration:  900 / 1500 [ 60%]  (Sampling)
## Chain 4, Iteration: 1050 / 1500 [ 70%]  (Sampling)
## Chain 4, Iteration: 1200 / 1500 [ 80%]  (Sampling)
## Chain 4, Iteration: 1350 / 1500 [ 90%]  (Sampling)
## Chain 4, Iteration: 1500 / 1500 [100%]  (Sampling)# 
## #  Elapsed Time: 0.063 seconds (Warm-up)
## #                0.047 seconds (Sampling)
## #                0.11 seconds (Total)
## # 
## 
## SAMPLING FOR MODEL 'model' NOW (CHAIN 5).
## 
## Chain 5, Iteration:    1 / 1500 [  0%]  (Warmup)
## Chain 5, Iteration:  150 / 1500 [ 10%]  (Warmup)
## Chain 5, Iteration:  300 / 1500 [ 20%]  (Warmup)
## Chain 5, Iteration:  450 / 1500 [ 30%]  (Warmup)
## Chain 5, Iteration:  600 / 1500 [ 40%]  (Warmup)
## Chain 5, Iteration:  750 / 1500 [ 50%]  (Warmup)
## Chain 5, Iteration:  751 / 1500 [ 50%]  (Sampling)
## Chain 5, Iteration:  900 / 1500 [ 60%]  (Sampling)
## Chain 5, Iteration: 1050 / 1500 [ 70%]  (Sampling)
## Chain 5, Iteration: 1200 / 1500 [ 80%]  (Sampling)
## Chain 5, Iteration: 1350 / 1500 [ 90%]  (Sampling)
## Chain 5, Iteration: 1500 / 1500 [100%]  (Sampling)# 
## #  Elapsed Time: 0.047 seconds (Warm-up)
## #                0.046 seconds (Sampling)
## #                0.093 seconds (Total)
## #

Stan: display results

summary statistics

print(samples_stan)

## Inference for Stan model: model.
## 5 chains, each with iter=1500; warmup=750; thin=1; 
## post-warmup draws per chain=750, total post-warmup draws=3750.
## 
##            mean se_mean   sd   2.5%    25%    50%    75%  97.5% n_eff Rhat
## theta[1]  61.40    0.02 1.16  59.10  60.62  61.41  62.18  63.63  2463    1
## theta[2]  65.78    0.02 0.98  63.81  65.14  65.79  66.42  67.73  2711    1
## theta[3]  67.60    0.02 1.00  65.59  66.97  67.63  68.27  69.50  2069    1
## theta[4]  61.22    0.02 0.86  59.59  60.66  61.20  61.78  62.95  2972    1
## mu        64.01    0.04 1.87  60.22  62.80  64.01  65.24  67.64  2192    1
## tau        3.46    0.02 0.90   1.77   2.79   3.48   4.21   4.92  1473    1
## sigma      2.45    0.01 0.42   1.78   2.17   2.40   2.68   3.37  2208    1
## lp__     -39.54    0.06 2.11 -44.53 -40.71 -39.19 -37.98 -36.51  1156    1
## 
## Samples were drawn using NUTS(diag_e) at Sat Oct 22 16:22:06 2016.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

plots

stan_plot(samples_stan)

stan_trace(samples_stan)

stan_hist(samples_stan)

stan_dens(samples_stan)

stan_diag(samples_stan)

stan_ac(samples_stan)

MCMC Example: A hierarchical normal model

Xiao Liu

2016.10.28