\documentstyle[fleqn,epsf,aima-slides]{article}
\begin{document}
\begin{huge}
\titleslide{Inference in belief networks}{Chapter 15.3--4 + new}
\sf
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Outline}
\blob Exact inference by enumeration
\blob Exact inference by variable elimination
\blob Approximate inference by stochastic simulation
\blob Approximate inference by Markov chain Monte Carlo
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Inference tasks}
\u{Simple queries}: compute posterior marginal $\pv(X_i|\mbf{E}\eq \mbf{e})$\al
e.g., $P(NoGas|Gauge\eq empty,Lights\eq on,Starts\eq false)$
\u{Conjunctive queries}: $\pv(X_i,X_j|\mbf{E}\eq \mbf{e}) =
\pv(X_i|\mbf{E}\eq \mbf{e}) \pv(X_j|X_i,\mbf{E}\eq \mbf{e}) $
\u{Optimal decisions}: decision networks include utility information;\nl
probabilistic inference required for $P(outcome|action,evidence)$
\u{Value of information}: which evidence to seek next?
\u{Sensitivity analysis}: which probability values are most critical?
\u{Explanation}: why do I need a new starter motor?
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Inference by enumeration}
Slightly intelligent way to sum out variables from the joint
without actually constructing its explicit representation
Simple query on the burglary network:\\
$\pv(B|J\eq true,M\eq true)$\\
$= \pv(B,J\eq true,M\eq true)/P(J\eq true,M\eq true)$\\
$= \alpha \pv(B,J\eq true,M\eq true)$\\
$= \alpha \mysum_e \mysum_a \pv(B,e,a,J\eq true,M\eq true)$
Rewrite full joint entries using product of CPT entries:\\
$P(B\eq true|J\eq true,M\eq true)$\\
$= \alpha \mysum_e \mysum_a P(B\eq true)P(e)P(a|B\eq true,e)P(J\eq true|a)P(M\eq true|a) $\\
$= \alpha P(B\eq true)\mysum_e P(e)\mysum_a P(a|B\eq true,e)P(J\eq true|a)P(M\eq true|a)$
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Enumeration algorithm}
Exhaustive depth-first enumeration: $O(n)$ space, $O(d^n)$ time
\input{\file{algorithms}{enumeration-algorithm}}
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Inference by variable elimination}
Enumeration is inefficient: repeated computation\al
e.g., computes $P(J\eq true|a)P(M\eq true|a)$ for each value of $e$
Variable elimination: carry out summations right-to-left,\\
storing intermediate results (\u{factors}) to avoid recomputation
$\pv(B|J\eq true,M\eq true)$\nl
$= \alpha \underbrace{\pv(B)}_B
\mysum_e \underbrace{P(e)}_E
\mysum_a \underbrace{\pv(a|B,e)}_A
\underbrace{P(J\eq true|a)}_J
\underbrace{P(M\eq true|a)}_M$\nl
$= \alpha \pv(B)\mysum_e P(e)\mysum_a \pv(a|B,e) P(J\eq true|a) f_M(a)$\nl
$= \alpha \pv(B)\mysum_e P(e)\mysum_a \pv(a|B,e) f_J(a) f_M(a)$\nl
$= \alpha \pv(B)\mysum_e P(e)\mysum_a f_A(a,b,e) f_J(a) f_M(a)$\nl
$= \alpha \pv(B)\mysum_e P(e)f_{\bar{A}JM}(b,e) $ (sum out $A$)\nl
$= \alpha \pv(B)f_{\bar{E}\bar{A}JM}(b)$ (sum out $E$)\nl
$= \alpha f_B(b)\stimes f_{\bar{E}\bar{A}JM}(b)$
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Variable elimination: Basic operations}
\u{Pointwise product} of factors $f_1$ and $f_2$:\al
$f_1(x_1,\ldots,x_j,y_1,\ldots,y_k) \stimes
f_2(y_1,\ldots,y_k,z_1,\ldots,z_l)$\nl
= $f(x_1,\ldots,x_j,y_1,\ldots,y_k,z_1,\ldots,z_l)$\\
E.g., $f_1(a,b) \stimes f_2(b,c) = f(a,b,c)$
Summing out a variable from a product of factors: move any constant
factors outside the summation:
$\mysum_x f_1 \stimes \cdots \stimes f_k =
f_1 \stimes \cdots \stimes f_i\ \mysum_x\; f_{i+1} \stimes \cdots \stimes
f_k = f_1 \stimes \cdots \stimes f_i \stimes f_{\bar{X}}$
assuming $f_1,\ldots,f_i$ do not depend on $X$
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Variable elimination algorithm}
\input{\file{algorithms}{elimination-ask-algorithm}}
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Complexity of exact inference}
\u{Singly connected} networks (or \u{polytrees}):\al
-- any two nodes are connected by at most one (undirected) path\al
-- time and space cost of variable elimination are $O(d^k n)$
\u{Multiply connected} networks:\al
-- can reduce 3SAT to exact inference $\implies$ NP-hard\al
-- equivalent to {\em counting} 3SAT models $\implies$ \#P-complete
\vspace*{0.2in}
\epsfxsize=0.75\textwidth
\fig{\file{figures}{bn-3sat.ps}}
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Inference by stochastic simulation}
Basic idea:\al
1) Draw $N$ samples from a sampling distribution $S$\al
2) Compute an approximate posterior probability $\hat P$\al
3) Show this converges to the true probability $P$
Outline:\al
-- Sampling from an empty network\al
-- Rejection sampling: reject samples disagreeing with evidence\al
-- Likelihood weighting: use evidence to weight samples\al
-- MCMC: sample from a stochastic process whose stationary\nl
distribution is the true posterior
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Sampling from an empty network}
\input{\file{algorithms}{prior-sampling-algorithm}}
\begin{tabular}{lr}
\hbox{\begin{minipage}[b]{0.55\textwidth}
$\pv(Cloudy) = \<0.5,0.5\>$\nl
sample $\rightarrow$ $true$\\
$\pv(Sprinkler|Cloudy) = \<0.1,0.9\>$\nl
sample $\rightarrow$ $false$\\
$\pv(Rain|Cloudy) = \<0.8,0.2\>$\nl
sample $\rightarrow$ $true$\\
\hbox{$\pv(WetGrass|\lnot Sprinkler,Rain) = \<0.9,0.1\>$}\nl
sample $\rightarrow$ $true$
\end{minipage}}
&
\epsfxsize=0.5\textwidth
\epsffile{\file{figures}{rain-clustering1.ps}}
\end{tabular}
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Sampling from an empty network contd.}
Probability that \prog{PriorSample} generates a particular event\al
$S_{PS}(x_1\ldots x_n) = \myprod_{i\eq 1}^n P(x_i | Parents(X_i))
= P(x_1\ldots x_n)$\\
i.e., the true prior probability
Let $N_{PS}(\mbf{Y}\eq \mbf{y})$ be the number of samples generated
for which $\mbf{Y}\eq \mbf{y}$, for any set of variables $\mbf{Y}$.
Then $\hat P(\mbf{Y}\eq \mbf{y}) = N_{PS}(\mbf{Y}\eq \mbf{y})/N$ and
\begin{eqnarray*}
\lim_{N\to\infty} \hat P(\mbf{Y}\eq \mbf{y})
& = & \mysum_{\smbf{h}} S_{PS}(\mbf{Y}\eq \mbf{y},\mbf{H}\eq \mbf{h})\\
& = & \mysum_{\smbf{h}} P(\mbf{Y}\eq \mbf{y},\mbf{H}\eq \mbf{h})\\
& = & P(\mbf{Y}\eq \mbf{y})
\end{eqnarray*}
That is, estimates derived from \prog{PriorSample} are \u{consistent}
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Rejection sampling}
$\hat{\pv}(X|\mbf{e})$ estimated from samples agreeing with $\mbf{e}$
\input{\file{algorithms}{rejection-sampling-algorithm}}
E.g., estimate $\pv(Rain|Sprinkler\eq true)$ using 100 samples\al
27 samples have $Sprinkler\eq true$\nl
Of these, 8 have $Rain\eq true$ and 19 have $Rain\eq false$.\\[0.2in]
$\hat{\pv}(Rain|Sprinkler\eq true) = \noprog{Normalize}(\<8,19\>) =
\<0.296,0.704\>$
Similar to a basic real-world empirical estimation procedure
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Analysis of rejection sampling}
$\hat{\pv}(X|\mbf{e}) = \alpha \mbf{N}_{PS}(X,\mbf{e}) $
\qquad (algorithm defn.)\al
$= \mbf{N}_{PS}(X,\mbf{e})/N_{PS}(\mbf{e})$
\qquad (normalized by $N_{PS}(\mbf{e})$)\al
$\approx \pv(X,\mbf{e})/P(\mbf{e})$
\qquad (property of \prog{PriorSample})\al
$= \pv(X|\mbf{e})$
\qquad (defn. of conditional probability)
Hence rejection sampling returns consistent posterior estimates
Problem: hopelessly expensive if $P(\mbf{e})$ is small
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Likelihood weighting}
Idea: fix evidence variables, sample only nonevidence variables,\\
and weight each sample by the likelihood it accords the evidence
\input{\file{algorithms}{likelihood-weighting-algorithm}}
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Likelihood weighting example}
Estimate $\pv(Rain|Sprinkler\eq true,WetGrass\eq true)$
\vspace*{0.2in}
\epsfxsize=0.5\textwidth
\epsffile{\file{figures}{rain-clustering-mcmc.ps}}
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{LW example contd.}
Sample generation process:\\
1. $w \leftarrow 1.0$\\
2. Sample $\pv(Cloudy) = \<0.5,0.5\>$; say $true$\\
3. $Sprinkler$ has value $true$, so\al
$w \leftarrow w \times P(Sprinkler\eq true|Cloudy\eq true) = 0.1$\\
4. Sample $\pv(Rain|Cloudy\eq true) = \<0.8,0.2\>$; say $true$ \\
5. $WetGrass$ has value $true$, so\al
\hbox{$w \leftarrow w \times P(WetGrass\eq true|Sprinkler\eq true,Rain\eq true) = 0.099$}
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Likelihood weighting analysis}
Sampling probability for \prog{WeightedSample} is\al
$S_{WS}(\mbf{y},\mbf{e}) = \myprod_{i\eq 1}^l P(y_i|Parents(Y_i))$\\
Note: pays attention to evidence in {\em ancestors} only\nl
$\implies$ somewhere ``in between'' prior and posterior distribution
Weight for a given sample $\mbf{y},\mbf{e}$ is\al
$w(\mbf{y},\mbf{e}) = \myprod_{i\eq 1}^m P(e_i | Parents(E_i))$
Weighted sampling probability is\al
$S_{WS}(\mbf{y},\mbf{e}) w(\mbf{y},\mbf{e})$\nl
$= \myprod_{i\eq 1}^l P(y_i|Parents(Y_i))\ \
\myprod_{i\eq 1}^m P(e_i|Parents(E_i))$\nl
$= P(\mbf{y},\mbf{e})$ (by standard global semantics of network)
Hence likelihood weighting returns consistent estimates\\
but performance still degrades with many evidence variables
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Approximate inference using MCMC}
``State'' of network = current assignment to all variables
Generate next state by sampling one variable given Markov blanket\\
Sample each variable in turn, keeping evidence fixed
\input{\file{algorithms}{mcmc-algorithm}}
Approaches \u{stationary distribution}: long-run fraction of time
spent in each state is exactly proportional to its posterior
probability
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{MCMC Example}
Estimate $\pv(Rain|Sprinkler\eq true,WetGrass\eq true)$
Sample $Cloudy$ then $Rain$, repeat.\\
Count number of times $Rain$ is true and false in the samples.
Markov blanket of $Cloudy$ is $Sprinkler$ and $Rain$\\
Markov blanket of $Rain$ is $Cloudy$, $Sprinkler$, and $WetGrass$
\vspace*{0.2in}
\epsfxsize=0.5\textwidth
\epsffile{\file{figures}{rain-clustering-mcmc.ps}}
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{MCMC example contd.}
Random initial state: $Cloudy\eq true$ and $Rain\eq false$
1. $\pv(Cloudy|MB(Cloudy)) = \pv(Cloudy|Sprinkler,\lnot Rain)$\nl
sample $\rightarrow$ $false$
2. $\pv(Rain|MB(Rain)) = \pv(Rain|\lnot Cloudy,Sprinkler,WetGrass)$\nl
sample $\rightarrow$ $true$
Visit 100 states\al
31 have $Rain\eq true$, 69 have $Rain\eq false$
$\hat{\pv}(Rain|Sprinkler\eq true,WetGrass\eq true)$\al
$= \noprog{Normalize}(\<31,69\>) = \<0.31,0.69\>$
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{MCMC analysis: Outline}
Transition probability $\transition{\mbf{y}}{\mbf{y}'}$
Occupancy probability $\pi_t(\mbf{y})$ at time $t$
Equilibrium condition on $\pi_t$ defines stationary distribution $\pi(\mbf{y})$\nl
Note: stationary distribution depends on choice of $\transition{\mbf{y}}{\mbf{y}'}$
Pairwise \u{detailed balance} on states guarantees equilibrium
\u{Gibbs sampling} transition probability:\nl
sample each variable given current values of all others\\
$\implies$ detailed balance with the true posterior
For Bayesian networks, Gibbs sampling reduces to\\
sampling conditioned on each variable's Markov blanket
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Stationary distribution}
$\pi_t(\mbf{y})$ = probability in state $\mbf{y}$ at time $t$\\
$\pi_{t+1}(\mbf{y}')$ = probability in state $\mbf{y}'$ at time $t+1$
$\pi_{t+1}$ in terms of $\pi_t$ and $\transition{\mbf{y}}{\mbf{y}'}$
\[
\pi_{t+1}(\mbf{y}') = \mysum_{\smbf{y}} \pi_t(\mbf{y}) \transition{\mbf{y}}{\mbf{y}'}
\]
Stationary distribution: $\pi_t = \pi_{t+1} = \pi$
\[
\pi(\mbf{y}') = \mysum_{\smbf{y}} \pi(\mbf{y}) \transition{\mbf{y}}{\mbf{y}'}
\qquad\mbox{for all }\mbf{y}'
\]
If $\pi$ exists, it is unique (specific to $\transition{\mbf{y}}{\mbf{y}'}$)
In equilibrium, expected ``outflow'' = expected ``inflow''
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Detailed balance}
``Outflow'' = ``inflow'' for each pair of states:
\[
\pi(\mbf{y}) \transition{\mbf{y}}{\mbf{y}'}
= \pi(\mbf{y}') \transition{\mbf{y}'}{\mbf{y}}
\qquad\mbox{for all }\mbf{y},\ \mbf{y}'
\]
Detailed balance $\implies$ stationarity:
\begin{eqnarray*}
\mysum_{\smbf{y}} \pi(\mbf{y}) \transition{\mbf{y}}{\mbf{y}'}
& = & \mysum_{\smbf{y}} \pi(\mbf{y}') \transition{\mbf{y}'}{\mbf{y}} \\
& = & \pi(\mbf{y}') \mysum_{\smbf{y}} \transition{\mbf{y}'}{\mbf{y}} \\
& = & \pi(\mbf{y}')
\end{eqnarray*}
MCMC algorithms typically constructed by designing a transition\\
probability $q$ that is in detailed balance with desired $\pi$
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Gibbs sampling}
Sample each variable in turn, given {\em all other variables}
Sampling $Y_i$, let $\bar{\mbf{Y}_i}$ be all other nonevidence variables\\
Current values are $y_i$ and $\bar{\mbf{y}_i}$; $\mbf{e}$ is fixed\\
Transition probability is given by
\[
\transition{\mbf{y}}{\mbf{y}'} =
\transition{y_i,\bar{\mbf{y}_i}}{y_i',\bar{\mbf{y}_i}} =
P(y_i'|\bar{\mbf{y}_i},\mbf{e})
\]
This gives detailed balance with true posterior $P(\mbf{y}|\mbf{e})$:
\begin{eqnarray*}
\pi(\mbf{y}) \transition{\mbf{y}}{\mbf{y}'}
&=& P(\mbf{y}|\mbf{e}) P(y_i'|\bar{\mbf{y}_i},\mbf{e})
= P(y_i,\bar{\mbf{y}_i}|\mbf{e})P(y_i'|\bar{\mbf{y}_i},\mbf{e}) \\
&=& P(y_i|\bar{\mbf{y}_i},\mbf{e})P(\bar{\mbf{y}_i}|\mbf{e})
P(y_i'|\bar{\mbf{y}_i},\mbf{e}) \quad\mbox{(chain rule)} \\
&=& P(y_i|\bar{\mbf{y}_i},\mbf{e})P(y_i',\bar{\mbf{y}_i}|\mbf{e})
\qquad\mbox{(chain rule backwards)} \\
&=& \transition{\mbf{y}'}{\mbf{y}} \pi(\mbf{y}')
= \pi(\mbf{y}') \transition{\mbf{y}'}{\mbf{y}}
\end{eqnarray*}
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Markov blanket sampling}
A variable is independent of all others given its Markov blanket:\al
$P(y_i'|\bar{\mbf{y}_i},\mbf{e}) = P(y_i'|MB(Y_i))$
Probability given the Markov blanket is calculated as follows:\al
$P(y_i'|MB(Y_i)) = P(y_i'|Parents(Y_i))
\myprod_{Z_j\in Children(Y_i)} P(z_j|Parents(Z_j))$
Hence computing the sampling distribution over $Y_i$ for each
flip requires just $cd$ multiplications if $Y_i$ has $c$ children
and $d$ values; can cache it if $c$ not too large.
Main computational problems:\al
1) Difficult to tell if convergence has been achieved\al
2) Can be wasteful if Markov blanket is large:\nl
$P(Y_i|MB(Y_i))$ won't change much (law of large numbers)
%%%%%%%%%%%% Slide %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\heading{Performance of approximation algorithms}
\u{Absolute approximation}:
$|P(X|\mbf{e}) - \hat P(X|\mbf{e})| \leq \epsilon$
\u{Relative approximation}:
$\frac{|P(X|\smbf{e}) - \hat P(X|\smbf{e})|}{P(X|\smbf{e})} \leq \epsilon$
Relative $\implies$ absolute since $0\leq P \leq 1$ (may be $O(2^{-n})$)
Randomized algorithms may fail with probability at most $\delta$
Polytime approximation: $\mbox{poly}(n,\epsilon^{-1},\log \delta^{-1})$
Theorem (Dagum and Luby, 1993): both absolute and relative\\
approximation for either deterministic or randomized algorithms\\
are NP-hard for any $\epsilon,\delta<0.5$
(Absolute approximation polytime with no evidence---Chernoff bounds)
\end{huge}
\end{document}