;;;; Basic dynamic programming routines for MDPs (Markov decision processes)
;;; Value iteration, value determination, and policy iteration.
;;; MDP agents pass in an mdp and expect a policy in return.
(defun value-iteration-policy (mdp)
(optimal-policy (value-iteration mdp) (mdp-model mdp) (mdp-rewards mdp)))
;;; Given an environment model M, value iteration
;;; determine the values of states U.
;;; Basic equation is U(i) <- r(i) + max_a sum_j M(a,i,j)U(j)
;;; where U(j) MUST be the old value not the new.
(defun value-iteration (mdp &optional (Uold (copy-hash-table
(mdp-rewards mdp) #'identity))
&key (epsilon 0.000001)
&aux (Unew (copy-hash-table Uold #'identity))
(max-delta infinity)
(M (mdp-model mdp))
(R (mdp-rewards mdp)))
(do ()
((< max-delta epsilon) Unew)
(setq max-delta 0)
(rotatef Uold Unew) ;;; switch contents; then we will overwrite Unew
(maphash
#'(lambda (s u)
(unless (sink? s M)
(setf (gethash s Unew)
(+ (gethash s R)
(if (gethash s M)
(apply #'max
(mapcar
#'(lambda (a) (q-value a s Uold M R))
(actions s M)))
0))))
(setq max-delta (max max-delta (abs (- (gethash s Unew) u)))))
Uold)))
;;; A state is a sink if there are no actions that can lead to another state.
;;; Sinks can arise by accident during reinforcement learning of an environment
;;; model. Because they cause infinite loops, they must be detected.
(defun sink? (s M)
(not (some #'(lambda (a)
(some #'(lambda (transition)
(not (equal s (transition-destination transition))))
(transitions a s M)))
(actions s M))))
;;; Given an initial policy P and initial utilities U, calculate the optimal
;;; policy. Do this by value determination alternating with policy update.
(defun policy-iteration (mdp &optional (U (copy-hash-table
(mdp-rewards mdp) #'identity))
&aux (M (mdp-model mdp))
(R (mdp-rewards mdp))
(P (optimal-policy U M R))
(unchanged nil) new)
(do ()
(unchanged P)
(setq unchanged t)
(setq U (value-determination P U M R))
(maphash #'(lambda (s aplist) (declare (ignore aplist))
(setq new (dmax-choice s U M R))
(when (> (q-value new s U M R)
(q-value (caar (gethash s P)) s U M R) )
(setq unchanged nil)
(setf (gethash s P) (list (list new 1.0)))))
P)))
;;; Given a fixed policy and a model, calculate the value of each state.
;;; This version does it by an iterative process similar to value iteration.
;;; Basic equation is U(i) <- r(i) + sum_j M(P(i),i,j)U(j)
;;; where U(j) MUST be the old value not the new.
;;; A better alternative is to set up the value equations and solve them
;;; using matrix methods.
(defun value-determination (P Uold M R
&key (epsilon 0.000001)
&aux Unew
(max-delta infinity))
(setf Unew (copy-hash-table Uold #'identity))
(do ()
((< max-delta epsilon) Unew)
(setq max-delta 0)
(rotatef Uold Unew)
(maphash
#'(lambda (s u)
(unless (sink? s M)
(setf (gethash s Unew)
(+ (gethash s R)
(if (gethash s M)
(q-value (caar (gethash s P)) s Uold M R)
0))))
(setq max-delta (max max-delta (abs (- (gethash s Unew) u)))))
Uold)))
;;; Compute optimal policy given U and M
(defun optimal-policy (U M R &aux (P (make-hash-table :test #'equal)))
(maphash #'(lambda (s md) (declare (ignore md))
(setf (gethash s P) (list (list (max-choice s U M R) 1.0))))
M)
P)
;;; The following functions select actions in particular states
;;; Pick a random action
(defun policy-choice (state P &aux (aplist (gethash state P))
(r (random 1.0)))
(dolist (a-p aplist)
(decf r (second a-p))
(unless (plusp r) (return (first a-p)))))
(defun random-choice (state U M R) (declare (ignore state U M R))
(random-element '(left right up down)))
;;; Pick the currently best action with tie-breaking
(defun max-choice (state U M R)
(car (the-biggest-random-tie
#'(lambda (ants) (q-value (car ants) state U M R))
(gethash state M))))
;;; Simply pick a currently best action deterministically
(defun dmax-choice (state U M R)
(car (the-biggest
#'(lambda (ants) (q-value (car ants) state U M R))
(gethash state M))))
;;; Q(a,s) is the value of doing a in s, calculated by averaging over the
;;; utilities of the possible outcomes. Used in several update equations.
(defun q-value (action state U M R &aux (v 0))
(declare (ignore R))
(dolist (transition (transitions action state M) v)
(incf v (* (transition-probability transition)
(gethash (transition-destination transition) U)))))