;;; learning/agents/exploring-adp-learner.lisp ;;; Reinforcement learning agent that uses dynamic ;;; programming to solve the Markov decision process ;;; that it learns from its experience. Thus, the ;;; main job is to update the model over time. ;;; Unlike the active-adp-learner, this agent uses ;;; an "intelligent" exploration policy to make sure it ;;; explores the state space reasonably quickly. (defvar *R+* 2) (defvar *Ne* 5) (defun exploration-function (u n) (if (< n *Ne*) *R+* u)) (defun make-exploring-adp-learner (actions) (let* ((percepts nil) (last-action nil) (U (make-hash-table :test #'equal)) (N (make-hash-table :test #'equal)) (M (make-hash-table :test #'equal)) (R (make-hash-table :test #'equal)) (mdp (make-mdp :model M :rewards R))) #'(lambda (e) (push e percepts) (let ((s (mdp-percept-state e))) (unless (gethash s N) ;;; make entries for new state (setf (gethash s N) 0 (gethash s U) 0 (gethash s M) (mapcar #'(lambda (a) (cons a (make-mdp-action-model))) actions) (gethash s R) (mdp-percept-reward e))) (incf (gethash s N)) (update-active-model mdp percepts last-action) (when (mdp-terminal-states mdp) (setq U (exploratory-value-iteration mdp U))) (when (mdp-percept-terminalp e) (setq percepts nil)) (setq last-action (exploration-choice s U M R)))))) ;;; Given an environment model M, determine the values of states U. ;;; Use value iteration, with initial values given by U itself. ;;; Basic equation is U(i) <- r(i) + max_a f(sum_j M(a,i,j)U(j), N(a,i)) ;;; where f is the exploration function. Does not applyt to terminal states. (defun exploratory-value-iteration (mdp &optional (Uold (copy-hash-table (mdp-rewards mdp) #'identity)) &key (epsilon 0.000001) &aux (Unew (copy-hash-table Uold #'identity)) (max-delta infinity) (M (mdp-model mdp)) (R (mdp-rewards mdp))) (do () ((< max-delta epsilon) Unew) (setq max-delta 0) (rotatef Uold Unew) ;;; switch contents; then we will overwrite Unew (maphash #'(lambda (s u) (unless (sink? s M) (setf (gethash s Unew) (+ (gethash s R) (if (gethash s M) (apply #'max (mapcar #'(lambda (a) (if (member s (mdp-terminal-states mdp) :test #'equal) (gethash s R) (exploration-function (q-value a s Uold M R) (mdp-action-model-times-executed (action-model a s M))))) (actions s M))) 0)))) (setq max-delta (max max-delta (abs (- (gethash s Unew) u))))) Uold))) (defun exploration-choice (s U M R) (the-biggest-random-tie #'(lambda (a) (exploration-function (q-value a s U M R) (mdp-action-model-times-executed (action-model a s M)))) (actions s M)))