;;; learning/agents/active-adp-learner.lisp
;;; Reinforcement learning agent that uses dynamic
;;; programming to solve the Markov decision process
;;; that it learns from its experience. Thus, the
;;; main job is to update the model over time.

(defun make-random-adp-learner (actions)
  (make-active-adp-learner actions #'random-choice))
(defun make-maximizing-adp-learner (actions)
  (make-active-adp-learner actions #'max-choice))

(defun make-active-adp-learner (actions choice-function)
  (let* ((percepts nil)
	 (last-action nil)
	 (U (make-hash-table :test #'equal))
	 (N (make-hash-table :test #'equal))
	 (M (make-hash-table :test #'equal))
	 (R (make-hash-table :test #'equal))
	 (mdp (make-mdp :model M :rewards R)))
    #'(lambda (e)
	(push e percepts)
	(let ((s (mdp-percept-state e)))
	  (unless (gethash s N)               ;;; make entries for new state
		  (setf (gethash s N) 0
			(gethash s U) 0
			(gethash s M) (mapcar 
				       #'(lambda (a)
					   (cons a (make-mdp-action-model)))
				       actions)
			(gethash s R) (mdp-percept-reward e)))
	  (incf (gethash s N))
	  (update-active-model mdp percepts last-action)
	  (when (mdp-terminal-states mdp)     ;;; make sure DP alg. terminates
	    (setq U (value-iteration mdp U)))
	  (when (mdp-percept-terminalp e)
		(setq percepts nil))
	  (setq last-action (funcall choice-function s U M R))))))

;;; Update current model to reflect the evidence from the most recent action

(defun update-active-model (mdp        ;;; current description of envt.
			    percepts   ;;; in reverse chronological order
			    action     ;;; last action taken
			    &aux (M (mdp-model mdp)) transition)
  (when (length>1 percepts)
    (let* ((e (first percepts)) (j (mdp-percept-state (first percepts)))
	   (e2 (second percepts)) (i (mdp-percept-state e2))
	   (action-model (action-model action i M))
	   (transitions (mdp-action-model-transitions action-model)))
      (when (mdp-percept-terminalp e)
	(unless (member j (mdp-terminal-states mdp) :test #'equal)
	  (push j (mdp-terminal-states mdp))))
      (incf (mdp-action-model-times-executed action-model))
      (unless (setq transition 
		    (find j transitions :test #'equal 
			  :key #'transition-destination))
        (push (setq transition (make-transition :destination j))
	      (mdp-action-model-transitions action-model)))
      (incf (transition-times-achieved transition))
      (dolist (trans (mdp-action-model-transitions action-model))
	(setf (transition-probability trans) 
	      (float (/ (transition-times-achieved trans)
			(mdp-action-model-times-executed action-model))))))))