;;; learning/agents/passive-td-learner.lisp
;;; Passive temporal-difference learning agent.
;;; After each transition, update the utility of the
;;; source state i to make it agree more closely with that
;;; of the destination state j.

(defvar *alpha* 1.0)     ;;; initial learning rate parameter

(defun make-passive-td-learner ()
  (let ((percepts nil)
	(U (make-hash-table :test #'equal))
	(N (make-hash-table :test #'equal)))
    #'(lambda (e)
	(push e percepts)
	(let ((s (mdp-percept-state e)))
	  (unless (gethash s N)  ;;; make entries for new state
		  (setf (gethash s N) 0
			(gethash s U) 0))
	  (incf (gethash s N))
	  (td-update U e percepts N)
	  (when (mdp-percept-terminalp e)
		(setq percepts nil)))
	'no-op)))

(defun td-update (U e percepts N &aux (terminalp (mdp-percept-terminalp e))
		                      (j (mdp-percept-state e))
		                      (r (mdp-percept-reward e)))
  (cond (terminalp
	 (setf (gethash j U)
	       (running-average (gethash j U) r (gethash j N))))
	((length>1 percepts)
	 (let* ((e2 (second percepts))
		(i (mdp-percept-state e2)))
	   (incf (gethash i U)
		 (* (current-alpha (gethash j N))
		    (+ r (- (gethash j U) (gethash i U)))))))))



(defun current-alpha (n)
  (/ (* 60 *alpha*)
     (+ 59 n)))