;; -*- Mode: Lisp; Syntax: Common-Lisp; -*-

;;;; Definitions for Markov Decision Problems and Reinforcement Learning

(defstructure (mdp-environment (:include environment))
  "An MDP-environment is driven by an MDP (Markov Decision Process),
  which (probabilistically) says what state to transition to for each action."
  ;;; To make an MDP into an environment, we basically just keep track of the
  ;;; current state, and then ask the MDP model to determine the new state.
  ;;; This makes sense for the case of a single agent in the environment.
  (mdp (make-mdp))
  (epochs-left 1))

(defstruct (mdp-percept (:type list))
  "A percept gives the current state, the reward received, and whether it is
  a terminal state."
  state reward terminalp)

;;;; Generic Functions for MDP-Environments

(defmethod initialize ((env mdp-environment))
  ;; Set the initial state, and make sure there is one agent.
  (setf (environment-state env) (mdp-initial-state (mdp-environment-mdp env)))
  (call-next-method)
  (assert (= 1 (length (environment-agents env)))))

(defmethod get-percept ((env mdp-environment) agent)
  "The percept is the current state, the reward, and whether this is terminal."
  (declare (ignore agent))
  (let* ((mdp (mdp-environment-mdp env))
	 (state (environment-state env))
	 (state-key (funcall (mdp-hash-key mdp) state)))
    (make-mdp-percept
     :state state
     :reward (gethash state-key (mdp-rewards mdp))
     :terminalp (not (null 
		      (member state (mdp-terminal-states mdp) 
			      :test #'equal))))))

(defmethod update-fn ((env mdp-environment))
  "We update by transitioning to a new state.  When we hit a terminal state,
  we restart in the initial state (until there are no more epochs left)."
  (let ((mdp (mdp-environment-mdp env))
	(agent (first (environment-agents env))))
    (incf (mdp-agent-total-reward agent) 
	  (mdp-percept-reward (agent-percept agent)))
    (cond ((member (environment-state env) (mdp-terminal-states mdp) 
		   :test #'equal)
	   ;; Start over when we reach a terminal state
	   (decf (mdp-environment-epochs-left env))
	   (setf (environment-state env) (mdp-initial-state mdp)))
	  (t (setf (environment-state env)
		   (mdp-next-state (agent-action agent)
				   (environment-state env)
				   mdp))))))

(defmethod performance-measure ((env mdp-environment) agent)
  "Return a number saying how well this agent is doing."
  ;; The default is to subtract one point for each time step.
  (mdp-agent-total-reward agent))

(defmethod termination? ((env mdp-environment))
  (= 0 (mdp-environment-epochs-left env)))

;;;; Utility Functions

(defun mdp-next-state (action state mdp)
  (let ((state-key (funcall (mdp-hash-key mdp) state)))
    (random-transition 
     (mdp-transitions action (gethash state-key (mdp-model mdp))))))

(defun mdp-transitions (action state-model)
  (mdp-action-model-transitions 
   (cdr (assoc action state-model :test #'equal))))
                       
(defun random-transition (transitions &aux (r (random 1.0)))
  (dolist (transition transitions)
    (decf r (transition-probability transition))
    (unless (plusp r) (return (transition-destination transition)))))