%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% BEGIN
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Reinforcement Learning.
% 7/11/01 JV Stone, Psychology, Sheffield University. j.v.stone@shef.ac.uk
%
% This matlab code implements an example given in Sutton's 1988 paper on
% temporal difference learning and
% in his recent book. The example consists of a Markov chain of 5 nodes plus
% two absorbing states (A and G):
%
% A B C D E F G
%
% with corresponding stateid numbers
%
% 0 1 2 3 4 5 6
%
% The 'reward' for states A and G are 0 and 1, resp.
% The transition probabilities P of moving to the right (ie p(1)) are:
%
% P = [] 1/2 1/2 1/2 1/2 1/2 []
%
% The probability p of ending in state G from each state is:
%
% T = [] 1/6 2/6 3/6 4/6 5/6 []
%
% Thus p(G|B)=1/6.
% The problem consists of learning the transition probabilities T.
%
% T is learned over a series of random walks (nwalks).
% T is estimated as a weight vector w.
% The problem of temporal credit assignment arises here because
% the reward is only given at the end (state A or G) of each walk,
% so that the final reward associated with moving into each state must be
% estimated long after that state has been left.
% Typical output for parameter values in this file:
%
% Initial w: w =
% 0.6507 0.8875 0.5692 0.8798 0.6063
% Final w: w =
% 0.1544 0.3235 0.4915 0.6373 0.8246
% Required solution: T =
% 0.1667 0.3333 0.5000 0.6667 0.8333
% Rms error = 0.035
clear all;
nstates = 5; % B to F
T = [1:nstates]/(nstates+1);
P = ones(nstates,1)/2;
states = diag(ones(nstates,1)); % each col vec is a state
w = rand(1,nstates);
dw = zeros(size(w)); % change in w.
elast = zeros(size(w)); % error at t-1 (sum in eq 4).
lambda = 0.3; % exponential decay - 0.3 better than 1
alpha = 0.05; % learning rate
nwalks = 1000; % number of random walks
intervalw = 10; % number of walks between weight update.
% Anneal value of learning rate.
a=ones(nwalks,1)*0.999;b=[1:nwalks]';
alphas=(a.^b)/10; figure(2); plot(alphas); title('Learning rate');
% Dont anneal
alphas = ones(nwalks,1)*alpha;
fprintf('Initial w: '); w
for i=1:nwalks
dw = dw*0;
walking = 1;
alpha = alphas(i);
% Init params for walk.
stateid = 3;
xt = states(:,stateid);
Pt = w*xt;
elast = 0;
while walking
% Move left or right
% next_stateid=get_next_stateid(P,stateid);
pright = P(stateid);
if rand < pright % move right
next_stateid = stateid+1;
else % move left
next_stateid = stateid-1;
end;
% Find Pnext
if next_stateid==0
Pnext = 0;
walking = 0;
elseif next_stateid==nstates+1
Pnext = 1;
walking = 0;
else
xnext = states(:,next_stateid);
Pnext = w*xnext;
end;
% Find change in w.
et = lambda*elast + xt'; % error at time t
dP = Pnext-Pt; % Change in predicted final reward made at time t+1.
dw = dw + dP*et;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% DEBUG
db=0;
if db
[stateid next_stateid Pnext Pt dP ]
dP*elast
pr;
end;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Update variables.
Pt = Pnext;
elast = et;
xt = xnext;
stateid = next_stateid;
end; % walking
% Update w
if rem(i,intervalw)==0
w = w + alpha*dw;
end;
end; % nwalks
fprintf('Final w: '); w
fprintf('Required solution: '); T
fprintf('Rms error = %.3f\n', norm(w-T));
f(1);
xx=1:nstates;plot(xx,T,'k',xx,w,'r');
title('RL: Optimal (black), estimated (red)');
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% END
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%