# RL Sutton & Barto Ch.3

```chapter 3 - example 3.8 & 3.12
bug fixed on 2015.12.18
#####################################
#     Ch3.7 Value function
#####################################
# example 3.8: Gridworld
# initializing value vector
V=rep(0,25);

# defining probability matrix
P=matrix(1/4,nrow=25,ncol=4) #

# defining deterministic transition matrix
north=c(2:25,25)
north[ c(5,10,15,20,25)]=c(5,10,15,20,25)
east=c(6:25,21:25)
west=c(1:5,1:20)
south=c(1,1:24)
south[ c(1,6,11,16,21)]=c(1,6,11,16,21)
trM=cbind(north,east,south,west)
trM[10,]=6
trM[20,]=18

# defining reward matrix
R=matrix(0,nrow=25,ncol=4)
R[which(trM==1:25)]=-1
R[10,]=10
R[20,]=5

# calculating state-values iteratively
# fixed number of iterations
nRep=1000; gamma=0.9;
for (i_rep in 1:nRep) {
V.old=V
for (i_state in 1:25) {
V[i_state]=sum(P[i_state,]*(R[i_state,]+gamma*V.old[trM[i_state,]]))
}
}

# result
> matrix(V,nrow=5)[5:1,]
[,1]       [,2]       [,3]       [,4]       [,5]
[1,]  3.30899634  8.7892919  4.4276192  5.3223676  1.4921788
[2,]  1.52158807  2.9923179  2.2501400  1.9075717  0.5474027
[3,]  0.05082249  0.7381706  0.6731133  0.3581862 -0.4031411
[4,] -0.97359230 -0.4354954 -0.3548823 -0.5856051 -1.1830751
[5,] -1.85770055 -1.3452313 -1.2292673 -1.4229181 -1.9751790

#####################################
#  ch3.8 optimal value function
#####################################
# example 3.12
V.star=V; # V calculated as in exp3.8
iter=0;maxItr=1000;delta=1;tol=1e-5;
while(iter < maxItr & delta > tol) {
delta=0;iter=iter+1
V.star.old=V.star
for (i_state in 1:25) {
v=V.star[i_state]
V.star[i_state]=max(R[i_state,]+gamma*V.star.old[trM[i_state,]])
delta=max(delta,abs(v-V.star[i_state]))
}
}
# result - ex3.12
> matrix(V.star,nrow=5)[5:1,]
[,1]     [,2]     [,3]     [,4]     [,5]
[1,] 21.97746 24.41940 21.97746 19.41941 17.47747
[2,] 19.77971 21.97746 19.77971 17.80174 16.02157
[3,] 17.80173 19.77971 17.80174 16.02157 14.41941
[4,] 16.02156 17.80173 16.02156 14.41940 12.97746
[5,] 14.41940 16.02156 14.41940 12.97746 11.67972
```