RL Sutton & Barto Ch.4

#######################################
#   ch4.1 policy evaluation 
########################################
# example 4.1 - bug fixed on 2017.03.21
# defining deterministic trans. matrix
north=c(1:3,15,1:10)
east=2:15;east[ c(3,7,11)]=c(3,7,11)
south=c(5:15,12:14)
west=c(15,1:13);west[ c(4,8,12)]=c(4,8,12)
trM=cbind(north,east,south,west)

# defining Reward & trans. prob.
R=-1;P=0.25;

# iterative policy evaluation
delta=1; gamma=1; tol=1e-8; V=rep(0,15);
while (delta>tol) {
 delta=0;
  for (i_state in 1:14) {
    v=V[i_state]
    V[i_state]=sum(P*(R+gamma*V[trM[i_state,]]))
    delta=max(delta,abs(v-V[i_state]));
  }
}

# result
> matrix(c(0,V),nrow=4)
     [,1] [,2] [,3] [,4]
[1,]    0  -14  -20  -22
[2,]  -14  -18  -20  -20
[3,]  -20  -20  -18  -14
[4,]  -22  -20  -14    0

#####################################
#   ch4.3 Policy Improvement
#   easier one 
#####################################
north=c(1:3,15,1:10)
east=2:15;east[ c(3,7,11)]=c(3,7,11)
south=c(5:15,12:14)
west=c(15,1:13);west[ c(4,8,12)]=c(4,8,12)
trM=cbind(north,east,south,west)
R=-1;P=0.25;V=rep(0,15);
delta=1; gamma=1; tol=1e-10; 
bestP=sample(1:4,14,replace=T)
stable=F;counter=0;
while (stable==F){
  counter=counter+1
  # iterative policy evaluation
  while (delta>tol) {
    delta=0;
    for (i_state in 1:14) {
      v=V[i_state]
      V[i_state]=sum(P*(R+gamma*V[trM[i_state,]]))
      delta=max(delta,abs(v-V[i_state]))
    }
  }
  # policy improvement
  stable=F
  for (i_state in 1:14) {
    b=bestP[i_state]
    bestP[i_state]=which.max(V[trM[i_state,]])
    ifelse((bestP[i_state]==b),stable<-T,stable<-F)
  }
}

# with softmax
R=-1;V=rep(0,15);
bestP=sample(1:4,14,replace=T)
stable=F;counter=0
while (stable==F){
  counter=counter+1
  Vmat=matrix(V[trM],ncol=4)
  Ppolicy=t(apply(Vmat,1,function(x) exp(10*x)/sum(exp(10*x))))
# iterative policy evaluation
  while (delta>tol) {
    delta=0;
    for (i_state in 1:14) {
      v=V[i_state]
      V[i_state]=sum(Ppolicy[i_state]*(R+gamma*V[trM[i_state,]]))
      delta=max(delta,abs(v-V[i_state]))
    }
  }
# policy improvement
  stable=F
  for (i_state in 1:14) {
    b=bestP[i_state]
    bestP[i_state]=which.max(V[trM[i_state,]])
    ifelse((bestP[i_state]==b),stable<-T,stable<-F)
  }
}


#####################################
#   ch4.4 Value Iteration 
#####################################
# example 4.3
V=c(rep(0,100),1);V.hist=c()
p=c(0.4,0.6);
gamma=1;delta=1; tol=1e-20
max.a=rep(0,101)
while (delta>tol) {
  delta=0;
  for (i_state in 1:99) {
    v=V[i_state+1]
    temp=matrix(0,nrow=1,ncol=i_state)
    for (i_action in 1:i_state) {
       temp[i_action]=sum(p*(gamma*c(V[(min(i_state+i_action,100)+1)],
         V[(max(i_state-i_action,0)+1)])))
     }
    V[i_state+1]=max(temp)
    max.a[i_state+1]=which.max(round(temp,8))
    delta=max(delta,abs(v-V[i_state+1]))
  }
  V.hist=rbind(V.hist,V)
}
# plotting results
par(mfrow=c(1,2))
plot(V.hist[1,],type='l',lwd=2,xlab="Capital",ylab="Value Estimates",col='red')
lines(V.hist[2,],lwd=2,col='blue')
lines(V.hist[3,],lwd=2,col='green')
lines(V.hist[32,],lwd=2,col='black')
legend("topleft",c("sweep 1","sweep 2","sweep 3", "sweep 32"),
 col=c("red","blue","green","black"),lwd=2)
barplot(max.a,xlab="Capital",ylab="Final Policy",col="white")

sutton4-6

RL Sutton & Barto Ch.3

chapter 3 - example 3.8 & 3.12
bug fixed on 2015.12.18
#####################################
#     Ch3.7 Value function
#####################################
# example 3.8: Gridworld
# initializing value vector
V=rep(0,25);                

# defining probability matrix
P=matrix(1/4,nrow=25,ncol=4) # 

# defining deterministic transition matrix
north=c(2:25,25)
north[ c(5,10,15,20,25)]=c(5,10,15,20,25)
east=c(6:25,21:25)
west=c(1:5,1:20)
south=c(1,1:24)
south[ c(1,6,11,16,21)]=c(1,6,11,16,21)
trM=cbind(north,east,south,west)
trM[10,]=6
trM[20,]=18

# defining reward matrix
R=matrix(0,nrow=25,ncol=4)
R[which(trM==1:25)]=-1
R[10,]=10
R[20,]=5

# calculating state-values iteratively
# fixed number of iterations 
nRep=1000; gamma=0.9;
for (i_rep in 1:nRep) {
  V.old=V
  for (i_state in 1:25) {
    V[i_state]=sum(P[i_state,]*(R[i_state,]+gamma*V.old[trM[i_state,]]))
  }
}

# result
> matrix(V,nrow=5)[5:1,]
            [,1]       [,2]       [,3]       [,4]       [,5]
[1,]  3.30899634  8.7892919  4.4276192  5.3223676  1.4921788
[2,]  1.52158807  2.9923179  2.2501400  1.9075717  0.5474027
[3,]  0.05082249  0.7381706  0.6731133  0.3581862 -0.4031411
[4,] -0.97359230 -0.4354954 -0.3548823 -0.5856051 -1.1830751
[5,] -1.85770055 -1.3452313 -1.2292673 -1.4229181 -1.9751790

#####################################
#  ch3.8 optimal value function
#####################################
# example 3.12
V.star=V; # V calculated as in exp3.8
iter=0;maxItr=1000;delta=1;tol=1e-5;
while(iter < maxItr & delta > tol) {
  delta=0;iter=iter+1
  V.star.old=V.star
  for (i_state in 1:25) {
    v=V.star[i_state]
    V.star[i_state]=max(R[i_state,]+gamma*V.star.old[trM[i_state,]])
    delta=max(delta,abs(v-V.star[i_state]))
  }
}
# result - ex3.12
> matrix(V.star,nrow=5)[5:1,]
         [,1]     [,2]     [,3]     [,4]     [,5]
[1,] 21.97746 24.41940 21.97746 19.41941 17.47747
[2,] 19.77971 21.97746 19.77971 17.80174 16.02157
[3,] 17.80173 19.77971 17.80174 16.02157 14.41941
[4,] 16.02156 17.80173 16.02156 14.41940 12.97746
[5,] 14.41940 16.02156 14.41940 12.97746 11.67972