December | 2015 | CourseLog

#######################################
#   ch4.1 policy evaluation 
########################################
# example 4.1 - bug fixed on 2017.03.21
# defining deterministic trans. matrix
north=c(1:3,15,1:10)
east=2:15;east[ c(3,7,11)]=c(3,7,11)
south=c(5:15,12:14)
west=c(15,1:13);west[ c(4,8,12)]=c(4,8,12)
trM=cbind(north,east,south,west)

# defining Reward & trans. prob.
R=-1;P=0.25;

# iterative policy evaluation
delta=1; gamma=1; tol=1e-8; V=rep(0,15);
while (delta>tol) {
 delta=0;
  for (i_state in 1:14) {
    v=V[i_state]
    V[i_state]=sum(P*(R+gamma*V[trM[i_state,]]))
    delta=max(delta,abs(v-V[i_state]));
  }
}

# result
> matrix(c(0,V),nrow=4)
     [,1] [,2] [,3] [,4]
[1,]    0  -14  -20  -22
[2,]  -14  -18  -20  -20
[3,]  -20  -20  -18  -14
[4,]  -22  -20  -14    0

#####################################
#   ch4.3 Policy Improvement
#   easier one 
#####################################
north=c(1:3,15,1:10)
east=2:15;east[ c(3,7,11)]=c(3,7,11)
south=c(5:15,12:14)
west=c(15,1:13);west[ c(4,8,12)]=c(4,8,12)
trM=cbind(north,east,south,west)
R=-1;P=0.25;V=rep(0,15);
delta=1; gamma=1; tol=1e-10; 
bestP=sample(1:4,14,replace=T)
stable=F;counter=0;
while (stable==F){
  counter=counter+1
  # iterative policy evaluation
  while (delta>tol) {
    delta=0;
    for (i_state in 1:14) {
      v=V[i_state]
      V[i_state]=sum(P*(R+gamma*V[trM[i_state,]]))
      delta=max(delta,abs(v-V[i_state]))
    }
  }
  # policy improvement
  stable=F
  for (i_state in 1:14) {
    b=bestP[i_state]
    bestP[i_state]=which.max(V[trM[i_state,]])
    ifelse((bestP[i_state]==b),stable<-T,stable<-F)
  }
}

# with softmax
R=-1;V=rep(0,15);
bestP=sample(1:4,14,replace=T)
stable=F;counter=0
while (stable==F){
  counter=counter+1
  Vmat=matrix(V[trM],ncol=4)
  Ppolicy=t(apply(Vmat,1,function(x) exp(10*x)/sum(exp(10*x))))
# iterative policy evaluation
  while (delta>tol) {
    delta=0;
    for (i_state in 1:14) {
      v=V[i_state]
      V[i_state]=sum(Ppolicy[i_state]*(R+gamma*V[trM[i_state,]]))
      delta=max(delta,abs(v-V[i_state]))
    }
  }
# policy improvement
  stable=F
  for (i_state in 1:14) {
    b=bestP[i_state]
    bestP[i_state]=which.max(V[trM[i_state,]])
    ifelse((bestP[i_state]==b),stable<-T,stable<-F)
  }
}


#####################################
#   ch4.4 Value Iteration 
#####################################
# example 4.3
V=c(rep(0,100),1);V.hist=c()
p=c(0.4,0.6);
gamma=1;delta=1; tol=1e-20
max.a=rep(0,101)
while (delta>tol) {
  delta=0;
  for (i_state in 1:99) {
    v=V[i_state+1]
    temp=matrix(0,nrow=1,ncol=i_state)
    for (i_action in 1:i_state) {
       temp[i_action]=sum(p*(gamma*c(V[(min(i_state+i_action,100)+1)],
         V[(max(i_state-i_action,0)+1)])))
     }
    V[i_state+1]=max(temp)
    max.a[i_state+1]=which.max(round(temp,8))
    delta=max(delta,abs(v-V[i_state+1]))
  }
  V.hist=rbind(V.hist,V)
}
# plotting results
par(mfrow=c(1,2))
plot(V.hist[1,],type='l',lwd=2,xlab="Capital",ylab="Value Estimates",col='red')
lines(V.hist[2,],lwd=2,col='blue')
lines(V.hist[3,],lwd=2,col='green')
lines(V.hist[32,],lwd=2,col='black')
legend("topleft",c("sweep 1","sweep 2","sweep 3", "sweep 32"),
 col=c("red","blue","green","black"),lwd=2)
barplot(max.a,xlab="Capital",ylab="Final Policy",col="white")

chapter 3 - example 3.8 & 3.12 bug fixed on 2015.12.18 ##################################### # Ch3.7 Value function ##################################### # example 3.8: Gridworld # initializing value vector V=rep(0,25); # defining probability matrix P=matrix(1/4,nrow=25,ncol=4) # # defining deterministic transition matrix north=c(2:25,25) north[ c(5,10,15,20,25)]=c(5,10,15,20,25) east=c(6:25,21:25) west=c(1:5,1:20) south=c(1,1:24) south[ c(1,6,11,16,21)]=c(1,6,11,16,21) trM=cbind(north,east,south,west) trM[10,]=6 trM[20,]=18 # defining reward matrix R=matrix(0,nrow=25,ncol=4) R[which(trM==1:25)]=-1 R[10,]=10 R[20,]=5 # calculating state-values iteratively # fixed number of iterations nRep=1000; gamma=0.9; for (i_rep in 1:nRep) { V.old=V for (i_state in 1:25) { V[i_state]=sum(P[i_state,]*(R[i_state,]+gamma*V.old[trM[i_state,]])) } } # result > matrix(V,nrow=5)[5:1,] [,1] [,2] [,3] [,4] [,5] [1,] 3.30899634 8.7892919 4.4276192 5.3223676 1.4921788 [2,] 1.52158807 2.9923179 2.2501400 1.9075717 0.5474027 [3,] 0.05082249 0.7381706 0.6731133 0.3581862 -0.4031411 [4,] -0.97359230 -0.4354954 -0.3548823 -0.5856051 -1.1830751 [5,] -1.85770055 -1.3452313 -1.2292673 -1.4229181 -1.9751790 ##################################### # ch3.8 optimal value function ##################################### # example 3.12 V.star=V; # V calculated as in exp3.8 iter=0;maxItr=1000;delta=1;tol=1e-5; while(iter < maxItr & delta > tol) { delta=0;iter=iter+1 V.star.old=V.star for (i_state in 1:25) { v=V.star[i_state] V.star[i_state]=max(R[i_state,]+gamma*V.star.old[trM[i_state,]]) delta=max(delta,abs(v-V.star[i_state])) } } # result - ex3.12 > matrix(V.star,nrow=5)[5:1,] [,1] [,2] [,3] [,4] [,5] [1,] 21.97746 24.41940 21.97746 19.41941 17.47747 [2,] 19.77971 21.97746 19.77971 17.80174 16.02157 [3,] 17.80173 19.77971 17.80174 16.02157 14.41941 [4,] 16.02156 17.80173 16.02156 14.41940 12.97746 [5,] 14.41940 16.02156 14.41940 12.97746 11.67972

CourseLog

Dept Cognitive & Info Science

Monthly Archives: December 2015

RL Sutton & Barto Ch.4

RL Sutton & Barto Ch.3