V=rep(0,25);
# defining probability matrix
P=matrix(1/4,nrow=25,ncol=4) #
# defining deterministic transition matrix
north=c(2:25,25)
north[ c(5,10,15,20,25)]=c(5,10,15,20,25)
east=c(6:25,21:25)
west=c(1:5,1:20)
south=c(1,1:24)
south[ c(1,6,11,16,21)]=c(1,6,11,16,21)
trM=cbind(north,east,south,west)
trM[10,]=6
trM[20,]=18
# defining reward matrix
R=matrix(0,nrow=25,ncol=4)
R[which(trM==1:25)]=-1
R[10,]=10
R[20,]=5
delta=1; gamma=0.9; tol=1e-10;
bestP=sample(1:4,25,replace=T)
stable=F;counter=0;
while (stable==F){
counter=counter+1
# iterative policy evaluation
while (delta>tol) {
delta=0;
V.old=V
for (i_state in 1:25) {
v=V[i_state]
V[i_state]=sum(P[i_state,]*(R[i_state,]+gamma*V.old[trM[i_state,]]))
delta=max(delta,abs(v-V[i_state]))
}
}
# policy improvement
stable=F
for (i_state in 1:25) {
b=bestP[i_state]
bestP[i_state]=which.max(V[trM[i_state,]])
ifelse((bestP[i_state]==b),stable<-T,stable<-F)
}
}
apply(matrix(bestP,nrow=5),2,rev)
bestP.mat = apply(matrix(as.character(bestP),nrow=5),2,rev)
bestP.mat[which(bestP.mat=="1")] = "N"
bestP.mat[which(bestP.mat=="2")] = "E"
bestP.mat[which(bestP.mat=="3")] = "S"
bestP.mat[which(bestP.mat=="4")] = "W"
print(bestP.mat)
Related