認知情報解析学演習a 課題03

V=rep(0,25);                

# defining probability matrix
P=matrix(1/4,nrow=25,ncol=4) # 

# defining deterministic transition matrix
north=c(2:25,25)
north[ c(5,10,15,20,25)]=c(5,10,15,20,25)
east=c(6:25,21:25)
west=c(1:5,1:20)
south=c(1,1:24)
south[ c(1,6,11,16,21)]=c(1,6,11,16,21)
trM=cbind(north,east,south,west)
trM[10,]=6
trM[20,]=18

# defining reward matrix
R=matrix(0,nrow=25,ncol=4)
R[which(trM==1:25)]=-1
R[10,]=10
R[20,]=5

delta=1; gamma=0.9; tol=1e-10; 
bestP=sample(1:4,25,replace=T)
stable=F;counter=0;
while (stable==F){
  counter=counter+1
  # iterative policy evaluation
  while (delta>tol) {
    delta=0;
    V.old=V
    for (i_state in 1:25) {
      v=V[i_state]
      V[i_state]=sum(P[i_state,]*(R[i_state,]+gamma*V.old[trM[i_state,]]))
      delta=max(delta,abs(v-V[i_state]))
    }
  }
  # policy improvement
  stable=F
  for (i_state in 1:25) {
    b=bestP[i_state]
    bestP[i_state]=which.max(V[trM[i_state,]])
    ifelse((bestP[i_state]==b),stable<-T,stable<-F)
  }
}

apply(matrix(bestP,nrow=5),2,rev)
bestP.mat = apply(matrix(as.character(bestP),nrow=5),2,rev)
bestP.mat[which(bestP.mat=="1")] = "N"
bestP.mat[which(bestP.mat=="2")] = "E"
bestP.mat[which(bestP.mat=="3")] = "S"
bestP.mat[which(bestP.mat=="4")] = "W"
print(bestP.mat)