# 認知情報解析演習　強化学習２

```# example 3.8: Gridworld
# initializing value vector
V=rep(0,25);

# defining probability matrix
P=matrix(1/4,nrow=25,ncol=4) #

# defining deterministic transition matrix
north=c(2:25,25)
north[ c(5,10,15,20,25)]=c(5,10,15,20,25)
east=c(6:25,21:25)
west=c(1:5,1:20)
south=c(1,1:24)
south[ c(1,6,11,16,21)]=c(1,6,11,16,21)
trM=cbind(north,east,south,west)
trM[10,]=6
trM[20,]=18

# defining reward matrix
R=matrix(0,nrow=25,ncol=4)
R[which(trM==1:25)]=-1
R[10,]=10
R[20,]=5

# calculating state-values iteratively
# fixed number of iterations
nRep=1000; gamma=0.9;
for (i_rep in 1:nRep) {
V.old=V
for (i_state in 1:25) {
V[i_state]=sum(P[i_state,]*(R[i_state,]+gamma*V.old[trM[i_state,]]))
}
}

### example 2
north=c(1:3,15,1:10)
east=2:15;east[ c(3,7,11)]=c(3,7,11)
south=c(5:15,12:14)
west=c(15,1:13);west[ c(4,8,12)]=c(4,8,12)
trM=cbind(north,east,south,west)
###

V.star=V; # V calculated as in exp3.8
iter=0;maxItr=1000;delta=1;tol=1e-5;
while(iter < maxItr & delta > tol) {
delta=0;iter=iter+1
V.star.old=V.star
for (i_state in 1:25) {
v=V.star[i_state]
V.star[i_state]=max(R[i_state,]+gamma*V.star.old[trM[i_state,]])
delta=max(delta,abs(v-V.star[i_state]))
}
}

#######################################
#   ch4.1 policy evaluation
########################################
# example 4.1 - bug fixed on 2017.03.21
# defining deterministic trans. matrix
north=c(1:3,15,1:10)
east=2:15;east[ c(3,7,11)]=c(3,7,11)
south=c(5:15,12:14)
west=c(15,1:13);west[ c(4,8,12)]=c(4,8,12)
trM=cbind(north,east,south,west)

# defining Reward & trans. prob.
R=-1;P=0.25;

# iterative policy evaluation
delta=1; gamma=1; tol=1e-8; V=rep(0,15);
while (delta>tol) {
delta=0;
for (i_state in 1:14) {
v=V[i_state]
V[i_state]=sum(P*(R+gamma*V[trM[i_state,]]))
delta=max(delta,abs(v-V[i_state]));
}
}

#####################################
#   ch4.3 Policy Improvement
#   easier one
#####################################
north=c(1:3,15,1:10)
east=2:15;east[ c(3,7,11)]=c(3,7,11)
south=c(5:15,12:14)
west=c(15,1:13);west[ c(4,8,12)]=c(4,8,12)
trM=cbind(north,east,south,west)
R=-1;V=rep(0,15);
delta=1; gamma=1; tol=1e-5;
bestP=sample(1:4,14,replace=T)
stable=F;counter=0
while (stable==F){
counter=counter+1
Vmat=matrix(V[trM],ncol=4)
Ppolicy=t(apply(Vmat,1,function(x) exp(10*x)/sum(exp(10*x))))
# iterative policy evaluation
while (delta>tol) {
delta=0;
for (i_state in 1:14) {
v=V[i_state]
V[i_state]=sum(Ppolicy[i_state]*(R+gamma*V[trM[i_state,]]))
delta=max(delta,abs(v-V[i_state]))
}
}
# policy improvement
stable=F
for (i_state in 1:14) {
b=bestP[i_state]
bestP[i_state]=which.max(V[trM[i_state,]])
ifelse((bestP[i_state]==b),stable<-T,stable<-F)
}
}

#####################################
#   ch4.4 Value Iteration
#####################################
# example 4.3
V=c(rep(0,100),1);V.hist=c()
p=c(0.4,0.6);
gamma=1;delta=1; tol=1e-20
max.a=rep(0,101)
while (delta>tol) {
delta=0;
for (i_state in 1:99) {
v=V[i_state+1]
temp=matrix(0,nrow=1,ncol=i_state)
for (i_action in 1:i_state) {
temp[i_action]=sum(p*(gamma*c(V[(min(i_state+i_action,100)+1)],
V[(max(i_state-i_action,0)+1)])))
}
V[i_state+1]=max(temp)
max.a[i_state+1]=which.max(round(temp,8))
delta=max(delta,abs(v-V[i_state+1]))
}
V.hist=rbind(V.hist,V)
}
# plotting results
par(mfrow=c(1,2))
plot(V.hist[1,],type='l',lwd=2,xlab="Capital",ylab="Value Estimates",col='red')
lines(V.hist[2,],lwd=2,col='blue')
lines(V.hist[3,],lwd=2,col='green')
lines(V.hist[32,],lwd=2,col='black')
legend("topleft",c("sweep 1","sweep 2","sweep 3", "sweep 32"),
col=c("red","blue","green","black"),lwd=2)
barplot(max.a,xlab="Capital",ylab="Final Policy",col="white")

```