dynamic programming 実装例

V=c(rep(0,100),1);V.hist=c()
p=c(0.4,0.6);
gamma=1;delta=1; tol=1e-20
max.a=rep(0,101)
while (delta>tol) {
  delta=0;
  for (i_state in 1:99) {
    v=V[i_state+1]
    temp=matrix(0,nrow=1,ncol=i_state)
    for (i_action in 1:i_state) {
       temp[i_action]=sum(p*(gamma*c(V[(min(i_state+i_action,100)+1)],
         V[(max(i_state-i_action,0)+1)])))
     }
    V[i_state+1]=max(temp)
    max.a[i_state+1]=which.max(round(temp,8))
    delta=max(delta,abs(v-V[i_state+1]))
  }
  V.hist=rbind(V.hist,V)
}
# plotting results
par(mfrow=c(1,2))
plot(V.hist[1,],type='l',lwd=2,xlab="Capital",ylab="Value Estimates",col='red')
lines(V.hist[2,],lwd=2,col='blue')
lines(V.hist[3,],lwd=2,col='green')
lines(V.hist[32,],lwd=2,col='black')
legend("topleft",c("sweep 1","sweep 2","sweep 3", "sweep 32"),
 col=c("red","blue","green","black"),lwd=2)
barplot(max.a,xlab="Capital",ylab="Final Policy",col="white")