################################################# # ch6.3 Temporal Difference # TD0 & constant step-size MC ################################################# # TD0 model TD0.ex1<-function(maxItr,alpha,gamma) { V=c(0,rep(0.5,5),0) V.hist=matrix(0,nrow=maxItr+1,ncol=5) V.hist[1,]=V[2:6] P.act=matrix(0.5,ncol=7,nrow=2) for (i_rep in 1:maxItr) { state=5 while (state!=1 & state!=7) { action=sample(c(-1,1),1,prob=P.act[,state]) state.old=state state=state+action r=ifelse(state==7,1,0) V[state.old]=V[state.old]+alpha*(r+gamma*V[state]-V[state.old]) } V.hist[(i_rep+1),]=V[2:6] } return(V.hist) } # (re)creating Fig 6.6 true.V=1:5*(1/6) res=TD0.ex1(1000,0.1,1) plot(true.V,type='o',pch=15,ylim=c(0,1),ylab="Value",xaxt="n", xlab="State",xlim=c(0.5,5.5),cex=2,lwd=2) axis(1,at=1:5,labels=c("A","B","C","D","E")) cols=c('red','blue','green','cyan','magenta') ns=c(1,2,11,101,1001) for (i_lines in 1:5) { lines(res[ns[i_lines],],type='o',pch=15+i_lines,cex=2,lwd=2,col=cols[i_lines]) } legend('topleft',c('True value','t=0','t=1','t=10','t=100','t=1000'), col=c('black',cols),pch=15:20,lwd=1.5)
# constant step-size Monte Carlo constMC.ex1<-function(maxItr,alpha) { V=c(0,rep(0.5,5),0) V.hist=matrix(0,nrow=maxItr+1,5) V.hist[1,]=V[2:6] P.act=matrix(0.5,ncol=7,nrow=2) for (i_rep in 1:maxItr) { state=5; state.hist=state while (state!=1 & state!=7) { action=sample(c(-1,1),1,prob=P.act[,state]) state=state+action state.hist=cbind(state.hist,state) } R=ifelse(state==7,1,0) n.state=length(state.hist) for (i_state in 1:(n.state-1)) { V[state.hist[i_state]]=V[state.hist[i_state]]+ alpha*(R-V[state.hist[i_state]]) } V.hist[(i_rep+1),]=V[2:6] } return(V.hist) } # (re)creating Fig 6.7 alphaTD=c(0.05,0.075,0.1,0.15) alphaMC=c(0.01,0.02,0.03,0.04) n.alphas=length(alphaTD) pchs=0:(0+n.alphas) true.V=1:5*(1/6) n_rep=100 sqs=seq(1,101,2) plot(0,0,type='n',xlim=c(0,100),ylim=c(0,0.25)) for (i_alpha in 1:n.alphas) { rmsTD=matrix(0,101,n_rep) rmsMC=matrix(0,101,n_rep) for (i_rep in 1:n_rep) { resTD=TD0.ex1(100,alphaTD[i_alpha],1) resMC=constMC.ex1(100,alphaMC[i_alpha]) for (i_gen in 1:101) { rmsTD[i_gen,i_rep]=sqrt(mean((resTD[i_gen,]-true.V)^2)) rmsMC[i_gen,i_rep]=sqrt(mean((resMC[i_gen,]-true.V)^2)) } } mTD=rowMeans(rmsTD) mMC=rowMeans(rmsMC) lines(mTD,col='red') lines(mMC,col='blue') lines(sqs,mTD[sqs],col='red',pch=pchs[i_alpha],type='p') lines(sqs,mMC[sqs],col='blue',pch=pchs[i_alpha],type='p') } labs=c("MC, alpha=0.01", "MC, alpha=0.02", "MC, alpha=0.03", "MC, alpha=0.04", "TD, alpha=0.05", "TD, alpha=0.075", "TD, alpha=0.10", "TD, alpha=0.15") legend('topright',labs,col=c(rep('blue',4),rep('red',4)),pch=rep(0:3,2),lwd=1.5)
################################################# # ch6.4 On-policy TD, Sarsa ################################################# sarsa.ex6.5<-function(maxItr,alpha,gamma,epsilon) { # field size: 7row x 10column # horizontal move -> COLUMN # vertical move -> ROW # effect of wind -> ROW # actions: 1-up, 2-right, 3-down, 4-left act.V=matrix(c(1,0,0,1,-1,0,0,-1),nrow=4,byrow=T) wind=matrix(c(0,0,0,0,0,0,1,0,1,0,1,0,2,0,2,0,1,0,0,0),byrow=T,nrow=10) goal=c(4,8) Qs=array(0,dim=c(7,10,4)) for (i_rep in 1:maxItr) { state=c(4,1) # start if (runif(1) > epsilon) { move=which.max(Qs[state[1],state[2],]) } else { move=sample(1:4,1)} while (!all(state==goal)) { st.old=state mv.old=move state=state+act.V[move,]+wind[state[2],] if (state[1]<1) {state[1]=1} if (state[1]>7) {state[1]=7} if (state[2]<1) {state[2]=1} if (state[2]>10) {state[2]=10} if (runif(1) > epsilon) { move=which.max(Qs[state[1],state[2],]) } else { move=sample(1:4,1)} rew=ifelse(all(state==goal),0,-1) Qs[st.old[1],st.old[2],mv.old]=Qs[st.old[1],st.old[2],mv.old] +alpha*(rew+gamma* Qs[state[1],state[2],move] -Qs[st.old[1],st.old[2],mv.old]) } } return(Qs) } # running example Qs=sarsa.ex6.5(5e6,0.1,1,0.1) # sim optimal actions state=c(4,1);goal=c(4,8); state.hist=state while (!all(state==goal)) { moveID=which.max(Qs[state[1],state[2],]) state=state+act.V[moveID,]+wind[state[2],] if (state[1]<1) {state[1]=1} if (state[1]>7) {state[1]=7} if (state[2]<1) {state[2]=1} if (state[2]>10) {state[2]=10} state.hist=rbind(state.hist,state) } # plotting results plot(0,0,type='n',xlim=c(0,11),ylim=c(0,8),xlab="",ylab="", main="Learned policies -- Sarsa") lines(1,4,type='p',pch=19,col='red',cex=2) lines(8,4,type='p',pch=19,col='red',cex=2) dirs=c("up","right","down","left" ) for (i_row in 1:7) { for (i_col in 1:10) { best.move=dirs[which.max(Qs[i_row,i_col,])] text(i_col,i_row,best.move) } } lines(state.hist[,2],state.hist[,1],col="red",lwd=2)
################################################# # ch6.5 Off-policy TD, Q-learning ################################################# Qlearn.ex6.5<-function(maxItr,alpha,gamma,epsilon) { # field size: 7row x 10column # horizontal move -> COLUMN # vertical move -> ROW # effect of wind -> ROW # actions: 1-up, 2-right, 3-down, 4-left act.V=matrix(c(1,0,0,1,-1,0,0,-1),nrow=4,byrow=T) wind=matrix(c(0,0,0,0,0,0,1,0,1,0,1,0,2,0,2,0,1,0,0,0),byrow=T,nrow=10) goal=c(4,8) Qs=array(0,dim=c(7,10,4)) for (i_rep in 1:maxItr) { state=c(4,1) # start while (!all(state==goal)) { if (runif(1) > epsilon) { move=which.max(Qs[state[1],state[2],]) } else { move=sample(1:4,1)} sIDX=state state=state+act.V[move,]+wind[state[2],] if (state[1]<1) {state[1]=1} if (state[1]>7) {state[1]=7} if (state[2]<1) {state[2]=1} if (state[2]>10) {state[2]=10} max.Q=max(Qs[state[1],state[2],]) rew=ifelse(all(state==goal),0,-1) Qs[sIDX[1],sIDX[2],move]=Qs[sIDX[1],sIDX[2],move] +alpha*(rew+gamma* max.Q-Qs[sIDX[1],sIDX[2],move]) } } return(Qs) } Qs=Qlearn.ex6.5(1e6,0.05,1,0.1) # sim optimal actions state=c(4,1);goal=c(4,8); state.hist=state while (!all(state==goal)) { moveID=which.max(Qs[state[1],state[2],]) state=state+act.V[moveID,]+wind[state[2],] if (state[1]<1) {state[1]=1} if (state[1]>7) {state[1]=7} if (state[2]<1) {state[2]=1} if (state[2]>10) {state[2]=10} state.hist=rbind(state.hist,state) } # plotting results plot(0,0,type='n',xlim=c(0,11),ylim=c(0,8),xlab="",ylab="", main="Learned policies -- Q-learning") lines(1,4,type='p',pch=19,col='red',cex=2) lines(8,4,type='p',pch=19,col='red',cex=2) dirs=c("up","right","down","left" ) for (i_row in 1:7) { for (i_col in 1:10) { best.move=dirs[which.max(Qs[i_row,i_col,])] text(i_col,i_row,best.move) } } lines(state.hist[,2],state.hist[,1],col="red",lwd=2)