epGreedy = function(nTrial,nRep,epsilon) { ret.hist=matrix(0,nrow=nTrial,ncol=nRep) opt.hist=ret.hist for (i_rep in 1:nRep){ Q.true=rnorm(10);Q.est=rep(0,10); Q.cum=rep(0,10);act.count=rep(0,10); opt.ID=which.max(Q.true) for (i_trial in 1:nTrial) { if (runif(1) < epsilon) { action=sample(1:10,1) } else { action=which.max(Q.est) } ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action] opt.hist[i_trial,i_rep]=action==opt.ID act.count[action]=act.count[action]+1 Q.cum[action]=Q.cum[action]+ret.hist[i_trial,i_rep] Q.est[action]=Q.cum[action]/act.count[action] } } return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist))) } type1=epGreedy(1000,2000,0.0) type2=epGreedy(1000,2000,0.01) type3=epGreedy(1000,2000,0.1) par(mfrow=c(2,1)) plot(type3$ret,type='l',xlab="Play",ylab="average reward") lines(type2$ret,type='l',col='red') lines(type1$ret,type='l',col='green') legend("bottomright",c("epsilon=0.00","epsilon=0.01","epsilon=0.10"), col=c("black","red","green"),lty=c(1,1,1)) plot(type3$opt,type='l',xlab="Play",ylab="% optimal action") lines(type2$opt,type='l',col='red') lines(type1$opt,type='l',col='green') legend("bottomright",c("epsilon=0.00","epsilon=0.01","epsilon=0.10"), col=c("black","red","green"),lty=c(1,1,1)) ##################################### # Generalised Version - ch2.5 & 2.7 ##################################### epGreedyG = function(nTrial,nRep,epsilon,LR,constantLR="F",initialQ) { # generalized version of epsilon-greedy ret.hist=matrix(0,nrow=nTrial,ncol=nRep) opt.hist=ret.hist for (i_rep in 1:nRep){ Q.true=rnorm(10);Q.est=rep(initialQ,10);act.count=rep(0,10); opt.ID=which.max(Q.true) for (i_trial in 1:nTrial) { if (runif(1) < epsilon) { action=sample(1:10,1) } else { action=which.max(Q.est) } ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action] opt.hist[i_trial,i_rep]=action==opt.ID act.count[action]=act.count[action]+1 if (constantLR=="F"){LR=1/act.count[action]} Q.est[action]=Q.est[action]+LR*(ret.hist[i_trial,i_rep]-Q.est[action]) } } return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist))) }
################################# # ch2.4 softmax ################################# RLsoftmax = function(nTrial,nRep,temp) { ret.hist=matrix(0,nrow=nTrial,ncol=nRep) opt.hist=ret.hist for (i_rep in 1:nRep){ Q.true=rnorm(10);Q.est=rep(0,10); Q.cum=rep(0,10);act.count=rep(0,10); opt.ID=which.max(Q.true) t=temp for (i_trial in 1:nTrial) { action=sample(1:10,1,prob=exp(Q.est/t)/sum(exp(Q.est/t))) ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action] opt.hist[i_trial,i_rep]=action==opt.ID act.count[action]=act.count[action]+1 Q.cum[action]=Q.cum[action]+ret.hist[i_trial,i_rep] Q.est[action]=Q.cum[action]/act.count[action] t=max(0.001,0.995*t) } } return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist))) } ################################# # ch2.8 reinforcement comparison ################################# reinfComp = function(nTrial,nRep,alpha,beta) { ret.hist=matrix(0,nrow=nTrial,ncol=nRep) opt.hist=ret.hist for (i_rep in 1:nRep){ Q.true=rnorm(10);p=rep(0,10);r.ave=0; opt.ID=which.max(Q.true) for (i_trial in 1:nTrial) { action=sample(1:10,1,prob=exp(p)/sum(exp(p))) ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action] opt.hist[i_trial,i_rep]=action==opt.ID p[action]=p[action]+beta*(ret.hist[i_trial,i_rep]-r.ave) r.ave=r.ave+alpha*(ret.hist[i_trial,i_rep]-r.ave) } } return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist))) } ################################# # ch2.9 pursuit method ################################# pursuit= function(nTrial,nRep,beta) { ret.hist=matrix(0,nrow=nTrial,ncol=nRep) opt.hist=ret.hist for (i_rep in 1:nRep){ Q.true=rnorm(10);p=rep(1/10,10);act.count=rep(0,10);Q.est=rep(0,10) opt.ID=which.max(Q.true) for (i_trial in 1:nTrial) { actTemp=which.max(Q.est) p[actTemp]=p[actTemp]+beta*(1-p[actTemp]) nonA=(1:10)[-actTemp] p[nonA]=p[nonA]+beta*(-p[nonA]) action=sample(1:10,1,prob=p) ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action] opt.hist[i_trial,i_rep]=action==opt.ID act.count[action]=act.count[action]+1 Q.est[action]=Q.est[action]+1/act.count[action]*(ret.hist[i_trial,i_rep]-Q.est[action]) } } return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist))) }