RL Sutton & Barto Ch2

epGreedy = function(nTrial,nRep,epsilon) {
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
  Q.true=rnorm(10);Q.est=rep(0,10);
  Q.cum=rep(0,10);act.count=rep(0,10);
  opt.ID=which.max(Q.true)
  for (i_trial in 1:nTrial) {
    if (runif(1) < epsilon) {
      action=sample(1:10,1)
    } else {
      action=which.max(Q.est)
    }
    ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
    opt.hist[i_trial,i_rep]=action==opt.ID
    act.count[action]=act.count[action]+1
    Q.cum[action]=Q.cum[action]+ret.hist[i_trial,i_rep]
    Q.est[action]=Q.cum[action]/act.count[action]
  }
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}
type1=epGreedy(1000,2000,0.0)
type2=epGreedy(1000,2000,0.01)
type3=epGreedy(1000,2000,0.1)

par(mfrow=c(2,1))
plot(type3$ret,type='l',xlab="Play",ylab="average reward")
lines(type2$ret,type='l',col='red')
lines(type1$ret,type='l',col='green')
legend("bottomright",c("epsilon=0.00","epsilon=0.01","epsilon=0.10"),
  col=c("black","red","green"),lty=c(1,1,1))
plot(type3$opt,type='l',xlab="Play",ylab="% optimal action")
lines(type2$opt,type='l',col='red')
lines(type1$opt,type='l',col='green')
legend("bottomright",c("epsilon=0.00","epsilon=0.01","epsilon=0.10"),
  col=c("black","red","green"),lty=c(1,1,1))

#####################################
#  Generalised Version - ch2.5 & 2.7
#####################################
epGreedyG = function(nTrial,nRep,epsilon,LR,constantLR="F",initialQ) {
# generalized version of epsilon-greedy 
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
  Q.true=rnorm(10);Q.est=rep(initialQ,10);act.count=rep(0,10);
  opt.ID=which.max(Q.true)
  for (i_trial in 1:nTrial) {
    if (runif(1) < epsilon) {
      action=sample(1:10,1)
    } else {
      action=which.max(Q.est)
    }
    ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
    opt.hist[i_trial,i_rep]=action==opt.ID
    act.count[action]=act.count[action]+1
    if (constantLR=="F"){LR=1/act.count[action]}
    Q.est[action]=Q.est[action]+LR*(ret.hist[i_trial,i_rep]-Q.est[action])
  }
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}

epGreedy

#################################
#  ch2.4 softmax
#################################
RLsoftmax = function(nTrial,nRep,temp) {
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
  Q.true=rnorm(10);Q.est=rep(0,10);
  Q.cum=rep(0,10);act.count=rep(0,10);
  opt.ID=which.max(Q.true)
  t=temp
  for (i_trial in 1:nTrial) {
    action=sample(1:10,1,prob=exp(Q.est/t)/sum(exp(Q.est/t)))
    ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
    opt.hist[i_trial,i_rep]=action==opt.ID
    act.count[action]=act.count[action]+1
    Q.cum[action]=Q.cum[action]+ret.hist[i_trial,i_rep]
    Q.est[action]=Q.cum[action]/act.count[action]
    t=max(0.001,0.995*t)
  }
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}

#################################
# ch2.8 reinforcement comparison
################################# 
reinfComp = function(nTrial,nRep,alpha,beta) {
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
  Q.true=rnorm(10);p=rep(0,10);r.ave=0;
  opt.ID=which.max(Q.true)
  for (i_trial in 1:nTrial) {
    action=sample(1:10,1,prob=exp(p)/sum(exp(p)))
    ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
    opt.hist[i_trial,i_rep]=action==opt.ID
    p[action]=p[action]+beta*(ret.hist[i_trial,i_rep]-r.ave)
    r.ave=r.ave+alpha*(ret.hist[i_trial,i_rep]-r.ave)
  }
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}

#################################
# ch2.9 pursuit method
#################################
pursuit= function(nTrial,nRep,beta) {
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
  Q.true=rnorm(10);p=rep(1/10,10);act.count=rep(0,10);Q.est=rep(0,10)
  opt.ID=which.max(Q.true)
  for (i_trial in 1:nTrial) {
    actTemp=which.max(Q.est)
    p[actTemp]=p[actTemp]+beta*(1-p[actTemp])
    nonA=(1:10)[-actTemp]
    p[nonA]=p[nonA]+beta*(-p[nonA])
    action=sample(1:10,1,prob=p)
    ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
    opt.hist[i_trial,i_rep]=action==opt.ID
    act.count[action]=act.count[action]+1
    Q.est[action]=Q.est[action]+1/act.count[action]*(ret.hist[i_trial,i_rep]-Q.est[action])
  }
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}