epGreedy = function(nTrial,nRep,epsilon) {
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
Q.true=rnorm(10);Q.est=rep(0,10);
Q.cum=rep(0,10);act.count=rep(0,10);
opt.ID=which.max(Q.true)
for (i_trial in 1:nTrial) {
if (runif(1) < epsilon) {
action=sample(1:10,1)
} else {
action=which.max(Q.est)
}
ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
opt.hist[i_trial,i_rep]=action==opt.ID
act.count[action]=act.count[action]+1
Q.cum[action]=Q.cum[action]+ret.hist[i_trial,i_rep]
Q.est[action]=Q.cum[action]/act.count[action]
}
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}
type1=epGreedy(1000,2000,0.0)
type2=epGreedy(1000,2000,0.01)
type3=epGreedy(1000,2000,0.1)
par(mfrow=c(2,1))
plot(type3$ret,type='l',xlab="Play",ylab="average reward")
lines(type2$ret,type='l',col='red')
lines(type1$ret,type='l',col='green')
legend("bottomright",c("epsilon=0.00","epsilon=0.01","epsilon=0.10"),
col=c("black","red","green"),lty=c(1,1,1))
plot(type3$opt,type='l',xlab="Play",ylab="% optimal action")
lines(type2$opt,type='l',col='red')
lines(type1$opt,type='l',col='green')
legend("bottomright",c("epsilon=0.00","epsilon=0.01","epsilon=0.10"),
col=c("black","red","green"),lty=c(1,1,1))
#####################################
# Generalised Version - ch2.5 & 2.7
#####################################
epGreedyG = function(nTrial,nRep,epsilon,LR,constantLR="F",initialQ) {
# generalized version of epsilon-greedy
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
Q.true=rnorm(10);Q.est=rep(initialQ,10);act.count=rep(0,10);
opt.ID=which.max(Q.true)
for (i_trial in 1:nTrial) {
if (runif(1) < epsilon) {
action=sample(1:10,1)
} else {
action=which.max(Q.est)
}
ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
opt.hist[i_trial,i_rep]=action==opt.ID
act.count[action]=act.count[action]+1
if (constantLR=="F"){LR=1/act.count[action]}
Q.est[action]=Q.est[action]+LR*(ret.hist[i_trial,i_rep]-Q.est[action])
}
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}
#################################
# ch2.4 softmax
#################################
RLsoftmax = function(nTrial,nRep,temp) {
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
Q.true=rnorm(10);Q.est=rep(0,10);
Q.cum=rep(0,10);act.count=rep(0,10);
opt.ID=which.max(Q.true)
t=temp
for (i_trial in 1:nTrial) {
action=sample(1:10,1,prob=exp(Q.est/t)/sum(exp(Q.est/t)))
ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
opt.hist[i_trial,i_rep]=action==opt.ID
act.count[action]=act.count[action]+1
Q.cum[action]=Q.cum[action]+ret.hist[i_trial,i_rep]
Q.est[action]=Q.cum[action]/act.count[action]
t=max(0.001,0.995*t)
}
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}
#################################
# ch2.8 reinforcement comparison
#################################
reinfComp = function(nTrial,nRep,alpha,beta) {
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
Q.true=rnorm(10);p=rep(0,10);r.ave=0;
opt.ID=which.max(Q.true)
for (i_trial in 1:nTrial) {
action=sample(1:10,1,prob=exp(p)/sum(exp(p)))
ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
opt.hist[i_trial,i_rep]=action==opt.ID
p[action]=p[action]+beta*(ret.hist[i_trial,i_rep]-r.ave)
r.ave=r.ave+alpha*(ret.hist[i_trial,i_rep]-r.ave)
}
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}
#################################
# ch2.9 pursuit method
#################################
pursuit= function(nTrial,nRep,beta) {
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
Q.true=rnorm(10);p=rep(1/10,10);act.count=rep(0,10);Q.est=rep(0,10)
opt.ID=which.max(Q.true)
for (i_trial in 1:nTrial) {
actTemp=which.max(Q.est)
p[actTemp]=p[actTemp]+beta*(1-p[actTemp])
nonA=(1:10)[-actTemp]
p[nonA]=p[nonA]+beta*(-p[nonA])
action=sample(1:10,1,prob=p)
ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
opt.hist[i_trial,i_rep]=action==opt.ID
act.count[action]=act.count[action]+1
Q.est[action]=Q.est[action]+1/act.count[action]*(ret.hist[i_trial,i_rep]-Q.est[action])
}
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}
Related