funZ<-function(x) {
x[1]^4-16*x[1]^2-5*x[1]+x[2]^4-16*x[2]^2-5*x[2]
}
n.theta = 10;
n.iter = 1000;
Wp = 0.1;
Wg = 0.1;
theta = matrix(rnorm(n.theta*2), nrow=n.theta)
v = matrix(rnorm(n.theta*2), nrow=n.theta)
theta.hist = array(0,c(n.theta,2,n.iter))
theta.hist[,,1]=theta
p.b.v <- apply(theta,1,funZ)
p.best = theta
g.b.v = min(p.b.v)
g.b.idx = which.min(p.b.v)
g.best <- theta[g.b.idx,]
v = v + Wp*runif(n.theta)*(p.best - theta)+ Wg*runif(n.theta)*t(g.best - t(theta))
theta = theta + v
for (i.iter in 2:n.iter){
v = v + Wp*runif(n.theta)*(p.best - theta)+ Wg*runif(n.theta)*t(g.best - t(theta))
theta = theta + v
fitG = apply(theta,1,funZ)
if (min(fitG) < g.b.v){
g.best = theta[which.min(fitG),]
g.b.v = min(fitG)
}
idx = which(fitG < p.b.v)
p.best[idx,] = theta[idx,]
p.b.v= fitG
theta.hist[,,i.iter]=theta
}
funZ2<-function(x,y) {x^4-16*x^2-5*x+y^4-16*y^2-5*y}
xmin=-5;xmax=5;n_len=100;
x<-seq(xmin, xmax,length.out=n_len);
y<-x;
z<-outer(x,y,funZ2)
contour(x,y,z,nlevels= 50, ,lwd = 0.3,drawlabels = F,col='blue')
for (i.agent in 1:n.theta){
lines(theta.hist[i.agent,1,],theta.hist[i.agent,2,],col=i.agent)
points(theta[1,1],theta[1,2],pch=20,cex=2,col=i.agent)
}
n.trial = 1000; n.rep = 100
Q.true = rnorm(10);
opt.act = which.max(Q.true)
r.hist = matrix(0,n.rep, n.trial)
oa.hist = matrix(0,n.rep, n.trial)
for (i.rep in 1:n.rep){
Q.est = rep(0,10);R.cum = rep(0,10)
act.count = rep(0,10)
for (i.trial in 1:n.trial){
max.id = which(Q.est == max(Q.est))
if (length(max.id)>1){
action = sample(max.id,1)
} else {
action = max.id
}
if (action == opt.act){
oa.hist[i.rep, i.trial] = 1
}
r = rnorm(1, Q.true[action],1)
r.hist[i.rep, i.trial] = r
R.cum[action] = R.cum[action] + r
act.count[action] = act.count[action] + 1
Q.est = R.cum / (act.count + 0.01)
}
}
plot(colMeans(r.hist),type='l')
plot(colMeans(oa.hist),type='l',ylim=c(0,1))
### epsilon greedy
n.trial = 1000; n.rep = 1000
r.hist = matrix(0,n.rep, n.trial)
oa.hist = matrix(0,n.rep, n.trial)
epsilon = 0.01
for (i.rep in 1:n.rep){
Q.est = rep(0,10);R.cum = rep(0,10)
act.count = rep(0,10)
Q.true = rnorm(10);
opt.act = which.max(Q.true)
for (i.trial in 1:n.trial){
if (runif(1)>epsilon){
max.id = which(Q.est == max(Q.est))
if (length(max.id)>1){
action = sample(max.id,1)
} else {
action = max.id
}
} else {
action = sample(10,1)
}
if (action == opt.act){
oa.hist[i.rep, i.trial] = 1
}
r = rnorm(1, Q.true[action],1)
r.hist[i.rep, i.trial] = r
R.cum[action] = R.cum[action] + r
act.count[action] = act.count[action] + 1
Q.est[action] = R.cum[action] / act.count[action]
}
}
### softmax
n.trial = 1000; n.rep = 1000
r.hist = matrix(0,n.rep, n.trial)
oa.hist = matrix(0,n.rep, n.trial)
gamma = 3
for (i.rep in 1:n.rep){
Q.est = rep(0,10);R.cum = rep(0,10)
act.count = rep(0,10)
Q.true = rnorm(10);
opt.act = which.max(Q.true)
for (i.trial in 1:n.trial){
action = sample(10, 1, prob=exp(gamma*Q.est)/sum(exp(gamma*Q.est)))
if (action == opt.act){
oa.hist[i.rep, i.trial] = 1
}
r = rnorm(1, Q.true[action],1)
r.hist[i.rep, i.trial] = r
R.cum[action] = R.cum[action] + r
act.count[action] = act.count[action] + 1
Q.est[action] = R.cum[action] / act.count[action]
}
Related