# 強化学習　方策の比較1

```# Qが最大のactionを選択
max.Q <- function(Q){
max.a = max(Q)
max.idx = which(Q == max.a)
if (length(max.idx)>1){
max.idx = sample(max.idx, 1)
}
return(max.idx)
}

# greedy方策
greedy <- function(n.trial, Q.star, N){
Q = Q.cum = count = rep(0, N)
rew.earned = rep(0, n.trial)
for (i.trial in 1:n.trial){
act.idx = max.Q(Q)
r.t = rnorm(1, mean = Q.star[act.idx], sd = 1)
Q.cum[act.idx] = Q.cum[act.idx] + r.t
count[act.idx] = count[act.idx] + 1
Q[act.idx] = Q.cum[act.idx] / count[act.idx]
rew.earned[i.trial] = r.t
}
return(list(Q = Q, rew.earned = rew.earned))
}

# epsilon greedy方策
# epsilon = 0の場合はgreedy方策と同等
eps.greedy <- function(n.trial, Q.star, N, epsilon){
Q = Q.cum = count = rep(0, N)
rew.earned = rep(0, n.trial)
for (i.trial in 1:n.trial){
if (runif(1) < epsilon){
act.idx = sample(1:N, 1)
} else {
act.idx = max.Q(Q)
}
r.t = rnorm(1, mean = Q.star[act.idx], sd = 1)
Q.cum[act.idx] = Q.cum[act.idx] + r.t
count[act.idx] = count[act.idx] + 1
Q[act.idx] = Q.cum[act.idx] / count[act.idx]
rew.earned[i.trial] = r.t
}
return(list(Q = Q, rew.earned = rew.earned))
}

# n.rep回繰り返す関数
comp.eps.greedy <- function(n.trial, n.rep, N, epsilon){
rew.history = matrix(0, nrow = n.trial, ncol = n.rep)
for (i.rep in 1:n.rep){
Q.star = rnorm(N, mean = 0, sd = 1);
res = eps.greedy(n.trial, Q.star, N, epsilon)
rew.history[ , i.rep] = res\$rew.earned
}
return(rew.history)
}

# 実行
EG.000 = comp.eps.greedy(1000, 1000, 10, 0.000)
EG.001 = comp.eps.greedy(1000, 1000, 10, 0.001)
EG.010 = comp.eps.greedy(1000, 1000, 10, 0.010)
EG.100 = comp.eps.greedy(1000, 1000, 10, 0.100)
EG.150 = comp.eps.greedy(1000, 1000, 10, 0.150)

# 結果の可視化
plot(rowMeans(EG.000), type="l", ylab="Average Reward", xlab="Trial",
ylim = c(0,2))
lines(rowMeans(EG.001),col=2)
lines(rowMeans(EG.010),col=3)
lines(rowMeans(EG.100),col=4)
lines(rowMeans(EG.150),col=5)
legend("bottomright",
c("epsilon = 0.000",
"epsilon = 0.001",
"epsilon = 0.010",
"epsilon = 0.100",
"epsilon = 0.150"),
col=1:5,lwd=2 )

# softmax
softmax <- function(n.trial, Q.star, N, tau){
Q = Q.cum = count = rep(0, N)
rew.earned = rep(0, n.trial)
for (i.trial in 1:n.trial){
p = exp(Q*tau)/sum(exp(Q*tau))
act.idx = sample(1:N, 1, prob = p)
r.t = rnorm(1, mean = Q.star[act.idx], sd = 1)
Q.cum[act.idx] = Q.cum[act.idx] + r.t
count[act.idx] = count[act.idx] + 1
Q[act.idx] = Q.cum[act.idx] / count[act.idx]
rew.earned[i.trial] = r.t
}
return(list(Q = Q, rew.earned = rew.earned))
}

comp.softmax <- function(n.trial, n.rep, N, tau){
rew.history = matrix(0, nrow = n.trial, ncol = n.rep)
for (i.rep in 1:n.rep){
Q.star = rnorm(N, mean = 0, sd = 1);
res = softmax(n.trial, Q.star, N, tau)
rew.history[ , i.rep] = res\$rew.earned
}
return(rew.history)
}

# 実行
EG.000 = comp.eps.greedy(1000, 1000, 10, 0.000)
EG.010 = comp.eps.greedy(1000, 1000, 10, 0.010)
EG.100 = comp.eps.greedy(1000, 1000, 10, 0.100)
SM.10 = comp.softmax(1000,1000,10,10)
SM.03 = comp.softmax(1000,1000,10,3)
# 結果の可視化
plot(rowMeans(EG.000), type="l", ylab="Average Reward", xlab="Trial",
ylim = c(0,2))
lines(rowMeans(EG.010),col=2)
lines(rowMeans(EG.100),col=3)
lines(rowMeans(SM.10),col=4)
lines(rowMeans(SM.03),col=5)
legend("bottomright",
c("epsilon = 0.000",
"epsilon = 0.010",
"epsilon = 0.100",
"tau = 10",
"tau = 3"),
col=1:5,lwd=2 )

# epsilon greedy (2nd version)
eps.greedy2 <- function(n.trial, Q.star, N, epsilon, lr, init.Q){
Q = rep(init.Q, N)
rew.earned = rep(0, n.trial)
for (i.trial in 1:n.trial){
if (runif(1) < epsilon){
act.idx = sample(1:N, 1)
} else {
act.idx = max.Q(Q)
}
r.t = rnorm(1, mean = Q.star[act.idx], sd = 1)
Q[act.idx] = Q[act.idx] + lr*(r.t - Q[act.idx])
rew.earned[i.trial] = r.t
}
return(list(Q = Q, rew.earned = rew.earned))
}
```