強化学習 方策の比較1

# Qが最大のactionを選択
max.Q <- function(Q){
  max.a = max(Q)
  max.idx = which(Q == max.a)
  if (length(max.idx)>1){
    max.idx = sample(max.idx, 1)
  }
  return(max.idx)
}

# greedy方策
greedy <- function(n.trial, Q.star, N){
  Q = Q.cum = count = rep(0, N)
  rew.earned = rep(0, n.trial)
  for (i.trial in 1:n.trial){
    act.idx = max.Q(Q)
    r.t = rnorm(1, mean = Q.star[act.idx], sd = 1)
    Q.cum[act.idx] = Q.cum[act.idx] + r.t
    count[act.idx] = count[act.idx] + 1
    Q[act.idx] = Q.cum[act.idx] / count[act.idx]
    rew.earned[i.trial] = r.t
  }
  return(list(Q = Q, rew.earned = rew.earned))
}

# epsilon greedy方策 
# epsilon = 0の場合はgreedy方策と同等
eps.greedy <- function(n.trial, Q.star, N, epsilon){
  Q = Q.cum = count = rep(0, N)
  rew.earned = rep(0, n.trial)
  for (i.trial in 1:n.trial){
    if (runif(1) < epsilon){
      act.idx = sample(1:N, 1)
    } else {
      act.idx = max.Q(Q)
    }
    r.t = rnorm(1, mean = Q.star[act.idx], sd = 1)
    Q.cum[act.idx] = Q.cum[act.idx] + r.t
    count[act.idx] = count[act.idx] + 1
    Q[act.idx] = Q.cum[act.idx] / count[act.idx]
    rew.earned[i.trial] = r.t
  }
  return(list(Q = Q, rew.earned = rew.earned))
}

# n.rep回繰り返す関数
comp.eps.greedy <- function(n.trial, n.rep, N, epsilon){
  rew.history = matrix(0, nrow = n.trial, ncol = n.rep)
  for (i.rep in 1:n.rep){
    Q.star = rnorm(N, mean = 0, sd = 1);
    res = eps.greedy(n.trial, Q.star, N, epsilon)
    rew.history[ , i.rep] = res$rew.earned
  }
  return(rew.history)
}

# 実行
EG.000 = comp.eps.greedy(1000, 1000, 10, 0.000)
EG.001 = comp.eps.greedy(1000, 1000, 10, 0.001)
EG.010 = comp.eps.greedy(1000, 1000, 10, 0.010)
EG.100 = comp.eps.greedy(1000, 1000, 10, 0.100)
EG.150 = comp.eps.greedy(1000, 1000, 10, 0.150)

# 結果の可視化
plot(rowMeans(EG.000), type="l", ylab="Average Reward", xlab="Trial",
     ylim = c(0,2))
lines(rowMeans(EG.001),col=2)
lines(rowMeans(EG.010),col=3)
lines(rowMeans(EG.100),col=4)
lines(rowMeans(EG.150),col=5)
legend("bottomright",
       c("epsilon = 0.000",
         "epsilon = 0.001",
         "epsilon = 0.010",
         "epsilon = 0.100",
         "epsilon = 0.150"),
       col=1:5,lwd=2 )

# softmax
softmax <- function(n.trial, Q.star, N, tau){
  Q = Q.cum = count = rep(0, N)
  rew.earned = rep(0, n.trial)
  for (i.trial in 1:n.trial){
    p = exp(Q*tau)/sum(exp(Q*tau))
    act.idx = sample(1:N, 1, prob = p)
    r.t = rnorm(1, mean = Q.star[act.idx], sd = 1)
    Q.cum[act.idx] = Q.cum[act.idx] + r.t
    count[act.idx] = count[act.idx] + 1
    Q[act.idx] = Q.cum[act.idx] / count[act.idx]
    rew.earned[i.trial] = r.t
  }
  return(list(Q = Q, rew.earned = rew.earned))
}

comp.softmax <- function(n.trial, n.rep, N, tau){
  rew.history = matrix(0, nrow = n.trial, ncol = n.rep)
  for (i.rep in 1:n.rep){
    Q.star = rnorm(N, mean = 0, sd = 1);
    res = softmax(n.trial, Q.star, N, tau)
    rew.history[ , i.rep] = res$rew.earned
  }
  return(rew.history)
}

# 実行
EG.000 = comp.eps.greedy(1000, 1000, 10, 0.000)
EG.010 = comp.eps.greedy(1000, 1000, 10, 0.010)
EG.100 = comp.eps.greedy(1000, 1000, 10, 0.100)
SM.10 = comp.softmax(1000,1000,10,10)
SM.03 = comp.softmax(1000,1000,10,3)
# 結果の可視化
plot(rowMeans(EG.000), type="l", ylab="Average Reward", xlab="Trial",
     ylim = c(0,2))
lines(rowMeans(EG.010),col=2)
lines(rowMeans(EG.100),col=3)
lines(rowMeans(SM.10),col=4)
lines(rowMeans(SM.03),col=5)
legend("bottomright",
       c("epsilon = 0.000",
         "epsilon = 0.010",
         "epsilon = 0.100",
         "tau = 10",
         "tau = 3"),
       col=1:5,lwd=2 )

# epsilon greedy (2nd version)
eps.greedy2 <- function(n.trial, Q.star, N, epsilon, lr, init.Q){
    Q = rep(init.Q, N)
    rew.earned = rep(0, n.trial)
    for (i.trial in 1:n.trial){
      if (runif(1) < epsilon){
        act.idx = sample(1:N, 1)
      } else {
        act.idx = max.Q(Q)
      }
      r.t = rnorm(1, mean = Q.star[act.idx], sd = 1)
      Q[act.idx] = Q[act.idx] + lr*(r.t - Q[act.idx])
      rew.earned[i.trial] = r.t
    }
    return(list(Q = Q, rew.earned = rew.earned))
  }