広域システム特別講義II 課題1

課題1.1 勾配法を用いた最小化

$z= \frac{1}{20}x^2+y^2$
を最小化するxとyを慣性を含む勾配法を用いて求めてください。

fun01 = function(x,y){
  z = 1/20*x^2 + y^2
  return(z)
}

結果の可視化は以下のようなものでcontourを作図して:

install.packages("plot3D")
library(plot3D)
x = seq(-10,10,0.2)
y = seq(-5,5,0.2)
M = mesh(x,y)
Z = as.vector(1/20*M$x^2)+as.vector(M$y^2) 
Z.mesh = matrix(Z,nrow(M$x))
contour(x, y, Z.mesh, drawlabels = F, nlevels=40)

その後linesで更新履歴を追加してみると良いかもしれません。

# gdは勾配法でのx、yの更新履歴
lines(gd, type='o', col = 'green', pch=20)

# gdmは慣性ありの勾配法でのx、yの更新履歴
lines(gdm, type='o', col = 'blue', pch=20)

実装例

fun01 = function(x,y){
  z = 1/20*x^2 + y^2
  return(z)
}

fun01.grad <- function(x,y){
  grad.x = 1/10*x
  grad.y = 2*y
  return(list(gx=grad.x, gy = grad.y))
}

# script
n.loop = 100; lambda = 0.9
x = y = rep(0,n.loop)
x[1] = -7; y[1] = 2
for (i.loop in 2:n.loop){
  g = fun01.grad(x[i.loop - 1],y[i.loop - 1])
  x[i.loop] = x[i.loop - 1] - lambda*g$gx
  y[i.loop] = y[i.loop - 1] - lambda*g$gy
}

# function
min.fun01 <- function(init.x, init.y, n.loop, labmda){
  x = y = rep(0,n.loop)
  x[1] = init.x; y[1] = init.y
  for (i.loop in 2:n.loop){
    g = fun01.grad(x[i.loop - 1],y[i.loop - 1])
    x[i.loop] = x[i.loop - 1] - lambda*g$gx
    y[i.loop] = y[i.loop - 1] - lambda*g$gy
  }
  return(list(x = x, y = y))
}

res <- min.fun01(-7, 2, 100, 0.9)
temp.x = seq(-10,10,0.2);temp.y = seq(-5,5,0.2)
M = mesh(temp.x,temp.y)
Z = as.vector(1/20*M$x^2)+as.vector(M$y^2)
Z.mesh = matrix(Z,nrow(M$x))
contour(temp.x, temp.y, Z.mesh, drawlabels = F, nlevels=40)
lines(res$x, res$y, type='o', col="green")

min.fun01M <- function(init.x, init.y, n.loop, labmda, gamma){
  x = y = rep(0,n.loop)
  x[1] = init.x; y[1] = init.y; v = rep(0,2)
  for (i.loop in 2:n.loop){
    g = fun01.grad(x[i.loop - 1],y[i.loop - 1])
    v = gamma*v - c(lambda*g$gx, lambda*g$gy)
    x[i.loop] = x[i.loop - 1] + v[1]
    y[i.loop] = y[i.loop - 1] + v[2]
  }
  return(list(x = x, y = y))
}

resM <- min.fun01M(-7, 2, 100, 0.9, 0.4)
lines(resM$x, resM$y, type='o', col="blue")

### with ggplot ###
ES <- tibble(x = as.vector(M$x),
             y = as.vector(M$y),
             z = Z)

res <- tibble(x = c(res$x, resM$x),
              y = c(res$y, resM$y),
              type = c(rep("Std",100), rep("moment.",100)))

ggplot(ES,aes(x = x, y = y, z =z )) + stat_contour(bins = 10) +
  geom_line(aes(x = x, y = y, color = type), data = res)

課題1.2 multi-armed bandit problem
wikipediaの記事
何種類かバージョンはありますが、この課題では各スロットマシーンの真のリワードは、mean = 0、sd = 1の正規分布に従うこととしてください。ただし、ある特定の回で実際に得られるリワードは真のリワード、プラス正規分布にしたがうノイズがあるとします（ここでも、mean = 0、sd = 1としましょう）。
真のリワードは不明で、1000回スロットマシーンを引いた時の総リワードを最大化することを目的とします。
1つ目の方策はgreedy法で、リワードの推定値が最も高いスロットマシーンを選択するといったものです。
2つ目の方策は基本的にはgreedy法で、ある確率epsilonでランダムにスロットマシーンを選択するものです。
各方策につき複数回検証しましょう（M回）。

# スロットマシーンの数を５としました（実際には幾つでも結構です）
N = 5

# スロットマシーンを引く回数を1000としました（実際には幾つでも結構です）
nTrial = 1000

# 真のリワードを生成します。
reward.true = rnorm(N, mean=0, sd=1)
> reward.true
[1] -0.2822860  0.5645874 -0.1968128  0.5430834 -0.3696859

# リワードの推定値を初期化します。
reward.est = rep(0, N)

# リワードの累積和を初期化します。
reward.cum = rep(0, N)

# 各スロットマシーンを引いた回数を初期化します。
sm.count = rep(0, N)

# リワードの履歴を初期化します。
reward.hist = rep(0, nTrial)

# greedy法で、どのスロットマシーンを引くか選択します。
# reward.estの最大値が複数個ある場合は、それらから1つをランダムで選択します。
max.est = which(max(reward.est) == reward.est)
if (length(max.est) > 1){
  selected = sample(max.est, 1)
} else {selected = max.est}

# 今回は5が選択されました。
> selected
[1] 5

# スロットマシーン5を引いてみます。
# 真のリワードは
# > reward.true[selected]
# [1] -0.3696859
# ですが、今回実際に得られるのはこれにノイズが乗ります。
reward = reward.true[selected] + rnorm(1, mean = 0, sd =1)
> reward
[1] -1.61256

reward.hist[1] = reward
# 繰り返す場合は、reward.hist[i.trial] = reward など 


# リワードの推定値をアップデートします
reward.cum[selected] = reward.cum[selected] + reward
sm.count[selected] = sm.count[selected] + 1
reward.est[selected] = reward.cum[selected] / sm.count[selected]
> reward.est
[1]  0.00000  0.00000  0.00000  0.00000 -1.61256

# 2回目
max.est = which(max(reward.est) == reward.est)
if (length(max.est) > 1){
   selected = sample(max.est, 1)
 } else {selected = max.est}
> selected 
[1] 2

reward = reward.true[selected] + rnorm(1, mean = 0, sd =1)
> reward
[1] 1.497099

reward.cum[selected] = reward.cum[selected] + reward
sm.count[selected] = sm.count[selected] + 1
reward.est[selected] = reward.cum[selected] / sm.count[selected]

> reward.est
[1]  0.000000  1.497099  0.000000  0.000000 -1.612560

# これをnTrial分繰り返します。

# 2つの方策の良さの検証は、特定のreward.trueの値に依存するべきではないので、
# reward.trueの値を変えてみます。これをM回繰り返してみましょう。

実装例：

## n-armed bandit
nTrial = 1000; epsilon = 0.0;
Q.true = rnorm(10); opt.ID = which.max(Q.true)
Q.est = Q.cum = act.count = rep(0, 10);
rew.hist = opt.hist = rep(0, nTrial);
for (i_trial in 1:nTrial) {
  if (runif(1) < epsilon) {
    action = sample(1:10, 1)
  } else {
    action = which.max(Q.est)
  }
  rew.hist[i_trial] = rnorm(1) + Q.true[action]
  opt.hist[i_trial] = action == opt.ID
  act.count[action] = act.count[action] + 1
  Q.cum[action] = Q.cum[action] + rew.hist[i_trial]
  Q.est[action] = Q.cum[action] / act.count[action]
}
plot(rew.hist, type='l')

epGreedy = function(nTrial,nRep,epsilon) {
  rew.hist = opt.hist = matrix(0,nrow=nTrial,ncol=nRep)
  for (i_rep in 1:nRep){
    Q.true=rnorm(10); opt.ID=which.max(Q.true)
    Q.est = Q.cum = act.count=rep(0,10);
    for (i_trial in 1:nTrial) {
      if (runif(1) < epsilon) {
        action=sample(1:10,1)
      } else {
        action=which.max(Q.est)
      }
      rew.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
      opt.hist[i_trial,i_rep]=action==opt.ID
      act.count[action]=act.count[action]+1
      Q.cum[action]=Q.cum[action]+rew.hist[i_trial,i_rep]
      Q.est[action]=Q.cum[action]/act.count[action]
    }
  }
  return(list(opt = opt.hist, rew = rew.hist))
}
n.Trial = 1000; n.Rep = 2000
type1=epGreedy(n.Trial, n.Rep, 0.0)
type2=epGreedy(n.Trial, n.Rep, 0.01)
type3=epGreedy(n.Trial, n.Rep, 0.1)

par(mfrow=c(2,1))
plot(colMeans(type3$rew),type='l',xlab="Play",ylab="average reward")
lines(colMeans(type2$rew),type='l',col='red')
lines(colMeans(type1$rew),type='l',col='green')
legend("bottomright",c("epsilon=0.00","epsilon=0.01","epsilon=0.10"),
       col=c("black","red","green"),lty=c(1,1,1))
plot(colMeans(type3$opt),type='l',xlab="Play",ylab="% optimal action")
lines(colMeans(type2$opt),type='l',col='red')
lines(colMeans(type1$opt),type='l',col='green')
legend("bottomright",c("epsilon=0.00","epsilon=0.01","epsilon=0.10"),
       col=c("black","red","green"),lty=c(1,1,1))

### with ggplot ###
res.all = tibble(trial = rep(1:nTrial, n.Rep * 3),
                 rew = c(as.vector(type1$rew), 
                         as.vector(type2$rew),
                         as.vector(type3$rew)),
                 opt = c(as.vector(type1$opt), 
                         as.vector(type2$opt),
                         as.vector(type3$opt)),
                 condition = c(rep("0.00", nTrial * n.Rep),
                               rep("0.01", nTrial * n.Rep),
                               rep("0.10", nTrial * n.Rep)))

p1 <- ggplot(res.all) +
  geom_smooth(aes(x = trial, y = rew, color = condition)) +
  ylab("Average Reward")

p2 <- ggplot(res.all) +
  geom_smooth(aes(x = trial, y = opt, color = condition)) +
  ylab("Prop. Optimal Action")

library(gridExtra)
grid.arrange(p1, p2, nrow =2)
###

CourseLog

Dept Cognitive & Info Science

広域システム特別講義II 課題1

Related

Share this:

Related