認知情報解析学演習b

library(keras)
library(tensorflow)
tf$compat$v1$disable_eager_execution()


deprocess_image <- function(x) {
  dms <- dim(x)
  x <- x - mean(x) 
  x <- x / (sd(x) + 1e-5)
  x <- x * 0.1 
  x <- x + 0.5 
  x <- pmax(0, pmin(x, 1))
  array(x, dim = dms)
}

generate_pattern <- function(layer_name, filter_index, size = 150) {
  
  layer_output <- model$get_layer(layer_name)$output
  loss <- k_mean(layer_output[,,,filter_index]) 
  grads <- k_gradients(loss, model$input)[[1]]
  
  grads <- grads / (k_sqrt(k_mean(k_square(grads))) + 1e-5)
  
  iterate <- k_function(list(model$input), list(loss, grads))
  
  input_img_data <- 
    array(runif(size * size * 3), dim = c(1, size, size, 3)) * 20 + 128
  
  step <- 1
  for (i in 1:40) {
    c(loss_value, grads_value) %<-% iterate(list(input_img_data))
    input_img_data <- input_img_data + (grads_value * step) 
  }
  
  img <- input_img_data[1,,,]
  deprocess_image(img) 
}


library(grid)

library(keras)
model <- application_vgg16(
  weights = "imagenet", 
  include_top = FALSE
)
layer_name <- "block3_conv1"
filter_index <- 1
grid.raster(generate_pattern("block3_conv1", 1))

認知情報解析演習B

input_chiba <- layer_input(shape = c(28,28,1))

brach_a = input_chiba %>% 
  layer_conv_2d(filter = 32, kernel_size = 1,
                activation = "relu", strides = 2)

brach_b = input_chiba %>% 
  layer_conv_2d(filter = 32, kernel_size = 1,
                activation = "relu") %>% 
  layer_conv_2d(filter = 32, kernel_size = 2,
                activation = "relu", stride = 2)

brach_c = input_chiba %>% 
  layer_average_pooling_2d(pool_size = 2, stride = 2) %>% 
  layer_conv_2d(filter = 32, kernel_size = 1,
                activation = "relu")

brach_d = input_chiba %>% 
  layer_conv_2d(filter = 32, kernel_size = 1,
                activation = "relu") %>% 
  layer_conv_2d(filter = 32, kernel_size = 1,
                activation = "relu") %>% 
  layer_conv_2d(filter = 32, kernel_size = 2,
                activation = "relu", stride = 2)

concat = layer_concatenate(list(brach_a, brach_b, brach_c, brach_d))

output_chiba = concat %>% layer_flatten() %>%
  layer_dense(units = 64, activation = "relu") %>%
  layer_dense(units = 10, activation = "softmax")

chiba_model <- keras_model(input_chiba, output_chiba)

mnist <- dataset_mnist()
c(c(train_images, train_labels),c(test_images,test_labels)) %<-% mnist

train_images <- array_reshape(train_images,c(60000,28,28,1))
test_images <- array_reshape(test_images,c(10000,28,28,1))

train_images = train_images/255
test_images = test_images/255

train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

chiba_model %>% compile(
  optimizer = "rmsprop",
  loss = "categorical_crossentropy",
  metrics = c("accuracy")
)

history <- chiba_model %>% fit(
  train_images,
  train_labels,
  epochs = 15,
  batch_size =64,
  validation_data = list(test_images,test_labels)
)
plot(history)

認知情報解析学演習 viz convnet

base_dir <- "/home/shiga/R_data/cat_and_dog/small_set"
train_dir <- file.path(base_dir, "train")
validation_dir <- file.path(base_dir, "validation")
test_dir <- file.path(base_dir, "test")

library(keras)
model <- keras_model_sequential() %>% 
  layer_conv_2d(filters = 32, kernel_size = c(3, 3), activation = "relu",
                input_shape = c(150, 150, 3)) %>% 
  layer_max_pooling_2d(pool_size = c(2, 2)) %>% 
  layer_conv_2d(filters = 64, kernel_size = c(3, 3), activation = "relu") %>% 
  layer_max_pooling_2d(pool_size = c(2, 2)) %>% 
  layer_conv_2d(filters = 128, kernel_size = c(3, 3), activation = "relu") %>% 
  layer_max_pooling_2d(pool_size = c(2, 2)) %>% 
  layer_conv_2d(filters = 128, kernel_size = c(3, 3), activation = "relu") %>% 
  layer_max_pooling_2d(pool_size = c(2, 2)) %>% 
  layer_flatten() %>% 
  layer_dropout(rate = 0.5) %>% 
  layer_dense(units = 512, activation = "relu") %>% 
  layer_dense(units = 1, activation = "sigmoid")  

model %>% compile(
  loss = "binary_crossentropy",
  optimizer = optimizer_rmsprop(lr = 1e-4),
  metrics = c("acc")
)

datagen <- image_data_generator(
  rescale = 1/255,
  rotation_range = 40,
  width_shift_range = 0.2,
  height_shift_range = 0.2,
  shear_range = 0.2,
  zoom_range = 0.2,
  horizontal_flip = TRUE
)

test_datagen <- image_data_generator(rescale = 1/255)

train_generator <- flow_images_from_directory(
  train_dir,
  datagen,
  target_size = c(150, 150),
  batch_size = 32,
  class_mode = "binary"
)

validation_generator <- flow_images_from_directory(
  validation_dir,
  test_datagen,
  target_size = c(150, 150),
  batch_size = 32,
  class_mode = "binary"
)

history <- model %>% fit_generator(
  train_generator,
  steps_per_epoch = 100,
  epochs = 100,
  validation_data = validation_generator,
  validation_steps = 50
)


###
img_path <- "/home/shiga/R_data/cat_and_dog/small_set/test/cat/cat.1700.jpg"
img <- image_load(img_path, target_size = c(150, 150))
img_tensor <- image_to_array(img)
img_tensor <- array_reshape(img_tensor, c(1, 150, 150, 3))
img_tensor <- img_tensor / 255
plot(as.raster(img_tensor[1,,,]))

layer_outputs <- lapply(model$layers[1:8], function(layer) layer$output)
# Creates a model that will return these outputs, given the model input:
activation_model <- keras_model(inputs = model$input, outputs = layer_outputs)
activations <- activation_model %>% predict(img_tensor)
first_layer_activation <- activations[[1]]
dim(first_layer_activation)

plot_channel <- function(channel) {
  rotate <- function(x) t(apply(x, 2, rev))
  image(rotate(channel), axes = FALSE, asp = 1, 
        col = terrain.colors(12))
}

plot_channel(first_layer_activation[1,,,5])

fifth_layer_activation <- activations[[5]]
plot_channel(fifth_layer_activation[1,,,1])
plot_channel(fifth_layer_activation[1,,,3])

認知情報解析演習 石とりゲーム(bugあり)

col1 = matrix(c(rep(0,4),c(1,0,0,0),c(1,1,0,0),c(1,1,1,0),rep(1,4)),nrow=4,byrow=T)
col2 = matrix(c(rep(10,4),c(11,10,10,10),c(11,11,10,10),c(11,11,11,10),rep(11,4)),nrow=4,byrow=T)
col3 = matrix(c(rep(100,4),c(101,100,100,100),c(101,101,100,100),c(101,101,101,100),rep(101,4)),nrow=4,byrow=T)
act.list = list()
state.temp = list()
counter = 0
Q1 = list()
Q2 = list()
for (i.c1 in 1:5){
  if (sum(col1[,i.c1])==0){
    act1 = c()
  } else {
    act1 = seq(1,sum(col1[,i.c1]),1)
  }
  for (i.c2 in 1:5){
    if (sum(col2[,i.c2])==40){
      act2 = c()
    } else {
      act2 = seq(11,sum(col2[,i.c2]==11)*11,11)
    }
    for (i.c3 in 1:5){
      if (sum(col3[,i.c3])==400){
        act3 = c()
      } else {
        act3 = seq(101,sum(col3[,i.c3]==101)*101,101)
      }
      counter = counter + 1
      state.temp[[counter]] = cbind(col1[,i.c1],col2[,i.c2],col3[,i.c3])
      act.list[[counter]] = c(act1,act2,act3)
      Q1[[counter]] = rep(0, length(c(act1,act2,act3)))
      Q2[[counter]] = rep(0, length(c(act1,act2,act3)))
    }
  }
}
rm.stone <- function(act, st.shape){
  if (act == -99){s}
  if (act > 100){
    n.remove = act%%100
    n.stone = length(which(st.shape[,3]==101))
    start = (5-n.stone)
    st.shape[(start:(start+n.remove-1)),3] = 100
  } else {
    if (act > 10){
      n.remove = act%%10
      n.stone = length(which(st.shape[,2]==11))
      start = (5-n.stone)
      st.shape[(start:(start+n.remove-1)),2] = 10
    } else {
      n.remove = act
      n.stone = length(which(st.shape[,1]==1))
      start = (5-n.stone)
      st.shape[(start:(start+n.remove-1)),1] = 0
    }
  }
  return(st.shape)
}

id.state <- function(st.shape, state.temp){
  for (i.st in 1:125){
    if  (all(st.shape == state.temp[[i.st]])){
      state.idx = i.st 
      break
    }
  }
  return(state.idx)
}

ck.act <- function(Q, act.vec, eta){
  if (is.null(act.vec)){
    return(list(act = -99, act.idx = -99))
    break
  }
  if (length(act.vec)==1){
    act = act.vec
  } else {
    p = exp(Q[[state.idx]])/sum(exp(Q[[state.idx]]))
    act = sample(act.vec, 1, prob = p)
  }
  act.idx = which(act.vec==act)
  return(list(act = act, act.idx = act.idx))
}


gamma=1;alpha = 0.1;n.rep=10000
for (i.rep in 1:n.rep){
  # first action
  state.idx = 125; counter = 1
  st.shape = state.temp[[state.idx]]
  res.act = ck.act(Q1,act.list[[state.idx]],eta)
  act = res.act$act;act.idx = res.act$act.idx
  state.old = state.idx
  act.old = act.idx
  
  # 2nd to last
  while (state.idx != 1) {
    counter = counter + 1
    st.shape <- rm.stone(act, st.shape)
    state.idx <- id.state(st.shape, state.temp)
    if (counter%%2==1) {
      res.act = ck.act(Q1,act.list[[state.idx]],eta)
    } else {
      res.act = ck.act(Q2,act.list[[state.idx]],eta)
    }
    act = res.act$act; act.idx = res.act$act.idx
    if (state.idx == 1){
      if (counter%%2==1){rew1 = -1; rew2 = 1;} else {rew1 = 1; rew2 = -1;}
      Q1[[state.old]][act.old]=Q1[[state.old]][act.old]
         +alpha*(rew1-Q1[[state.old]][act.old])
      Q2[[state.old]][act.old]=Q2[[state.old]][act.old]
         +alpha*rew2-Q2[[state.old]][act.old])
    } else {
      rew1 = 0; 
      rew2 =0;
      if (counter%%2==1){
        Q1[[state.old]][act.old]=Q1[[state.old]][act.old]
          +alpha*(rew1+gamma* Q1[[state.idx]][act.idx]-Q1[[state.old]][act.old])
      } else {
        Q2[[state.old]][act.old]=Q2[[state.old]][act.old]
          +alpha*(rew2+gamma* Q2[[state.idx]][act.idx]-Q2[[state.old]][act.old])
      }
    }
    
    state.old = state.idx
    act.old = act.idx
  }
}  

認知情報解析 実習問題

# 修正済み
# 時間がかかる場合は、繰り返し回数を減らしてください。
# 効率が悪いと思うので、適宜変更してください。

temp.state = expand.grid(loc1 = 0:2,loc2=0:2,loc3=0:2,
                         loc4 = 0:2,loc5=0:2,loc6=0:2,
                         loc7 = 0:2,loc8=0:2,loc9=0:2)
temp.state = = expand.grid(rep(list(0:2),9))

n.ones = rowSums(temp.state == 1 )
n.twos = rowSums(temp.state == 2 )
omitTwo = which(n.ones < n.twos)
omitOne = which((n.ones-1 ) > n.twos)
omitUniq = unique(c(omitOne, omitTwo))
state = temp.state[-omitUniq,]
poss.act = apply(state, 1, function(x) which(x==0))

temp.win = matrix(1:9,3)
win.idx = matrix(c(temp.win[1,],temp.win[2,],temp.win[3,],
                   temp.win[,1],temp.win[,2],temp.win[,3],
                   diag(temp.win),
                   diag(temp.win[3:1,])),ncol=3,byrow=T)

idx1 = c()
idx2 = c()
idxCM = c()
for (i.win in 1:nrow(win.idx)){
  idx.temp = apply(state, 1, function(x) sum(x[win.idx[i.win,]]==1)==3)
  idx1 = c(idx1, which(idx.temp))
  idxCM.temp = apply(state, 1, function(x) sum(x[win.idx[i.win,]]==1)==2)
  idxCM = c(idxCM, which(idxCM.temp))
  idx.temp = apply(state, 1, function(x) sum(x[win.idx[i.win,]]==2)==3)
  idx2 = c(idx2, which(idx.temp))
}
n0=apply(state,1,function(x) length((which(x==0))))
tie = which(n0==0)

Q = list()
V = list()
rew.sum = list()
rew.count = list()
policy = list()
for (i.state in 1:nrow(state)){
  Q[[i.state]] =  rep(0,length(poss.act[[i.state]]))
  V[[i.state]] = rep(0,length(poss.act[[i.state]]))
  rew.sum[[i.state]] = rep(0,length(poss.act[[i.state]]))
  rew.count[[i.state]] = rep(0,length(poss.act[[i.state]]))
  policy[[i.state]] = rep(1/length(poss.act[[i.state]]),length(poss.act[[i.state]]))
}

R.W  = 10
R.T  = 5
R.L = -10
gamma = 1
epsilon = 0.05
eta = 1

ck.result <- function(st.idx, idx1, idx2, tie){
  term = F
  rew = 0
  result = "not terminal"
  if (match(st.idx ,idx1, nomatch = 0)!=0){
    rew = R.W
    term = T
    result = "win"
  } else if (match(st.idx ,idx2, nomatch = 0)!=0){
    rew = R.L
    term = T
    result = "lose"
  } else if (match(st.idx ,tie, nomatch = 0)!=0){
    rew = R.T
    term = T
    result = "tie"
  }
  return(list(rew = rew, term = term, result = result))
}

n.rep = 10000
game.hist = rep(0,n.rep)

# main loop
for (i.rep in 1:n.rep){
  st.idx = 1
  term = F
  state.temp = rep(0,9)
  state.hist1 = c()
  state.hist2 = c()
  repeat {
    # playing game
    if (length(poss.act[[st.idx]])==1){
      act1 = poss.act[[st.idx]]
    } else{
      p.act = exp(eta*Q[[st.idx]])/sum(exp(eta*Q[[st.idx]]))
      act1 = sample(poss.act[[st.idx]],1, prob = p.act)
    }
    state.hist1 = rbind(state.hist1,c(st.idx, act1))
    state.temp[act1] = 1
    st.idx = which(apply(state, 1, function(x) sum(x==state.temp) )==9)
    res = ck.result(st.idx, idx1, idx2, tie)
    if (res$term == T){
      rew = res$rew
      break
    }
    p.act = exp(eta*Q[[st.idx]])/sum(exp(eta*Q[[st.idx]]))
    act2 = sample(poss.act[[st.idx]],1, prob = policy[[st.idx]])
    state.hist2 = rbind(state.hist2,c(st.idx, act2))
    state.temp[act2] = 2
    st.idx = which(apply(state, 1, function(x) sum(x==state.temp) )==9)
    res = ck.result(st.idx, idx1, idx2, tie)
    if (res$term == T){
      rew = res$rew
      break
    }
  }
  # update Q & policy
  game.hist[i.rep] = res$result!="lose"
  n.st = nrow(state.hist1)
  #print(res$result)
  if (i.rep%%100==0){print(i.rep)}
  for (i.st in 1:n.st){
    act.idx = which(poss.act[[state.hist1[i.st,1]]]==state.hist1[i.st,2])
    rew.sum[[state.hist1[i.st,1]]][act.idx] = rew.sum[[state.hist1[i.st,1]]][act.idx] + rew
    rew.count[[state.hist1[i.st,1]]][act.idx] = rew.count[[state.hist1[i.st,1]]][act.idx] + 1
    Q[[state.hist1[i.st,1]]][act.idx] = rew.sum[[state.hist1[i.st,1]]][act.idx] / rew.count[[state.hist1[i.st,1]]][act.idx]
  }
}

# plotting results
library(pracma)
game.hist.smooth = movavg(game.hist, 400, type="s")
plot(game.hist.smooth,type='l')

認知情報解析演習a Monte Carlo 01

north=c(1:3,15,1:10)
east=2:15;east[ c(3,7,11)]=c(3,7,11)
south=c(5:15,12:14)
west=c(15,1:13);west[ c(4,8,12)]=c(4,8,12)
trM=cbind(north,east,south,west)

r=-1;P=rep(0.25,4);V = rep(0,14)max.iter = 10000;
state.count=rep(0,15)
for (i.iter in 1:max.iter){
  state = sample(1:14,1)
  state.seq = state
  while(state!=15){
    action = sample(1:4,1,prob = P)
    state.seq = c(state.seq,trM[state,action])
    state = trM[state,action]  
  }
  uniq.seq = unique(state.seq)
  for (i.uniq in 1:(length(uniq.seq)-1)){
    first.visit = which(state.seq == uniq.seq[i.uniq])[1]
    V[uniq.seq[i.uniq]] = V[uniq.seq[i.uniq]] + r*(length(state.seq)-first.visit-1)
  }
  state.count[uniq.seq] = state.count[uniq.seq] + 1
}
V = matrix(c(0,V/state.count[1:14],0),nrow=4)

認知情報解析06/05の演習問題

演習で書いたプログラムで問題はありませんでした。
以下、結果を可視化するコマンドを書き加えました:

# initalization
p.win =0.4; p.lose = 0.6; P = c(p.lose, p.win);
R = c(rep(0,100), 1); V = rep(0,101);
gamma = 1; tol = 1e-10; counter=0
cols = c("red","skyblue",'orange','black')
par(mfrow=c(1,2))

# value iteration
repeat{
  counter = counter+1
  delta = 0
  for (i.state in 2:100) {
    v <- V[i.state]
    temp.v = rep(0, (i.state - 1))
    for (i.bet in 1:(i.state - 1)) {
      lower.B = i.state - i.bet
      upper.B = min(i.state + i.bet, 101)
      temp.v[i.bet] = sum(P * (R[c(lower.B, upper.B)] + gamma * V[c(lower.B, upper.B)]))
      V[i.state] = max(temp.v)
    }
    delta <- max(abs(v-V[i.state]), delta)
  }
  # plotting results
  if (counter==1){
    plot(V[2:100], type='l', col=cols[1], lwd=2, xlab="capital", ylab="value")
  } else {
    if (counter<4){
      lines(V[2:100], type='l', col=cols[counter], lwd=2)
    }
  }
  if (delta < tol){break}
}
# end of value iteration

lines(V[2:100],type='l', col=cols[4], lwd=2)
legend("topleft", c("1st sweep","2nd sweep", "3rd sweep","32nd sweep") ,col=cols, lwd=2)

# identifying optimal action
policy = rep(0,101)
for (i.state in 2:100) {
  temp.v = rep(0, (i.state - 1))
  for (i.bet in 1:(i.state - 1)) {
    lower.B = i.state - i.bet
    upper.B = min(i.state + i.bet, 101)
    temp.v[i.bet] = sum(P * (R[c(lower.B, upper.B)] + gamma * V[c(lower.B, upper.B)]))
    policy[i.state] = which.max(round(temp.v,4))
  }
}
barplot(policy,xlab="capital",ylab="Optimal action")

認知情報解析学演習a 課題03

V=rep(0,25);                

# defining probability matrix
P=matrix(1/4,nrow=25,ncol=4) # 

# defining deterministic transition matrix
north=c(2:25,25)
north[ c(5,10,15,20,25)]=c(5,10,15,20,25)
east=c(6:25,21:25)
west=c(1:5,1:20)
south=c(1,1:24)
south[ c(1,6,11,16,21)]=c(1,6,11,16,21)
trM=cbind(north,east,south,west)
trM[10,]=6
trM[20,]=18

# defining reward matrix
R=matrix(0,nrow=25,ncol=4)
R[which(trM==1:25)]=-1
R[10,]=10
R[20,]=5

delta=1; gamma=0.9; tol=1e-10; 
bestP=sample(1:4,25,replace=T)
stable=F;counter=0;
while (stable==F){
  counter=counter+1
  # iterative policy evaluation
  while (delta>tol) {
    delta=0;
    V.old=V
    for (i_state in 1:25) {
      v=V[i_state]
      V[i_state]=sum(P[i_state,]*(R[i_state,]+gamma*V.old[trM[i_state,]]))
      delta=max(delta,abs(v-V[i_state]))
    }
  }
  # policy improvement
  stable=F
  for (i_state in 1:25) {
    b=bestP[i_state]
    bestP[i_state]=which.max(V[trM[i_state,]])
    ifelse((bestP[i_state]==b),stable<-T,stable<-F)
  }
}

apply(matrix(bestP,nrow=5),2,rev)
bestP.mat = apply(matrix(as.character(bestP),nrow=5),2,rev)
bestP.mat[which(bestP.mat=="1")] = "N"
bestP.mat[which(bestP.mat=="2")] = "E"
bestP.mat[which(bestP.mat=="3")] = "S"
bestP.mat[which(bestP.mat=="4")] = "W"
print(bestP.mat)

認知情報解析 課題2

# initializing Q matrix
Q = P = matrix(1/4,nrow=25,ncol=4) # 

# defining deterministic transition matrix
north=c(2:25,25)
north[ c(5,10,15,20,25)]=c(5,10,15,20,25)
east=c(6:25,21:25)
west=c(1:5,1:20)
south=c(1,1:24)
south[ c(1,6,11,16,21)]=c(1,6,11,16,21)
trM=cbind(north,east,south,west)
trM[10,]=6
trM[20,]=18

R=matrix(0,nrow=25,ncol=4)
R[which(trM==1:25)]=-1
R[10,]=10
R[20,]=5

nRep=1000; gamma=0.9; P = 0.25
for (i_rep in 1:nRep) {
  Q.old = Q
  for (i_state in 1:25) {
    for (i_act in 1:4){
      Q[i_state, i_act]=R[i_state, i_act]+gamma * P * sum(Q.old[trM[i_state,i_act]])
    }
  }
}

強化学習 方策の比較1

# Qが最大のactionを選択
max.Q <- function(Q){
  max.a = max(Q)
  max.idx = which(Q == max.a)
  if (length(max.idx)>1){
    max.idx = sample(max.idx, 1)
  }
  return(max.idx)
}

# greedy方策
greedy <- function(n.trial, Q.star, N){
  Q = Q.cum = count = rep(0, N)
  rew.earned = rep(0, n.trial)
  for (i.trial in 1:n.trial){
    act.idx = max.Q(Q)
    r.t = rnorm(1, mean = Q.star[act.idx], sd = 1)
    Q.cum[act.idx] = Q.cum[act.idx] + r.t
    count[act.idx] = count[act.idx] + 1
    Q[act.idx] = Q.cum[act.idx] / count[act.idx]
    rew.earned[i.trial] = r.t
  }
  return(list(Q = Q, rew.earned = rew.earned))
}

# epsilon greedy方策 
# epsilon = 0の場合はgreedy方策と同等
eps.greedy <- function(n.trial, Q.star, N, epsilon){
  Q = Q.cum = count = rep(0, N)
  rew.earned = rep(0, n.trial)
  for (i.trial in 1:n.trial){
    if (runif(1) < epsilon){
      act.idx = sample(1:N, 1)
    } else {
      act.idx = max.Q(Q)
    }
    r.t = rnorm(1, mean = Q.star[act.idx], sd = 1)
    Q.cum[act.idx] = Q.cum[act.idx] + r.t
    count[act.idx] = count[act.idx] + 1
    Q[act.idx] = Q.cum[act.idx] / count[act.idx]
    rew.earned[i.trial] = r.t
  }
  return(list(Q = Q, rew.earned = rew.earned))
}

# n.rep回繰り返す関数
comp.eps.greedy <- function(n.trial, n.rep, N, epsilon){
  rew.history = matrix(0, nrow = n.trial, ncol = n.rep)
  for (i.rep in 1:n.rep){
    Q.star = rnorm(N, mean = 0, sd = 1);
    res = eps.greedy(n.trial, Q.star, N, epsilon)
    rew.history[ , i.rep] = res$rew.earned
  }
  return(rew.history)
}

# 実行
EG.000 = comp.eps.greedy(1000, 1000, 10, 0.000)
EG.001 = comp.eps.greedy(1000, 1000, 10, 0.001)
EG.010 = comp.eps.greedy(1000, 1000, 10, 0.010)
EG.100 = comp.eps.greedy(1000, 1000, 10, 0.100)
EG.150 = comp.eps.greedy(1000, 1000, 10, 0.150)

# 結果の可視化
plot(rowMeans(EG.000), type="l", ylab="Average Reward", xlab="Trial",
     ylim = c(0,2))
lines(rowMeans(EG.001),col=2)
lines(rowMeans(EG.010),col=3)
lines(rowMeans(EG.100),col=4)
lines(rowMeans(EG.150),col=5)
legend("bottomright",
       c("epsilon = 0.000",
         "epsilon = 0.001",
         "epsilon = 0.010",
         "epsilon = 0.100",
         "epsilon = 0.150"),
       col=1:5,lwd=2 )

# softmax
softmax <- function(n.trial, Q.star, N, tau){
  Q = Q.cum = count = rep(0, N)
  rew.earned = rep(0, n.trial)
  for (i.trial in 1:n.trial){
    p = exp(Q*tau)/sum(exp(Q*tau))
    act.idx = sample(1:N, 1, prob = p)
    r.t = rnorm(1, mean = Q.star[act.idx], sd = 1)
    Q.cum[act.idx] = Q.cum[act.idx] + r.t
    count[act.idx] = count[act.idx] + 1
    Q[act.idx] = Q.cum[act.idx] / count[act.idx]
    rew.earned[i.trial] = r.t
  }
  return(list(Q = Q, rew.earned = rew.earned))
}

comp.softmax <- function(n.trial, n.rep, N, tau){
  rew.history = matrix(0, nrow = n.trial, ncol = n.rep)
  for (i.rep in 1:n.rep){
    Q.star = rnorm(N, mean = 0, sd = 1);
    res = softmax(n.trial, Q.star, N, tau)
    rew.history[ , i.rep] = res$rew.earned
  }
  return(rew.history)
}

# 実行
EG.000 = comp.eps.greedy(1000, 1000, 10, 0.000)
EG.010 = comp.eps.greedy(1000, 1000, 10, 0.010)
EG.100 = comp.eps.greedy(1000, 1000, 10, 0.100)
SM.10 = comp.softmax(1000,1000,10,10)
SM.03 = comp.softmax(1000,1000,10,3)
# 結果の可視化
plot(rowMeans(EG.000), type="l", ylab="Average Reward", xlab="Trial",
     ylim = c(0,2))
lines(rowMeans(EG.010),col=2)
lines(rowMeans(EG.100),col=3)
lines(rowMeans(SM.10),col=4)
lines(rowMeans(SM.03),col=5)
legend("bottomright",
       c("epsilon = 0.000",
         "epsilon = 0.010",
         "epsilon = 0.100",
         "tau = 10",
         "tau = 3"),
       col=1:5,lwd=2 )

# epsilon greedy (2nd version)
eps.greedy2 <- function(n.trial, Q.star, N, epsilon, lr, init.Q){
    Q = rep(init.Q, N)
    rew.earned = rep(0, n.trial)
    for (i.trial in 1:n.trial){
      if (runif(1) < epsilon){
        act.idx = sample(1:N, 1)
      } else {
        act.idx = max.Q(Q)
      }
      r.t = rnorm(1, mean = Q.star[act.idx], sd = 1)
      Q[act.idx] = Q[act.idx] + lr*(r.t - Q[act.idx])
      rew.earned[i.trial] = r.t
    }
    return(list(Q = Q, rew.earned = rew.earned))
  }