広域システム特別講義II 強化学習1B

mk_MC_seq <- function(){
  # defining transitino matrix & probs.
  north=c(1:3,15,1:10)
  east=2:15;east[ c(3,7,11)]=c(3,7,11)
  south=c(5:15,12:14)
  west=c(15,1:13);west[ c(4,8,12)]=c(4,8,12)
  trM=cbind(north,east,south,west)
  P=rep(0.25,4)
  # creating sequence
  state = sample(1:14,1)
  state.seq = state
  while(state!=15){
    action = sample(1:4,1,prob = P)
    state.seq = c(state.seq,trM[state,action])
    state = trM[state,action]  
  }
  return(state.seq = state.seq)
}

# update Vs

cal.cumV.MC <- function(cumV, state.seq,state.count){
  r = -1
  uniq.seq = unique(state.seq)
  for (i.uniq in 1:(length(uniq.seq)-1)){
    first.visit = which(state.seq == uniq.seq[i.uniq])[1]
    cumV[uniq.seq[i.uniq]] = cumV[uniq.seq[i.uniq]] + r*(length(state.seq)-first.visit)
  }
  state.count[uniq.seq] = state.count[uniq.seq] + 1
  return(list(cumV = cumV, state.count = state.count))
}


# main script
max.iter = 10000;
state.count=rep(0,15); cumV = rep(0,14)
for (i.iter in 1:max.iter){
  state.seq = mk_MC_seq()
  updates = cal.cumV.MC(cumV, state.seq, state.count)
  state.count = updates$state.count
  cumV = updates$cumV
}
V = matrix(c(0,cumV/state.count[1:14],0),nrow=4)


# function to calc. card values
card.value<-function(adj.cards) {
  sum.cards=sum(adj.cards)
  if (any(adj.cards==1) & sum.cards<=11) {
    sum.cards=sum.cards+10;
    usableA=1          #true
  } else {usableA=2}  #false
  return(c(sum.cards,usableA))
}

# function to calc. reward
calc.reward<-function(p.val,d.val) {
  if (p.val>21) { 
    reward=-1
  } else {
    if (d.val>21) { 
      reward=1
    } else {
      if (p.val==d.val) {
        reward=0
      } else { 
        reward=ifelse(p.val>d.val,1,-1)
      }
    }
  }
}

# playing a single game
play.BJ <- function(policy){
  cards=sample(rep(1:13,4))
  player=cards[1:2];  adj.player=pmin(player,10)
  dealer=cards[3:4];  adj.dealer=pmin(dealer,10)
  cards=cards[-(1:4)]
  d.val=card.value(adj.dealer)
  p.val=card.value(adj.player)
  state.hist=c(adj.dealer[1],p.val[1],p.val[2])
  while (p.val[1] < policy) {
    player=c(player,cards[1]); adj.player=pmin(player,10)
    cards=cards[-1]
    p.val=card.value(adj.player)
    state.hist=rbind(state.hist,c(adj.dealer[1],p.val[1],p.val[2]))
  }
  while (d.val[1] < 17) {
    dealer=c(dealer,cards[1]); adj.dealer=pmin(dealer,10)
    cards=cards[-1]
    d.val=card.value(adj.dealer)
  }
  return(list(p.val = p.val, d.val = d.val, state.hist = state.hist))
}

# main function
BJ_MC_fixedPolicy<-function(policy=20,maxIter=1e6){
  rew.sum=array(0,dim=c(10,10,2))
  rew.count=array(0,dim=c(10,10,2))
  for (i_play in 1:maxIter) {
    result <- play.BJ(policy)
    p.val = result$p.val
    d.val = result$d.val
    state.hist = result$state.hist
    rew=calc.reward(p.val[1],d.val[1])
    n.state=nrow(state.hist)
    if (is.null(n.state)) {
      n.state=1
      state.hist=t(as.matrix(state.hist))
    }
    for (i_state in 1:n.state) {
      if (state.hist[i_state,2] > 11 & state.hist[i_state,2] < 22) {
        rew.sum[state.hist[i_state,1],(state.hist[i_state,2]-11),state.hist[i_state,3]]= rew.sum[state.hist[i_state,1],(state.hist[i_state,2]-11),state.hist[i_state,3]]+rew
        rew.count[state.hist[i_state,1],(state.hist[i_state,2]-11),state.hist[i_state,3]]=rew.count[state.hist[i_state,1],(state.hist[i_state,2]-11),state.hist[i_state,3]]+1
      }
    }
  }
  return(rew.sum/rew.count)
}

play.BJ2 <- function(policy){
  cards=sample(c(rep(1:10,4),rep(10,12)))
  player=cards[1:2]; dealer=cards[3:4]; cards=cards[-(1:4)]
  d.val=card.value(dealer); p.val=card.value(player)
  
  while( p.val[1] < 12 ) {
    player=c(player,cards[1])
    cards=cards[-1]
    p.val=card.value(player)
  }
  # initial random action
  action=sample(0:1,1)
  state.hist=c(dealer[1],p.val[1],p.val[2],(action+1))
  
  # player's action
  while (action==1 & p.val[1]<22) {
    player=c(player,cards[1])
    cards=cards[-1]
    p.val=card.value(player)
    state.hist=rbind(state.hist,c(dealer[1],p.val[1],p.val[2],(action+1)))
    if (p.val[1]<22) {
      action=policy[dealer[1],(p.val[1]-11),p.val[2]]
    }
  }
  
  # dealer's action
  while (d.val[1]<17) {
    dealer=c(dealer,cards[1])
    cards=cards[-1]
    d.val=card.value(dealer)
  }
  return(list(p.val = p.val, d.val = d.val, state.hist = state.hist))
}
  

BJ_MC<-function(maxIter=1e6){
  rew.sum=array(0,dim=c(10,10,2,2))
  rew.count=array(1,dim=c(10,10,2,2))
  Q=array(0,dim=c(10,10,2))
  V=array(0,dim=c(10,10,2))
  policy=array(sample(0:1,10*10*2,replace=T),dim=c(10,10,2))
  # policy: 1 = hit, 0 = stay
  for (i_play in 1:maxIter) {
    result = play.BJ2(policy)
    p.val = result$p.val
    d.val = result$d.val
    state.hist = result$state.hist
    rew=calc.reward(p.val[1],d.val[1])
    n.state=nrow(state.hist)
    if (is.null(n.state)) {
      n.state=1
      state.hist=t(as.matrix(state.hist))
    }
    for (i_state in 1:n.state) {
      if (state.hist[i_state,2]>11 & state.hist[i_state,2]<22) {
        ind=state.hist[i_state,]-c(0,11,0,0)
        rew.sum[ind[1],ind[2],ind[3],ind[4]]= rew.sum[ind[1],ind[2],ind[3],ind[4]]+rew
        rew.count[ind[1],ind[2],ind[3],ind[4]]=rew.count[ind[1],ind[2],ind[3],ind[4]]+1
        Q=rew.sum/rew.count;
        policy[,,1]=Q[,,1,1] < Q[,,1,2]
        policy[,,2]=Q[,,2,1] < Q[,,2,2]
      }
    }
  }
  V[,,1]=(rew.sum[,,1,1]+rew.sum[,,1,2])/(rew.count[,,1,1]+rew.count[,,1,2])
  V[,,2]=(rew.sum[,,2,1]+rew.sum[,,2,2])/(rew.count[,,2,1]+rew.count[,,2,2])
  return(list(policy,V,Q))
}

# TD random walk
TD0.ex1<-function(maxItr,alpha,gamma) {
  V=c(0,rep(0.5,5),0)
  V.hist=matrix(0,nrow=maxItr+1,ncol=5)
  V.hist[1,]=V[2:6]
  P.act=matrix(0.5,ncol=7,nrow=2)
  for (i_rep in 1:maxItr) {
    state=5
    while (state!=1 & state!=7) {
      action=sample(c(-1,1),1,prob=P.act[,state])
      state.old=state
      state=state+action
      r=ifelse(state==7,1,0)
      V[state.old]=V[state.old]+alpha*(r+gamma*V[state]-V[state.old])
    }
    V.hist[(i_rep+1),]=V[2:6]
  }
  return(V.hist)  
}

# constant step-size Monte Carlo
constMC.ex1<-function(maxItr,alpha) {
  V=c(0,rep(0.5,5),0)
  V.hist=matrix(0,nrow=maxItr+1,5)
  V.hist[1,]=V[2:6]
  P.act=matrix(0.5,ncol=7,nrow=2)
  for (i_rep in 1:maxItr) {
    state=5;
    state.hist=state
    while (state!=1 & state!=7) {
      action=sample(c(-1,1),1,prob=P.act[,state])
      state=state+action
      state.hist=cbind(state.hist,state)
    }
    R=ifelse(state==7,1,0)
    n.state=length(state.hist)
    for (i_state in 1:(n.state-1)) {
      V[state.hist[i_state]]=V[state.hist[i_state]]+
        alpha*(R-V[state.hist[i_state]])
    }
    V.hist[(i_rep+1),]=V[2:6]
  }
  return(V.hist)  
}

# (re)creating Fig 6.7
alphaTD=c(0.05,0.075,0.1,0.15)
alphaMC=c(0.01,0.02,0.03,0.04)
n.alphas=length(alphaTD)
pchs=0:(0+n.alphas)
true.V=1:5*(1/6)
n_rep=100
sqs=seq(1,101,2)
plot(0,0,type='n',xlim=c(0,100),ylim=c(0,0.25))
for (i_alpha in 1:n.alphas) {
  rmsTD=matrix(0,101,n_rep)
  rmsMC=matrix(0,101,n_rep)
  for (i_rep in 1:n_rep) {
    resTD=TD0.ex1(100,alphaTD[i_alpha],1)
    resMC=constMC.ex1(100,alphaTD[i_alpha])
    for (i_gen in 1:101) {
      rmsTD[i_gen,i_rep]=sqrt(mean((resTD[i_gen,]-true.V)^2))
      rmsMC[i_gen,i_rep]=sqrt(mean((resMC[i_gen,]-true.V)^2))
    }
  }
  mTD=rowMeans(rmsTD)
  mMC=rowMeans(rmsMC)
  lines(mTD,col='red')
  lines(mMC,col='blue')
  lines(sqs,mTD[sqs],col='red',pch=pchs[i_alpha],type='p')
  lines(sqs,mMC[sqs],col='blue',pch=pchs[i_alpha],type='p')
}
labs=c("MC, alpha=0.01",
       "MC, alpha=0.02", 
       "MC, alpha=0.03",
       "MC, alpha=0.04",
       "TD, alpha=0.05",
       "TD, alpha=0.075",
       "TD, alpha=0.10",
       "TD, alpha=0.15")  
legend('topright',labs,col=c(rep('blue',4),rep('red',4)),pch=rep(0:3,2),lwd=1.5)


sarsa.ex6.5<-function(maxItr,alpha,gamma,epsilon) {
  # field size: 7row x 10column
  # horizontal move ->  COLUMN 
  # vertical move     ->  ROW 
  # effect of wind     ->  ROW
  # actions: 1-up, 2-right, 3-down, 4-left 
  act.V=matrix(c(1,0,0,1,-1,0,0,-1),nrow=4,byrow=T)
  wind=matrix(c(0,0,0,0,0,0,1,0,1,0,1,0,2,0,2,0,1,0,0,0),byrow=T,nrow=10)
  goal=c(4,8)
  Qs=array(0,dim=c(7,10,4))
  for (i_rep in 1:maxItr) {
    state=c(4,1) # start
    if (runif(1) > epsilon) {
      move=which.max(Qs[state[1],state[2],])
    } else { move=sample(1:4,1)}
    while (!all(state==goal)) {
      st.old=state
      mv.old=move
      state=state+act.V[move,]+wind[state[2],]
      if (state[1]<1) {state[1]=1}
      if (state[1]>7) {state[1]=7}
      if (state[2]<1) {state[2]=1}
      if (state[2]>10) {state[2]=10}
      if (runif(1) > epsilon) {
        move=which.max(Qs[state[1],state[2],])
      } else { move=sample(1:4,1)}
      rew=ifelse(all(state==goal),0,-1)
      Qs[st.old[1],st.old[2],mv.old]=Qs[st.old[1],st.old[2],mv.old]
      +alpha*(rew+gamma* Qs[state[1],state[2],move]
              -Qs[st.old[1],st.old[2],mv.old])
    }
  }
  return(Qs)
}    

# running example
Qs=sarsa.ex6.5(5e6,0.1,1,0.1)
# sim optimal actions
state=c(4,1);goal=c(4,8);
state.hist=state
while (!all(state==goal)) {
  moveID=which.max(Qs[state[1],state[2],])
  state=state+act.V[moveID,]+wind[state[2],]
  if (state[1]<1) {state[1]=1}
  if (state[1]>7) {state[1]=7}
  if (state[2]<1) {state[2]=1}
  if (state[2]>10) {state[2]=10}
  state.hist=rbind(state.hist,state)
}
# plotting results
plot(0,0,type='n',xlim=c(0,11),ylim=c(0,8),xlab="",ylab="",
     main="Learned policies -- Sarsa")
lines(1,4,type='p',pch=19,col='red',cex=2)
lines(8,4,type='p',pch=19,col='red',cex=2)
dirs=c("up","right","down","left" )
for (i_row in 1:7) {
  for (i_col in 1:10) {
    best.move=dirs[which.max(Qs[i_row,i_col,])]
    text(i_col,i_row,best.move)
  }
}
lines(state.hist[,2],state.hist[,1],col="red",lwd=2)


Qlearn.ex6.5<-function(maxItr,alpha,gamma,epsilon) {
  # field size: 7row x 10column
  # horizontal move ->  COLUMN 
  # vertical move     ->  ROW 
  # effect of wind     ->  ROW
  # actions: 1-up, 2-right, 3-down, 4-left 
  act.V=matrix(c(1,0,0,1,-1,0,0,-1),nrow=4,byrow=T)
  wind=matrix(c(0,0,0,0,0,0,1,0,1,0,1,0,2,0,2,0,1,0,0,0),byrow=T,nrow=10)
  goal=c(4,8)
  Qs=array(0,dim=c(7,10,4))
  for (i_rep in 1:maxItr) {
    state=c(4,1) # start
    while (!all(state==goal)) {
      if (runif(1) > epsilon) {
        move=which.max(Qs[state[1],state[2],])
      } else { move=sample(1:4,1)}
      sIDX=state
      state=state+act.V[move,]+wind[state[2],]
      if (state[1]<1) {state[1]=1}
      if (state[1]>7) {state[1]=7}
      if (state[2]<1) {state[2]=1}
      if (state[2]>10) {state[2]=10}   
      max.Q=max(Qs[state[1],state[2],])
      rew=ifelse(all(state==goal),0,-1)
      Qs[sIDX[1],sIDX[2],move]=Qs[sIDX[1],sIDX[2],move]
      +alpha*(rew+gamma* max.Q-Qs[sIDX[1],sIDX[2],move])
    }
  }
  return(Qs)
}

Qs=Qlearn.ex6.5(1e6,0.05,1,0.1)
# sim optimal actions
state=c(4,1);goal=c(4,8);
state.hist=state
while (!all(state==goal)) {
  moveID=which.max(Qs[state[1],state[2],])
  state=state+act.V[moveID,]+wind[state[2],]
  if (state[1]<1) {state[1]=1}
  if (state[1]>7) {state[1]=7}
  if (state[2]<1) {state[2]=1}
  if (state[2]>10) {state[2]=10}
  state.hist=rbind(state.hist,state)
}
# plotting results
plot(0,0,type='n',xlim=c(0,11),ylim=c(0,8),xlab="",ylab="",
     main="Learned policies -- Q-learning")
lines(1,4,type='p',pch=19,col='red',cex=2)
lines(8,4,type='p',pch=19,col='red',cex=2)
dirs=c("up","right","down","left" )
for (i_row in 1:7) {
  for (i_col in 1:10) {
    best.move=dirs[which.max(Qs[i_row,i_col,])]
    text(i_col,i_row,best.move)
  }
}
lines(state.hist[,2],state.hist[,1],col="red",lwd=2)