RL Sutton & Barto Ch.6

#################################################
#  ch6.3 Temporal Difference 
#   TD0 & constant step-size MC
#################################################
# TD0 model
TD0.ex1<-function(maxItr,alpha,gamma) {
  V=c(0,rep(0.5,5),0)
  V.hist=matrix(0,nrow=maxItr+1,ncol=5)
  V.hist[1,]=V[2:6]
  P.act=matrix(0.5,ncol=7,nrow=2)
  for (i_rep in 1:maxItr) {
    state=5
    while (state!=1 & state!=7) {
      action=sample(c(-1,1),1,prob=P.act[,state])
      state.old=state
      state=state+action
      r=ifelse(state==7,1,0)
      V[state.old]=V[state.old]+alpha*(r+gamma*V[state]-V[state.old])
    }
    V.hist[(i_rep+1),]=V[2:6]
  }
  return(V.hist)  
}
 
# (re)creating Fig 6.6
true.V=1:5*(1/6)
res=TD0.ex1(1000,0.1,1)
plot(true.V,type='o',pch=15,ylim=c(0,1),ylab="Value",xaxt="n",
  xlab="State",xlim=c(0.5,5.5),cex=2,lwd=2)
axis(1,at=1:5,labels=c("A","B","C","D","E"))
cols=c('red','blue','green','cyan','magenta')
ns=c(1,2,11,101,1001)
for (i_lines in 1:5) {
  lines(res[ns[i_lines],],type='o',pch=15+i_lines,cex=2,lwd=2,col=cols[i_lines])
}
legend('topleft',c('True value','t=0','t=1','t=10','t=100','t=1000'),
 col=c('black',cols),pch=15:20,lwd=1.5)
# constant step-size Monte Carlo
constMC.ex1<-function(maxItr,alpha) {
  V=c(0,rep(0.5,5),0)
  V.hist=matrix(0,nrow=maxItr+1,5)
  V.hist[1,]=V[2:6]
  P.act=matrix(0.5,ncol=7,nrow=2)
  for (i_rep in 1:maxItr) {
  state=5;
  state.hist=state
  while (state!=1 & state!=7) {
    action=sample(c(-1,1),1,prob=P.act[,state])
    state=state+action
    state.hist=cbind(state.hist,state)
  }
  R=ifelse(state==7,1,0)
  n.state=length(state.hist)
  for (i_state in 1:(n.state-1)) {
    V[state.hist[i_state]]=V[state.hist[i_state]]+
     alpha*(R-V[state.hist[i_state]])
  }
  V.hist[(i_rep+1),]=V[2:6]
 }
 return(V.hist)  
}

# (re)creating Fig 6.7
alphaTD=c(0.05,0.075,0.1,0.15)
alphaMC=c(0.01,0.02,0.03,0.04)
n.alphas=length(alphaTD)
pchs=0:(0+n.alphas)
true.V=1:5*(1/6)
n_rep=100
sqs=seq(1,101,2)
plot(0,0,type='n',xlim=c(0,100),ylim=c(0,0.25))
for (i_alpha in 1:n.alphas) {
  rmsTD=matrix(0,101,n_rep)
  rmsMC=matrix(0,101,n_rep)
  for (i_rep in 1:n_rep) {
     resTD=TD0.ex1(100,alphaTD[i_alpha],1)
     resMC=constMC.ex1(100,alphaMC[i_alpha])
     for (i_gen in 1:101) {
       rmsTD[i_gen,i_rep]=sqrt(mean((resTD[i_gen,]-true.V)^2))
       rmsMC[i_gen,i_rep]=sqrt(mean((resMC[i_gen,]-true.V)^2))
     }
  }
  mTD=rowMeans(rmsTD)
  mMC=rowMeans(rmsMC)
  lines(mTD,col='red')
  lines(mMC,col='blue')
  lines(sqs,mTD[sqs],col='red',pch=pchs[i_alpha],type='p')
  lines(sqs,mMC[sqs],col='blue',pch=pchs[i_alpha],type='p')
}
labs=c("MC, alpha=0.01",
       "MC, alpha=0.02", 
       "MC, alpha=0.03",
       "MC, alpha=0.04",
       "TD, alpha=0.05",
       "TD, alpha=0.075",
       "TD, alpha=0.10",
       "TD, alpha=0.15")  
legend('topright',labs,col=c(rep('blue',4),rep('red',4)),pch=rep(0:3,2),lwd=1.5)
#################################################
#  ch6.4 On-policy TD, Sarsa 
#################################################
sarsa.ex6.5<-function(maxItr,alpha,gamma,epsilon) {
# field size: 7row x 10column
# horizontal move ->  COLUMN 
# vertical move     ->  ROW 
# effect of wind     ->  ROW
# actions: 1-up, 2-right, 3-down, 4-left 
act.V=matrix(c(1,0,0,1,-1,0,0,-1),nrow=4,byrow=T)
wind=matrix(c(0,0,0,0,0,0,1,0,1,0,1,0,2,0,2,0,1,0,0,0),byrow=T,nrow=10)
goal=c(4,8)
Qs=array(0,dim=c(7,10,4))
for (i_rep in 1:maxItr) {
  state=c(4,1) # start
  if (runif(1) > epsilon) {
    move=which.max(Qs[state[1],state[2],])
  } else { move=sample(1:4,1)}
  while (!all(state==goal)) {
    st.old=state
    mv.old=move
    state=state+act.V[move,]+wind[state[2],]
    if (state[1]<1) {state[1]=1}
    if (state[1]>7) {state[1]=7}
    if (state[2]<1) {state[2]=1}
    if (state[2]>10) {state[2]=10}
    if (runif(1) > epsilon) {
      move=which.max(Qs[state[1],state[2],])
    } else { move=sample(1:4,1)}
     rew=ifelse(all(state==goal),0,-1)
    Qs[st.old[1],st.old[2],mv.old]=Qs[st.old[1],st.old[2],mv.old]
     +alpha*(rew+gamma* Qs[state[1],state[2],move]
     -Qs[st.old[1],st.old[2],mv.old])
  }
}
return(Qs)
}    
# running example
Qs=sarsa.ex6.5(5e6,0.1,1,0.1)
# sim optimal actions
state=c(4,1);goal=c(4,8);
state.hist=state
while (!all(state==goal)) {
  moveID=which.max(Qs[state[1],state[2],])
  state=state+act.V[moveID,]+wind[state[2],]
  if (state[1]<1) {state[1]=1}
  if (state[1]>7) {state[1]=7}
  if (state[2]<1) {state[2]=1}
  if (state[2]>10) {state[2]=10}
  state.hist=rbind(state.hist,state)
}
# plotting results
plot(0,0,type='n',xlim=c(0,11),ylim=c(0,8),xlab="",ylab="",
  main="Learned policies -- Sarsa")
lines(1,4,type='p',pch=19,col='red',cex=2)
lines(8,4,type='p',pch=19,col='red',cex=2)
dirs=c("up","right","down","left" )
for (i_row in 1:7) {
   for (i_col in 1:10) {
     best.move=dirs[which.max(Qs[i_row,i_col,])]
     text(i_col,i_row,best.move)
   }
}
lines(state.hist[,2],state.hist[,1],col="red",lwd=2)
#################################################
#  ch6.5 Off-policy TD, Q-learning
#################################################
Qlearn.ex6.5<-function(maxItr,alpha,gamma,epsilon) {
# field size: 7row x 10column
# horizontal move ->  COLUMN 
# vertical move     ->  ROW 
# effect of wind     ->  ROW
# actions: 1-up, 2-right, 3-down, 4-left 
act.V=matrix(c(1,0,0,1,-1,0,0,-1),nrow=4,byrow=T)
wind=matrix(c(0,0,0,0,0,0,1,0,1,0,1,0,2,0,2,0,1,0,0,0),byrow=T,nrow=10)
goal=c(4,8)
Qs=array(0,dim=c(7,10,4))
for (i_rep in 1:maxItr) {
  state=c(4,1) # start
  while (!all(state==goal)) {
    if (runif(1) > epsilon) {
      move=which.max(Qs[state[1],state[2],])
    } else { move=sample(1:4,1)}
    sIDX=state
    state=state+act.V[move,]+wind[state[2],]
    if (state[1]<1) {state[1]=1}
    if (state[1]>7) {state[1]=7}
    if (state[2]<1) {state[2]=1}
    if (state[2]>10) {state[2]=10}   
    max.Q=max(Qs[state[1],state[2],])
    rew=ifelse(all(state==goal),0,-1)
    Qs[sIDX[1],sIDX[2],move]=Qs[sIDX[1],sIDX[2],move]
     +alpha*(rew+gamma* max.Q-Qs[sIDX[1],sIDX[2],move])
  }
}
return(Qs)
}

Qs=Qlearn.ex6.5(1e6,0.05,1,0.1)
# sim optimal actions
state=c(4,1);goal=c(4,8);
state.hist=state
while (!all(state==goal)) {
  moveID=which.max(Qs[state[1],state[2],])
  state=state+act.V[moveID,]+wind[state[2],]
  if (state[1]<1) {state[1]=1}
  if (state[1]>7) {state[1]=7}
  if (state[2]<1) {state[2]=1}
  if (state[2]>10) {state[2]=10}
  state.hist=rbind(state.hist,state)
}
# plotting results
plot(0,0,type='n',xlim=c(0,11),ylim=c(0,8),xlab="",ylab="",
 main="Learned policies -- Q-learning")
lines(1,4,type='p',pch=19,col='red',cex=2)
lines(8,4,type='p',pch=19,col='red',cex=2)
dirs=c("up","right","down","left" )
for (i_row in 1:7) {
   for (i_col in 1:10) {
     best.move=dirs[which.max(Qs[i_row,i_col,])]
     text(i_col,i_row,best.move)
   }
}
lines(state.hist[,2],state.hist[,1],col="red",lwd=2) 

認知情報解析B ALCOVE

# function to initialize ALCOVE's parameter set
ALC.init<-function() {
  stimType='nom' # stimType 
  c=2            # gain
  phi=5	         # decision strength
  lrA=0.1        # learning rate for attentions
  lrW=0.1        # learning rate for associations
  return(data.frame(stimType,lrA,lrW,c,phi))
}

# main function
alcove<-function(parmSet,inp,targ,iter) {
# ----ALCOVE for R ---
# input arguments
#  parmSet - parameter set
# inp - input matrix (n_sample x n_dimension)
# targ - target matrix (n_sample x n_category)
# iter - # of training epochs 
# ----ALCOVE for R ---
  # initialization
  inpSize=dim(inp)
  targSize=dim(targ)
  alpha=matrix(1,nrow=1,ncol=inpSize[2])/inpSize[2]
  w=matrix(rnorm(inpSize[1]*targSize[2])*0.1,ncol=inpSize[1])             
  accu=rep(0,nrow=iter+1)
  attn=matrix(0,nrow=iter+1,ncol=inpSize[2])
  attn[1,]=alpha
  # end initialization

  for (i_iter in 1:iter) {
    ro=sample(1:inpSize[1],inpSize[1])
    prob_temp=0;
    for (i_tr in 1:inpSize[1]) {
      if (parmSet$stimType=='nom') {
        diff= matrix(as.numeric(matrix(1,inpSize[1],1)%*%inp[ro[i_tr],]!=inp),nrow=inpSize[1])
      } else { diff=inp[ro[i_tr],]-inp }
      exemp=exp(-parmSet$c*(diff%*%t(alpha)))
      out=w%*%exemp
      err=targ[ro[i_tr],]-out
      delta_W=parmSet$lrW*exemp%*%t(err)
      delta_A=-parmSet$lrA*(t(err)%*%w*t(exemp))%*%diff*parmSet$c
      w=w+t(delta_W)
      alpha=alpha+delta_A
      alpha[which(alpha<0)]=0
      pT=(exp(parmSet$phi*out)/sum(exp(parmSet$phi*out)))*targ[ro[i_tr],]
      prob_temp=prob_temp+pT[which(!pT==0)]
    }
    accu[i_iter+1]=prob_temp/targSize[1]
    attn[i_iter+1,]=alpha
  }
  attnN=attn/apply(attn,1, sum)
  out=matrix(0,nrow=targSize[2],ncol=inpSize[1])
  for (i_tr in 1:inpSize[1]) {
    if (parmSet$stimType=='nom') {
      diff= matrix(as.numeric(matrix(1,inpSize[1],1)%*%inp[i_tr,]!=inp),nrow=inpSize[1])
    } else { diff=inp[i_tr,]-inp}
    exemp=exp(-parmSet$c*(diff%*%t(alpha)))
    out[,i_tr]=w%*%exemp
  }
 return(list(alpha=alpha,w=w,attn=attn,attnN=attnN,accu=accu,out=out,ps=parmSet))
}

##################################################
#  example
# カテゴリー学習モデル:ALCOVE
# medin & schaffer 1978 のシミュレーション
##################################################
inp=matrix(c(1, 1, 1, 0,
             1, 0, 1, 0,
             1, 0, 1, 1,
             1, 1, 0, 1,
             0, 1, 1, 1,
             1, 1, 0, 0,
             0, 1, 1 ,0,
             0, 0, 0, 1,
             0, 0, 0, 0),nrow=9,byrow=T)
targ=matrix(c(1, 0,
              1, 0,
              1, 0,
              1, 0,
              1, 0,
              0, 1,
              0, 1,
              0, 1,
              0, 1),nrow=9,byrow=T)  
parmSet<-ALC.init()
result<-alcove(parmSet,inp,targ,30)
plot(result$accu, type="o", col="black",ylim=c(0,1.1), 
 xlab="training", ylab="Proportion", cex.lab=1.4)
lines(result$attnN[,1], type="o", pch=22, lty=2, col="blue")
lines(result$attnN[,2], type="o", pch=23, lty=3, col="red")
lines(result$attnN[,3], type="o", pch=24, lty=4, col="green")
lines(result$attnN[,4], type="o", pch=25, lty=5, col="cyan")
title(main="ALOCVE: Learning MS (1978) 5-4 Stimulus Set ")
legend("topleft", c("Accuracy","AttnD1","AttnD2","AttnD3","AttnD4"),
 cex=1,col=c("black","blue","red","green","cyan"), pch=c(21:25), lty=c(1:5));

RL Sutton & Barto Ch.5

########################################
#   ch5.1 Monte Carlo policy evaluation 
########################################
#
# reference: http://waxworksmath.com
#
########################################
# function to calc. value of cards
card.value<-function(adj.cards) {
 sum.cards=sum(adj.cards)
 if (any(adj.cards==1) & sum.cards<=11) {
   sum.cards=sum.cards+10;
   usableA=1          #true
  } else {usableA=2}  #false
 return(c(sum.cards,usableA))
}

# function to calc. reward
calc.reward<-function(p.val,d.val) {
  if (p.val>21) { reward=-1
  } else {if (d.val>21) { reward=1
    } else {if (p.val==d.val) {reward=0
      } else{ reward=ifelse(p.val>d.val,1,-1)}
}}}

# main function
BJ_MC_fixedPolicy<-function(policy=20,maxIter=1e6){
  rew.sum=array(0,dim=c(10,10,2))
  rew.count=array(0,dim=c(10,10,2))
  for (i_play in 1:maxIter) {
    cards=sample(rep(1:13,4))
    player=cards[1:2];  adj.player=pmin(player,10)
    dealer=cards[3:4];  adj.dealer=pmin(dealer,10)
    cards=cards[-(1:4)]
    d.val=card.value(adj.dealer)
    p.val=card.value(adj.player)
    state.hist=c(adj.dealer[1],p.val[1],p.val[2])
    while (p.val[1] < policy) {
      player=c(player,cards[1]); adj.player=pmin(player,10)
      cards=cards[-1]
      p.val=card.value(adj.player)
      state.hist=rbind(state.hist,c(adj.dealer[1],p.val[1],p.val[2]))
    }
    while (d.val[1] < 17) {
      dealer=c(dealer,cards[1]); adj.dealer=pmin(dealer,10)
      cards=cards[-1]
      d.val=card.value(adj.dealer)
    }
    rew=calc.reward(p.val[1],d.val[1])
    n.state=nrow(state.hist)
    if (is.null(n.state)) {
      n.state=1
      state.hist=t(as.matrix(state.hist))
    }
    for (i_state in 1:n.state) {
      if (state.hist[i_state,2] > 11 & state.hist[i_state,2] < 22) {
        rew.sum[state.hist[i_state,1],(state.hist[i_state,2]-11),state.hist[i_state,3]]
          = rew.sum[state.hist[i_state,1],(state.hist[i_state,2]-11),state.hist[i_state,3]]+rew
        rew.count[state.hist[i_state,1],(state.hist[i_state,2]-11),state.hist[i_state,3]]
          =rew.count[state.hist[i_state,1],(state.hist[i_state,2]-11),state.hist[i_state,3]]+1
      }
    }
  }
  return(rew.sum/rew.count)
}

# function 2 plot results
plot.BJ_MC<-function(V){
 par(mfrow=c(1,2))
 image(V[,,1],main="with usable Ace",xaxt='n',yaxt='n',
   xlab="Dealer showing",ylab="Player sum")
 axis(1,at=seq(0,1,length.out=10),label=c("A",paste(2:10)))
 axis(2,at=seq(0,1,length.out=10),label=12:21)
 image(V[,,2],main="without usable Ace",xaxt='n',yaxt='n',
   xlab="Dealer showing",ylab="Player sum")
 axis(1,at=seq(0,1,length.out=10),label=c("A",paste(2:10)))
 axis(2,at=seq(0,1,length.out=10),label=12:21)
}

> V=BJ_MC(17)
> plot.BJ_MC(V)

RL_exp5_1

########################################
#   ch5.3 Monte Carlo exploring starts
########################################
# function to calc. value of cards
card.value<-function(adj.cards) {
 sum.cards=sum(adj.cards)
 if (any(adj.cards==1) & sum.cards<=11) {
   sum.cards=sum.cards+10;
   usableA=1             #true
  } else {usableA=2}     #false
 return(c(sum.cards,usableA))
}

# function to calc. reward
calc.reward<-function(p.val,d.val) {
  if (p.val>21) { reward=-1
  } else {if (d.val>21) { reward=1
    } else {if (p.val==d.val) {reward=0
      } else{ reward=ifelse(p.val>d.val,1,-1)}
}}}

# main function
BJ_MC<-function(maxIter=1e6){
  rew.sum=array(0,dim=c(10,10,2,2))
  rew.count=array(1,dim=c(10,10,2,2))
  Q=array(0,dim=c(10,10,2))
  V=array(0,dim=c(10,10,2))
  policy=array(sample(0:1,10*10*2,replace=T),dim=c(10,10,2))
  # policy: 1 = hit, 0 = stay
  for (i_play in 1:maxIter) {
    # initial draw
    cards=sample(c(rep(1:10,4),rep(10,12)))
    player=cards[1:2]
    dealer=cards[3:4]
    cards=cards[-(1:4)]
    d.val=card.value(dealer)
    p.val=card.value(player)
     
    while( p.val[1] < 12 ) {
      player=c(player,cards[1])
      cards=cards[-1]
      p.val=card.value(player)
    }
    action=sample(0:1,1)
    state.hist=c(dealer[1],p.val[1],p.val[2],(action+1))
    
    # player's action
    while (action==1 & p.val[1]<22) {
      player=c(player,cards[1])
      cards=cards[-1]
      p.val=card.value(player)
      state.hist=rbind(state.hist,c(dealer[1],p.val[1],p.val[2],(action+1)))
      if (p.val[1]<22) {
        action=policy[dealer[1],(p.val[1]-11),p.val[2]]
      }
    }
    
    # dealer's action
    while (d.val[1]<17) {
      dealer=c(dealer,cards[1])
      cards=cards[-1]
      d.val=card.value(dealer)
    }
    rew=calc.reward(p.val[1],d.val[1])
    n.state=nrow(state.hist)
    if (is.null(n.state)) {
      n.state=1
      state.hist=t(as.matrix(state.hist))
    }
    for (i_state in 1:n.state) {
      if (state.hist[i_state,2]>11 & state.hist[i_state,2]<22) {
        ind=state.hist[i_state,]-c(0,11,0,0)
        rew.sum[ind[1],ind[2],ind[3],ind[4]]= rew.sum[ind[1],ind[2],ind[3],ind[4]]+rew
        rew.count[ind[1],ind[2],ind[3],ind[4]]=rew.count[ind[1],ind[2],ind[3],ind[4]]+1
        Q=rew.sum/rew.count;
        policy[,,1]=Q[,,1,1] < Q[,,1,2]
        policy[,,2]=Q[,,2,1] < Q[,,2,2]
      }
    }
  }
  V[,,1]=(rew.sum[,,1,1]+rew.sum[,,1,2])/(rew.count[,,1,1]+rew.count[,,1,2])
  V[,,2]=(rew.sum[,,2,1]+rew.sum[,,2,2])/(rew.count[,,2,1]+rew.count[,,2,2])
  return(list(policy,V,Q))
}


# function 2 plot results
plot.BJ_MC2<-function(V){
 par(mfrow=c(2,2))
 image(V[[2]][,,1],main="Utility with usable Ace",xaxt='n',yaxt='n',
  xlab="Dealer showing",ylab="Player sum")
 axis(1,at=seq(0,1,length.out=10),label=c("A",paste(2:10)))
 axis(2,at=seq(0,1,length.out=10),label=12:21)
 image(V[[2]][,,2],main="Utility without usable Ace",xaxt='n',yaxt='n',
  xlab="Dealer showing",ylab="Player sum")
 axis(1,at=seq(0,1,length.out=10),label=c("A",paste(2:10)))
 axis(2,at=seq(0,1,length.out=10),label=12:21)
 image(V[[1]][,,1],main="Policy with usable Ace",xaxt='n',yaxt='n',
  xlab="Dealer showing",ylab="Player sum")
 axis(1,at=seq(0,1,length.out=10),label=c("A",paste(2:10)))
 axis(2,at=seq(0,1,length.out=10),label=12:21)
 image(V[[1]][,,2],main="Policy without usable Ace",xaxt='n',yaxt='n',
  xlab="Dealer showing",ylab="Player sum")
 axis(1,at=seq(0,1,length.out=10),label=c("A",paste(2:10)))
 axis(2,at=seq(0,1,length.out=10),label=12:21)
}

V=BJ_MC(5e6)
plot.BJ_MC2(V)

RL_exp5_3

数理社会学: なぜ宣伝しなくても流行がおこるのか

vogue2015plot

社会を<モデル>でみる:数理社会学への招待
18章:なぜ宣伝しなくても流行がおこるのか。
モデルの説明:
x:流行に影響されていない人
y:流行に影響されている人
z:以前流行に影響されてが、もう影響されていない
t:時間
\delta x = -\alpha * x[t] * y[t] * \delta t
\delta y = (\alpha * x[t] * y[t] – \beta * y[t]) * \delta t
\delta z = \beta * y[t] * \delta t

# initialization
vogue<-function(alpha,beta,x.init,y.init,z.init,max_time){
  dt=0.01
  x=c(x.init,rep(0,max_time))
  y=c(y.init,rep(0,max_time))
  z=c(z.init,rep(0,max_time))
 # main loop
  for (t in 1:max_time) {
    x[(t+1)]=x[t]-alpha*x[t]*y[t]*dt
    y[(t+1)]=y[t]+(alpha*x[t]*y[t]-beta*y[t])*dt
    z[(t+1)]=z[t]+beta*y[t]*dt
  }
 return(data.frame(x,y,z))
}
vogue.plot<-function(dat) {
 plot(dat$x,type="l",lwd=4,col="blue", main="Typical Time Series of \"Vogue\" ",
  xlab="time", ylab="Number of People",cex.lab=1.3,ylim=c(0,dat$x[1]))
 lines(dat$y,type="l",lwd=4,col="red")
 lines(dat$z,type="l",lwd=4,col="green")
 legend("right", c("affected","not affected", "no longer affected"), 
  col=c("red","blue","green"),lty=rep(1,3),lwd=4)
}

# running functions
res<-vogue(0.0001,0.1,10000,100,0,5000)
vogue.plot(res)