RL Sutton & Barto Ch.6

#################################################
#  ch6.3 Temporal Difference 
#   TD0 & constant step-size MC
#################################################
# TD0 model
TD0.ex1<-function(maxItr,alpha,gamma) {
  V=c(0,rep(0.5,5),0)
  V.hist=matrix(0,nrow=maxItr+1,ncol=5)
  V.hist[1,]=V[2:6]
  P.act=matrix(0.5,ncol=7,nrow=2)
  for (i_rep in 1:maxItr) {
    state=5
    while (state!=1 & state!=7) {
      action=sample(c(-1,1),1,prob=P.act[,state])
      state.old=state
      state=state+action
      r=ifelse(state==7,1,0)
      V[state.old]=V[state.old]+alpha*(r+gamma*V[state]-V[state.old])
    }
    V.hist[(i_rep+1),]=V[2:6]
  }
  return(V.hist)  
}
 
# (re)creating Fig 6.6
true.V=1:5*(1/6)
res=TD0.ex1(1000,0.1,1)
plot(true.V,type='o',pch=15,ylim=c(0,1),ylab="Value",xaxt="n",
  xlab="State",xlim=c(0.5,5.5),cex=2,lwd=2)
axis(1,at=1:5,labels=c("A","B","C","D","E"))
cols=c('red','blue','green','cyan','magenta')
ns=c(1,2,11,101,1001)
for (i_lines in 1:5) {
  lines(res[ns[i_lines],],type='o',pch=15+i_lines,cex=2,lwd=2,col=cols[i_lines])
}
legend('topleft',c('True value','t=0','t=1','t=10','t=100','t=1000'),
 col=c('black',cols),pch=15:20,lwd=1.5)
# constant step-size Monte Carlo
constMC.ex1<-function(maxItr,alpha) {
  V=c(0,rep(0.5,5),0)
  V.hist=matrix(0,nrow=maxItr+1,5)
  V.hist[1,]=V[2:6]
  P.act=matrix(0.5,ncol=7,nrow=2)
  for (i_rep in 1:maxItr) {
  state=5;
  state.hist=state
  while (state!=1 & state!=7) {
    action=sample(c(-1,1),1,prob=P.act[,state])
    state=state+action
    state.hist=cbind(state.hist,state)
  }
  R=ifelse(state==7,1,0)
  n.state=length(state.hist)
  for (i_state in 1:(n.state-1)) {
    V[state.hist[i_state]]=V[state.hist[i_state]]+
     alpha*(R-V[state.hist[i_state]])
  }
  V.hist[(i_rep+1),]=V[2:6]
 }
 return(V.hist)  
}

# (re)creating Fig 6.7
alphaTD=c(0.05,0.075,0.1,0.15)
alphaMC=c(0.01,0.02,0.03,0.04)
n.alphas=length(alphaTD)
pchs=0:(0+n.alphas)
true.V=1:5*(1/6)
n_rep=100
sqs=seq(1,101,2)
plot(0,0,type='n',xlim=c(0,100),ylim=c(0,0.25))
for (i_alpha in 1:n.alphas) {
  rmsTD=matrix(0,101,n_rep)
  rmsMC=matrix(0,101,n_rep)
  for (i_rep in 1:n_rep) {
     resTD=TD0.ex1(100,alphaTD[i_alpha],1)
     resMC=constMC.ex1(100,alphaMC[i_alpha])
     for (i_gen in 1:101) {
       rmsTD[i_gen,i_rep]=sqrt(mean((resTD[i_gen,]-true.V)^2))
       rmsMC[i_gen,i_rep]=sqrt(mean((resMC[i_gen,]-true.V)^2))
     }
  }
  mTD=rowMeans(rmsTD)
  mMC=rowMeans(rmsMC)
  lines(mTD,col='red')
  lines(mMC,col='blue')
  lines(sqs,mTD[sqs],col='red',pch=pchs[i_alpha],type='p')
  lines(sqs,mMC[sqs],col='blue',pch=pchs[i_alpha],type='p')
}
labs=c("MC, alpha=0.01",
       "MC, alpha=0.02", 
       "MC, alpha=0.03",
       "MC, alpha=0.04",
       "TD, alpha=0.05",
       "TD, alpha=0.075",
       "TD, alpha=0.10",
       "TD, alpha=0.15")  
legend('topright',labs,col=c(rep('blue',4),rep('red',4)),pch=rep(0:3,2),lwd=1.5)
#################################################
#  ch6.4 On-policy TD, Sarsa 
#################################################
sarsa.ex6.5<-function(maxItr,alpha,gamma,epsilon) {
# field size: 7row x 10column
# horizontal move ->  COLUMN 
# vertical move     ->  ROW 
# effect of wind     ->  ROW
# actions: 1-up, 2-right, 3-down, 4-left 
act.V=matrix(c(1,0,0,1,-1,0,0,-1),nrow=4,byrow=T)
wind=matrix(c(0,0,0,0,0,0,1,0,1,0,1,0,2,0,2,0,1,0,0,0),byrow=T,nrow=10)
goal=c(4,8)
Qs=array(0,dim=c(7,10,4))
for (i_rep in 1:maxItr) {
  state=c(4,1) # start
  if (runif(1) > epsilon) {
    move=which.max(Qs[state[1],state[2],])
  } else { move=sample(1:4,1)}
  while (!all(state==goal)) {
    st.old=state
    mv.old=move
    state=state+act.V[move,]+wind[state[2],]
    if (state[1]<1) {state[1]=1}
    if (state[1]>7) {state[1]=7}
    if (state[2]<1) {state[2]=1}
    if (state[2]>10) {state[2]=10}
    if (runif(1) > epsilon) {
      move=which.max(Qs[state[1],state[2],])
    } else { move=sample(1:4,1)}
     rew=ifelse(all(state==goal),0,-1)
    Qs[st.old[1],st.old[2],mv.old]=Qs[st.old[1],st.old[2],mv.old]
     +alpha*(rew+gamma* Qs[state[1],state[2],move]
     -Qs[st.old[1],st.old[2],mv.old])
  }
}
return(Qs)
}    
# running example
Qs=sarsa.ex6.5(5e6,0.1,1,0.1)
# sim optimal actions
state=c(4,1);goal=c(4,8);
state.hist=state
while (!all(state==goal)) {
  moveID=which.max(Qs[state[1],state[2],])
  state=state+act.V[moveID,]+wind[state[2],]
  if (state[1]<1) {state[1]=1}
  if (state[1]>7) {state[1]=7}
  if (state[2]<1) {state[2]=1}
  if (state[2]>10) {state[2]=10}
  state.hist=rbind(state.hist,state)
}
# plotting results
plot(0,0,type='n',xlim=c(0,11),ylim=c(0,8),xlab="",ylab="",
  main="Learned policies -- Sarsa")
lines(1,4,type='p',pch=19,col='red',cex=2)
lines(8,4,type='p',pch=19,col='red',cex=2)
dirs=c("up","right","down","left" )
for (i_row in 1:7) {
   for (i_col in 1:10) {
     best.move=dirs[which.max(Qs[i_row,i_col,])]
     text(i_col,i_row,best.move)
   }
}
lines(state.hist[,2],state.hist[,1],col="red",lwd=2)
#################################################
#  ch6.5 Off-policy TD, Q-learning
#################################################
Qlearn.ex6.5<-function(maxItr,alpha,gamma,epsilon) {
# field size: 7row x 10column
# horizontal move ->  COLUMN 
# vertical move     ->  ROW 
# effect of wind     ->  ROW
# actions: 1-up, 2-right, 3-down, 4-left 
act.V=matrix(c(1,0,0,1,-1,0,0,-1),nrow=4,byrow=T)
wind=matrix(c(0,0,0,0,0,0,1,0,1,0,1,0,2,0,2,0,1,0,0,0),byrow=T,nrow=10)
goal=c(4,8)
Qs=array(0,dim=c(7,10,4))
for (i_rep in 1:maxItr) {
  state=c(4,1) # start
  while (!all(state==goal)) {
    if (runif(1) > epsilon) {
      move=which.max(Qs[state[1],state[2],])
    } else { move=sample(1:4,1)}
    sIDX=state
    state=state+act.V[move,]+wind[state[2],]
    if (state[1]<1) {state[1]=1}
    if (state[1]>7) {state[1]=7}
    if (state[2]<1) {state[2]=1}
    if (state[2]>10) {state[2]=10}   
    max.Q=max(Qs[state[1],state[2],])
    rew=ifelse(all(state==goal),0,-1)
    Qs[sIDX[1],sIDX[2],move]=Qs[sIDX[1],sIDX[2],move]
     +alpha*(rew+gamma* max.Q-Qs[sIDX[1],sIDX[2],move])
  }
}
return(Qs)
}

Qs=Qlearn.ex6.5(1e6,0.05,1,0.1)
# sim optimal actions
state=c(4,1);goal=c(4,8);
state.hist=state
while (!all(state==goal)) {
  moveID=which.max(Qs[state[1],state[2],])
  state=state+act.V[moveID,]+wind[state[2],]
  if (state[1]<1) {state[1]=1}
  if (state[1]>7) {state[1]=7}
  if (state[2]<1) {state[2]=1}
  if (state[2]>10) {state[2]=10}
  state.hist=rbind(state.hist,state)
}
# plotting results
plot(0,0,type='n',xlim=c(0,11),ylim=c(0,8),xlab="",ylab="",
 main="Learned policies -- Q-learning")
lines(1,4,type='p',pch=19,col='red',cex=2)
lines(8,4,type='p',pch=19,col='red',cex=2)
dirs=c("up","right","down","left" )
for (i_row in 1:7) {
   for (i_col in 1:10) {
     best.move=dirs[which.max(Qs[i_row,i_col,])]
     text(i_col,i_row,best.move)
   }
}
lines(state.hist[,2],state.hist[,1],col="red",lwd=2) 

RL Sutton & Barto Ch.5

########################################
#   ch5.1 Monte Carlo policy evaluation 
########################################
#
# reference: http://waxworksmath.com
#
########################################
# function to calc. value of cards
card.value<-function(adj.cards) {
 sum.cards=sum(adj.cards)
 if (any(adj.cards==1) & sum.cards<=11) {
   sum.cards=sum.cards+10;
   usableA=1          #true
  } else {usableA=2}  #false
 return(c(sum.cards,usableA))
}

# function to calc. reward
calc.reward<-function(p.val,d.val) {
  if (p.val>21) { reward=-1
  } else {if (d.val>21) { reward=1
    } else {if (p.val==d.val) {reward=0
      } else{ reward=ifelse(p.val>d.val,1,-1)}
}}}

# main function
BJ_MC_fixedPolicy<-function(policy=20,maxIter=1e6){
  rew.sum=array(0,dim=c(10,10,2))
  rew.count=array(0,dim=c(10,10,2))
  for (i_play in 1:maxIter) {
    cards=sample(rep(1:13,4))
    player=cards[1:2];  adj.player=pmin(player,10)
    dealer=cards[3:4];  adj.dealer=pmin(dealer,10)
    cards=cards[-(1:4)]
    d.val=card.value(adj.dealer)
    p.val=card.value(adj.player)
    state.hist=c(adj.dealer[1],p.val[1],p.val[2])
    while (p.val[1] < policy) {
      player=c(player,cards[1]); adj.player=pmin(player,10)
      cards=cards[-1]
      p.val=card.value(adj.player)
      state.hist=rbind(state.hist,c(adj.dealer[1],p.val[1],p.val[2]))
    }
    while (d.val[1] < 17) {
      dealer=c(dealer,cards[1]); adj.dealer=pmin(dealer,10)
      cards=cards[-1]
      d.val=card.value(adj.dealer)
    }
    rew=calc.reward(p.val[1],d.val[1])
    n.state=nrow(state.hist)
    if (is.null(n.state)) {
      n.state=1
      state.hist=t(as.matrix(state.hist))
    }
    for (i_state in 1:n.state) {
      if (state.hist[i_state,2] > 11 & state.hist[i_state,2] < 22) {
        rew.sum[state.hist[i_state,1],(state.hist[i_state,2]-11),state.hist[i_state,3]]
          = rew.sum[state.hist[i_state,1],(state.hist[i_state,2]-11),state.hist[i_state,3]]+rew
        rew.count[state.hist[i_state,1],(state.hist[i_state,2]-11),state.hist[i_state,3]]
          =rew.count[state.hist[i_state,1],(state.hist[i_state,2]-11),state.hist[i_state,3]]+1
      }
    }
  }
  return(rew.sum/rew.count)
}

# function 2 plot results
plot.BJ_MC<-function(V){
 par(mfrow=c(1,2))
 image(V[,,1],main="with usable Ace",xaxt='n',yaxt='n',
   xlab="Dealer showing",ylab="Player sum")
 axis(1,at=seq(0,1,length.out=10),label=c("A",paste(2:10)))
 axis(2,at=seq(0,1,length.out=10),label=12:21)
 image(V[,,2],main="without usable Ace",xaxt='n',yaxt='n',
   xlab="Dealer showing",ylab="Player sum")
 axis(1,at=seq(0,1,length.out=10),label=c("A",paste(2:10)))
 axis(2,at=seq(0,1,length.out=10),label=12:21)
}

> V=BJ_MC(17)
> plot.BJ_MC(V)

RL_exp5_1

########################################
#   ch5.3 Monte Carlo exploring starts
########################################
# function to calc. value of cards
card.value<-function(adj.cards) {
 sum.cards=sum(adj.cards)
 if (any(adj.cards==1) & sum.cards<=11) {
   sum.cards=sum.cards+10;
   usableA=1             #true
  } else {usableA=2}     #false
 return(c(sum.cards,usableA))
}

# function to calc. reward
calc.reward<-function(p.val,d.val) {
  if (p.val>21) { reward=-1
  } else {if (d.val>21) { reward=1
    } else {if (p.val==d.val) {reward=0
      } else{ reward=ifelse(p.val>d.val,1,-1)}
}}}

# main function
BJ_MC<-function(maxIter=1e6){
  rew.sum=array(0,dim=c(10,10,2,2))
  rew.count=array(1,dim=c(10,10,2,2))
  Q=array(0,dim=c(10,10,2))
  V=array(0,dim=c(10,10,2))
  policy=array(sample(0:1,10*10*2,replace=T),dim=c(10,10,2))
  # policy: 1 = hit, 0 = stay
  for (i_play in 1:maxIter) {
    # initial draw
    cards=sample(c(rep(1:10,4),rep(10,12)))
    player=cards[1:2]
    dealer=cards[3:4]
    cards=cards[-(1:4)]
    d.val=card.value(dealer)
    p.val=card.value(player)
     
    while( p.val[1] < 12 ) {
      player=c(player,cards[1])
      cards=cards[-1]
      p.val=card.value(player)
    }
    action=sample(0:1,1)
    state.hist=c(dealer[1],p.val[1],p.val[2],(action+1))
    
    # player's action
    while (action==1 & p.val[1]<22) {
      player=c(player,cards[1])
      cards=cards[-1]
      p.val=card.value(player)
      state.hist=rbind(state.hist,c(dealer[1],p.val[1],p.val[2],(action+1)))
      if (p.val[1]<22) {
        action=policy[dealer[1],(p.val[1]-11),p.val[2]]
      }
    }
    
    # dealer's action
    while (d.val[1]<17) {
      dealer=c(dealer,cards[1])
      cards=cards[-1]
      d.val=card.value(dealer)
    }
    rew=calc.reward(p.val[1],d.val[1])
    n.state=nrow(state.hist)
    if (is.null(n.state)) {
      n.state=1
      state.hist=t(as.matrix(state.hist))
    }
    for (i_state in 1:n.state) {
      if (state.hist[i_state,2]>11 & state.hist[i_state,2]<22) {
        ind=state.hist[i_state,]-c(0,11,0,0)
        rew.sum[ind[1],ind[2],ind[3],ind[4]]= rew.sum[ind[1],ind[2],ind[3],ind[4]]+rew
        rew.count[ind[1],ind[2],ind[3],ind[4]]=rew.count[ind[1],ind[2],ind[3],ind[4]]+1
        Q=rew.sum/rew.count;
        policy[,,1]=Q[,,1,1] < Q[,,1,2]
        policy[,,2]=Q[,,2,1] < Q[,,2,2]
      }
    }
  }
  V[,,1]=(rew.sum[,,1,1]+rew.sum[,,1,2])/(rew.count[,,1,1]+rew.count[,,1,2])
  V[,,2]=(rew.sum[,,2,1]+rew.sum[,,2,2])/(rew.count[,,2,1]+rew.count[,,2,2])
  return(list(policy,V,Q))
}


# function 2 plot results
plot.BJ_MC2<-function(V){
 par(mfrow=c(2,2))
 image(V[[2]][,,1],main="Utility with usable Ace",xaxt='n',yaxt='n',
  xlab="Dealer showing",ylab="Player sum")
 axis(1,at=seq(0,1,length.out=10),label=c("A",paste(2:10)))
 axis(2,at=seq(0,1,length.out=10),label=12:21)
 image(V[[2]][,,2],main="Utility without usable Ace",xaxt='n',yaxt='n',
  xlab="Dealer showing",ylab="Player sum")
 axis(1,at=seq(0,1,length.out=10),label=c("A",paste(2:10)))
 axis(2,at=seq(0,1,length.out=10),label=12:21)
 image(V[[1]][,,1],main="Policy with usable Ace",xaxt='n',yaxt='n',
  xlab="Dealer showing",ylab="Player sum")
 axis(1,at=seq(0,1,length.out=10),label=c("A",paste(2:10)))
 axis(2,at=seq(0,1,length.out=10),label=12:21)
 image(V[[1]][,,2],main="Policy without usable Ace",xaxt='n',yaxt='n',
  xlab="Dealer showing",ylab="Player sum")
 axis(1,at=seq(0,1,length.out=10),label=c("A",paste(2:10)))
 axis(2,at=seq(0,1,length.out=10),label=12:21)
}

V=BJ_MC(5e6)
plot.BJ_MC2(V)

RL_exp5_3

数理社会学: なぜ宣伝しなくても流行がおこるのか

vogue2015plot

社会を<モデル>でみる:数理社会学への招待
18章:なぜ宣伝しなくても流行がおこるのか。
モデルの説明:
x:流行に影響されていない人
y:流行に影響されている人
z:以前流行に影響されてが、もう影響されていない
t:時間
\delta x = -\alpha * x[t] * y[t] * \delta t
\delta y = (\alpha * x[t] * y[t] – \beta * y[t]) * \delta t
\delta z = \beta * y[t] * \delta t

# initialization
vogue<-function(alpha, beta, inits, max_time){
  dt=0.01
  max_timeR = max_time - 1
  x=c(inits[1], rep(0, max_timeR))
  y=c(inits[2], rep(0, max_timeR))
  z=c(inits[3], rep(0, max_timeR))
  # main loop
  for (t in 1:max_timeR) {
    x[(t+1)] = x[t] - alpha*x[t]*y[t]*dt
    y[(t+1)] = y[t] + (alpha*x[t]*y[t] - beta*y[t])*dt
    z[(t+1)] = z[t] + beta*y[t]*dt
  }
  return(data.frame(x,y,z))
}

vogue.plot<-function(dat) {
  plot(dat$x,type="l",lwd=4,col="blue", main="Typical Time Series of \"Vogue\" ",
       xlab="time", ylab="Number of People",cex.lab=1.3,ylim=c(0,dat$x[1]))
  lines(dat$y,type="l",lwd=4,col="red")
  lines(dat$z,type="l",lwd=4,col="green")
  legend("right", c("affected","not affected", "no longer affected"), 
         col=c("red","blue","green"),lty=rep(1,3),lwd=4)
}

# running functions
res<-vogue(0.0001, 0.1, c(10000, 100, 0), 5000)
vogue.plot(res)

RL Sutton & Barto Ch.4

#######################################
#   ch4.1 policy evaluation 
########################################
# example 4.1 - bug fixed on 2017.03.21
# defining deterministic trans. matrix
north=c(1:3,15,1:10)
east=2:15;east[ c(3,7,11)]=c(3,7,11)
south=c(5:15,12:14)
west=c(15,1:13);west[ c(4,8,12)]=c(4,8,12)
trM=cbind(north,east,south,west)

# defining Reward & trans. prob.
R=-1;P=0.25;

# iterative policy evaluation
delta=1; gamma=1; tol=1e-8; V=rep(0,15);
while (delta>tol) {
 delta=0;
  for (i_state in 1:14) {
    v=V[i_state]
    V[i_state]=sum(P*(R+gamma*V[trM[i_state,]]))
    delta=max(delta,abs(v-V[i_state]));
  }
}

# result
> matrix(c(0,V),nrow=4)
     [,1] [,2] [,3] [,4]
[1,]    0  -14  -20  -22
[2,]  -14  -18  -20  -20
[3,]  -20  -20  -18  -14
[4,]  -22  -20  -14    0

#####################################
#   ch4.3 Policy Improvement
#   easier one 
#####################################
north=c(1:3,15,1:10)
east=2:15;east[ c(3,7,11)]=c(3,7,11)
south=c(5:15,12:14)
west=c(15,1:13);west[ c(4,8,12)]=c(4,8,12)
trM=cbind(north,east,south,west)
R=-1;P=0.25;V=rep(0,15);
delta=1; gamma=1; tol=1e-10; 
bestP=sample(1:4,14,replace=T)
stable=F;counter=0;
while (stable==F){
  counter=counter+1
  # iterative policy evaluation
  while (delta>tol) {
    delta=0;
    for (i_state in 1:14) {
      v=V[i_state]
      V[i_state]=sum(P*(R+gamma*V[trM[i_state,]]))
      delta=max(delta,abs(v-V[i_state]))
    }
  }
  # policy improvement
  stable=F
  for (i_state in 1:14) {
    b=bestP[i_state]
    bestP[i_state]=which.max(V[trM[i_state,]])
    ifelse((bestP[i_state]==b),stable<-T,stable<-F)
  }
}

# with softmax
R=-1;V=rep(0,15);
bestP=sample(1:4,14,replace=T)
stable=F;counter=0
while (stable==F){
  counter=counter+1
  Vmat=matrix(V[trM],ncol=4)
  Ppolicy=t(apply(Vmat,1,function(x) exp(10*x)/sum(exp(10*x))))
# iterative policy evaluation
  while (delta>tol) {
    delta=0;
    for (i_state in 1:14) {
      v=V[i_state]
      V[i_state]=sum(Ppolicy[i_state]*(R+gamma*V[trM[i_state,]]))
      delta=max(delta,abs(v-V[i_state]))
    }
  }
# policy improvement
  stable=F
  for (i_state in 1:14) {
    b=bestP[i_state]
    bestP[i_state]=which.max(V[trM[i_state,]])
    ifelse((bestP[i_state]==b),stable<-T,stable<-F)
  }
}


#####################################
#   ch4.4 Value Iteration 
#####################################
# example 4.3
V=c(rep(0,100),1);V.hist=c()
p=c(0.4,0.6);
gamma=1;delta=1; tol=1e-20
max.a=rep(0,101)
while (delta>tol) {
  delta=0;
  for (i_state in 1:99) {
    v=V[i_state+1]
    temp=matrix(0,nrow=1,ncol=i_state)
    for (i_action in 1:i_state) {
       temp[i_action]=sum(p*(gamma*c(V[(min(i_state+i_action,100)+1)],
         V[(max(i_state-i_action,0)+1)])))
     }
    V[i_state+1]=max(temp)
    max.a[i_state+1]=which.max(round(temp,8))
    delta=max(delta,abs(v-V[i_state+1]))
  }
  V.hist=rbind(V.hist,V)
}
# plotting results
par(mfrow=c(1,2))
plot(V.hist[1,],type='l',lwd=2,xlab="Capital",ylab="Value Estimates",col='red')
lines(V.hist[2,],lwd=2,col='blue')
lines(V.hist[3,],lwd=2,col='green')
lines(V.hist[32,],lwd=2,col='black')
legend("topleft",c("sweep 1","sweep 2","sweep 3", "sweep 32"),
 col=c("red","blue","green","black"),lwd=2)
barplot(max.a,xlab="Capital",ylab="Final Policy",col="white")

sutton4-6

RL Sutton & Barto Ch.3

chapter 3 - example 3.8 & 3.12
bug fixed on 2015.12.18
#####################################
#     Ch3.7 Value function
#####################################
# example 3.8: Gridworld
# initializing value vector
V=rep(0,25);                

# defining probability matrix
P=matrix(1/4,nrow=25,ncol=4) # 

# defining deterministic transition matrix
north=c(2:25,25)
north[ c(5,10,15,20,25)]=c(5,10,15,20,25)
east=c(6:25,21:25)
west=c(1:5,1:20)
south=c(1,1:24)
south[ c(1,6,11,16,21)]=c(1,6,11,16,21)
trM=cbind(north,east,south,west)
trM[10,]=6
trM[20,]=18

# defining reward matrix
R=matrix(0,nrow=25,ncol=4)
R[which(trM==1:25)]=-1
R[10,]=10
R[20,]=5

# calculating state-values iteratively
# fixed number of iterations 
nRep=1000; gamma=0.9;
for (i_rep in 1:nRep) {
  V.old=V
  for (i_state in 1:25) {
    V[i_state]=sum(P[i_state,]*(R[i_state,]+gamma*V.old[trM[i_state,]]))
  }
}

# result
> matrix(V,nrow=5)[5:1,]
            [,1]       [,2]       [,3]       [,4]       [,5]
[1,]  3.30899634  8.7892919  4.4276192  5.3223676  1.4921788
[2,]  1.52158807  2.9923179  2.2501400  1.9075717  0.5474027
[3,]  0.05082249  0.7381706  0.6731133  0.3581862 -0.4031411
[4,] -0.97359230 -0.4354954 -0.3548823 -0.5856051 -1.1830751
[5,] -1.85770055 -1.3452313 -1.2292673 -1.4229181 -1.9751790

#####################################
#  ch3.8 optimal value function
#####################################
# example 3.12
V.star=V; # V calculated as in exp3.8
iter=0;maxItr=1000;delta=1;tol=1e-5;
while(iter < maxItr & delta > tol) {
  delta=0;iter=iter+1
  V.star.old=V.star
  for (i_state in 1:25) {
    v=V.star[i_state]
    V.star[i_state]=max(R[i_state,]+gamma*V.star.old[trM[i_state,]])
    delta=max(delta,abs(v-V.star[i_state]))
  }
}
# result - ex3.12
> matrix(V.star,nrow=5)[5:1,]
         [,1]     [,2]     [,3]     [,4]     [,5]
[1,] 21.97746 24.41940 21.97746 19.41941 17.47747
[2,] 19.77971 21.97746 19.77971 17.80174 16.02157
[3,] 17.80173 19.77971 17.80174 16.02157 14.41941
[4,] 16.02156 17.80173 16.02156 14.41940 12.97746
[5,] 14.41940 16.02156 14.41940 12.97746 11.67972

RL Sutton & Barto Ch2

epGreedy = function(nTrial,nRep,epsilon) {
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
  Q.true=rnorm(10);Q.est=rep(0,10);
  Q.cum=rep(0,10);act.count=rep(0,10);
  opt.ID=which.max(Q.true)
  for (i_trial in 1:nTrial) {
    if (runif(1) < epsilon) {
      action=sample(1:10,1)
    } else {
      action=which.max(Q.est)
    }
    ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
    opt.hist[i_trial,i_rep]=action==opt.ID
    act.count[action]=act.count[action]+1
    Q.cum[action]=Q.cum[action]+ret.hist[i_trial,i_rep]
    Q.est[action]=Q.cum[action]/act.count[action]
  }
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}
type1=epGreedy(1000,2000,0.0)
type2=epGreedy(1000,2000,0.01)
type3=epGreedy(1000,2000,0.1)

par(mfrow=c(2,1))
plot(type3$ret,type='l',xlab="Play",ylab="average reward")
lines(type2$ret,type='l',col='red')
lines(type1$ret,type='l',col='green')
legend("bottomright",c("epsilon=0.00","epsilon=0.01","epsilon=0.10"),
  col=c("black","red","green"),lty=c(1,1,1))
plot(type3$opt,type='l',xlab="Play",ylab="% optimal action")
lines(type2$opt,type='l',col='red')
lines(type1$opt,type='l',col='green')
legend("bottomright",c("epsilon=0.00","epsilon=0.01","epsilon=0.10"),
  col=c("black","red","green"),lty=c(1,1,1))

#####################################
#  Generalised Version - ch2.5 & 2.7
#####################################
epGreedyG = function(nTrial,nRep,epsilon,LR,constantLR="F",initialQ) {
# generalized version of epsilon-greedy 
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
  Q.true=rnorm(10);Q.est=rep(initialQ,10);act.count=rep(0,10);
  opt.ID=which.max(Q.true)
  for (i_trial in 1:nTrial) {
    if (runif(1) < epsilon) {
      action=sample(1:10,1)
    } else {
      action=which.max(Q.est)
    }
    ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
    opt.hist[i_trial,i_rep]=action==opt.ID
    act.count[action]=act.count[action]+1
    if (constantLR=="F"){LR=1/act.count[action]}
    Q.est[action]=Q.est[action]+LR*(ret.hist[i_trial,i_rep]-Q.est[action])
  }
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}

epGreedy

#################################
#  ch2.4 softmax
#################################
RLsoftmax = function(nTrial,nRep,temp) {
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
  Q.true=rnorm(10);Q.est=rep(0,10);
  Q.cum=rep(0,10);act.count=rep(0,10);
  opt.ID=which.max(Q.true)
  t=temp
  for (i_trial in 1:nTrial) {
    action=sample(1:10,1,prob=exp(Q.est/t)/sum(exp(Q.est/t)))
    ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
    opt.hist[i_trial,i_rep]=action==opt.ID
    act.count[action]=act.count[action]+1
    Q.cum[action]=Q.cum[action]+ret.hist[i_trial,i_rep]
    Q.est[action]=Q.cum[action]/act.count[action]
    t=max(0.001,0.995*t)
  }
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}

#################################
# ch2.8 reinforcement comparison
################################# 
reinfComp = function(nTrial,nRep,alpha,beta) {
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
  Q.true=rnorm(10);p=rep(0,10);r.ave=0;
  opt.ID=which.max(Q.true)
  for (i_trial in 1:nTrial) {
    action=sample(1:10,1,prob=exp(p)/sum(exp(p)))
    ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
    opt.hist[i_trial,i_rep]=action==opt.ID
    p[action]=p[action]+beta*(ret.hist[i_trial,i_rep]-r.ave)
    r.ave=r.ave+alpha*(ret.hist[i_trial,i_rep]-r.ave)
  }
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}

#################################
# ch2.9 pursuit method
#################################
pursuit= function(nTrial,nRep,beta) {
ret.hist=matrix(0,nrow=nTrial,ncol=nRep)
opt.hist=ret.hist
for (i_rep in 1:nRep){
  Q.true=rnorm(10);p=rep(1/10,10);act.count=rep(0,10);Q.est=rep(0,10)
  opt.ID=which.max(Q.true)
  for (i_trial in 1:nTrial) {
    actTemp=which.max(Q.est)
    p[actTemp]=p[actTemp]+beta*(1-p[actTemp])
    nonA=(1:10)[-actTemp]
    p[nonA]=p[nonA]+beta*(-p[nonA])
    action=sample(1:10,1,prob=p)
    ret.hist[i_trial,i_rep]=rnorm(1)+Q.true[action]
    opt.hist[i_trial,i_rep]=action==opt.ID
    act.count[action]=act.count[action]+1
    Q.est[action]=Q.est[action]+1/act.count[action]*(ret.hist[i_trial,i_rep]-Q.est[action])
  }
}
return(data.frame(opt=rowMeans(opt.hist),ret=rowMeans(ret.hist)))
}

数理社会学:なぜ差別しなくても外国人居住区ができるのか

社会を<モデル>でみる:数理社会学への招待
29章:なぜ差別しなくても外国人居住区ができるのか
○と♯の2つのグループが存在し、以下の条件で他の場所へ移動する。
○: 近隣に2人以下○の場合、
♯: 近隣の1/3以上が♯でない場合、

# 1 epochの内、数回移動する可能性のある場合
FUNmigration<-function(field_size=6, Nsharp=10, Ncircle=10) {
  
Nempty=(field_size^2-Nsharp-Ncircle)
society<-matrix(sample(c(rep(0,Nempty),rep(1,Ncircle),rep(2,Nsharp))),ncol=field_size)
# plotting initial config.
par(mfrow=c(1,2))
idx1<-which(society==1,arr.ind=T);idx2<-which(society==2,arr.ind=T)
plot(idx1[,1],idx1[,2],type="n",xlim=c(0.5,field_size+0.5),
  ylim=c(0.5,field_size+0.5),main="Initial Arrangement",
  xlab="location @ x", ylab="location @ y")
text(idx1[,1],idx1[,2],"o",cex=4,col="red");text(idx2[,1],idx2[,2],"#",cex=4,col="blue")

# main 
moved=1;counter=0
while (moved>0&counter<1000) {
  counter=counter+1;moved=0
  for (i_row in 1:field_size) {
    for (i_col in 1:field_size) {
      # checking sharps
      if (society[i_row,i_col]==2) {
        neR_IDX=max((i_row-1),1):min((i_row+1),field_size);
        neC_IDX=max((i_col-1),1):min((i_col+1),field_size);
        n_sharp=sum(society[neR_IDX,neC_IDX]==2)-1;
        n_circle=sum(society[neR_IDX,neC_IDX]==1);
        if (n_sharp+n_circle==0 | n_sharp/(n_sharp+n_circle) < (1/3)) {
          moved=moved+1;
          loc_mov=sample(which(society==0),1)
          society[i_row,i_col]=0
          society[loc_mov]=2
        }
      }
      # checking circles
     if (society[i_row,i_col]==1) {
       neR_IDX=max((i_row-1),1):min((i_row+1),field_size);
       neC_IDX=max((i_col-1),1):min((i_col+1),field_size);
       n_circle=sum(society[neR_IDX,neC_IDX]==1)-1;
       if (n_circle < 3) {
         moved=moved+1;
         loc_mov=sample(which(society==0),1)
         society[i_row,i_col]=0
         society[loc_mov]=1
        }
      }
    }
  }
}
# plotting final config.
idx1<-which(society==1,arr.ind=T)
idx2<-which(society==2,arr.ind=T)
plot(idx1[,1],idx1[,2],type="n",xlim=c(0.5,field_size+0.5),ylim=c(0.5,field_size+0.5),
  main="Arragement After Migration",,xlab="location @ x", ylab="location @ y")
text(idx1[,1],idx1[,2],"o",cex=4,col="red")
text(idx2[,1],idx2[,2],"#",cex=4,col="blue")
}
# 1 epochの内、1度のみ移動する場合
FUNmigration <- function(soc.size = 6, n.circle = 10, n.sharp = 10) {
  r.sample = sample(soc.size ^ 2)
  society = matrix(0, soc.size, soc.size)
  society[r.sample[1:n.circle]] = 1
  society[r.sample[(n.circle + 1):(n.circle + n.sharp)]] = 2
  
  # plotting initial config.
  par(mfrow = c(1, 2))
  idx1 <- which(society == 1, arr.ind = T)
  idx2 <- which(society == 2, arr.ind = T)
  plot(idx1[, 1], idx1[, 2], type = "n", xlim = c(0.5, soc.size + 0.5),
    ylim = c(0.5, soc.size + 0.5), main = "Initial Arrangement",
    xlab = "location @ x", ylab = "location @ y")
  text(idx1[, 1], idx1[, 2], "o", cex = 3, col = "red")
  text(idx2[, 1], idx2[, 2], "#", cex = 3, col = "blue")
  move = 1
  while (move != 0 ) {
    # circles
    move = 0
    c.pos = which(society == 1, arr.ind = T)
    for (i.c in 1:n.circle) {
      r.idx = c(max(1, c.pos[i.c, 1] - 1), min(soc.size, c.pos[i.c, 1] + 1))
      c.idx = c(max(1, c.pos[i.c, 2] - 1), min(soc.size, c.pos[i.c, 2] + 1))
      neighber  = society[r.idx[1]:r.idx[2], c.idx[1]:c.idx[2]]
      neighber.c = sum(neighber == 1) - 1
      if (neighber.c < 3) {
        move = 1
        move.idx = which(society == 0)
        society[sample(move.idx, 1)] = 1
        society[c.pos[i.c, 1], c.pos[i.c, 2]] = 0
      }
    }
    # sharps
    s.pos = which(society == 2, arr.ind = T)
    for (i.s in 1:n.sharp) {
      r.idx = c(max(1, s.pos[i.s, 1] - 1), min(soc.size, s.pos[i.s, 1] + 1))
      c.idx = c(max(1, s.pos[i.s, 2] - 1), min(soc.size, s.pos[i.s, 2] + 1))
      neighbor  = society[r.idx[1]:r.idx[2], c.idx[1]:c.idx[2]]
      neighbor.s = sum(neighbor == 2) - 1
      neighbor.all = sum(neighbor != 0) - 1
      prop.s = max(0, neighbor.s / neighbor.all, na.rm = T)
      if (prop.s < 1 / 3) {
        move = 1
        move.idx = which(society == 0)
        society[sample(move.idx, 1)] = 2
        society[s.pos[i.s, 1], s.pos[i.s, 2]] = 0
      }
    }
  }
  idx1 <- which(society == 1, arr.ind = T)
  idx2 <- which(society == 2, arr.ind = T)
  plot(idx1[, 1], idx1[, 2], type = "n",
    xlim = c(0.5, soc.size + 0.5), ylim = c(0.5, soc.size + 0.5),
    main = "Arragement After Migration", xlab = "location @ x", ylab = "location @ y")
  text(idx1[, 1], idx1[, 2], "o", cex = 3, col = "red")
  text(idx2[, 1], idx2[, 2], "#", cex = 3, col = "blue")
}

数理社会学・第10章:喧嘩しても分かれない二人

mathsoc_ch11
社会を<モデル>でみる:数理社会学への招待
10章:なぜ好きなのにケンカするのか
x:xさんの行為、正の値:2人にとって良い行為;負の値:2人もしくはyさんにとって悪い行為
y:yさんの行為、正の値:2人にとって良い行為;負の値:2人もしくはxさんにとって悪い行為
x(t+1)=x(t)+a*x(t)+b*y(t)
y(t+1)=y(t)+c*x(t)+d*y(t)

# なぜ好きなのにけんかをするのか
fighting_couple<-function(a,b,c,d) {
  timeSep=0.05;ts=seq(1,50,timeSep);n_ts=length(ts)
  x=matrix(0,nrow=n_ts,ncol=1)
  y=matrix(0,nrow=n_ts,,ncol=1)
  initX=c(rep(-40,5),rep(-20,5),rep(0,5),rep(20,5),rep(40,5))
  initY=c(rep(c(-40,-20,0,20,40),5))
  initX=initX[-13]
  initY=initY[-13]
  lengthINI=length(initX)
  for (i_ini in 1:lengthINI) {  
    x[1]=initX[i_ini];y[1]=initY[i_ini];
    for (i_gen in 2:n_ts) {
       x[i_gen]=x[i_gen-1]+(a*x[i_gen-1]+b*y[i_gen-1])*timeSep
       y[i_gen]=y[i_gen-1]+(c*x[i_gen-1]+d*y[i_gen-1])*timeSep
    }  
    if (i_ini==1) {  
       plot(x,y,xlim=c(-50,50),ylim=c(-50,50),col=4,type='l',lwd=2,
         xlab="X's Action",ylab="Y's Action")
       arrows(x[2],y[2],x[3],y[3],col=4,lwd=2,length=0.15)} else {  
       lines(x, y, col=4, lwd=2)
       arrows(x[2], y[2], x[3], y[3], col=4,lwd=2, length=0.15)
    }  
  }  
}  

par(mfrow=c(2,3))
fighting_couple(-1,0.0,0.5,-1)
fighting_couple(-1,2,-1,-1)
fighting_couple(0,-1,1,0)
fighting_couple(1,-2,2,0)
fighting_couple(1,0,0.5,1)
fighting_couple(1,-4,-4,0)
fun1<-function(a,b,c,d,N,initX,initY) {
  dt=0.05;
  x=matrix(0,nrow=N,ncol=1);y=matrix(0,nrow=N,ncol=1);
  x[1]=initX;y[1]=initY;
  for (i_rep in 2:N) {
    x[i_rep]=x[i_rep-1]+(a*x[i_rep-1]+b*y[i_rep-1])*dt
    y[i_rep]=y[i_rep-1]+(c*x[i_rep-1]+d*y[i_rep-1])*dt
  }
  return(data.frame(x,y))
}
fun2<-function(a,b,c,d,N,initX, initY) { 
  res<-fun1(a,b,c,d,N,initX[1],initY[1]);
  nXYs=length(initX)
  plot(res$x,res$y,xlim=c(-50,100),ylim=c(-50,100),type='l')
  for (i_loop in 2:nXYs){
    res<-fun1(a,b,c,d,N,initX[i_loop],initY[i_loop]);
    lines(res$x,res$y,type='l')
  }
}
# 実行例
initX=rep(seq(10,50,10),5)
initY=sort(rep(seq(10,50,10),5))
fun2(0,-1,1,0,500,initX,initY) 

個体群生態学 人口の推移モデル

人口の推移モデル
N_t+1=r*N_t*(1-N_t)

FUNpop_t<-function(r=2.5,n0=0.5,gen=100) {
  nt=n0;x=seq(0, 1, 0.01);y=r*x*(1-x)
  maxY=max(y,1);i_gen=1
  plot(x,y,type='l',xlim=c(-0.1, 1.1),ylim=c(0,maxY),xlab="Prop. @ time t", 
   ylab="Prop @ time t+1");
  lines(x,x,type='l')
  while ((i_gen < gen) & (nt>0)) {
    nt1=r*nt*(1-nt);
    lines(c(nt,nt),c(nt, nt1),type='l',col='red')
    lines(c(nt,nt1),c(nt1,nt1),type='l',col='red')
    nt=nt1;
    i_gen=i_gen+1;
  }
}
# example
par(mfrow=c(2,2))
FUNpop_t(r=2.7,n0=0.15,gen=100)
FUNpop_t(r=3.1,n0=0.15,gen=100)
FUNpop_t(r=3.8,n0=0.15,gen=100)
FUNpop_t(r=4.2,n0=0.15,gen=100)

population01

周期性の検証

FUNpop_t2<-function(gen=100,digit=5,n0=0.5){
  r=seq(2.5,4,0.001)
  res=matrix(0,ncol=2,nrow=1)
  r.length=length(r)
  for (i_r in 1:r.length) {
    nt=matrix(0,nrow=gen,ncol=1)
    nt[1]=n0;
    for (i_loop in 2:gen)
      {nt[i_loop]=r[i_r]*nt[i_loop-1]*(1-nt[i_loop-1])}
    bunnki<-unique(round(nt[duplicated(round(nt,digits=digit))],digits=digit))
    bunnki.length<-length(bunnki);
    res=rbind(res,cbind(rep(r[i_r],bunnki.length),bunnki))
  }
return(res[-1, ])
}
# example
res<-FUNpop_t2(gen=10000, digit=5)
plot(res[,1],res[,2],pch=19,cex=0.1,col="red",xlab="r values",ylab="cyclic points")

population02

数理社会学:なぜ社会は狭いのか?

ネットワークの作成方法の例と距離の測りかた
Regular Networks, Small-World Networks

mkRegG=function(n_node,n_edge){
# input
# n_node: number of nodes
# n_edge: number of edges / 2  
  M=matrix(0,n_node,n_node) 
  for (i_loop in 1:n_edge){
    M=M+diag(1,n_node,n_node)[, c((i_loop+1):n_node,1:i_loop)]
  }
  return(M+t(M))
}

# small-world network
G=mkRegG(100,2);
n_node=ncol(G);
prob=0.05;
for (i_node in 1:n_node) {
  node=G[i_node,]
  linked=which(node==1) 
  woLinked=which(node==0) 
  woLinked=woLinked[woLinked!=i_node]
  rwVec=linked[which(runif(length(linked)) < prob)]
  nRW=length(rwVec)
  if (nRW>0) {
    newLink=sample(woLinked,nRW)
    G[i_node,newLink]=1;G[newLink,i_node]=1
    G[i_node,rwVec]=0;G[rwVec,i_node]=0
  }
}  

# cal. shortest path
dijkstra2<-function(G,nodeID){
n_node=nrow(G)
G[which(G==0)]=Inf;diag(G)=0
d=rep(Inf,n_node);d[nodeID]=0
M=1:n_node;M=M[-nodeID]
while(length(M)>0) {
  for (j in 1:n_node) 
    {
      d[j]=min(d[j],d[nodeID]+G[nodeID,j])
    }
    nodeID=M[which(d[M]==min(d[M]))]
    n_remove=length(nodeID)
    for (i_remove in 1:n_remove){
      M=M[-which(M==nodeID[i_remove])]
    }
  }
 return(d)
}

昔のバージョン(多分)

# regular network 
mkRegG=function(n_node,n_edge){
# input
# n_node: number of nodes
# n_edge: number of edges / 2  
  M=matrix(0,n_node,n_node)	
  for (i_loop in 1:n_edge){
    M=M+diag(1,n_node,n_node)[, c((i_loop+1):n_node,1:i_loop)]
  }
  return(M+t(M))
}
# WS model (small-world)
mkWSG=function(regG,prob){
# input
# regG: regular network
# prob: probability of rewiring / 2
n_node=ncol(regG)
M=regG;
  for (i_node in 1:n_node){
    edge=which(M[i_node,]==1)
    rwVec=edge[which(runif(length(edge)) < prob)]
    nRW=length(rwVec);
    if (nRW>0) {
      newEdge=sample(seq(n_node)[-i_node],nRW);
      while (any(M[i_node,newEdge]==1) & any(M[i_node,rwVec]==1)){
        newEdge=sample(seq(n_node)[-i_node],nRW);
    }
    M[i_node,newEdge]=1;M[newEdge,i_node]=1;
    M[i_node,rwVec]=0;M[rwVec,i_node]=0;	
    }
  }
  return(M);
}
# scale-free network
mkFSG=function(n_node,n_edge) {
# input
# n_node: number of nodes
# n_edge: minimum number of edges
  M=matrix(1,n_edge+1,n_edge+1)-diag(n_edge+1)
  for (i_node in (n_edge+2):n_node) {
    Pnode=rowSums(M)/sum(M)	
    cumPnode=c(0,cumsum(Pnode))
    vec=matrix(0,1,i_node-1)
      while (sum(vec) < n_edge) {
        vec[max(which(cumPnode<=runif(1)))]=1
      }
    M=rbind(M,vec);
    M=cbind(M,c(vec,0));
  }
  return(M)
}