認知情報解析演習 W10

n.var = 10
n.child = 10
child = matrix(sample(0:1,n.var*n.child, replace =T),nrow =n.child)

GA_mutate <- function(child, p){
  mut.mat = matrix(sample(0:1,length(child),replace = T, 
                          prob = c(1-p, p)),nrow=nrow(child))
  return(abs(child-mut.mat))
}

GA_crossover<-function(parent){
  n.parent = nrow(parent)
  rand.order = sample(1:n.parent)
  p1.idx= rand.order[1:(n.parent/2)]
  p2.idx= rand.order[(n.parent/2+1):n.parent]
  co.idx = sample(1:(length(parent)/2),length(parent)/4)
  c1 = parent[p1.idx,]
  c1.copy = c1
  c2 = parent[p2.idx,]
  c1[co.idx] = c2[co.idx]
  c2[co.idx] = c1.copy[co.idx]
  return(child = rbind(c1,c2))
}


GA_survive<-function(parent, child, fitP, fitC){
  nPop=nrow(parent);
  fitT=c(fitP,fitC);
  fitMax=sort(fitT,index.return=TRUE,decreasing = TRUE)
  tempX=rbind(parent,child);
  fittest=tempX[fitMax$ix[1:nPop],]
  return(fittest)
}

GA<-function(parent, nGen,p,X,Y){
  optHist=matrix(0,nrow=nGen,ncol=1)
  fittest.hist = matrix(0,nrow=nGen,ncol = ncol(parent))
  nPop = nrow(parent)
  nVar = ncol(parent)
  fitP = rep(0,nPop)
  fitC = fitP
  for (i.pop in 1:nPop){
    dat.temp = X[,which(parent[i.pop,]==1)]
    fitP[i.pop] = summary(lm(Y~dat.temp))$adj.r.squared
  }
  optHist[1]=max(c(fitP))
  fittest.hist[1,] = parent[which.max(fitP),]
  for (i_gen in 2:nGen) {
    child<-GA_crossover(parent)
    child<-GA_mutate(child,p)
    # assuming same numbers of set of genes
    for (i.pop in 1:nPop){
      dat.temp = X[,which(parent[i.pop,]==1)]
      fitP[i.pop] = summary(lm(Y~dat.temp))$adj.r.squared
      if (sum(child[i.pop,])==0){
        child[i.pop,sample(1:ncol(child),sample(1:nVar,1))]=1
      }
      dat.temp = X[,which(child[i.pop,]==1)]
      fitC[i.pop] = summary(lm(Y~dat.temp))$adj.r.squared
    }
    parent<-GA_survive(parent, child, fitP, fitC)
    optHist[i_gen]=max(c(fitP,fitC))
    fittest.hist[i_gen, ] = parent[1,]
  }
  return(list(fittest = parent, optHist = optHist, fittest.hist = fittest.hist))
}

set.seed(19)
nVar = 10
nData = 50
X=matrix(rnorm(nVar*nData,mean=10,sd=5),ncol=nVar);
y=X%*%c(-2,0,2,0,2,0,-3,0,-2,0)+rnorm(nData,mean=0,sd=8);
summary(lm(y~X[,seq(1,9,2)]))
summary(lm(y~X))
parent = matrix(sample(0:1,300,replace = T),nrow=30)
res<-GA(parent,100,0.1,X,y)
head(res$fittest)

より複雑なモデルへ

C = combn(3,2)
paste("X",C,sep="")
temp.lab =toString(paste("X",C[ , 1], sep = ""))
gsub(",", ":", temp.lab) 

n.comb = ncol(C)
var.labels=c()
for (i.comb in 1: n.comb){
  temp.label = toString(paste("X",C[ , i.comb], sep = ""))
  var.labels = c(var.labels, gsub(",", ":", temp.label) )
}


mk.labels <- function(C){
  var.labels = c()
  n.comb = ncol(C)
  for (i.comb in 1: n.comb){
    temp.label = toString(paste("X",C[ , i.comb], sep = ""))
    var.labels = c(var.labels, gsub(",", ":", temp.label) )
  }
  return(var.labels)
}
                                    
n.var = 3
max.interaction = 3
# single variable
var.labels = paste("X", 1:n.var, sep = "")


# interaction terms
for (i.interaction in 2:max.interaction ){
  combination = combn(n.var, i.interaction)
  var.labels = c(var.labels, mk.labels(combination))
}

model.def = paste("Y ~", gsub(",", "+",   toString(var.labels[c(1,1,1,1,1,0,0) == 1]))) 

set.seed(20)
nVar = 3
nData = 50
X=matrix(rnorm(nVar*nData,mean=10,sd=5),ncol=nVar);
y=X%*%c(1,1,-2)+rnorm(nData,mean=0,sd=3);
dat = data.frame(Y=y,X1=X[,1],X2=X[,2],X3=X[,3])

# example
model.def1 = paste("Y ~", gsub(",", "+",  toString(var.labels[c(1,1,1,0,0,0,0) == 1]))) 
model.lm = lm(model.def1, dat)

Evolutionary Strategy

n.parent = 10
n.theta = 10
parent = list(theta = matrix(rnorm(n.parent*n.theta),nrow=n.parent),
              sigma = matrix(runif(n.parent*n.theta)+1,nrow = n.parent))

ES_crossover<-function(parent){
  n.parent = nrow(parent$theta)
  rand.order = sample(1:n.parent)
  p1.idx= rand.order[1:(n.parent/2)]
  p2.idx= rand.order[(n.parent/2+1):n.parent]
  co.idx = sample(1:(length(parent)/2),length(parent)/4)
  c1 = parent$theta[p1.idx,]
  c1.copy = c1
  c2 = parent$theta[p2.idx,]
  c1[co.idx] = c2[co.idx]
  c2[co.idx] = c1.copy[co.idx]
  sigma = (parent$sigma[p1.idx,]+parent$sigma[p2.idx,])/2
  return(child = list(theta = rbind(c1,c2), sigma = rbind(sigma,sigma)))
}

ES_mutate<-function(child, tau){
  child$sigma = child$sigma*
    exp(tau*matrix(rnorm(length(child$sigma)),nrow=nrow(child$sigma)))
  child$theta = child$theta + 
    matrix(rnorm(length(child$sigma),0,child$sigma),nrow=nrow(child$sigma))
  return(child)
}

set.seed(20); nData = 100
X=matrix(rnorm(9*nData,mean=10,sd=2),ncol=9);X=cbind(rep(1,nData),X)
y=X%*%c(10,2,5,-3,-5,0,0,0,0,0)+rnorm(nData,mean=0,sd=2);
#summary(lm(y~X[,2:10]))

# 評価関数
reg.error<-function(b,X,y){
  yhat<-X%*%b
  return(sum((y-yhat)^2))
}
# 評価の使用例
fitC = apply(child$theta,1,reg.error, X, y)

認知情報解析演習 08

  # genetic algorithm - identify the best linear model
  GA_recomb<-function(G) {
    nPop=nrow(G);
    nVar=ncol(G);
    child = G;
    G.permuted = G[sample(1:nPop),]
    recomb.idx=which(matrix(sample(0:1,nPop*nVar,replace=T),nrow=nPop)==1)
    child[recomb.idx] = G.permuted[recomb.idx]
    return(child)
  }
  
  GA_mutate = function(child, p){
    n.pop = nrow(child)
    n.var = ncol(child)
    mut.mat= matrix((runif(n.pop*n.var) < p), nrow = n.pop)
    child = abs(child-mut.mat)
    return(child)
  }
  
  GA_survive<-function(G, child, fitG, fitC){
    nPop=nrow(G);
    fitT=c(fitG,fitC);
    fitMax=sort(fitT,index.return=TRUE,decreasing = TRUE)
    tempX=rbind(G,child);
    G=tempX[fitMax$ix[1:nPop],]
    return(G)
  }
  
  GA<-function(G, nGen,p,X,Y){
    optHist=matrix(0,nrow=nGen,ncol=1)
    G.hist = matrix(0,nrow=nGen,ncol = ncol(G))
    nPop = nrow(G)
    nVar = ncol(G)
    fitG = rep(0,nPop)
    fitC = fitG
    for (i.pop in 1:nPop){
      dat.temp = X[,which(G[i.pop,]==1)]
      fitG[i.pop] = summary(lm(Y~dat.temp))$adj.r.squared
    }
    optHist[1]=max(c(fitG))
    G.hist[1,] = G[which.max(fitG),]
    for (i_gen in 2:nGen) {
      child<-GA_recomb(G)
      child<-GA_mutate(child,p)
      # assuming same numbers of set of genes
      for (i.pop in 1:nPop){
        dat.temp = X[,which(G[i.pop,]==1)]
        fitG[i.pop] = summary(lm(Y~dat.temp))$adj.r.squared
        if (sum(child[i.pop,])==0){
          child[i.pop,sample(1:ncol(child.all),sample(1:nVar,1))]=1
        }
        dat.temp = X[,which(child[i.pop,]==1)]
        fitC[i.pop] = summary(lm(Y~dat.temp))$adj.r.squared
      }
      G<-GA_survive(G, child, fitG, fitC)
      optHist[i_gen]=max(c(fitG,fitC))
      G.hist[i_gen, ] = G[1,]
    }
    return(list(G = G, optHist = optHist, G.hist = G.hist))
  }
  
  set.seed(19)
  nVar = 10
  nData = 50
  X=matrix(rnorm(nVar*nData,mean=10,sd=5),ncol=nVar);
  y=X%*%c(-2,0,2,0,2,0,-3,0,-2,0)+rnorm(nData,mean=0,sd=8);
  summary(lm(y~X[,seq(1,9,2)]))
  summary(lm(y~X))
  G = matrix(sample(0:1,300,replace = T),nrow=30)
  res<-GA(G,100,0.1,X,y)
  head(res$G)
  
  # GA - challenging task
  C = combn(3,2)
  temp.lab =toString(paste("X",C[ , i.comb], sep = ""))
  gsub(",", "*", temp.lab) 
  
  n.comb = ncol(C)
  var.labels = c()
  for (i.comb in 1: n.comb){
    temp.label = toString(paste("X",C[ , i.comb], sep = ""))
    var.labels = c(var.labels, gsub(",", "*", temp.label) )
  }
  
  mk.labels <- function(C){
    var.labels = c()
    n.comb = ncol(C)
    for (i.comb in 1: n.comb){
      temp.label = toString(paste("X",C[ , i.comb], sep = ""))
      var.labels = c(var.labels, gsub(",", "*", temp.label) )
    }
    return(var.labels)
  }
  var.labels = paste("X", 1:n.var, sep = "")
  
  # interaction terms
  for (i.interaction  in 2:max.interaction ){
    combination = combn(n.var, i.interaction)
    var.labels = c(var.labels, mk.labels(combination))
  }
  model.def = paste("Y ~", gsub(",", "+", toString(var.labels[c(1,1,1,1,1,0,0) == 1]))) 
  X=matrix(rnorm(nVar*nData,mean=10,sd=5),ncol=nVar);
  y=X%*%c(1,1,-2)+rnorm(nData,mean=0,sd=3);
  dat = data.frame(y=y,X1=X[,1],X2=X[,2],X3=X[,3])
  
  model.lm = lm(model.def, dat)
  

認知情報解析演習 06

Traveling Salesman Problem

TSP.inv <- function(x, n){
  x.length = length(x)
  start.idx = sample(1:x.length,1)
  end.idx = min(x.length, (start.idx + n -1))
  x[start.idx:end.idx]=x[end.idx:start.idx]
  return(x)
}

TSP.switch <- function(x){
  x.length = length(x)
  switch.idx = sample(1:x.length,2)
  x[switch.idx] = x[rev(switch.idx)]
  return(x)
}

TSP.trans <-function(x){
  x.length = length(x)
  trans.idx = sample(1:x.length,2)
  new.x = x[-trans.idx[1]]
  new.x = append(new.x, x[trans.idx[1]],after = (trans.idx[2]-1))
  return(new.x)
}

TSP.init<-function(n_city=10) {
  return(matrix(runif(n_city*2,1,100),nrow=n_city,ncol=2))
}

cities = TSP.init(10)

TSP.totDist<-function(route, cities) {
  route=c(route,route[1]);
  n.cities=nrow(cities);
  totDist=0;
  for (i.cities in 1:n.cities) {
    totDist = totDist + dist(cities[route[i.cities:(i.cities + 1)],])
  }
  return(totDist)
}

TSP.totDist(route,cities)

TSP.demo<-function(n.city=20, maxItr=1000, parm.set = c(1,0.99, 50)) {
  loc=TSP.init(n.city);
  route=1:n.city
  ## param. for simulated annealing - change values if necessary 
  C=parm.set[1];
  eta=parm.set[2];
  TEMP=parm.set[3]; 
  ##
  optDist=TSP.totDist(route,loc)
  optHist=matrix(0,nrow=maxItr,ncol=(length(route)+1))
  optHist[1,]=c(route,optDist)
  for (i_loop in 2:maxItr) {
    rand.op=sample(c('inv','sw','trans'),1,prob=c(0.75,0.125,0.125))
    if (rand.op=='inv') {
      new.route=TSP.inv(route,sample(2:n.city,1))
    } else if (rand.op=='sw') {
      new.route=TSP.switch(route)
      } else { 
        new.route=TSP.trans(route)
    }
    new.dist=TSP.totDist(new.route,loc)
    delta=new.dist-optDist
    prob=1/(1+exp(delta/(C*TEMP)))
    if (runif(1) < prob) {
      route=new.route;optDist=new.dist;
    } 
    optHist[i_loop,]=c(route,optDist);
    TEMP=min(TEMP*eta,0.1)
  }
  par(mfrow=c(1,2)) 
  plot(rbind(loc,loc[1,]),type='o',pch=20,cex=2.5, col='red',
       xlab='location @X',ylab='location @Y',main='Initial route')
  plot(loc[optHist[1000,c(1:n.city,1)],],type='o',pch=20, col='magenta',
       cex=2.5,xlab='location @X',ylab='location @Y',main='Optimized route')
  return(optHist)
}
# regression with SimAnneal.
reg.ERR<-function(b,X,y){
  yhat<-X%*%b
  return(sum((y-yhat)^2))
}
##
SA.reg <- function(maxItr,parm.set=c(C=1,eta=0.995,TEMP=100,sigma=100)){
  set.seed(20)
  nData = 100
  x=matrix(rnorm(9*nData,mean=10,sd=2),ncol=9);X=cbind(rep(1,nData),x)
  y=X%*%c(10,2,5,-3,-5,0,0,0,0,0)+rnorm(nData,mean=0,sd=2);
  lm.sum = summary(lm(y~X[,2:10]))
  C = parm.set[1]
  eta = parm.set[2]
  TEMP= parm.set[3]
  sigma = parm.set[4]
  b = rnorm(10,0,100)
  optERR=reg.ERR(b,X,y)
  optHist=matrix(0,nrow=maxItr,ncol=(10+1))
  optHist[1,]=c(b,optERR)
  for (i_loop in 2:maxItr) {
    new.b = b + rnorm(10,0,sigma*TEMP)
    new.ERR=Fun.reg(new.b, X,y)
    delta=new.ERR-optERR
    prob=1/(1+exp(delta/(C*TEMP)))
    if (runif(1) < prob) {
      b=new.b;
      optERR=new.ERR;
    } 
    optHist[i_loop,]=c(b,optERR);
    TEMP=max(TEMP*eta,0.01)
  }
  print(optHist[maxItr,])
  print(coef(lm.sum)[,1])
  return(list(optHist=optHist, gt = coef(lm.sum)))
}
res=SA.reg(1e5,c(80,0.999,1,1))

認知情報解析演習 04

fun01 <- function(x1,x2){
  return(x1^2 - x1*x2 -4*x1 +x2^2 -x2 )
}

x1 = seq(-10,10,0.05); x2 = seq(-10,10,0.05)
v <- outer(x1,x2,function(x1,x2) fun01(x1,x2))
contour(x1,x2,v,nlevels= 100, drawlabels = F,col='blue')

a = runif(2,0,10); b = runif(2,0,10);c = runif(2,0,10)

sortABC <-  function(a,b,c,func){
  fA = do.call(func,list(a[1],a[2]))
  fB = do.call(func,list(b[1],b[2]))
  fC = do.call(func,list(c[1],c[2]))
  sort.F = sort(c(fA,fB,fC),index.return=T)
  temp.set = rbind(a,b,c)
  a = temp.set[sort.F$ix[1],]; fA = sort.F$x[1]
  b = temp.set[sort.F$ix[2],]; fB = sort.F$x[2]
  c = temp.set[sort.F$ix[3],]; fC = sort.F$x[3]
  return(list(abc = rbind(a,b,c), fs = c(fA,fB,fC)))
}

nelder.mead01 <- function(a,b,c, func, tol){
# not in the original form
# expansion -> reflection -> contraction -> shrink
# minimizes f with 2 variables 
  repeat {
    sort.abc = sortABC(a,b,c,func)
    a = sort.abc$abc[1,];b = sort.abc$abc[2,];c = sort.abc$abc[3,]
    fA = sort.abc$fs[1];fB = sort.abc$fs[2];fC = sort.abc$fs[3];
    max.diffs = max(abs(outer(c(fA,fB,fC),c(fA,fB,fC),'-')))
    if (max.diffs < tol){
      return(list(abc=rbind(a,b,c), fs = c(fA,fB,fC)));
      break
    }
    points(a[1],a[2],pch = 20, col ='red')
    points(b[1],b[2],pch = 20, col ='cyan')
    points(c[1],c[2],pch = 20, col ='green')
    m = (a + b)/2
    e = m + 2*(m - c)
    fE = do.call(func,list(e[1],e[2]))
    if (fE < fB){
      c = e; fC = fE
    } else {
      r = 2*m -c
      fR = do.call(func,list(r[1],r[2]))
      if (fR < fC){
        c = r; fC = fR
      }
      if (fR >= fB){
        s = (c + m)/2; fS = do.call(func,list(s[1],s[2]))
        if (fS < fC){
          c = s; fC =fS
        } else {
          b = m; fB = do.call(func,list(m[1],m[2]))
          c = (a + c)/2; fC = do.call(func,list(c[1],c[2]))
        }
      }
    }
  }
}
# simulated annealing
# version 1
SimAneal01<-function(func, initXY, maxItr=1000, C=1, eta=0.99, width=10) {
  x=initXY
  opt.value = do.call(func,list(x))
  n.var = length(x)
  opt.hist=matrix(0, nrow=maxItr, ncol=5)
  opt.hist[1,]=c(x,x,opt.value,opt.value,0)
  for (i_loop in 2:maxItr) {
    accept = 0
    temp.x = x + rnorm(n.var, mean = 0, sd=width)
    temp.value= do.call(func,list(temp.x))
    delta=temp.value-opt.value;
    prob=1/(1+exp(delta/(C*width)))
    if (runif(1) < prob) {
      x = temp.x;
      opt.value = temp.value;
      accept = 1
    } 
    opt.hist[i_loop,]=c(x, temp.x, opt.value, temp.value, accept);
    width=width*eta
  }
  return(data.frame(opt.hist))
}

set.seed(50)
n.iter =500
fun01<-function(x){x^2+2*x}
res <- SimAneal01(fun01, 3, n.iter, 1, 0.985, 1)

# version 2
funZ<-function(x,y) {x^4-16*x^2-5*x+y^4-16*y^2-5*y}
minSA<-function(initXY,maxItr=1000,C=1,eta=0.99,width) {
  x=initXY[1];y=initXY[2];z=funZ(x,y); 
  opHist=matrix(0,nrow=maxItr,ncol=3)
  opHist[1,]=c(x,y,z)
  for (i_loop in 2:maxItr) {
    xTemp=x+rnorm(1,mean=0,sd=width)
    yTemp=y+rnorm(1,mean=0,sd=width)
    zTemp=funZ(xTemp, yTemp);
    delta=zTemp-z;
    prob=1/(1+exp(delta/(C*width)))
    if (runif(1) < prob) {
      x=xTemp;y=yTemp;z=zTemp;
    } 
    opHist[i_loop,]=c(x,y,z);
    width=width*eta
  }
  return(opHist)
}

funZ<-function(x,y) {x^4-16*x^2-5*x+y^4-16*y^2-5*y}
xmin=-5;xmax=5;n_len=100;
x<-seq(xmin, xmax,length.out=n_len);
y<-x;z<-outer(x,y,funZ)

res<-minSA(c(0,0),1000,1,0.99,10)
lines(res[,1:2],type='o',lwd=2,col='green',pch=20)

認知情報解析 03

# objective: find x that minimizes y: y = x^2 +2*x

# simple gradient descent (GD)
tol = 1e-7;lr = 0.1; 
x = 10; x.hist = x
repeat{
  grad = 2*x + 2
  if (abs(grad) <= tol) { break }
  x = x - lr*grad
  x.hist = c(x.hist,x)
}
x.temp = seq(-10,10,length.out = 100)
plot(x.temp, x.temp^2+2*x.temp,type='l',lwd = 3, ylim = c(-5,120),
     ylab = "y",xlab="x")
lines(x.hist,x.hist^2+2*x.hist,type='o',col='red',lwd=2,pch=19)
points(x.hist,rep(-4,length(x.hist)),col='red',pch="I")

# GD w/ momentum
tol = 1e-7;lr = 0.1; gamma = 0.36
x = 10; x.histM = x; v = 0;
repeat{
  grad = 2*x + 2
  if (abs(grad) <= tol) { break }
  v = gamma*v - lr*grad
  x = x + v
  x.histM = c(x.histM,x)
}
lines(x.histM,x.histM^2+2*x.histM,type='o',col='blue',pch=19)
points(x.histM,rep(-2,length(x.histM)),col='blue',pch="I")
legend("topleft",c("standard GD","GD w/ moment."), pch=1,lwd=2,col=c('red','blue'))
c(length(x.hist),length(x.histM))

認知情報解析 02

# ケーキの分配ゲーム
n.cake = 10 
# 利得行列
pay.mat = matrix(0,(n.cake+1),(n.cake+1))
for (i.cake in 1:n.cake){
  pay.mat[(i.cake+1),1:(n.cake-i.cake+1)] =i.cake
}
# 初期化(各戦略の確率や時間など)
p.cake = runif(n.cake+1)
p.cake = p.cake/sum(p.cake)

max.time = 50
dt = 0.01
t = seq(0,max.time,dt)
n.iter = length(t)
p.hist = matrix(0,nrow = n.iter, ncol = (n.cake+1))
p.hist[1,] = p.cake
 
# メインのスクリプト
for (i.time in 2:n.iter){
  W = colSums(p.cake*t(pay.mat))
  W.ave = sum(outer(p.cake,p.cake)*pay.mat)
  p.cake = p.cake + p.cake*(W - W.ave)/W.ave * dt
  p.hist[i.time,] = p.cake
}
 
# 結果の可視化
plot(p.hist[,1],ylim=c(0,1),type='l',lwd=2,ylab = 'Proportion',xlab="time")  
for (i.strat in 2:(n.cake+1)){
  lines(p.hist[,i.strat],col=i.strat,lwd=2)
}
legend("topleft",paste("request = ",0:10),col=1:(n.cake+1),lwd =2,cex=0.75)
# 中央極限定理の実験
ss.female = rnorm(6000,mean = 24, sd = 0.5)
ss.male = rnorm(6000,mean = 26, sd = 0.5)
pop = c(ss.female,ss.male)
parm.mu = mean(pop)
parm.sd = sd(pop)

n.per.one.sample = 10;
n.rep = 100000;

samp.data = matrix(sample(pop,n.per.one.sample*n.rep,replace = T),
  ncol = n.per.one.sample)
samp.mean = rowMeans(samp.data)


par(mfrow=c(1,3))
# fig 1
hist(pop,  col='skyblue',breaks = 50, probability = T, 
  main = "Population Dist.",xlab = "shoesizd")
pop.dens<-density(pop)
lines(pop.dens,col='orange',lwd=4)

# fig 2
rand.p = sample(1:n.rep,300)
dat.dens = density(samp.data[rand.p[1],])
plot(dat.dens, col='gray',xlim=c(22,28),ylim=c(0,0.5), 
  main = "Sample Dist. (N = 10)",xlab = "shoesizd")
for (i.d in 2:300){
  dat.dens = density(samp.data[rand.p[i.d],])
  lines(dat.dens, col='gray')
}

# fig 3
mean.dens<-density(samp.mean)
hist(samp.mean, col='skyblue',breaks = 50, probability = T, 
  main = 'Dist of Sample Means',xlab = "shoesizd")
lines(mean.dens,col='red',lwd=4)
x = seq(22,27,0.01)
y = dnorm(x,parm.mu, parm.sd/sqrt(n.per.one.sample))
lines(x, y ,col='darkblue',lty=3,lwd=3)
legend("topright",c("Empirical", "Theoretical"),
  lty=c(1,3),col=c("red","blue"),lwd=4,cex=1)

認知情報解析 01

# 流行のモデル
ryuko <- function(x0, y0, z0, alpha, beta, T){
  dt = 0.01
  total.p = x0+y0+z0
  x = rep(0,T);y = rep(0,T);z = rep(0,T)
  x[1]=x0;y[1]=y0;z[1]=z0;
  for (i.t in 1:(T-1)){
    x[i.t + 1] =x[i.t]-(alpha*x[i.t]*y[i.t])*dt
    y[i.t + 1] =y[i.t]+(alpha*x[i.t]*y[i.t]-beta*y[i.t])*dt
    z[i.t + 1] =z[i.t]+(beta*y[i.t])*dt
  }
  plot(1:T,x/total.p,type='l',col='red',xlab = "Time", 
       ylab = "Proportion", lwd = 2)
  lines(1:T, y/total.p, col='blue', lwd = 2)
  lines(1:T, z/total.p, col='green', lwd = 2)
  return(data.frame(x=x,y=y,z=z))
}
# 大数の法則に関するシミュレーション

# サイコロをN回ふって6が出る確率の推移を検証する関数
die.6 <- function(N){
  die <- sample(1:6, N, replace=T)
  six.cumsum = cumsum(die==6)
  six.prob = six.cumsum/(1:N)
  return(six.prob=six.prob)
}

# 関数の実行&結果の可視化
N = 3000
result = die.6(N)
plot(1:N, result, ylim=c(0,1), col='red', lwd=3, type='l',
     xlab="Trial",ylab="Prob.")
abline(h = 1/6, lty=2, lwd=2)
legend("topright",c("Theoretical","Empirical"),lty=c(2,1),
       col=c("black","red"), lwd=2)

# 大数の法則を繰り返し検証するスクリプト
N = 3000; n.rep = 500
result = matrix(0, nrow=n.rep, ncol=N)
plot(1,1,type='n',xlim = c(0,N), ylim=c(0,1),
  xlab="Trial", ylab="P(die = 6)")
for (i.rep in 1:n.rep){
  result[i.rep,] = die(N) 
  lines(result[i.rep,],col='gray')
}
lines(1:N, colMeans(six.p), col='red', lwd = 2)
abline(h = 1/6, lty=2, lwd=2)
legend("topright",c("Theoretical","Average","Indiv. trial"),lty=c(2,1,1),
       col=c("black","red","gray"), lwd=2)

# 大数の法則を繰り返し検証するスクリプトを関数にしたもの
die.6Mult <- function(N, n.rep) {
  plot(1,1,type='n',xlim = c(0,N), ylim=c(0,1), xlab="Trial", 
    ylab="P(die = 6)")
  die <- matrix(sample(1:6, N*n.rep, replace=T), nrow = n.rep)
  six.cumsum = apply(die,1, function(x){cumsum(x==6)})
  six.prob = six.cumsum/(1:N)
  for (i.rep in 1:n.rep){
    lines(six.prob[,i.rep],col='gray')
  }
  lines(1:N, rowMeans(six.prob), col='red', lwd = 2)
  abline(h = 1/6, lty=2, lwd=2)
  legend("topright",c("Theoretical","Average","Indiv. trial"),lty=c(2,1,1),
         col=c("black","red","gray"), lwd=2)
  return(six.prob=six.prob)
}
#  上記の関数の実行例
result <- die.6Mult(3000,500)

認知情報解析 DL ch5までのまとめ

# ReLU
relu.forwd <- function(x){
  return(pmax(x,0))
}
relu.bckwd <- function(z, dout){
  dout[which(z <= 0)] = 0
  return(dout)
}
# sigmoid
sigmoid.forwd <- function(x){
  return(1/(1+exp(-x)))
}
sigmoid.bckwd <- function(z, dout){
  return(dout*(1-z)*z)
}
# Affine
affine.forwd <- function(x, W, b){
  return(x%*%W + matrix(1, nrow = nrow(x), ncol = 1)%*%b)
}
affine.bckwd <- function(x, W, dout){
  dx = dout%*%t(W)
  dW = t(x)%*%dout
  db = colSums(dout)
  return(list(dx = dx, dW = dW, db = db))
}
# softmax with CE
softmax.forwd <- function(x, target){
  max.x = apply(x,1,max)
  C = ncol(x)
  x = x - max.x%*%matrix(1,nrow=1,ncol=C)
  y = exp(x)/rowSums(exp(x))
  delta = 1e-7;
  R = nrow(as.matrix(y))
  return(list(smx = y, ce = -sum(target*log(y + delta))/R))
}
softmax.bckwd <- function(smx, target){
  R = nrow(smx)
  return((smx-target)/R)
}

activation <- function(A, target, actFun){
  if (actFun == "sigmoid"){
    return(sigmoid.forwd(A))
  }
  if (actFun == "relu"){
    return(relu.forwd(A))
  }
  if (actFun == "softmax"){
    return(softmax.forwd(A, target))
  }
}

Dact <- function(z, smx, target, dout, actFun){
  if (actFun == "sigmoid"){
    return(sigmoid.bckwd(z, dout))
  }
  if (actFun == "relu"){
    return(relu.bckwd(z, dout))
  }
  if (actFun == "softmax"){
    return(softmax.bckwd(smx, target))
  }
}

# function for initialization 
init.network <- function(n.neurons, actFun){
  n.layer = length(n.neurons)
  W = list(); b = list()
  for (i.layer in 1:(n.layer-1)){
    W[[i.layer]] = matrix(rnorm(n.neurons[i.layer]*n.neurons[(i.layer+1)],sd = 0.1),
                          nrow=n.neurons[i.layer])
    b[[i.layer]] =  matrix(rnorm(n.neurons[(i.layer+1)],sd = 0.1), nrow = 1)
  }
  return(list(W = W, b = b, actFun = actFun))
}

cu.nnet = function(train.x, train.y, net, HP = c(10,1000,0.05)){
# HP: Hyperparameters
# HP[1] = batch size
# HP[2] = n iteration
# HP[3] = learning rate
  n.layer <- length(net$W)
  loss = rep(0,HP[2])
  A = list(); z = list(); Dact = list(); Daff = list()
  for (i.iter in 1:HP[2]){
    batch_mask = sample(1:nrow(train.x), HP[1])
    x.batch = train.x[batch_mask,]
    t.batch = train.y[batch_mask,]  
    for (i.layer in 1:n.layer){
      if (i.layer == 1){
        x = x.batch
      } else {
        x = z[[(i.layer - 1)]]
      }
      A[[i.layer]] = affine.forwd(x, net$W[[i.layer]], net$b[[i.layer]])
      z[[i.layer]] = activation(A[[i.layer]], t.batch, net$actFun[i.layer])
    }
    loss[i.iter] = z[[i.layer]]$ce
    smx = z[[i.layer]]$smx
    for (i.layerR in n.layer:1){
      if (i.layerR == n.layer){
        dout = 1
      } else {
        dout = Daff[[(i.layerR+1)]]$dx
      }
      Dact[[i.layerR]] = Dact(z[[i.layerR]], smx, t.batch, dout, net$actFun[i.layerR])
      if (i.layerR==1){
        x = x.batch
      } else {
        x = A[[(i.layerR-1)]]
      }
      Daff[[i.layerR]] = affine.bckwd(x, network$W[[i.layerR]], Dact[[i.layerR]])
    }
    for (i.layer in 1:n.layer){
      net$W[[i.layer]] = net$W[[i.layer]] - HP[3]*Daff[[i.layer]]$dW
      net$b[[i.layer]] = net$b[[i.layer]] - HP[3]*Daff[[i.layer]]$db
    }
  }
  return(list(loss = loss, net = net))
}

# iris
train.x = as.matrix(iris[,1:4])
train.y.temp = as.numeric(iris[,5])
train.y = matrix(0,nrow = nrow(train.x), ncol =3)
train.y[which(train.y.temp==1), 1]=1
train.y[which(train.y.temp==2), 2]=1
train.y[which(train.y.temp==3), 3]=1
actF = c("relu","softmax")
network = init.network(c(4,15,3), actF)
res = cu.nnet(train.x, train.y, network, HP=c(15,2000,0.1))
plot(res$loss,type='l')

# MNIST
train <- read.csv('~/courses/CogMod/CMA2017/MNSTtrain.csv', header=TRUE)
test <- read.csv('~/courses/CogMod/CMA2017/MNSTtest.csv', header=TRUE)
train <- data.matrix(train)
test <- data.matrix(test)
train.x <- as.matrix(train[,-1]/255)
train.y.temp <- train[,1]
train.y = matrix(0,nrow = nrow(train.x), ncol = 10)
for (i in 1:nrow(train.x)){
  train.y[i,(train.y.temp[i]+1)]=1
}
actF = c("relu","relu","softmax")
network = init.network(c(784,100,50,10), actF)
res = cu.nnet(train.x, train.y, network,HP=c(100,2000,0.1))
plot(res$loss,type='l')

########################################################
#   with additional methods

init.network <- function(n.neurons, actFun, Opt, sdv){
  n.layer = length(n.neurons)
  W = list(); b = list(); 
  mW = list(); mb = list();    # momentum
  hW = list(); hb = list();    # adaGrad
  aMW = list(); aMb = list();  # adam M
  aVW = list(); aVb = list();  # adam V
  for (i.layer in 1:(n.layer-1)){
    if (nargs() == 3) {
      if (actFun[i.layer]=="sigmoid"){
        # Xavier
        sdv = 1/sqrt(n.neurons[i.layer])
      } else {
        # He - assumes ReLU
        sdv = sqrt(2/n.neurons[i.layer])
      }
    }
    W[[i.layer]] = matrix(rnorm(n.neurons[i.layer]*n.neurons[(i.layer+1)], sd = sdv),
                          nrow=n.neurons[i.layer])
    b[[i.layer]] =  matrix(rnorm(n.neurons[(i.layer+1)], sd = sdv), nrow = 1)
    if (Opt == "momentum"){
      mW[[i.layer]] = matrix(0, nrow=n.neurons[i.layer], ncol=n.neurons[(i.layer+1)])
      mb[[i.layer]] = matrix(0, nrow = 1, ncol=n.neurons[(i.layer+1)])
    } 
    if (Opt == "adagrad"){
      hW[[i.layer]] = matrix(0, nrow=n.neurons[i.layer], ncol=n.neurons[(i.layer+1)])
      hb[[i.layer]] = matrix(0, nrow = 1, ncol=n.neurons[(i.layer+1)])
    }
    if (Opt == "adam"){
      aMW[[i.layer]] = matrix(0, nrow=n.neurons[i.layer], ncol=n.neurons[(i.layer+1)])
      aMb[[i.layer]] = matrix(0, nrow = 1, ncol=n.neurons[(i.layer+1)])
      aVW[[i.layer]] = matrix(0, nrow=n.neurons[i.layer], ncol=n.neurons[(i.layer+1)])
      aVb[[i.layer]] = matrix(0, nrow = 1, ncol=n.neurons[(i.layer+1)])
    }
  }
  return(list(W = W, b = b, actFun = actFun, optimizer = Opt,
              mW=mW, mb=mb,
              hW=hW, hb=hb,
              aMW=aMW,aMb=aMb,aVW=aVW))
}

OPT<-function(net, Daff, HP){
  if (net$optimizer == "momentum"){
    return(Opt.mom(net, Daff, HP))
  }
  if (net$optimizer == "adagrad"){
    return(Opt.adagrad(net, Daff, HP))
  }
  if (net$optimizer == "adam"){
    return(Opt.adam(net, Daff, HP))
  }
}

Opt.mom <- function(net, Daff, HP){
# HP[3] = learning rate
# HP[4] = weight decay
# HP[5] = momentum
  n.layer <- length(net$W)
  for (i.layer in 1:n.layer){
    net$mW[[i.layer]] = HP[5]*net$mW[[i.layer]] 
                        - HP[3]*Daff[[i.layer]]$dW - HP[4]*net$W[[i.layer]]
    net$mb[[i.layer]] = HP[5]*net$mb[[i.layer]] 
                        - HP[3]*Daff[[i.layer]]$db - HP[4]*net$b[[i.layer]]
    net$W[[i.layer]] = net$W[[i.layer]] + net$mW[[i.layer]]
    net$b[[i.layer]] = net$b[[i.layer]] + net$mb[[i.layer]] 
  }
  return(net=net)
}

cu.nnet = function(train.x, train.y, net, HP = c(10,1000,0.05,0.01,0.1,0.999,0.9)){
# HP: Hyperparameters
# HP[1] = batch size
# HP[2] = n iteration
# HP[3] = learning rate
# HP[4] = weight decay
# HP[5] = momentum
# HP[6] = beta1 (adam)
# HP[7] = beta2 (adam)
  n.layer <- length(net$W)
  loss = rep(0,HP[2])
  A = list(); z = list(); Dact = list(); Daff = list()
  for (i.iter in 1:HP[2]){
    batch_mask = sample(1:nrow(train.x), HP[1])
    x.batch = train.x[batch_mask,]
    t.batch = train.y[batch_mask,]
    for (i.layer in 1:n.layer){
      if (i.layer == 1){
        x = x.batch
      } else {
        x = z[[(i.layer - 1)]]
      }
      A[[i.layer]] = affine.forwd(x, net$W[[i.layer]], net$b[[i.layer]])
      z[[i.layer]] = activation(A[[i.layer]], t.batch, net$actFun[i.layer])
    }
    loss[i.iter] = z[[i.layer]]$ce
    smx = z[[i.layer]]$smx
    for (i.layerR in n.layer:1){
      if (i.layerR == n.layer){
        dout = 1
      } else {
        dout = Daff[[(i.layerR+1)]]$dx
      }
      Dact[[i.layerR]] = Dact(z[[i.layerR]], smx, t.batch, dout, net$actFun[i.layerR])
      if (i.layerR==1){
        x = x.batch
      } else {
        x = A[[(i.layerR-1)]]
      }
      Daff[[i.layerR]] = affine.bckwd(x, net$W[[i.layerR]], Dact[[i.layerR]])
    }
    net = OPT(net, Daff, HP)
  }
  return(list(loss = loss, net = net))
}

# iris
train.x = as.matrix(iris[,1:4])
train.y.temp = as.numeric(iris[,5])
train.y = matrix(0,nrow = nrow(train.x), ncol =3)
train.y[which(train.y.temp==1), 1]=1
train.y[which(train.y.temp==2), 2]=1
train.y[which(train.y.temp==3), 3]=1
actF = c("relu","softmax")
network = init.network(c(4,15,3), actF, "momentum")
res = cu.nnet(train.x, train.y, network, HP=c(15,1000,0.01,0.0001,0.9,0.999,0.9))
hist(res$net$W[[1]])
plot(res$loss,type='l')

認知情報解析 ch06.1

func02R = function(x){
  return(1/20*x[1]^2 + x[2]^2)
}
numerical.grad <- function(func, x){
  h = 1e-4
  R = nrow(x)
  C = ncol(x)
  grad = matrix(0, R, C)
  for (i.col in 1:C){
    for (i.row in 1:R){
      temp.x = x[i.row,i.col]
      x[i.row, i.col] = temp.x + h
      plusH = do.call(func, list(x))
      x[i.row, i.col] = temp.x - h
      minusH = do.call(func,list(x))
      grad[i.row, i.col] = (plusH - minusH)/(2*h)
      x[i.row, i.col] = temp.x
    }
  }
  return(grad)
}

require(plot3D)
x = seq(-10,10,0.2)
y = seq(-10,10,0.2)
M = mesh(x,y)
Z = as.vector(1/20*M$x^2)+as.vector(M$y^2) 
Z.mesh = matrix(Z,nrow(M$x))
contour(x,y,Z.mesh,drawlabels = F,nlevels=40)

grad.desc <- function(func, init.x, lr, n.iter){
  x = init.x
  x.hist = init.x
  for (i.iter in 1:n.iter) {
    grad = numerical.grad(func, x)
    x = x - lr*grad
    x.hist = rbind(x.hist,x)
  }
  return(x.hist)
}
x.init = matrix(c(-7,2),nrow = 1)
gd = grad.desc("func02R",x.init,0.9,100)
lines(gd,type='o',col = 'green',pch=20)

認知情報解析 Ch05

multi.forwd <- function(x,y){
  return(x*y)
}
multi.bckwd <- function(x, y, dout){
  dx = dout * y
  dy = dout * x
  return(list(dx = dx, dy = dy))
}

apple = 100; n.apple = 2; tax = 1.1
apple.pre.tax = multi.forwd(apple, n.apple)
apple.post.tax = multi.forwd(apple.pre.tax, tax)

dprice = 1
d.apple.post.tax = multi.bckwd(apple.pre.tax, tax, dprice)
d.apple = multi.bckwd(apple, n.apple, d.apple.post.tax$dx)$dx
d.n.apple = multi.bckwd(apple, n.apple, d.apple.post.tax$dx)$dy

add.forwd <- function(x,y){
  return(x + y)
}
add.bckwd <- function(x, y, dout){
  dx = dout
  dy = dout
  return(list(dx = dx, dy = dy))
}

apple = 100; n.apple = 2; tax = 1.1
orange = 150; n.orange = 3;

apple.price = multi.forwd(apple, n.apple)
orange.price = multi.forwd(orange, n.orange)
all.price = add.forwd(apple.price, orange.price)
price = multi.forwd(all.price, tax)

dprice = 1
d.all.price = multi.bckwd(all.price, tax, dprice)
d.apple.price = add.bckwd(apple.price, orange.price, d.all.price$dx)$dx
d.orange.price = add.bckwd(orange, n.orange.price, d.all.price$dx)$dy
d.apple = multi.bckwd(apple, n.apple, d.apple.price)$dx
d.n.apple = multi.bckwd(apple, n.apple, d.apple.price)$dy
d.orange = multi.bckwd(orange, n.orange, d.orange.price)$dx
d.n.orange = multi.bckwd(orange, n.orange, d.orange.price)$dy

relu.forwd <- function(x){
  return(pmax(x,0))
}

relu.bckwd <- function(x, dout){
  dout[which(x <= 0)] = 0
  return(dout)
}

sigmoid.forwd <- function(x){
  return(1/(1+exp(-x)))
}

sigmoid.bckwd <- function(x, dout){
  y = sigmoid.forwd(x) 
  return(dout*(1-y)*y)
}

affine.forwd <- function(x, W, b){
  return(x%*%W + matrix(1, nrow = nrow(x), ncol = 1)%*%b)
}

affine.bckwd <- function(x, W, b, dout){
  dx = dout%*%t(W)
  dW = t(x)%*%dout
  db = colSums(dout)
  return(list(dx = dx, dW = dW, db = db))
}

softmax.forwd <- function(x, target){
  max.x = apply(x,1,max)
  C = ncol(x)
  x = x - max.x%*%matrix(1,nrow=1,ncol=C)
  y = exp(x)/rowSums(exp(x))
  delta = 1e-7;
  R = nrow(as.matrix(y))
  return(-sum(target*log(y + delta))/R)
}

softmax.bckwd <- function(x, target,  dout = 1){
  max.x = apply(x, 1, max)
  R = nrow(x)
  C = ncol(x)
  x = x - max.x%*%matrix(1,nrow=1,ncol=C)
  y = exp(x)/rowSums(exp(x))
  return((y-target)/R)
}

init.network <- function(n.neurons){
  n.layer = length(n.neurons)
  W = list(); b = list()
  for (i.layer in 1:(n.layer-1)){
    W[[i.layer]] = matrix(rnorm(n.neurons[i.layer]*n.neurons[(i.layer+1)],sd = 0.1),
      nrow=n.neurons[i.layer])
    b[[i.layer]] =  matrix(rnorm(n.neurons[(i.layer+1)],sd = 0.1), nrow = 1)
  }
  return(list(W = W,b = b))
}

sigmoid.func <- function(x){
  return(1/(1+exp(-x)))
}

relu.func <- function(x){
  y = apply(x,2,function(x) pmax(x,0))
  return(y)
}

activation <- function(A, actFun){
  if (actFun == "sigmoid"){
    return(sigmoid.func(A))
  }
  if (actFun == "relu"){
    return(relu.func(A))
  }
  if (actFun == "softmax"){
    return(softmax(A))
  }
}

feedforward <- function(network, x, actFun) {
  n.layer <- length(network$W)
  batch.size = nrow(x)
  for (i.layer in 1:n.layer){
    A = x%*%network$W[[i.layer]] 
      + matrix(1,nrow=batch.size,ncol = 1)%*%network$b[[i.layer]]
    x = activation(A, actFun[i.layer])
  }
  return(x)
}

cross.entropy = function(y, target){
  delta = 1e-7;
  R = nrow(as.matrix(y))
  return(-sum(target*log(y + delta))/R)
}

loss.network = function(params, x, t, actFun){
  y = feedforward(params,x,actFun)
  return(cross.entropy(y, t))
}

以下は課題が完了するまでは見ないように心がけててください。
########################################################################
########################################################################
########################################################################
train.x = as.matrix(iris[,1:4])
train.y.temp = as.numeric(iris[,5])
train.y = matrix(0,nrow = nrow(train.x), ncol =3)
train.y[which(train.y.temp==1), 1]=1
train.y[which(train.y.temp==2), 2]=1
train.y[which(train.y.temp==3), 3]=1

params = init.network(c(4,15,3))
batch_size = 10; n.iter =5000; lambda =0.05
n.train = nrow(train.x)
loss = rep(0,n.iter)
for (i.iter in 1:n.iter){
  batch_mask = sample(1:n.train, batch_size)
  x.batch = train.x[batch_mask,]
  t.batch = train.y[batch_mask,]
  a1 = affine.forwd(x.batch,params$W[[1]],params$b[[1]])
  z1 = sigmoid.forwd(a1)
  a2 = affine.forwd(z1,params$W[[2]],params$b[[2]])
  z2 = softmax.forwd(a2,t.batch)
  dwSM = softmax.bckwd(a2, t.batch, 1)
  dwA2 = affine.bckwd(a1,params$W[[2]],params$b[[2]],dwSM)
  dwSG = sigmoid.bckwd(a1,dwA2$dx)
  dwA1 = affine.bckwd(x.batch,params$W[[1]],params$b[[1]],dwSG)
  params$W[[2]] = params$W[[2]] - lambda*dwA2$dW
  params$b[[2]] = params$b[[2]] - lambda*dwA2$db
  params$W[[1]] = params$W[[1]] - lambda*dwA1$dW
  params$b[[1]] = params$b[[1]] - lambda*dwA1$db
  loss[i.iter] = loss.network(params,x.batch,t.batch,c("sigmoid","softmax"))
}
plot(loss,type='l', xlab = "trial")


# MNIST
train <- read.csv('~/courses/CogMod/CMA2017/MNSTtrain.csv', header=TRUE)
test <- read.csv('~/courses/CogMod/CMA2017/MNSTtest.csv', header=TRUE)
train <- data.matrix(train)
test <- data.matrix(test)
train.x <- as.matrix(train[,-1]/255)
train.y.temp <- train[,1]
train.y = matrix(0,nrow = nrow(train.x), ncol = 10)
for (i in 1:nrow(train.x)){
  train.y[i,(train.y.temp[i]+1)]=1
}

params = init.network(c(784,50,10))
batch_size = 100; n.iter =5000; lambda =0.05
n.train = nrow(train.x)
loss = rep(0,n.iter)
for (i.iter in 1:n.iter){
  batch_mask = sample(1:n.train, batch_size)
  x.batch = train.x[batch_mask,]
  t.batch = train.y[batch_mask,]
  a1 = affine.forwd(x.batch,params$W[[1]],params$b[[1]])
  z1 = sigmoid.forwd(a1)
  a2 = affine.forwd(z1,params$W[[2]],params$b[[2]])
  z2 = softmax.forwd(a2,t.batch)
  dwSM = softmax.bckwd(a2, t.batch, 1)
  dwA2 = affine.bckwd(a1,params$W[[2]],params$b[[2]],dwSM)
  dwSG = sigmoid.bckwd(a1,dwA2$dx)
  dwA1 = affine.bckwd(x.batch,params$W[[1]],params$b[[1]],dwSG)
  params$W[[2]] = params$W[[2]] - lambda*dwA2$dW
  params$b[[2]] = params$b[[2]] - lambda*dwA2$db
  params$W[[1]] = params$W[[1]] - lambda*dwA1$dW
  params$b[[1]] = params$b[[1]] - lambda*dwA1$db
  loss[i.iter] = loss.network(params,x.batch,t.batch,c("sigmoid","softmax"))
}
plot(loss,type='l', xlab = "trial")
apply((feedforward(params, train.x[1:10,], c("sigmoid", "softmax"))),1,which.max)
train.y.temp[1:10]+1

### MNIST 3-layer
params = init.network(c(784,50,50,10))
batch_size = 100; n.iter =5000; lambda =0.05
n.train = nrow(train.x)
loss = rep(0,n.iter)
for (i.iter in 1:n.iter){
  batch_mask = sample(1:n.train, batch_size)
  x.batch = train.x[batch_mask,]
  t.batch = train.y[batch_mask,]
  a1 = affine.forwd(x.batch,params$W[[1]],params$b[[1]])
  z1 = sigmoid.forwd(a1)
  a2 = affine.forwd(z1,params$W[[2]],params$b[[2]])
  z2 = sigmoid.forwd(a2)
  a3 = affine.forwd(z2,params$W[[3]],params$b[[3]])
  z3 = softmax.forwd(a3,t.batch)
  dwSM = softmax.bckwd(a3, t.batch, 1)
  dwA3 = affine.bckwd(a2,params$W[[3]],params$b[[3]],dwSM)
  dwSG2 = sigmoid.bckwd(a2,dwA3$dx)
  dwA2 = affine.bckwd(a1,params$W[[2]],params$b[[2]],dwSG2)
  dwSG1 = sigmoid.bckwd(a1,dwA2$dx)
  dwA1 = affine.bckwd(x.batch,params$W[[1]],params$b[[1]],dwSG1)
  params$W[[3]] = params$W[[3]] - lambda*dwA3$dW
  params$b[[3]] = params$b[[3]] - lambda*dwA3$db
  params$W[[2]] = params$W[[2]] - lambda*dwA2$dW
  params$b[[2]] = params$b[[2]] - lambda*dwA2$db
  params$W[[1]] = params$W[[1]] - lambda*dwA1$dW
  params$b[[1]] = params$b[[1]] - lambda*dwA1$db
  loss[i.iter] = loss.network(params,x.batch,t.batch,c("sigmoid","sigmoid","softmax"))
}
plot(loss,type='l', xlab = "trial")
pred<-apply((feedforward(params, train.x, c("sigmoid","sigmoid", "softmax"))),
  1, which.max)
table(pred,train.y+1)