広域システム特別講義II 教師あり学習1a

# with THRESHOLD (theta)
AND.gate <- function(x1, x2){
  w1 = 0.5
  w2 = 0.5
  theta = 0.7
  y.temp = w1*x1 + w2*x2
  if (y.temp <= theta){
    y = 0
  } else {
    y = 1
  }
  return(y)
}

AND.gate <- function(x1, x2){
  w1 = 0.5; w2 = 0.5; theta = 0.7
  return(as.numeric(w1*x1 + w2*x2 > theta))
}

NAND.gate <- function(x1, x2){
  w1 = -0.5; w2 = -0.5; theta = -0.7
  return(as.numeric(w1*x1 + w2*x2 > theta))
}

OR.gate <- function(x1, x2){
  w1 = 0.5; w2 = 0.5; theta = 0.3
  return(as.numeric(w1*x1 + w2*x2 > theta))
}

# with BIAS (b)
AND.gate <- function(x1, x2){
  w1 = 0.5
  w2 = 0.5
  b = -0.7
  y.temp = w1*x1 + w2*x2 + b
  if (y.temp <= 0){
    y = 0
  } else {
    y = 1
  }
  return(y)
}

AND.gate <- function(x1, x2){
  w1 = 0.5; w2 = 0.5; b = -0.7
  return(as.numeric(w1*x1 + w2*x2 + b > 0))
}

NAND.gate <- function(x1, x2){
  w1 = -0.5; w2 = -0.5; b = 0.7
  return(as.numeric(w1*x1 + w2*x2 + b > 0))
}

OR.gate <- function(x1, x2){
  w1 = 0.5; w2 = 0.5; b = -0.3
  return(as.numeric(w1*x1 + w2*x2 + b > 0))
}

NOR.gate <- function(x1, x2){
  w1 = -0.5; w2 = -0.5; b = 0.3
  return(as.numeric(w1*x1 + w2*x2 + b > 0))
}

plot.logic <- function(logic.oper){
  x1 = c(0,0,1,1);
  x2 = c(0,1,0,1);
  if (logic.oper == "and") {
    w1 = 0.5; w2 = 0.5; theta = 0.7;
    true.point = AND.gate(x1,x2)
  } else if (logic.oper == "or") {
    w1 = 0.5; w2 = 0.5; theta = 0.3;
    true.point = OR.gate(x1,x2)
  } else if (logic.oper == "nand") {
    w1 = -0.5; w2 = -0.5; theta = -0.7;
    true.point = NAND.gate(x1,x2)
  } else if (logic.oper == "nor"){
    w1 = -0.5; w2 = -0.5; theta = -0.3;
    true.point = NOR.gate(x1,x2)
  } else {warning("incompatible operator");stop() }
  plot(c(0,0,1,1),c(0,1,0,1),xlim = c(-0.5, 1.5), ylim = c(-0.5, 1.5),
       pch = 20, cex= 2, col = true.point+1)
  abline(a = theta/w1, b = -w1/w2, lwd = 3)
}

XOR.gate <- function(x1, x2){
  gate1 <- NAND.gate(x1,x2)
  gate2 <- OR.gate(x1,x2)
  y <- AND.gate(gate1,gate2)
  return(y)
}

plot.XOR <- function(){
  x1 = c(0,0,1,1);
  x2 = c(0,1,0,1);
  w11 = -0.5; w21 = -0.5; theta1 = -0.7
  w12 = 0.5; w22 = 0.5; theta2 = 0.3
  true.point = XOR.gate(x1, x2)
  plot(c(0,0,1,1),c(0,1,0,1),xlim = c(-0.5, 1.5), ylim = c(-0.5, 1.5),
       pch = 20, cex= 2, col = true.point+1)
  abline(a = theta1/w11, b = -w11/w21, lwd = 3)
  abline(a = theta2/w12, b = -w12/w22, lwd = 3)
}


multi.forwd <- function(x,y){
  return(x*y)
}
multi.bckwd <- function(x, y, dout){
  dx = dout * y
  dy = dout * x
  return(list(dx = dx, dy = dy))
}

apple = 100; n.apple = 2; tax = 1.1
apple.pre.tax = multi.forwd(apple, n.apple)
apple.post.tax = multi.forwd(apple.pre.tax, tax)

dprice = 1
d.apple.post.tax = multi.bckwd(apple.pre.tax, tax, dprice)
d.apple = multi.bckwd(apple, n.apple, d.apple.post.tax$dx)$dx
d.n.apple = multi.bckwd(apple, n.apple, d.apple.post.tax$dx)$dy

add.forwd <- function(x,y){
  return(x + y)
}
add.bckwd <- function(x, y, dout){
  dx = dout
  dy = dout
  return(list(dx = dx, dy = dy))
}

# network
step.func <- function(x){
  return(as.numeric(x > 0))
}
x = seq(-5, 5, 0.1)
y = step.func(x)
plot(x,y, ylab = 'y', xlab = 'a', type ="l", lwd =2)

sigmoid.func <- function(x){
  return(1/(1+exp(-x)))
}

y = sigmoid.func(x)
plot(x,y, ylab = 'y', xlab = 'a', type ="l", lwd =2)

y.step = step.func(x)
y.sigm = sigmoid.func(x)
plot(x,y.step, ylab = 'y', xlab = 'a', type ="l", lwd =2)
lines(x,y.sigm, lwd =2, lty = 2)

relu.func <- function(x){
  return(pmax(0,x))
}

y.relu = relu.func(x)
plot(x,y.relu, ylab = 'y', xlab = 'a', type ="l", lwd =2)

A = matrix(1:4, nrow = 2, byrow = T)
B = matrix(5:8, nrow = 2, byrow = T)

A = matrix(1:6, nrow = 3, byrow = T)
B = matrix(7:8, nrow = 2, byrow = T)

x = c(1,0.5)
W1 = matrix((1:6)*0.1, nrow = 2)
B1 = (1:3)*0.1
A1 = x%*%W1 + B1
Z1 = sigmoid.func(A1)

W2 = matrix((1:6)*0.1, nrow = 3)
B2 = c(0.1, 0.2)
A2 = Z1%*%W2 + B2
Z2 = sigmoid.func(A2)

W3 = matrix((1:4)*0.1, nrow = 2)
B3 = c(0.1, 0.2)
A3 = Z2%*%W3+ B3
Z3 = A3

# function to initialize 3L network
init.3L.network <- function(){
  W1 = matrix((1:6)*0.1, nrow = 2)
  B1 = (1:3)*0.1
  W2 = matrix((1:6)*0.1, nrow = 3)
  B2 = c(0.1, 0.2)
  W3 = matrix((1:4)*0.1, nrow = 2)
  B3 = c(0.1, 0.2)
  return(list(W1 = W1, B1 = B1, W2 = W2, B2 = B2, W3 = W3, B3 = B3))
}
# feedforward process
forward.3L <- function(network, x){
  A1 = x%*%network$W1 + network$B1
  Z1 = sigmoid.func(A1)
  A2 = Z1%*%network$W2 + network$B2
  Z2 = sigmoid.func(A2)
  A3 = Z2%*%network$W3 + network$B3
  Z3 = sigmoid.func(A3)
  A3 = Z3
  return(A3)
}

network<-init.3L.network()
y = forward.3L(network, c(1, 0.5))

a = c(1010,1000,990)
exp(a)/sum(exp(a))

softmax.func <- function(x){
  max.x = max(x)
  return(exp(x-max.x)/sum(exp(x-max.x)))
}

train <- read.csv('http://www.Matsuka.info/univ/course_folder/MNSTtrain.csv',
                  header=TRUE)
train <- data.matrix(train)
train.x <- train[,-1]
train.y <- train[,1]
train.x <- t(train.x/255)
download.file("http://www.matsuka.info/univ/course_folder/trNetwork.Rdata",
              "trNetwork.Rdata")
load("trNetwork.Rdata")
network=trNetwork


n.train = ncol(train.x)
correct.cl = 0
conf.matrix = matrix(0,10,10)
for (i.loop in 1:n.train){
  y = forward.3L(network,train.x[,i.loop])
  max.y = max.col(y)
  conf.matrix[max.y, (train.y[i.loop]+1)] =
    conf.matrix[max.y, (train.y[i.loop]+1)] + 1
}
accuracy = sum(diag(conf.matrix))/n.train


# learning
apple = 100; n.apple = 2; tax = 1.1
orange = 150; n.orange = 3;

apple.price = multi.forwd(apple, n.apple)
orange.price = multi.forwd(orange, n.orange)
all.price = add.forwd(apple.price, orange.price)
price = multi.forwd(all.price, tax)

dprice = 1
d.all.price = multi.bckwd(all.price, tax, dprice)
d.apple.price = add.bckwd(apple.price, orange.price, d.all.price$dx)$dx
d.orange.price = add.bckwd(orange, n.orange.price, d.all.price$dx)$dy
d.apple = multi.bckwd(apple, n.apple, d.apple.price)$dx
d.n.apple = multi.bckwd(apple, n.apple, d.apple.price)$dy
d.orange = multi.bckwd(orange, n.orange, d.orange.price)$dx
d.n.orange = multi.bckwd(orange, n.orange, d.orange.price)$dy

relu.forwd <- function(x){
  return(pmax(x,0))
}

relu.bckwd <- function(x, dout){
  dout[which(x <= 0)] = 0
  return(dout)
}

sigmoid.forwd <- function(x){
  return(1/(1+exp(-x)))
}

sigmoid.bckwd <- function(x, dout){
  y = sigmoid.forwd(x)
  return(dout*(1-y)*y)
}

affine.forwd <- function(x, W, b){
  return(x%*%W + matrix(1, nrow = nrow(x), ncol = 1)%*%b)
}

affine.bckwd <- function(x, W, b, dout){
  dx = dout%*%t(W)
  dW = t(x)%*%dout
  db = colSums(dout)
  return(list(dx = dx, dW = dW, db = db))
}

softmax.forwd <- function(x, target){
  max.x = apply(x,1,max)
  C = ncol(x)
  x = x - max.x%*%matrix(1,nrow=1,ncol=C)
  y = exp(x)/rowSums(exp(x))
  delta = 1e-7;
  R = nrow(as.matrix(y))
  return(-sum(target*log(y + delta))/R)
}

softmax.bckwd <- function(x, target,  dout = 1){
  max.x = apply(x, 1, max)
  R = nrow(x)
  C = ncol(x)
  x = x - max.x%*%matrix(1,nrow=1,ncol=C)
  y = exp(x)/rowSums(exp(x))
  return((y-target)/R)
}

init.network <- function(n.neurons){
  n.layer = length(n.neurons)
  W = list(); b = list()
  for (i.layer in 1:(n.layer-1)){
    W[[i.layer]] = matrix(rnorm(n.neurons[i.layer]*n.neurons[(i.layer+1)],sd = 0.1),
                          nrow=n.neurons[i.layer])
    b[[i.layer]] =  matrix(rnorm(n.neurons[(i.layer+1)],sd = 0.1), nrow = 1)
  }
  return(list(W = W,b = b))
}

sigmoid.func <- function(x){
  return(1/(1+exp(-x)))
}

relu.func <- function(x){
  y = apply(x,2,function(x) pmax(x,0))
  return(y)
}

activation <- function(A, actFun){
  if (actFun == "sigmoid"){
    return(sigmoid.func(A))
  }
  if (actFun == "relu"){
    return(relu.func(A))
  }
  if (actFun == "softmax"){
    return(softmax(A))
  }
}

feedforward <- function(network, x, actFun) {
  n.layer <- length(network$W)
  batch.size = nrow(x)
  for (i.layer in 1:n.layer){
    A = x%*%network$W[[i.layer]]
    + matrix(1,nrow=batch.size,ncol = 1)%*%network$b[[i.layer]]
    x = activation(A, actFun[i.layer])
  }
  return(x)
}

cross.entropy = function(y, target){
  delta = 1e-7;
  R = nrow(as.matrix(y))
  return(-sum(target*log(y + delta))/R)
}

loss.network = function(params, x, t, actFun){
  y = feedforward(params,x,actFun)
  return(cross.entropy(y, t))
}

softmax <- function(x, target){
  max.x = apply(x,1,max)
  C = ncol(x)
  x = x - max.x%*%matrix(1,nrow=1,ncol=C)
  y = exp(x)/rowSums(exp(x))
  return(y)
}

train.x = as.matrix(iris[,1:4])
train.y.temp = as.numeric(iris[,5])
train.y = matrix(0,nrow = nrow(train.x), ncol =3)
train.y[which(train.y.temp==1), 1]=1
train.y[which(train.y.temp==2), 2]=1
train.y[which(train.y.temp==3), 3]=1

params = init.network(c(4,15,3))
batch_size = 10; n.iter =5000; lambda =0.05
n.train = nrow(train.x)
params = init.network(c(4,30,3))
batch_size = 10; n.iter =5000; lambda =0.01
n.train = nrow(train.x)
loss = rep(0,n.iter)
for (i.iter in 1:n.iter){
  batch_mask = sample(1:n.train, batch_size)
  x.batch = train.x[batch_mask,]
  t.batch = train.y[batch_mask,]
  a1 = affine.forwd(x.batch,params$W[[1]],params$b[[1]])
  z1 = sigmoid.forwd(a1)
  a2 = affine.forwd(z1,params$W[[2]],params$b[[2]])
  z2 = softmax.forwd(a2,t.batch)
  dwSM = softmax.bckwd(a2, t.batch, 1)
  dwA2 = affine.bckwd(a1,params$W[[2]],params$b[[2]],dwSM)
  dwSG = sigmoid.bckwd(a1,dwA2$dx)
  dwA1 = affine.bckwd(x.batch,params$W[[1]],params$b[[1]],dwSG)
  params$W[[2]] = params$W[[2]] - lambda*dwA2$dW
  params$b[[2]] = params$b[[2]] - lambda*dwA2$db
  params$W[[1]] = params$W[[1]] - lambda*dwA1$dW
  params$b[[1]] = params$b[[1]] - lambda*dwA1$db
  loss[i.iter] = loss.network(params,x.batch,t.batch,c("sigmoid","softmax"))
}
plot(loss,type='l', xlab = "trial")