train.x = as.matrix(iris[,1:4])
train.y.temp = as.numeric(iris[,5])
train.y = matrix(0,nrow = nrow(train.x), ncol =3)
train.y[which(train.y.temp==1), 1]=1
train.y[which(train.y.temp==2), 2]=1
train.y[which(train.y.temp==3), 3]=1
init.network <- function(n.neurons){
n.layer = length(n.neurons)
W = list(); b = list()
for (i.layer in 1:(n.layer-1)){
W[[i.layer]] =
matrix(rnorm(n.neurons[i.layer]*n.neurons[(i.layer+1)],sd = 0.1),nrow=n.neurons[i.layer])
b[[i.layer]] = matrix(rnorm(n.neurons[(i.layer+1)],sd = 0.1), nrow = 1)
}
return(list(W = W,b = b))
}
sigmoid.forwd <- function(x){
return(1/(1+exp(-x)))
}
sigmoid.bckwd <- function(x, dout){
y = sigmoid.forwd(x)
return(dout*(1-y)*y)
}
affine.forwd <- function(x, W, b){
return(x%*%W + matrix(1, nrow = nrow(x), ncol = 1)%*%b)
}
affine.bckwd <- function(x, W, b, dout){
dx = dout%*%t(W)
dW = t(x)%*%dout
db = colSums(dout)
return(list(dx = dx, dW = dW, db = db))
}
softmax.forwd <- function(x, target){
max.x = apply(x,1,max)
C = ncol(x)
x = x - max.x%*%matrix(1,nrow=1,ncol=C)
y = exp(x)/rowSums(exp(x))
delta = 1e-7;
R = nrow(as.matrix(y))
return(-sum(target*log(y + delta))/R)
}
softmax.bckwd <- function(x, target, dout = 1){
max.x = apply(x, 1, max)
R = nrow(x)
C = ncol(x)
x = x - max.x%*%matrix(1,nrow=1,ncol=C)
y = exp(x)/rowSums(exp(x))
return((y-target)/R)
}
params = init.network(c(4,15,3))
batch_size = 50; n.iter =5000; lambda =0.05
n.train = nrow(train.x)
loss = rep(0,n.iter)
for (i.iter in 1:n.iter){
batch_mask = sample(1:n.train, batch_size)
x.batch = train.x[batch_mask,]
t.batch = train.y[batch_mask,]
a1 = affine.forwd(x.batch,params$W[[1]],params$b[[1]])
z1 = sigmoid.forwd(a1)
a2 = affine.forwd(z1,params$W[[2]],params$b[[2]])
z2 = softmax.forwd(a2,t.batch)
loss[i.iter] = z2
dwSM = softmax.bckwd(a2, t.batch, 1)
dwA2 = affine.bckwd(a1,params$W[[2]],params$b[[2]],dwSM)
dwSG = sigmoid.bckwd(a1,dwA2$dx)
dwA1 = affine.bckwd(x.batch,params$W[[1]],params$b[[1]],dwSG)
params$W[[2]] = params$W[[2]] - lambda*dwA2$dW
params$b[[2]] = params$b[[2]] - lambda*dwA2$db
params$W[[1]] = params$W[[1]] - lambda*dwA1$dW
params$b[[1]] = params$b[[1]] - lambda*dwA1$db
}
loss.dat = data.frame(trial = 1:length(loss), loss = loss)
ggplot(loss.dat,aes(x = trial, y = loss)) +
geom_smooth(se=F)
### methods to improve effeciency
func02R = function(x){
return(1/20*x[1]^2 + x[2]^2)
}
numerical.grad <- function(func, x){
h = 1e-4
R = nrow(x)
C = ncol(x)
grad = matrix(0, R, C)
for (i.col in 1:C){
for (i.row in 1:R){
temp.x = x[i.row,i.col]
x[i.row, i.col] = temp.x + h
plusH = do.call(func, list(x))
x[i.row, i.col] = temp.x - h
minusH = do.call(func,list(x))
grad[i.row, i.col] = (plusH - minusH)/(2*h)
x[i.row, i.col] = temp.x
}
}
return(grad)
}
require(plot3D)
x = seq(-10,10,0.5)
y = seq(-10,10,0.5)
M = mesh(x,y)
R = nrow(M$x)
C = nrow(M$x)
scaling = 0.05
plot(c(),c(),xlim = c(-10,10),ylim=c(-10,10))
for (i.col in 1:C){
for (i.row in 1:R){
ng = numerical.grad("func02R",matrix(c(M$x[i.row,i.col],M$y[i.row,i.col]),nrow=1))
arrows(M$x[i.row,i.col],M$y[i.row,i.col],
(M$x[i.row,i.col]-ng[1]*scaling),(M$y[i.row,i.col]-ng[2]*scaling),
length = 0.05)
}
}
x = seq(-10,10,0.2)
y = seq(-10,10,0.2)
M = mesh(x,y)
Z = as.vector(1/20*M$x^2)+as.vector(M$y^2)
Z.mesh = matrix(Z,nrow(M$x))
contour(x,y,Z.mesh,drawlabels = F,nlevels=40)
grad.desc <- function(func, init.x, lr, n.iter){
x = init.x
x.hist = init.x
for (i.iter in 1:n.iter) {
grad = numerical.grad(func, x)
x = x - lr*grad
x.hist = rbind(x.hist,x)
}
return(x.hist)
}
x.init = matrix(c(-7,2),nrow = 1)
gd = grad.desc("func02R",x.init,0.9,100)
lines(gd,type='o',col = 'green',pch=20)
gd.moment <- function(func, init.x, lr, moment, n.iter){
x = init.x
x.hist = init.x
grad.hist = rep(0,length(x))
for (i.iter in 1:n.iter) {
grad = numerical.grad(func, x)
x = x - lr*grad+moment*grad.hist
x.hist = rbind(x.hist,x)
grad.hist = -lr*grad
}
return(x.hist)
}
x.init = matrix(c(-7,2),nrow = 1)
gdm = gd.moment("func02R",x.init,0.9,0.3,100)
lines(gdm,type='o',col = 'blue',pch=20)
adaGrad <- function(func, init.x, eta, n.iter){
x = init.x
x.hist = init.x
h = rep(0,length(x))
for (i.iter in 1:n.iter) {
grad = numerical.grad(func, x)
h = h + grad*grad
x = x - eta*(1/(sqrt(h)+1e-7))*grad
x.hist = rbind(x.hist,x)
}
return(x.hist)
}
x.init = matrix(c(-7,2),nrow = 1)
adaGrad = adaGrad("func02R",x.init,0.9,100)
contour(x,y,Z.mesh,drawlabels = F,nlevels=40)
lines(adaGrad,type='o',col = 'red',pch=20)
adam <- function(func, init.x, eta, beta1, beta2, epsilon, n.iter){
x = init.x
x.hist = init.x
m = rep(0,length(x))
v = rep(0,length(x))
for (i.iter in 1:n.iter) {
grad = numerical.grad(func, x)
m = beta1*m + (1-beta1)*grad
v = beta2*v + (1-beta2)*grad*grad
m.hat = m/(1-beta1)
v.hat = v/(1-beta2)
x = x - eta/((sqrt(v.hat)+epsilon))*m.hat
x.hist = rbind(x.hist,x)
}
return(x.hist)
}
x.init = matrix(c(-7,2),nrow = 1)
adam = adam("func02R",x.init,0.2,0.9,0.999,1e-8,100)
contour(x,y,Z.mesh,drawlabels = F,nlevels=40)
lines(adam,type='o',col = 'red',pch=20)
### w/ functions
relu.forwd <- function(x){
return(pmax(x,0))
}
relu.bckwd <- function(z, dout){
dout[which(z <= 0)] = 0
return(dout)
}
# sigmoid
sigmoid.forwd <- function(x){
return(1/(1+exp(-x)))
}
sigmoid.bckwd <- function(z, dout){
return(dout*(1-z)*z)
}
# Affine
affine.forwd <- function(x, W, b){
return(x%*%W + matrix(1, nrow = nrow(x), ncol = 1)%*%b)
}
affine.bckwd <- function(x, W, dout){
dx = dout%*%t(W)
dW = t(x)%*%dout
db = colSums(dout)
return(list(dx = dx, dW = dW, db = db))
}
# softmax with CE
softmax.forwd <- function(x, target){
max.x = apply(x,1,max)
C = ncol(x)
x = x - max.x%*%matrix(1,nrow=1,ncol=C)
y = exp(x)/rowSums(exp(x))
delta = 1e-7;
R = nrow(as.matrix(y))
return(list(smx = y, ce = -sum(target*log(y + delta))/R))
}
softmax.bckwd <- function(smx, target){
R = nrow(smx)
return((smx-target)/R)
}
activation <- function(A, target, actFun){
if (actFun == "sigmoid"){
return(sigmoid.forwd(A))
}
if (actFun == "relu"){
return(relu.forwd(A))
}
if (actFun == "softmax"){
return(softmax.forwd(A, target))
}
}
Dact <- function(z, smx, target, dout, actFun){
if (actFun == "sigmoid"){
return(sigmoid.bckwd(z, dout))
}
if (actFun == "relu"){
return(relu.bckwd(z, dout))
}
if (actFun == "softmax"){
return(softmax.bckwd(smx, target))
}
}
# function for initialization
init.network <- function(n.neurons, actFun, Opt, sdv){
n.layer = length(n.neurons)
W = list(); b = list();
mW = list(); mb = list(); # momentum
hW = list(); hb = list(); # adaGrad
aMW = list(); aMb = list(); # adam M
aVW = list(); aVb = list(); # adam V
for (i.layer in 1:(n.layer-1)){
if (nargs() == 3) {
if (actFun[i.layer]=="sigmoid"){
# Xavier
sdv = 1/sqrt(n.neurons[i.layer])
} else {
# He - assumes ReLU
sdv = sqrt(2/n.neurons[i.layer])
}
}
W[[i.layer]] = matrix(rnorm(n.neurons[i.layer]*n.neurons[(i.layer+1)], sd = sdv),
nrow=n.neurons[i.layer])
b[[i.layer]] = matrix(rnorm(n.neurons[(i.layer+1)], sd = sdv), nrow = 1)
if (Opt == "momentum"){
mW[[i.layer]] = matrix(0, nrow=n.neurons[i.layer], ncol=n.neurons[(i.layer+1)])
mb[[i.layer]] = matrix(0, nrow = 1, ncol=n.neurons[(i.layer+1)])
}
if (Opt == "adagrad"){
hW[[i.layer]] = matrix(0, nrow=n.neurons[i.layer], ncol=n.neurons[(i.layer+1)])
hb[[i.layer]] = matrix(0, nrow = 1, ncol=n.neurons[(i.layer+1)])
}
if (Opt == "adam"){
aMW[[i.layer]] = matrix(0, nrow=n.neurons[i.layer], ncol=n.neurons[(i.layer+1)])
aMb[[i.layer]] = matrix(0, nrow = 1, ncol=n.neurons[(i.layer+1)])
aVW[[i.layer]] = matrix(0, nrow=n.neurons[i.layer], ncol=n.neurons[(i.layer+1)])
aVb[[i.layer]] = matrix(0, nrow = 1, ncol=n.neurons[(i.layer+1)])
}
}
return(list(W = W, b = b, actFun = actFun, optimizer = Opt,
mW=mW, mb=mb,
hW=hW, hb=hb,
aMW=aMW,aMb=aMb,aVW=aVW))
}
OPT<-function(net, Daff, HP){
if (net$optimizer == "momentum"){
return(Opt.mom(net, Daff, HP))
}
if (net$optimizer == "adagrad"){
return(Opt.adagrad(net, Daff, HP))
}
if (net$optimizer == "adam"){
return(Opt.adam(net, Daff, HP))
}
}
Opt.mom <- function(net, Daff, HP) {
# HP[3] = learning rate
# HP[4] = weight decay
# HP[5] = momentum
n.layer <- length(net$W)
for (i.layer in 1:n.layer) {
net$mW[[i.layer]] = HP[5]*net$mW[[i.layer]] - HP[3]*Daff[[i.layer]]$dW - HP[4]*net$W[[i.layer]]
net$mb[[i.layer]] = HP[5]*net$mb[[i.layer]] - HP[3]*Daff[[i.layer]]$db - HP[4]*net$b[[i.layer]]
net$W[[i.layer]] = net$W[[i.layer]] + net$mW[[i.layer]]
net$b[[i.layer]] = net$b[[i.layer]] + net$mb[[i.layer]]
}
return(net=net)
}
Opt.adagrad <- function(net, Daff, HP) {
# HP[3] = learning rate
# HP[4] = weight decay
n.layer <- length(net$W)
for (i.layer in 1:n.layer) {
net$hW[[i.layer]] = net$hW[[i.layer]] + Daff[[i.layer]]$dW*Daff[[i.layer]]$dW
net$hb[[i.layer]] = net$hb[[i.layer]] + Daff[[i.layer]]$db*Daff[[i.layer]]$db
net$W[[i.layer]] = net$W[[i.layer]]-HP[3]/(sqrt(net$hW[[i.layer]])+1e-7)*Daff[[i.layer]]$dW - HP[4]*net$W[[i.layer]]
net$b[[i.layer]] = net$b[[i.layer]]-HP[3]/(sqrt(net$hb[[i.layer]])++1e-7)*Daff[[i.layer]]$db - HP[4]*net$b[[i.layer]]
}
return(net=net)
}
cu.nnet = function(train.x, train.y, net, HP = c(10,1000,0.05,0.01,0.1,0.999,0.9)){
# HP: Hyperparameters
# HP[1] = batch size
# HP[2] = n iteration
# HP[3] = learning rate
# HP[4] = weight decay
# HP[5] = momentum
# HP[6] = beta1 (adam)
# HP[7] = beta2 (adam)
n.layer <- length(net$W)
loss = rep(0,HP[2])
A = list(); z = list(); Dact = list(); Daff = list()
for (i.iter in 1:HP[2]){
batch_mask = sample(1:nrow(train.x), HP[1])
x.batch = train.x[batch_mask,]
t.batch = train.y[batch_mask,]
for (i.layer in 1:n.layer){
if (i.layer == 1){
x = x.batch
} else {
x = z[[(i.layer - 1)]]
}
A[[i.layer]] = affine.forwd(x, net$W[[i.layer]], net$b[[i.layer]])
z[[i.layer]] = activation(A[[i.layer]], t.batch, net$actFun[i.layer])
}
loss[i.iter] = z[[i.layer]]$ce
smx = z[[i.layer]]$smx
for (i.layerR in n.layer:1){
if (i.layerR == n.layer){
dout = 1
} else {
dout = Daff[[(i.layerR+1)]]$dx
}
Dact[[i.layerR]] = Dact(z[[i.layerR]], smx, t.batch, dout, net$actFun[i.layerR])
if (i.layerR==1){
x = x.batch
} else {
x = A[[(i.layerR-1)]]
}
Daff[[i.layerR]] = affine.bckwd(x, net$W[[i.layerR]], Dact[[i.layerR]])
}
net = OPT(net, Daff, HP)
}
return(list(loss = loss, net = net))
}
train.x = as.matrix(iris[,1:4])
train.y.temp = as.numeric(iris[,5])
train.y = matrix(0,nrow = nrow(train.x), ncol =3)
train.y[which(train.y.temp==1), 1]=1
train.y[which(train.y.temp==2), 2]=1
train.y[which(train.y.temp==3), 3]=1
actF = c("relu","softmax")
network = init.network(c(4,15,3), actF, "momentum")
res = cu.nnet(train.x, train.y, network, HP=c(15,1000,0.01,0.0001,0.9,0.999,0.9))
hist(res$net$W[[1]])
plot(res$loss,type='l')
MNIST.dat<-read.csv("http://www.matsuka.info/univ/course_folder/MNSTtrain.csv")
train <- data.matrix(MNIST.dat)
train.x <- as.matrix(train[,-1]/255)
train.y.temp <- train[,1]
train.y = matrix(0,nrow = nrow(train), ncol = 10)
for (i in 1:nrow(train.x)){
train.y[i,(train.y.temp[i]+1)]=1
}
actF = c("relu","relu","softmax")
network = init.network(c(784,100,50,10), actF, "momentum")
res = cu.nnet(train.x, train.y, network,HP=c(100,2000,0.1,0.001,0.8,0.999,0.9))
plot(res$loss,type='l')
Related