広域システム特別講義II 教師なし学習1A

dat <- read.table("http://www.matsuka.info/data_folder/tdkPCA01.txt")
dat.pca <- princomp(dat)
dat.pca$score
dat.pca$loadings

library(nnet)
dat <- read.csv("http://www.matsuka.info/data_folder/tdkReg01.csv")
set.seed(5)
x = dat[, 1:3]
y = dat[, 4]
dat.nnet = nnet(x, y, size = 150, linout = TRUE,maxit = 1000)
nnet.pred <- predict(dat.nnet, dat)
cor(dat.nnet$fitted.values,dat$sales)^2

dat.lm<-lm(sales~.,data=dat)
summary(dat.lm)

# pseudo-pca
dat<-read.table("http://www.matsuka.info/data_folder/tdkPCA01.txt")
dat.nnet<-nnet(dat,dat,size=2, maxit=1000, decay=0.01, linout=TRUE)
cor(dat.nnet$fitted.values,dat)


### text processing

txt = "You say goodbye and I say hello.";txt = tolower(txt)
txt = gsub('[.]', ' .',txt)
words = unlist(strsplit(txt, " "))
uniq.words = unique(words)
uniq.words[1]
which(uniq.words=="say")

n.uniq = length(uniq.words)
n.words = length(words)

corpus = rep(0,n.words)
corpus = match(words,uniq.words)

cc = matrix(0,nrow=n.uniq, ncol=n.uniq)
for (i.c in 1:n.uniq){
  loc = which(corpus==i.c)
  L = which(match(uniq.words,words[pmax(loc-1,1)])!='NA')
  R = which(match(uniq.words,words[pmin(loc+1,n.words)])!='NA')
  cc[i.c, c(L,R)]=cc[i.c, c(L,R)]+1
}
diag(cc)=0

contxt <- function(corpus, uniq.words, words){
  cc = matrix(0, nrow=n.uniq, ncol=n.uniq)
  for (i.c in 1:n.uniq){
    loc = which(corpus==i.c)
    L = which(match(uniq.words, words[pmax(loc-1,1)])!='NA')
    R = which(match(uniq.words, words[pmin(loc+1,n.words)])!='NA')
    cc[i.c, c(L,R)]=cc[i.c, c(L,R)]+1
  }
  diag(cc)=0
  return(cc)
}

words.sim <- function(word1, word2, eps=1e-8){
  nw1 = word1/(sqrt(sum(word1^2)) + eps)
  nw2 = word2/(sqrt(sum(word2^2)) + eps)
  return(nw1%*%nw2)
}

w1 = cc[which(uniq.words=="you"),]
w2 = cc[which(uniq.words=="i"),]

words.sim(w1,w2)

most.sim <- function(word, uniq.words, cc, N=5){
  n.uniq = length(uniq.words)
  word2 = cc[which(uniq.words==word),]
  res = data.frame(words = uniq.words, similarity = rep(0,n.uniq))
  top = data.frame(words = rep("",N+1), similarity=rep(0,N+1))
  res$similarity = apply(cc,1, function(x) words.sim(x,word2))
  sort.res = sort(res$similarity, decreasing = T,  index.return=T)
  top$words = res$words[sort.res$ix[1:(N+1)]]
  top$similarity = res$similarity[sort.res$ix[1:(N+1)]]
  self.idx = which(top$words == word)
  return(top[-self.idx,])
}

most.sim('i',uniq.words,cc,5)

ppmi <- function(cc, eps = 1e-8){
  n.uniq = ncol(cc)
  PPMI = matrix(0, n.uniq, n.uniq)
  N = sum(cc)
  r.sum = rowSums(cc)
  pmi = log2(cc*N/outer(r.sum,r.sum))
  PPMI=matrix(pmax(0,pmi),nrow=n.uniq)
  return(PPMI)
}


### LSA
PPMI <- ppmi(cc)

s = svd(PPMI)
plot(s$u[,2],s$u[,1],pch=20,col='red',cex=5)
text(s$u[,2],s$u[,1],uniq.words,cex=2)



### word2vec inefficient
corp2contxt1S = function(corpus){
  len.corp = length(corpus)
  # creating target matrix
  idxT = cbind(1:(len.corp-2), corpus[2:(len.corp-1)])
  targ1S = matrix(0,nrow=len.corp-2,ncol=length(unique(corpus)))
  targ1S[idxT]=1

  # creating context matrices
  idxC1 = cbind(1:(len.corp-2),corpus[1:(len.corp-2)])
  idxC2 = cbind(1:(len.corp-2),corpus[3:len.corp])
  contxt1 = matrix(0,nrow=len.corp-2,ncol=length(unique(corpus)))
  contxt2 = matrix(0,nrow=len.corp-2,ncol=length(unique(corpus)))
  contxt1[idxC1]=1
  contxt2[idxC2]=1
  return(list(target=targ1S,contxt1=contxt1,contxt2=contxt2))
}
txt = "You say goodbye and I say hello.";
corp = txt2corpus(txt)
dat <- corp2contxt1S(corp)
init.W2V <- function(n.uniq,size.hidden){
  W.in = matrix(rnorm(n.uniq*size.hidden),nrow=n.uniq)*0.01
  W.out = matrix(rnorm(n.uniq*size.hidden),nrow=size.hidden)*0.01
  return(list(W.in = W.in, W.out= W.out))
}

softmax.forwd <- function(x, target){
  max.x = apply(x,1,max)
  C = ncol(x)
  x = x - max.x%*%matrix(1,nrow=1,ncol=C)
  y = exp(x)/rowSums(exp(x))
  delta = 1e-7;
  R = nrow(as.matrix(y))
  return(-sum(target*log(y + delta))/R)
}

softmax.bckwd <- function(x, target,  dout = 1){
  max.x = apply(x, 1, max)
  R = nrow(x)
  C = ncol(x)
  x = x - max.x%*%matrix(1,nrow=1,ncol=C)
  y = exp(x)/rowSums(exp(x))
  return((y-target)/R)
}

MatMult.bckwd <- function(x, W, dout){
  dx = dout%*%t(W)
  dW = t(x)%*%dout
  return(list(dx = dx, dW = dW))
}

softmax.pred <- function(x, target){
  max.x = apply(x,1,max)
  C = ncol(x)
  x = x - max.x%*%matrix(1,nrow=1,ncol=C)
  y = exp(x)/rowSums(exp(x))
  return(y)
}

network<-init.W2V(7,5)
n.batch = 3;
n.data = nrow(dat$target)
n.iter = 2000;
lambda=0.1;
loss = rep(0, n.iter)
for (i.iter in 1:n.iter){
  samp.idx = sample(1:n.data,n.batch)
  batch.c1 = dat$contxt1[samp.idx,]
  batch.c2 = dat$contxt2[samp.idx,]
  batch.y = dat$target[samp.idx,]
  h1 <- MatMult.forwd(batch.c1, network$W.in)
  h2 <- MatMult.forwd(batch.c2, network$W.in)
  h = 0.5 * (h1 + h2)
  s = MatMult.forwd(h,network$W.out)
  z = softmax.forwd(s,batch.y)
  loss[i.iter] = z
  ds = softmax.bckwd(s, batch.y, 1)
  da = MatMult.bckwd(h,network$W.out,ds)
  da$dW = da$dW*0.5
  dIn1 = MatMult.bckwd(batch.c1,network$W.in,da$dx)
  dIn2 = MatMult.bckwd(batch.c2,network$W.in,da$dx)
  network$W.out = network$W.out - lambda*da$dW
  network$W.in = network$W.in - lambda*dIn1$dW
  network$W.in = network$W.in - lambda*dIn2$dW
}
plot(loss, type='l')

### word2vec
txt2corpus <- function(txt){
  txt = tolower(txt)
  txt = gsub('[.]', ' .',txt)
  words = unlist(strsplit(txt, " "))
  uniq.words = unique(words)
  n.uniq = length(uniq.words)
  n.words = length(words)
  corpus = rep(0,n.words)
  corpus = match(words,uniq.words)
  return(corpus)
}

corp2contxt <- function(corpus){
  len.corp = length(corpus)
  target = corpus[2:(len.corp-1)]
  col1 = corpus[1:(len.corp-2)]
  col2 = corpus[3:len.corp]
  context = cbind(col1,col2)
  return(list(context=context,target = target))
}


embed.forwd <- function(W, idx){
  return(W[idx,])
}

h1 <- embed.forwd(W,idx1)
h2 <- embed.forwd(W,idx2)
h = (h1 + h2)/2

embed.dot.forwd <- function(W, h, idx){
  return(O=rowSums(W[idx,]*h))
}

s = embed.dot.forwd(network$W.out,h,batch.y)

sigmoid.forwd <- function(O){
  return(1/(1+exp(-O)))
}

sigmoidWL.forwd <- function(O,target){
  #delta = 1e-5
  #y = max(0, (1/(1+exp(-O)) - delta))
  y = 1/(1+exp(-O))
  loss=-sum(target*log(y)+(1-target)*log(1 - y))
  return(loss)
}

sigmoidWL.backwd <- function(O,target,dout=1){
  #delta = 1e-5
  #y = 1/(1+exp(-O)) - delta
  y = 1/(1+exp(-O))
  dW = (y - target)*dout
  return(dW)
}

embed.dot.backwd <- function(W,h,idx,dout){
  dh = dout * W[idx,]
  dW = dout * h
  return(list(dh = dh, dW = dW))
}

embed.backwd <- function(W, idx, dout){
  dW = matrix(0,nrow(W),ncol(W))
  for (i.idx in 1:length(idx)){
    dW[idx[i.idx], ] = dW[idx[i.idx], ] + dout[i.idx, ]
  }
  return(dW)
}


init.W2V <- function(n.uniq,size.hidden){
  W.in = matrix(rnorm(n.uniq*size.hidden),nrow=n.uniq)*0.01
  W.out = matrix(rnorm(n.uniq*size.hidden),nrow=n.uniq)*0.01
  return(list(W.in = W.in, W.out= W.out))
}

### naive W2V
network<-init.W2V(8,5)
n.batch = 3;
txt = "You say goodbye and I say hello.."
corp = txt2corpus(txt)
corp = c(8,8,corp)
dat<-corp2contxt(corp)
n.data =  length(dat$target)

n.iter = 3000;
lambda=0.1;
loss = rep(0, n.iter)
for (i.iter in 1:n.iter){
  samp.idx = sample(1:n.data,n.batch)
  batch.c1 = dat$context[samp.idx,1]
  batch.c2 = dat$context[samp.idx,2]
  batch.y = dat$target[samp.idx]
  h1 <- embed.forwd(network$W.in,batch.c1)
  h2 <- embed.forwd(network$W.in,batch.c2)
  h = 0.5 * (h1 + h2)
  # positive only
  s = embed.dot.forwd(network$W.out,h,batch.y)
  z = sigmoidWL.forwd(s,1)
  loss[i.iter] = z
  ds = sigmoidWL.backwd(s, 1, 1)
  dE = embed.dot.backwd(network$W.out,h, batch.y, ds)
  dh = dE$dh*0.5
  dIn1 = embed.backwd(network$W.in,dat$context[samp.idx,1], dh)
  dIn2 = embed.backwd(network$W.in,dat$context[samp.idx,2], dh)
  network$W.out[batch.y,] = network$W.out[batch.y,] - lambda*dE$dW
  network$W.in = network$W.in - lambda*dIn1
  network$W.in = network$W.in - lambda*dIn2
}

par(mfrow=c(1,1))
plot(loss, type='l')

samp.idx = c(2:6,8,9,1)
batch.c1 = dat$context[samp.idx,1]
batch.c2 = dat$context[samp.idx,2]
batch.y = dat$target[samp.idx]
h1 <- embed.forwd(network$W.in,batch.c1)
h2 <- embed.forwd(network$W.in,batch.c2)
h = 0.5 * (h1 + h2)
s = MatMult.forwd(h,t(network$W.out))
z = sigmoid.forwd(s)
res=cbind(z,batch.y)
#par(mfrow=c(8,1))
#for (i in 1:8){
#  col.spec = rep("black",8)
#  col.spec[i]="orange"
# barplot(res[i, 1:8],col=col.spec)
#}

par(mfrow=c(4,1))
for (i in 1:4){
  col.spec = rep("black",8)
  col.spec[i]="orange"
  barplot(res[i, 1:8],col=col.spec)
}
par(mfrow=c(4,1))
for (i in 5:8){
  col.spec = rep("black",8)
  col.spec[i]="orange"
  barplot(res[i, 1:8],col=col.spec)
}






### with negative sampling
unigram.sampler <- function(corpus, target, power, sample.size){
  neg.corpus <- corpus[-which(corpus == target)]
  uniq.word = unique(neg.corpus)
  n.word = length(neg.corpus)
  prob = (as.vector(table(neg.corpus))/n.word)^power
  prob = prob/sum(prob)
  sample.idx = sample(uniq.word, sample.size, prob = prob)
  return(sort(sample.idx))
}
get.neg.sample <- function(corpus, target, power, sample.size){
  sample.id = matrix(0, nrow = length(target), ncol = sample.size)
  for (i.targ in 1:length(target)){
    sample.id[i.targ,] = unigram.sampler(corpus, target[i.targ], power, sample.size)
  }
  return(sample.id)
}


network<-init.W2V(8,5)
n.batch = 3;
txt = "You say goodbye and I say hello.."
corp = txt2corpus(txt)
corp = c(8,8,corp)
dat<-corp2contxt(corp)
n.data =  length(dat$target)

n.iter = 10000;
lambda=0.1;
loss = rep(0, n.iter)
sample.size = 2
for (i.iter in 1:n.iter){
  samp.idx = sample(1:n.data,n.batch)
  batch.c1 = dat$context[samp.idx,1]
  batch.c2 = dat$context[samp.idx,2]
  batch.y = dat$target[samp.idx]
  h1 <- embed.forwd(network$W.in,batch.c1)
  h2 <- embed.forwd(network$W.in,batch.c2)
  h = 0.5 * (h1 + h2)
  # positive
  s = embed.dot.forwd(network$W.out,h,batch.y)
  z = sigmoidWL.forwd(s,1)
  loss[i.iter] = z
  ds = sigmoidWL.backwd(s, 1, 1)
  dE = embed.dot.backwd(network$W.out,h, batch.y, ds)
  dh = dE$dh*0.5
  dIn1 = embed.backwd(network$W.in,dat$context[samp.idx,1], dh)
  dIn2 = embed.backwd(network$W.in,dat$context[samp.idx,2], dh)
  network$W.out[batch.y,] = network$W.out[batch.y,] - lambda*dE$dW
  network$W.in = network$W.in - lambda*dIn1
  network$W.in = network$W.in - lambda*dIn2
  # negative
  neg.set <- get.neg.sample(corp,batch.y, 0.75, sample.size)
  for (i.neg in 1:sample.size){
    s = embed.dot.forwd(network$W.out,h,neg.set[,i.neg])
    z = sigmoidWL.forwd(s,0)
    loss[i.iter] = loss[i.iter] + z
    ds = sigmoidWL.backwd(s, 0, dout=1)
    dE = embed.dot.backwd(network$W.out,h,neg.set[,i.neg], ds)
    dh = dE$dh*0.5
    dIn1 = embed.backwd(network$W.in, dat$context[samp.idx,1],dh)
    dIn2 = embed.backwd(network$W.in, dat$context[samp.idx,2],dh)
    network$W.out[neg.set[,i.neg],] = network$W.out[neg.set[,i.neg],] - lambda*dE$dW
    network$W.in = network$W.in - lambda*dIn1
    network$W.in = network$W.in - lambda*dIn2
  }
}
par(mfrow=c(1,1))
loss.dat <- data.frame(epoch=1:length(loss), loss = loss)
ggplot(loss.dat, aes(x = epoch, y = loss)) +
  geom_smooth(se=F)


samp.idx = c(2:6,8,9,1)
batch.c1 = dat$context[samp.idx,1]
batch.c2 = dat$context[samp.idx,2]
batch.y = dat$target[samp.idx]
h1 <- embed.forwd(network$W.in,batch.c1)
h2 <- embed.forwd(network$W.in,batch.c2)
h = 0.5 * (h1 + h2)
s = MatMult.forwd(h,t(network$W.out))
z = sigmoid.forwd(s)
res=cbind(z,batch.y)
#par(mfrow=c(8,1))
#for (i in 1:8){
# col.spec = rep("black",8)
#  col.spec[i]="orange"
#  barplot(res[i, 1:8],col=col.spec)
#}

par(mfrow=c(4,1))
for (i in 1:4){
  col.spec = rep("black",8)
  col.spec[i]="orange"
  barplot(res[i, 1:8],col=col.spec)
}

par(mfrow=c(4,1))
for (i in 5:8){
  col.spec = rep("black",8)
  col.spec[i]="orange"
  barplot(res[i, 1:8],col=col.spec)
}