Posted on October 24, 2018 by matsukat

認知情報解析演習b DL2 – ch.2

txt = "You say goodbye and I say hello.";txt = tolower(txt)
txt = gsub('[.]', ' .',txt)
words = unlist(strsplit(txt, " "))
uniq.words = unique(words)
uniq.words[1]
which(uniq.words=="say")

n.uniq = length(uniq.words)
n.words = length(words)

# creating corpus
corpus = rep(0,n.words)
corpus = match(words,uniq.words)

## co-occurance matrix
cc = matrix(0,nrow=n.uniq, ncol=n.uniq)
for (i.c in 1:n.uniq){
  loc = which(corpus==i.c)
  L = which(match(uniq.words,words[pmax(loc-1,1)])!='NA')
  R = which(match(uniq.words,words[pmin(loc+1,n.words)])!='NA')
  cc[i.c, c(L,R)]=cc[i.c, c(L,R)]+1
}
diag(cc)=0

## ppmi matrix
ppmi <- function(cc, eps = 1e-8){
  n.uniq = ncol(cc)
  PPMI = matrix(0, n.uniq, n.uniq)
  N = sum(cc)
  r.sum = rowSums(cc)
  pmi = log2(cc*N/outer(r.sum,r.sum))
  PPMI=pmax(0,pmi)
  return(PPMI)
}

# performing SVD
s = svd(PPMI)
plot(s$u[,1],s$u[,2],pch=20,col='red')
text(s$u[,1],s$u[,2],row.names(cc))

# calc. simliarity b/w words
words.sim <- function(word1, word2, eps=1e-8){
  nw1 = word1/(sqrt(sum(word1^2)) + eps)
  nw2 = word2/(sqrt(sum(word2^2)) + eps)
  return(nw1%*%nw2)
}

w1 = cc[which(uniq.words=="you"),]
w2 = cc[which(uniq.words=="i"),]
words.sim(w1,w2)

# reporting most simliar words
most.sim <- function(word, uniq.words, cc, N=5){
  n.uniq = length(uniq.words)
  word2 = cc[which(uniq.words==word),]
  res = data.frame(words = uniq.words, similarity = rep(0,n.uniq))
  top = data.frame(words = rep("",N+1), similarity=rep(0,N+1))
  res$similarity = apply(cc,1, function(x) words.sim(x,word2))
  sort.res = sort(res$similarity, decreasing = T,  index.return=T)
  top$words = res$words[sort.res$ix[1:(N+1)]]
  top$similarity = res$similarity[sort.res$ix[1:(N+1)]]
  self.idx = which(top$words == word)
  return(top[-self.idx,])
} 

## Penn TB
f = file("~/courses/CogMod/CMA2018/ptb.tex")
ptb<-readLines(con=f)
ptb = paste(ptb,"")
words = unlist(strsplit(ptb, " "))
uniq.words = unique(words)
n.uniq = length(uniq.words)
n.words = length(words)

corpus = rep(0,n.words)
corpus = match(words,uniq.words)

cc <- contxt(corpus,uniq.words,words)
PPMI<-ppmi(cc)
library(rsvd)
## 時間がかかります。
s = rsvd(PPMI)

Related