txt = "You say goodbye and I say hello.";txt = tolower(txt)
txt = gsub('[.]', ' .',txt)
words = unlist(strsplit(txt, " "))
uniq.words = unique(words)
uniq.words[1]
which(uniq.words=="say")
n.uniq = length(uniq.words)
n.words = length(words)
# creating corpus
corpus = rep(0,n.words)
corpus = match(words,uniq.words)
## co-occurance matrix
cc = matrix(0,nrow=n.uniq, ncol=n.uniq)
for (i.c in 1:n.uniq){
loc = which(corpus==i.c)
L = which(match(uniq.words,words[pmax(loc-1,1)])!='NA')
R = which(match(uniq.words,words[pmin(loc+1,n.words)])!='NA')
cc[i.c, c(L,R)]=cc[i.c, c(L,R)]+1
}
diag(cc)=0
## ppmi matrix
ppmi <- function(cc, eps = 1e-8){
n.uniq = ncol(cc)
PPMI = matrix(0, n.uniq, n.uniq)
N = sum(cc)
r.sum = rowSums(cc)
pmi = log2(cc*N/outer(r.sum,r.sum))
PPMI=pmax(0,pmi)
return(PPMI)
}
# performing SVD
s = svd(PPMI)
plot(s$u[,1],s$u[,2],pch=20,col='red')
text(s$u[,1],s$u[,2],row.names(cc))
# calc. simliarity b/w words
words.sim <- function(word1, word2, eps=1e-8){
nw1 = word1/(sqrt(sum(word1^2)) + eps)
nw2 = word2/(sqrt(sum(word2^2)) + eps)
return(nw1%*%nw2)
}
w1 = cc[which(uniq.words=="you"),]
w2 = cc[which(uniq.words=="i"),]
words.sim(w1,w2)
# reporting most simliar words
most.sim <- function(word, uniq.words, cc, N=5){
n.uniq = length(uniq.words)
word2 = cc[which(uniq.words==word),]
res = data.frame(words = uniq.words, similarity = rep(0,n.uniq))
top = data.frame(words = rep("",N+1), similarity=rep(0,N+1))
res$similarity = apply(cc,1, function(x) words.sim(x,word2))
sort.res = sort(res$similarity, decreasing = T, index.return=T)
top$words = res$words[sort.res$ix[1:(N+1)]]
top$similarity = res$similarity[sort.res$ix[1:(N+1)]]
self.idx = which(top$words == word)
return(top[-self.idx,])
}
## Penn TB
f = file("~/courses/CogMod/CMA2018/ptb.tex")
ptb<-readLines(con=f)
ptb = paste(ptb,"")
words = unlist(strsplit(ptb, " "))
uniq.words = unique(words)
n.uniq = length(uniq.words)
n.words = length(words)
corpus = rep(0,n.words)
corpus = match(words,uniq.words)
cc <- contxt(corpus,uniq.words,words)
PPMI<-ppmi(cc)
library(rsvd)
## 時間がかかります。
s = rsvd(PPMI)
Related