データ解析基礎論B 多変量解析

install.packages("ggfortify")
install.packages("ggdendro")
library(ggfortify)
library(ggdendro)

# pca
dat<-read.table("http://www.matsuka.info/data_folder/tdkPCA01.txt")
dat.pca<-princomp(dat)
autoplot(dat.pca, label = TRUE, label.size = 6, 
         loadings = TRUE, loadings.colour = 'red',
         loadings.label = TRUE, loadings.label.size = 5)
autoplot(dat.pca, shape = FALSE, label.size = 6, 
         loadings = TRUE, loadings.colour = 'red',
         loadings.label = TRUE, loadings.label.size = 5)


cldata<-data.frame(var1=c(4,1,5,1,5), var2=c(1,5,4,3,1))
rownames(cldata)  = c("A","B","C","D","E")
autoplot(dist(cldata))
cldata.cluster=hclust(dist(cldata),method="average")
ggdendrogram(cldata.cluster, rotate = T, theme_dendro = FALSE)+ xlab("Individual")

dat<-read.csv("http://matsuka.info/data_folder/tdkClust.csv", header=TRUE, row.names=1)
autoplot(dist(dat))
dat.cluster=hclust(dist(dat))
ggdendrogram(dat.cluster, rotate = T, theme_dendro = FALSE)+ xlab("Occupation")
dat.pca = princomp(dat)
autoplot(dat.pca, label = TRUE, shape = FALSE, label.size = 4, 
         loadings = TRUE, loadings.colour = 'red',
         loadings.label = TRUE, loadings.label.size = 5)


dat.HC.S=hclust(dist(dat), method = "single")
dat.HC.C=hclust(dist(dat), method = "complete")
dat.HC.A=hclust(dist(dat), method = "average")
dat.HC.W=hclust(dist(dat), method = "ward.D")
ggdendrogram(dat.HC.S, rotate = T, theme_dendro = FALSE)+ xlab("Occupation")+ggtitle("Method = Single")
ggdendrogram(dat.HC.C, rotate = T, theme_dendro = FALSE)+ xlab("Occupation")+ggtitle("Method = Complete")
ggdendrogram(dat.HC.A, rotate = T, theme_dendro = FALSE)+ xlab("Occupation")+ggtitle("Method = Average")
ggdendrogram(dat.HC.W, rotate = T, theme_dendro = FALSE)+ xlab("Occupation")+ggtitle("Method = Ward's MV")

dat.kmeans=kmeans(dat, centers=3, nstart=10)
pairs(dat, 
      main = "Clustering Occupations",
      pch = 21, 
      bg = c("red", "blue", "green")
[unclass(dat.kmeans$cluster)])
autoplot(dat.kmeans, dat, size = 3, label = TRUE, label.size = 5)

source("http://www.matsuka.info/univ/course_folder/cuUtil02.R") 
res<-cu.KMC.rep(dat,10,100)

autoplot(dat.kmeans, dat, frame = TRUE, frame.type = 'norm') + ylim(-0.7,0.7)+xlim(-1.2,0.7)
autoplot(dat.kmeans, dat, frame = TRUE)+ ylim(-0.7,0.7)+xlim(-1.2,0.7)

dat<-data.frame(writing=c(68,85,50,54,66,35,56,25,43,70),
                interview=c(65,80,95,70,75,55,65,75,50,40), 
                cl=c(rep("A",5),rep("N",5)))
library(MASS)
dat.lda<-lda(cl~.,data=dat)
intcpt = (dat.lda$scaling[1]*dat.lda$means[1,1]+dat.lda$scaling[2]*dat.lda$means[1,2]+
            dat.lda$scaling[1]*dat.lda$means[2,1]+dat.lda$scaling[2]*dat.lda$means[2,2])/2
new.dim.slope = dat.lda$scaling[1]/dat.lda$scaling[2]

disc.intcpt = intcpt / dat.lda$scaling[2]
disc.slope = -dat.lda$scaling[1] / dat.lda$scaling[2]

ggplot(dat, aes(x = writing, y= interview, color = cl)) +
  geom_point(size = 4) +
  geom_abline(aes(intercept = intcpt, slope = new.dim.slope )) +
  geom_abline(aes(intercept = disc.intcpt, slope = disc.slope ),color = "red") + xlim(30,100)+ylim(30,100)

dat<-read.csv("http://matsuka.info/data_folder/tdkDA01.csv", header=T)
dat.lda<-lda(class~.,dat)
lda.pred<-predict(dat.lda,dat)
table(lda.pred$class, dat$class)
dat.ldaCV<-lda(class~.,dat, CV=T)


dat<-read.csv("http://matsuka.info/data_folder/tdkDA02.csv",header=T)
dat.lda=lda(class~.,data=dat)

lda.pred <- predict(dat.lda)$x  %>% as.data.frame %>% cbind(class = dat$class)
ggplot(lda.pred) + geom_point(aes(x=LD1, y=LD2, color = class), size = 2.5)


dat<-data.frame(p1=c(4,1,5,1,5),p2=c(1,5,4,3,1))
rownames(dat)<-c('a','b','c','d','e')
dat.mds<-cmdscale(dist(dat),2)
ggplot(dat.mds, aes(x = dat.mds[,1],y = dat.mds[,2])) +
  geom_text(aes(label = row.names(dat.mds)), size = 6)