# データ解析基礎論B 判別分析＋

```library(MASS)
dat<-data.frame(writing=c(68,85,50,54,66,35,56,25,43,70),
interview=c(65,80,95,70,75,55,65,75,50,40),
cl=c(rep("A",5),rep("N",5)))
dat.lda<-lda(cl~.,data=dat)

intcpt = (dat.lda\$scaling[1]*dat.lda\$means[1,1] +
dat.lda\$scaling[2]*dat.lda\$means[1,2] +
dat.lda\$scaling[1]*dat.lda\$means[2,1] +
dat.lda\$scaling[2]*dat.lda\$means[2,2])/2
z = as.matrix(dat[,1:2])%*%dat.lda\$scaling-intcpt

new.dim.slope = dat.lda\$scaling[1]/dat.lda\$scaling[2]

disc.intcpt = intcpt / dat.lda\$scaling[2]
disc.slope = -dat.lda\$scaling[1] / dat.lda\$scaling[2]

ggplot(dat, aes(x = writing, y= interview, color = cl)) +
geom_point(size = 4) +
geom_abline(aes(intercept = intcpt, slope = new.dim.slope )) +
geom_abline(aes(intercept = disc.intcpt, slope = disc.slope ),color = "red") + xlim(30,100)+ylim(30,100)

dat.lda<-lda(class~.,dat)
lda.pred<-predict(dat.lda,dat)
table(lda.pred\$class, dat\$class)

dat.lda<-lda(class~.,dat, CV=T)
dat.cl = dat.lda\$posterior[,1]>dat.lda\$posterior[,2]
table(dat.cl, dat\$class)

dat.lda=lda(class~.,data=dat)
lda.pred <- predict(dat.lda, dat)
plot(dat.lda, dimen=3, col=as.numeric(lda.pred\$class),cex=2)

dat.km<-kmeans(dat[,1:6],5)
table(lda.pred\$class,dat.km\$cluster)
plot(dat,col=dat.km\$cluster,pch=20)
plot(dat.lda, dimen=2, col=as.numeric(lda.pred\$class),cex=2)

dat1<-subset(dat,dat\$popularity<5)
dat2<-subset(dat,dat\$popularity>4 & dat\$popularity<6)
dat3<-subset(dat,dat\$popularity>6)
dat1\$popularity="LP";dat2\$popularity="MP";dat3\$popularity="VP"
datT=rbind(dat1,dat2,dat3)
datT.lda<-lda(popularity~.,datT)
datT.pred<-predict(datT.lda,datT)
table(datT.pred\$class,datT\$popularity)
par(mfrow=c(1,2))
plot(datT.lda,col=c(rep('red',20),rep('blue',28),rep('green',29)),cex=1.5)
plot(datT.lda,col=as.numeric(datT.pred\$class),cex=1.5)
par(mfrow=c(1,1))

library(nnet)
set.seed(5)
x = dat[,1:3]
y = dat[,4]
dat.nnet = nnet(x,y, size = 150, linout= TRUE, maxit = 1000)
nnet.pred <-predict(dat.nnet,dat)
cor(dat.nnet\$fitted.values,dat\$sales)^2

dat.lm<-lm(sales~.,data=dat)
plot(dat.nnet\$fitted.values, dat\$sales,pch=20,col='black')
points(predict(dat.lm), dat\$sales,pch=20,col='red')

n.data = nrow(dat);n.sample = n.data*0.6; n.rep = 100
trainNN.cor = rep(0,n.rep); trainLM.cor = rep(0,n.rep)
testNN.cor = rep(0,n.rep); testLM.cor = rep(0,n.rep)
for (i.rep in 1:n.rep){
randperm = sample(1:n.data)
train.idx = randperm[1:n.sample]
test.idx = randperm[(n.sample+1):n.data]
dat.nnet <- nnet(sales~.,size = 100, linout=T, maxit=1000, data = dat[train.idx,])
dat.lm <-lm(sales~.,data=dat[train.idx, ])
trainNN.cor[i.rep] <- cor(predict(dat.nnet,dat[train.idx, ]), dat[train.idx,]\$sales)
trainLM.cor[i.rep] <- cor(predict(dat.lm,dat[train.idx, ]), dat[train.idx,]\$sales)
testNN.cor[i.rep] <- cor(predict(dat.nnet,dat[test.idx, ]), dat[test.idx,]\$sales)
testLM.cor[i.rep] <- cor(predict(dat.lm,dat[test.idx, ]), dat[test.idx,]\$sales)
}
print(c(mean(trainNN.cor,na.rm=T),mean(testNN.cor,na.rm=T)))
print(c(mean(trainLM.cor,na.rm=T),mean(testLM.cor,na.rm=T)))

n.data = nrow(dat);n.sample = n.data*0.6; n.rep = 100
trainNN.cor = rep(0,n.rep); trainLM.cor = rep(0,n.rep)
testNN.cor = rep(0,n.rep); testLM.cor = rep(0,n.rep)
for (i.rep in 1:n.rep){
randperm = sample(1:n.data)
train.idx = randperm[1:n.sample]
test.idx = randperm[(n.sample+1):n.data]
dat.nnet <- nnet(sales~.,size = 30, linout=T,decay= 0.1, maxit=1000, data = dat[train.idx,])
dat.lm <-lm(sales~.,data=dat[train.idx, ])
trainNN.cor[i.rep] <- cor(predict(dat.nnet,dat[train.idx, ]), dat[train.idx,]\$sales)
trainLM.cor[i.rep] <- cor(predict(dat.lm,dat[train.idx, ]), dat[train.idx,]\$sales)
testNN.cor[i.rep] <- cor(predict(dat.nnet,dat[test.idx, ]), dat[test.idx,]\$sales)
testLM.cor[i.rep] <- cor(predict(dat.lm,dat[test.idx, ]), dat[test.idx,]\$sales)
}
print(c(mean(trainNN.cor,na.rm=T),mean(testNN.cor,na.rm=T)))
print(c(mean(trainLM.cor,na.rm=T),mean(testLM.cor,na.rm=T)))

dat.nnet<-nnet(class~.,dat,size=30,maxit=1000,decay=0.05)
dat.pred<-predict(dat.nnet,dat,type="class")
table(dat.pred,dat\$class)

dat.glm<-glm(class~., family="binomial",dat)
glm.pred<-predict(dat.glm, dat, type="response")>0.5
table(glm.pred,dat\$class)

dat.nnet<-nnet(survival~., dat, size=30, maxit=1000, decay=0.01)
dat.pred<-predict(dat.nnet,dat,type="class")
table(dat.pred,dat\$survival)

Ns = summary(dat\$survival)
(Ns[1]/Ns[2])^-1

wts = rep(1,nrow(dat))
wts[which(dat\$survival=="no")]=45
dat.nnet<-nnet(survival~., weights=wts, dat, size=30, maxit=1000, decay=0.01)
dat.pred<-predict(dat.nnet,dat,type="class")
table(dat.pred,dat\$survival)

class.id<-class.ind(dat\$class)
x = dat[,1:6]
dat.nnet<-nnet(x,class.id,size=30, maxit=1000, decay=0.01, softmax=TRUE)
max.id = apply(dat.nnet\$fitted.values,1,which.max)
table(max.id,dat\$class)