データ解析基礎論B 人工神経回路

library(nnet)
set.seed(5)
x = dat[,1:3]
y = dat[,4]
dat.nnet = nnet(x,y, size = 150, linout= TRUE, maxit = 1000)
nnet.pred <-predict(dat.nnet,dat)
cor(dat.nnet\$fitted.values,dat\$sales)^2

n.data = nrow(dat);
n.sample = n.data*0.6;
n.rep = 100
trainNN.cor = rep(0,n.rep); trainLM.cor = rep(0,n.rep)
testNN.cor = rep(0,n.rep); testLM.cor = rep(0,n.rep)
for (i.rep in 1:n.rep){
randperm = sample(1:n.data)
train.idx = randperm[1:n.sample]
test.idx = randperm[(n.sample+1):n.data]
dat.nnet <- nnet(sales~.,size = c(10), linout=T,decay= 0.01, maxit=1000, data = dat[train.idx,])
dat.lm <-lm(sales~.,data=dat[train.idx, ])
trainNN.cor[i.rep] <- cor(predict(dat.nnet,dat[train.idx, ]), dat[train.idx,]\$sales)
trainLM.cor[i.rep] <- cor(predict(dat.lm,dat[train.idx, ]), dat[train.idx,]\$sales)
testNN.cor[i.rep] <- cor(predict(dat.nnet,dat[test.idx, ]), dat[test.idx,]\$sales)
testLM.cor[i.rep] <- cor(predict(dat.lm,dat[test.idx, ]), dat[test.idx,]\$sales)
}
print(c(mean(trainNN.cor,na.rm=T),mean(testNN.cor,na.rm=T)))
print(c(mean(trainLM.cor,na.rm=T),mean(testLM.cor,na.rm=T)))
print(c(max(trainNN.cor,na.rm=T),max(testNN.cor,na.rm=T)))
print(c(max(trainLM.cor,na.rm=T),max(testLM.cor,na.rm=T)))
print(c(min(trainNN.cor,na.rm=T),min(testNN.cor,na.rm=T)))
print(c(min(trainLM.cor,na.rm=T),min(testLM.cor,na.rm=T)))

dat.nnet<-nnet(class~.,dat,size=30,maxit=1000,decay=0.05)
dat.pred<-predict(dat.nnet,dat,type="class")
table(dat.pred,dat\$class)

dat.glm<-glm(class~., family="binomial",dat)
glm.pred<-predict(dat.glm, dat, type="response")>0.5
table(glm.pred,dat\$class)

wts = rep(1,nrow(dat))
wts[which(dat\$survival=="no")]=45
dat.nnet<-nnet(survival~., weights=wts, dat, size=100, maxit=1000, decay=0.1)
dat.pred<-predict(dat.nnet,dat,type="class")
table(dat.pred,dat\$survival)

class.id<-class.ind(dat\$class)
x = dat[,1:6]
dat.nnet<-nnet(x, class.id, size=30, maxit=1000, decay=0.01, softmax=TRUE)
max.id = apply(dat.nnet\$fitted.values,1,which.max)
table(max.id,dat\$class)

dat.nnet<-nnet(dat,dat,size=2, maxit=1000, decay=0.01, linout=TRUE)

データ解析基礎論B 判別分析＋

library(MASS)
dat<-data.frame(writing=c(68,85,50,54,66,35,56,25,43,70),
interview=c(65,80,95,70,75,55,65,75,50,40),
cl=c(rep("A",5),rep("N",5)))
dat.lda<-lda(cl~.,data=dat)

intcpt = (dat.lda\$scaling[1]*dat.lda\$means[1,1] +
dat.lda\$scaling[2]*dat.lda\$means[1,2] +
dat.lda\$scaling[1]*dat.lda\$means[2,1] +
dat.lda\$scaling[2]*dat.lda\$means[2,2])/2
z = as.matrix(dat[,1:2])%*%dat.lda\$scaling-intcpt

new.dim.slope = dat.lda\$scaling[1]/dat.lda\$scaling[2]

disc.intcpt = intcpt / dat.lda\$scaling[2]
disc.slope = -dat.lda\$scaling[1] / dat.lda\$scaling[2]

ggplot(dat, aes(x = writing, y= interview, color = cl)) +
geom_point(size = 4) +
geom_abline(aes(intercept = intcpt, slope = new.dim.slope )) +
geom_abline(aes(intercept = disc.intcpt, slope = disc.slope ),color = "red") + xlim(30,100)+ylim(30,100)

dat.lda<-lda(class~.,dat)
lda.pred<-predict(dat.lda,dat)
table(lda.pred\$class, dat\$class)

dat.lda<-lda(class~.,dat, CV=T)
dat.cl = dat.lda\$posterior[,1]>dat.lda\$posterior[,2]
table(dat.cl, dat\$class)

dat.lda=lda(class~.,data=dat)
lda.pred <- predict(dat.lda, dat)
plot(dat.lda, dimen=3, col=as.numeric(lda.pred\$class),cex=2)

dat.km<-kmeans(dat[,1:6],5)
table(lda.pred\$class,dat.km\$cluster)
plot(dat,col=dat.km\$cluster,pch=20)
plot(dat.lda, dimen=2, col=as.numeric(lda.pred\$class),cex=2)

dat1<-subset(dat,dat\$popularity<5)
dat2<-subset(dat,dat\$popularity>4 & dat\$popularity<6)
dat3<-subset(dat,dat\$popularity>6)
dat1\$popularity="LP";dat2\$popularity="MP";dat3\$popularity="VP"
datT=rbind(dat1,dat2,dat3)
datT.lda<-lda(popularity~.,datT)
datT.pred<-predict(datT.lda,datT)
table(datT.pred\$class,datT\$popularity)
par(mfrow=c(1,2))
plot(datT.lda,col=c(rep('red',20),rep('blue',28),rep('green',29)),cex=1.5)
plot(datT.lda,col=as.numeric(datT.pred\$class),cex=1.5)
par(mfrow=c(1,1))

library(nnet)
set.seed(5)
x = dat[,1:3]
y = dat[,4]
dat.nnet = nnet(x,y, size = 150, linout= TRUE, maxit = 1000)
nnet.pred <-predict(dat.nnet,dat)
cor(dat.nnet\$fitted.values,dat\$sales)^2

dat.lm<-lm(sales~.,data=dat)
plot(dat.nnet\$fitted.values, dat\$sales,pch=20,col='black')
points(predict(dat.lm), dat\$sales,pch=20,col='red')

n.data = nrow(dat);n.sample = n.data*0.6; n.rep = 100
trainNN.cor = rep(0,n.rep); trainLM.cor = rep(0,n.rep)
testNN.cor = rep(0,n.rep); testLM.cor = rep(0,n.rep)
for (i.rep in 1:n.rep){
randperm = sample(1:n.data)
train.idx = randperm[1:n.sample]
test.idx = randperm[(n.sample+1):n.data]
dat.nnet <- nnet(sales~.,size = 100, linout=T, maxit=1000, data = dat[train.idx,])
dat.lm <-lm(sales~.,data=dat[train.idx, ])
trainNN.cor[i.rep] <- cor(predict(dat.nnet,dat[train.idx, ]), dat[train.idx,]\$sales)
trainLM.cor[i.rep] <- cor(predict(dat.lm,dat[train.idx, ]), dat[train.idx,]\$sales)
testNN.cor[i.rep] <- cor(predict(dat.nnet,dat[test.idx, ]), dat[test.idx,]\$sales)
testLM.cor[i.rep] <- cor(predict(dat.lm,dat[test.idx, ]), dat[test.idx,]\$sales)
}
print(c(mean(trainNN.cor,na.rm=T),mean(testNN.cor,na.rm=T)))
print(c(mean(trainLM.cor,na.rm=T),mean(testLM.cor,na.rm=T)))

n.data = nrow(dat);n.sample = n.data*0.6; n.rep = 100
trainNN.cor = rep(0,n.rep); trainLM.cor = rep(0,n.rep)
testNN.cor = rep(0,n.rep); testLM.cor = rep(0,n.rep)
for (i.rep in 1:n.rep){
randperm = sample(1:n.data)
train.idx = randperm[1:n.sample]
test.idx = randperm[(n.sample+1):n.data]
dat.nnet <- nnet(sales~.,size = 30, linout=T,decay= 0.1, maxit=1000, data = dat[train.idx,])
dat.lm <-lm(sales~.,data=dat[train.idx, ])
trainNN.cor[i.rep] <- cor(predict(dat.nnet,dat[train.idx, ]), dat[train.idx,]\$sales)
trainLM.cor[i.rep] <- cor(predict(dat.lm,dat[train.idx, ]), dat[train.idx,]\$sales)
testNN.cor[i.rep] <- cor(predict(dat.nnet,dat[test.idx, ]), dat[test.idx,]\$sales)
testLM.cor[i.rep] <- cor(predict(dat.lm,dat[test.idx, ]), dat[test.idx,]\$sales)
}
print(c(mean(trainNN.cor,na.rm=T),mean(testNN.cor,na.rm=T)))
print(c(mean(trainLM.cor,na.rm=T),mean(testLM.cor,na.rm=T)))

dat.nnet<-nnet(class~.,dat,size=30,maxit=1000,decay=0.05)
dat.pred<-predict(dat.nnet,dat,type="class")
table(dat.pred,dat\$class)

dat.glm<-glm(class~., family="binomial",dat)
glm.pred<-predict(dat.glm, dat, type="response")>0.5
table(glm.pred,dat\$class)

dat.nnet<-nnet(survival~., dat, size=30, maxit=1000, decay=0.01)
dat.pred<-predict(dat.nnet,dat,type="class")
table(dat.pred,dat\$survival)

Ns = summary(dat\$survival)
(Ns[1]/Ns[2])^-1

wts = rep(1,nrow(dat))
wts[which(dat\$survival=="no")]=45
dat.nnet<-nnet(survival~., weights=wts, dat, size=30, maxit=1000, decay=0.01)
dat.pred<-predict(dat.nnet,dat,type="class")
table(dat.pred,dat\$survival)

class.id<-class.ind(dat\$class)
x = dat[,1:6]
dat.nnet<-nnet(x,class.id,size=30, maxit=1000, decay=0.01, softmax=TRUE)
max.id = apply(dat.nnet\$fitted.values,1,which.max)
table(max.id,dat\$class)

dat.nnet<-nnet(dat,dat,size=2, maxit=1000, decay=0.01, linout=TRUE)
cor(dat.nnet\$fitted.values,dat)

データ解析基礎論b テスト理論

install.packages("psych")
library("psych")
ca <- alpha(dat)

image(cor(dat2)[10:1,1:10])
ca2 <- alpha(dat2)
ca2

ca1_5 = alpha(dat[,1:5])
ca6_10 = alpha(dat[,6:10])

### not recommended ###
# FA
F1<-factanal(dat[,1:5],1)
F2<-factanal(dat[,6:10],1)

library(sem)
fa.model=cfa(reference.indicator=FALSE)
F1: q1,q2,q3,q4,q5
F2: q6,q7,q8,q9,q10

fa.model<-update(fa.model)
delete, F1<->F2

fa.result<-sem(fa.model, cov(dat2), 300)
summary(fa.result)

### recommended approach
install.packages("ltm")
library(ltm)
descript(dat)

irt1P<-rasch(dat)
irt1P
summary(irt1P)
theta = factor.scores(irt1P)

GoF.rasch(irt1P)
person.fit(irt1P)
item.fit(irt1P)
cor(rowSums(theta[[1]][,1:10]),theta[[1]]\$z1)

## 2p IRT
irt2P<-ltm(dat~z1)
irt2P
plot(irt2P)
person.fit(irt2P)
item.fit(irt2P)
anova(irt1P,irt2P)

## jisshu

データ解析基礎論B LogLinear Analysis

freq<-c(33,37,7,23)
pref<-factor(c('soba','udon','soba','udon'))
region<-factor(c('east','east','west','west'))
dat<-data.frame(pref,region,freq)
dat.table=table(pref,region)
dat.table[cbind(pref,region)]<-freq

ME.lrt=2*sum((freq)*log(freq/25))

dat.loglinCE_A<-loglin(dat.table,　list(1), fit=T,param=T)
1-pchisq(dat.loglinCE_A\$lrt, dat.loglinCE_A\$df)

dat.loglinCE_B<-loglin(dat.table,list(2), fit=T,param=T)
1-pchisq(dat.loglinCE_B\$lrt, dat.loglinCE_B\$df)

dat.loglinIND<-loglin(dat.table,list(1,2), fit=T,param=T)
1-pchisq(dat.loglinIND\$lrt, dat.loglinIND\$df)

dat.loglinSAT<-loglin(dat.table,list(c(1,2)), fit=T,param=T)

ME.lrt-dat.loglinCE_A\$lrt
1-pchisq(ME.lrt-dat.loglinCE_A\$lrt,1)
dat.loglinCE_A\$lrt-dat.loglinIND\$lrt
1-pchisq(dat.loglinCE_A\$lrt-dat.loglinIND\$lrt,1)
dat.loglinIND\$lrt-dat.loglinSAT\$lrt
1-pchisq(dat.loglinIND\$lrt-dat.loglinSAT\$lrt,1)

freq<-c(9,5,2,4,16,10,26,28)
gener<-factor(c(rep('female',4),c(rep('male',4))))
affil<-factor(rep(c('L','L','E','E'),2))
circle<-factor(rep(c('tennis','astro'),4))
dat<-data.frame(gener,affil,circle,freq)

dat.table<-table(gender,affil,circle)
dat.table[cbind(gender,affil,circle)]<-freq

dat.loglin2<-loglin(dat.table,list(1), fit=T,param=T)
dat.loglin3<-loglin(dat.table,list(1,3), fit=T,param=T)
dat.loglin4<-loglin(dat.table,list(1,2,3), fit=T,param=T)
dat.loglin5<-loglin(dat.table,list(c(1,3),2), fit=T,param=T)
dat.loglin6<-loglin(dat.table,list(c(1,3),c(1,2)), fit=T,param=T)
dat.loglin7<-loglin(dat.table,list(c(1,3),c(1,2),c(2,3)), fit=T,param=T)
dat.loglin8<-loglin(dat.table,list(c(1,2,3)), fit=T,param=T)

source('http://peach.l.chiba-u.ac.jp/course_folder/cuUtil02.R')

dat.tab <- table(dat)
dat.LL <- loglin(dat.tab, list(c(1,4),c(2,4),c(3,4)), fit = T, param=T)
1-pchisq(dat.LL\$lrt, dat.LL\$df)

データ解析基礎論B GLM

library(tidyverse)
ggplot(dat) +
geom_point(aes(x = study, y = pass), size =3) +
xlab("Hours studied") +
ylab("Passing")

dat.lm <- lm(pass~study,dat)
ggplot(dat,aes(x = study, y = pass)) +
geom_point(size =3) +
xlab("Hours studied") +
ylab("Passing") +
geom_abline(slope = dat.lm\$coefficients[2], intercept = dat.lm\$coefficients[1]+1, color = "red",size = 2)
par(mfrow=c(2,2))
plot(dat.lm)

real.p = data.frame( real.p = table(dat\$pass, dat\$study)[2,] / colSums(table(dat\$pass, dat\$study)), x = 0:30)
ggplot(real.p,aes(x = x, y = real.p)) +
geom_point(size =3) +
xlab("Hours studied") +
ylab("Passing (actual probability)")

dat.lr <- glm(pass~study,family=binomial, data=dat)
summary(dat.lr)
coef = coefficients(dat.lr)
temp.x = seq(0,30,0.1)
pred.pass.p = data.frame(pred.p = 1/(1+exp(-(coef[1]+coef[2]*temp.x))), x = temp.x)
ggplot(dat, aes(x = study,y = pass)) +
geom_point(size=3) +
geom_line(aes(x =x, y= pred.p + 1), data = pred.pass.p, color = 'red',size = 1)+
xlab("Hours studied") +
ylab("Passing")

##
pred.pass.p = 1/(1+exp(-(coef[1]+coef[2]*c(10:15))))
odds=pred.pass.p/(1-pred.pass.p)
exp(coef[2])
odds[2:6]/odds[1:5]

dat.lr<-glm(gender~shoesize,family=binomial,data=dat)
anova(dat.lr, test ="Chisq")
dat.lr0<-glm(gender~1,family="binomial",data=dat)
dat.lrS<-glm(gender~shoesize,family=binomial,data=dat)
dat.lrh<-glm(gender~h,family="binomial",data=dat)

M=matrix(c(52,48,8,42),nrow=2)
rownames(M)<-c("present", "absent")
colnames(M)<-c("smoker",'non-smoker')
dat<-as.data.frame((as.table(M)))
colnames(dat)<-c("cancer","smoking","freq")
dat=dat[rep(1:nrow(dat),dat\$freq),1:2]
rownames(dat)<-c()

dat.glm<-glm(cancer~smoking,family=binomial,data=dat)

dat.glm<-glm(survival~age, family=binomial,data=dat)
dat.glm2<-glm(survival~Ncigarettes,family=binomial,data=dat)
dat.glm3<-glm(survival~NdaysGESTATION,family=binomial,data=dat)
dat.glmAllMult=glm(survival~age*Ncigarettes*NdaysGESTATION,family=binomial,data=dat)
library(MASS)
stepAIC(dat.glmAllMult)

ggplot(dat) +
geom_histogram(aes(eye.count), fill='red') +
xlab("Number of times looking at eyes")

ggplot(dat, aes(x=attr,  y = eye.count)) +
geom_point(size =2) +
ylab("Number of times looking at eyes")+
xlab("Attractiveness") +
geom_abline(slope = dat.lm\$coefficients[2], intercept = dat.lm\$coefficients[1], color = "red",size = 2)

dat.lm <- lm(eye.count~attr,data = dat)
dat.pr<-glm(eye.count~attr,family = "poisson",data=dat)
cf = coefficients(dat.pr)
x.temp <- seq(0,10,0.1)
pred.c = data.frame(x=x.temp, pred = exp(cf[1]+cf[2]*x.temp))
ggplot(dat, aes(x=attr,  y = eye.count)) +
geom_point(size =2) +
ylab("Number of times looking at eyes")+
xlab("Attractiveness") +
geom_abline(slope = dat.lm\$coefficients[2], intercept = dat.lm\$coefficients[1], color = "red",size = 2)+
geom_line(aes(x = x.temp, y= pred),data = pred.c, color="blue", size=2)

データ解析基礎論B 多変量解析

install.packages("ggfortify")
install.packages("ggdendro")
library(ggfortify)
library(ggdendro)

# pca
dat.pca<-princomp(dat)
autoplot(dat.pca, label = TRUE, label.size = 6,
autoplot(dat.pca, shape = FALSE, label.size = 6,

cldata<-data.frame(var1=c(4,1,5,1,5), var2=c(1,5,4,3,1))
rownames(cldata)  = c("A","B","C","D","E")
autoplot(dist(cldata))
cldata.cluster=hclust(dist(cldata),method="average")
ggdendrogram(cldata.cluster, rotate = T, theme_dendro = FALSE)+ xlab("Individual")

autoplot(dist(dat))
dat.cluster=hclust(dist(dat))
ggdendrogram(dat.cluster, rotate = T, theme_dendro = FALSE)+ xlab("Occupation")
dat.pca = princomp(dat)
autoplot(dat.pca, label = TRUE, shape = FALSE, label.size = 4,

dat.HC.S=hclust(dist(dat), method = "single")
dat.HC.C=hclust(dist(dat), method = "complete")
dat.HC.A=hclust(dist(dat), method = "average")
dat.HC.W=hclust(dist(dat), method = "ward.D")
ggdendrogram(dat.HC.S, rotate = T, theme_dendro = FALSE)+ xlab("Occupation")+ggtitle("Method = Single")
ggdendrogram(dat.HC.C, rotate = T, theme_dendro = FALSE)+ xlab("Occupation")+ggtitle("Method = Complete")
ggdendrogram(dat.HC.A, rotate = T, theme_dendro = FALSE)+ xlab("Occupation")+ggtitle("Method = Average")
ggdendrogram(dat.HC.W, rotate = T, theme_dendro = FALSE)+ xlab("Occupation")+ggtitle("Method = Ward's MV")

dat.kmeans=kmeans(dat, centers=3, nstart=10)
pairs(dat,
main = "Clustering Occupations",
pch = 21,
bg = c("red", "blue", "green")
[unclass(dat.kmeans\$cluster)])
autoplot(dat.kmeans, dat, size = 3, label = TRUE, label.size = 5)

source("http://www.matsuka.info/univ/course_folder/cuUtil02.R")
res<-cu.KMC.rep(dat,10,100)

autoplot(dat.kmeans, dat, frame = TRUE, frame.type = 'norm') + ylim(-0.7,0.7)+xlim(-1.2,0.7)
autoplot(dat.kmeans, dat, frame = TRUE)+ ylim(-0.7,0.7)+xlim(-1.2,0.7)

dat<-data.frame(writing=c(68,85,50,54,66,35,56,25,43,70),
interview=c(65,80,95,70,75,55,65,75,50,40),
cl=c(rep("A",5),rep("N",5)))
library(MASS)
dat.lda<-lda(cl~.,data=dat)
intcpt = (dat.lda\$scaling[1]*dat.lda\$means[1,1]+dat.lda\$scaling[2]*dat.lda\$means[1,2]+
dat.lda\$scaling[1]*dat.lda\$means[2,1]+dat.lda\$scaling[2]*dat.lda\$means[2,2])/2
new.dim.slope = dat.lda\$scaling[1]/dat.lda\$scaling[2]

disc.intcpt = intcpt / dat.lda\$scaling[2]
disc.slope = -dat.lda\$scaling[1] / dat.lda\$scaling[2]

ggplot(dat, aes(x = writing, y= interview, color = cl)) +
geom_point(size = 4) +
geom_abline(aes(intercept = intcpt, slope = new.dim.slope )) +
geom_abline(aes(intercept = disc.intcpt, slope = disc.slope ),color = "red") + xlim(30,100)+ylim(30,100)

dat.lda<-lda(class~.,dat)
lda.pred<-predict(dat.lda,dat)
table(lda.pred\$class, dat\$class)
dat.ldaCV<-lda(class~.,dat, CV=T)

dat.lda=lda(class~.,data=dat)

lda.pred <- predict(dat.lda)\$x  %>% as.data.frame %>% cbind(class = dat\$class)
ggplot(lda.pred) + geom_point(aes(x=LD1, y=LD2, color = class), size = 2.5)

dat<-data.frame(p1=c(4,1,5,1,5),p2=c(1,5,4,3,1))
rownames(dat)<-c('a','b','c','d','e')
dat.mds<-cmdscale(dist(dat),2)
ggplot(dat.mds, aes(x = dat.mds[,1],y = dat.mds[,2])) +
geom_text(aes(label = row.names(dat.mds)), size = 6)

データ解析基礎論B W05 Factor Analysis

chisq.test(c(72,23,16,49),p=rep(40,4),rescale.p=F)
chisq.test(c(72,23,16,49),p=rep(0.25,4),rescale.p=F)

M=matrix(c(52,48,8,42),nrow=2)

chisq.test(M,correct=T)

#(abs(52-40)-0.5)^2/40+(abs(48-60)-0.5)^2/60
# +(abs(8-20)-0.5)^2/20+(abs(42-30)-0.5)^2/30

dat.fa<-factanal(dat,1)

dat.pca<-princomp(dat)
dat.fa<-factanal(dat,1)

dat.fa<-factanal(dat,1,score="regression")
plot(dat.fa\$score~dat.pca\$score[,1],pch=20,cex=2,xlab="Component Score", ylab="Factor Score")

fa_pca.scores = tibble(fa = dat.fa\$scores, pca = dat.pca\$scores[,1], total.score = rowSums(dat))
ggplot(fa_pca.scores) +
geom_point(aes(x = fa, y  = pca), size = 3) +
xlab("Factor Score") + ylab("Component Score")

cor(dat.fa\$score,dat.pca\$score)

ggplot(fa_pca.scores) +
geom_point(aes(x = fa, y  = total.score), size = 3) +
xlab("Factor Score") + ylab("Total Score")

dat.faWOR<-factanal(dat,2, rotation="none", score="regression")
dat.faWR<-factanal(dat,2, rotation="varimax", score="regression")

facet_wrap(~ factor, nrow=1) +
geom_bar(stat="identity") +
coord_flip() +
high = "blue", mid = "white", low = "red",
midpoint=0, guide=F) +

facet_wrap(~ factor, nrow=1) +
geom_bar(stat="identity") +
coord_flip() +
high = "blue", mid = "white", low = "red",
midpoint=0, guide=F) +

geom_point(size = 3, color = "red") +
geom_vline(xintercept=0) +
geom_hline(yintercept=0) +
ylim(-1.1, 1.1) + xlim(-1.1, 1.1)

dat.model1<-factanal(dat,1)
dat.model2<-factanal(dat,2)
dat.model3<-factanal(dat,3)
dat.model4<-factanal(dat,4)

source("http://www.matsuka.info/univ/course_folder/cuUtil02.R")
cu.lrtest.csq(dat.model3,dat.model4)
cu.AIC.csq(dat.model1)

library(sem)

model01=cfa(reference.indicator=FALSE)
F1:extrovert,cheerful, leadership, antisocial, talkative, motivated, hesitance, popularity

cv.mat = cov(dat)
mod1<-sem(model01,cv.mat,100)

model02=cfa(reference.indicator=FALSE)
F2: cheerful, antisocial, talkative, popularity

mod2<-sem(model02, cov(dat), nrow(dat))

opt <- options(fit.indices = c("RMSEA"))
summary(mod2)

データ解析基礎論B PCA

dat.pca<-princomp(dat)
summary(dat.pca)
biplot(dat.pca)

dat.pca<-princomp(dat)

plot(dat,pch=20)
dat.pca<-princomp(dat)
screeplot(dat.pca,type="lines")

dist = c(100,200,400,800,1500,5000,10000,42195)
log.dist=log(dist)

データ解析基礎論B W03 R graphics

library(tidyverse)

# CLT
NperSample = 10
SampleSize = 300000

runif(NperSample * SampleSize) %>%
matrix(nrow=NperSample) %>%
colMeans() %>% tibble(sample.mean = .) -> means

ggplot(means,aes(x = sample.mean, y = ..density..)) +
geom_histogram(bins=200) +
geom_density(colour = "orange",size=2)

ggplot(means,aes(x = sample.mean, y = ..density..)) +
geom_histogram(bins=200) +
geom_line(stat = "density", colour = "orange",size=2)

runif(NperSample * SampleSize) %>%
matrix(nrow=NperSample) %>%
colMeans() %>% tibble(sample.mean = .) %>%
ggplot(., aes(x = sample.mean, y = ..density..)) +
geom_histogram(bins=100,colour = "grey20") +
geom_line(stat = "density", colour = "skyblue",size=2)

dt <- as_tibble(dat)

ggplot(dt, aes(x = Hworked, y = nbooks)) +
geom_point(size = 3)

ggplot(dt) +
geom_point(aes(x = Hworked, y = nbooks, color = grade),size = 3)

ggplot(dt) +
geom_point(aes(x = Hworked, y = nbooks, shape = grade),size = 5)

ggplot(dt) +
geom_point(aes(x = Hworked, y = nbooks),size = 5) +

ggplot(dt) +
geom_smooth(aes(x = Hworked, y = nbooks))

ggplot(dt) +
geom_smooth(aes(x = Hworked, y = nbooks, linetype = grade))

ggplot(dt) +
geom_smooth(aes(x = Hworked, y = nbooks)) +

ggplot(dt) +
geom_smooth(aes(x = Hworked, y = nbooks)) +
geom_point(aes(x = Hworked, y = nbooks), size = 4)

ggplot(dt) +
geom_smooth(aes(x = Hworked, y = nbooks), colour = "black", se = FALSE) +
geom_point(aes(x = Hworked, y = nbooks, color = grade), size = 4)

ggplot(dt) +
geom_smooth(aes(x = Hworked, y = nbooks, color = grade), se = FALSE) +
geom_point(aes(x = Hworked, y = nbooks, color = grade), size = 4)

plot1 <- ggplot(dt) +
geom_smooth(aes(x = Hworked, y = nbooks, color = grade), se = FALSE) +
geom_point(aes(x = Hworked, y = nbooks, color = grade), size = 4)
plot1 + xlab("Hours worked") + ylab("Number of books read")

plot1 + xlab("Hours worked") +  ylab("Number of books read") +
theme(axis.title.x = element_text(face = "italic",size = 14, colour = "navy"),
axis.title.y = element_text(face = "bold",size = 10, colour = "darkgreen"))

ggplot(filter(dt, affil == "LA")) +
geom_point(aes(x = Hworked, y = nbooks, color = grade), size = 4)

group_by(dt, grade) %>% summarize(ave.books = mean(nbooks, na.rm = T)) %>%
ggplot() + geom_bar(aes(x = grade, y = ave.books), stat = "identity")

group_by(dt, grade) %>% summarize(ave.books = mean(nbooks, na.rm = T)) %>%
ggplot() + geom_bar(aes(x = grade, y = ave.books), stat = "identity")

group_by(dt, grade) %>% summarize(ave.books = mean(nbooks, na.rm = T),
se = sd(nbooks, na.rm =T)/n()) %>%
ggplot(aes(x = grade, y = ave.books)) +
geom_bar(stat = "identity", fill = "grey70") +
geom_errorbar(aes(ymin = ave.books - se, ymax = ave.books +se), width = 0.2) +

ggplot(dt,aes(x = Hworked, y = nbooks)) +
stat_density2d(aes(colour =..level..)) +
geom_point()

ggplot(dt,aes(x = Hworked, y = nbooks)) +
stat_density2d(aes(alpha =..density..), geom="tile",contour=F) +
geom_point(alpha =0.4)

ggplot(dt) +
stat_summary(aes(x = grade, y = nbooks),
fun.y = mean,
fun.ymin = function(x) mean(x) - sd(x),
fun.ymax = function(x) mean(x) + sd(x))

ggplot(dt) +
geom_boxplot(aes(x = grade, y = nbooks))
ggplot(dt) +
geom_boxplot(aes(x = grade, y = nbooks)) +
coord_flip()

dt <- as_tibble(dat)
dt.lm <- lm(h~shoesize, dt)
cfs <- coef(dt.lm)
ggplot(dt, aes(x = shoesize, y = h)) +
geom_point() +
geom_abline(intercept = cfs[1], slope = cfs[2], col = "red") +
geom_text( x= 22, y =175, aes(label = paste("r^2  =",round(summary(dt.lm)\$r.squared,3))))

データ解析基礎論B W02

install.packages("tidyverse")
library(tidyverse)

random.number <- rnorm(1000)
mean(random.number)
mean(random.number <- rnorm(1000))

rnorm(1000) %>% mean()

# CLT
NperSample = 10
SampleSize = 300000

random.number <- runif(NperSample * SampleSize)
dat <- matrix(random.number, nrow=NperSample)
means <- colMeans(dat)
dens <- density(means)
hist(means, breaks = 100, probability = T, main = "Distributionf of Means")
lines(dens, lwd = 3, col = "orange")

runif(NperSample * SampleSize) %>%
matrix(nrow=NperSample) %>%
colMeans() -> means
hist(means, breaks=100,probability = T, main = "Distributionf of Means")
means %>% density() %>% lines(,lwd=3,col='orange')

histWdensity <- function(dat, nbreaks=30, main.txt){
dens <- density(dat)
hist(dat, breaks = nbreaks, probability = T, main = main.txt)
lines(dens, lwd = 3, col = "orange")
}

runif(NperSample * SampleSize) %>%
matrix(nrow=NperSample) %>%
colMeans() %>%
histWdensity(nbreaks = 100, main.txt = "Distributionf of Means")

dt <- as_tibble(dat)
dt.la <- filter(dt, affil == "LA")

dt.la2 <- filter(dt, affil == "LA" & grade == "SP")
dt.laNot2 <- filter(dt, affil == "LA" & grade != "SP")

dtWOgender <- select(dt, -gender)

dt.weekly <- mutate(dt,nbooksWeek = nbooks / 52)

dt.atLeastOneBook <- mutate(dt, atleastOneBook = (nbooks/52) >= 1)
dt.atLeastOneBook <- mutate(dt, atleastOneBook = (nbooks/12) >= 1)

dt.BWindex = mutate(dt, nbooksWeek = nbooks / 52,
idx = nbooksWeek / (12*7-Hworked))

summarize(dt.byGrade, ave.books = mean(nbooks,na.rm = TRUE),
ave.Hworked = mean(Hworked, na.rm = TRUE))

dt.summ <- summarize(dt.byGrAf, ave.books = mean(nbooks,na.rm = TRUE),
ave.Hworked = mean(Hworked, na.rm = TRUE), N = n())

dt.summ2 <- dt %>%
summarize(ave.books = mean(nbooks,na.rm = TRUE),
ave.Hworked = mean(Hworked, na.rm = TRUE),
N = n()) %>% filter(N > 2) %>% arrange(desc(ave.books))

plot(x = dt.summ2\$ave.books, y = dt.summ2\$ave.Hworked, pch=20, cex = 3,
xlab = "Ave. # books read",ylab = "Ave hours worked")

col_names = TRUE)

dt.summ3 <- dt %>%
summarize(ave.books = mean(nbooks,na.rm = TRUE),
ave.Hworked = mean(Hworked, na.rm = TRUE))

dt.summ3G <- dt.summ3 %>% gather('ave.books', 'ave.Hworked',
key = 'ave.values', value = "BorW")

dt.summ3SformG <- spread(dt.summ3G, key = ave.values, value =BorW)

dt.sumLA <- dt %>% filter(affil=="LA") %>% group_by(grade) %>%
summarize(ave.books = mean(nbooks))

toeic <- tibble(
score = c(800,830),
)

new.dt1 <- dt.sumLA %>% inner_join(toeic, by = "grade")