library(nnet) dat<-read.csv("http://www.matsuka.info/data_folder/tdkReg01.csv") set.seed(5) x = dat[,1:3] y = dat[,4] dat.nnet = nnet(x,y, size = 150, linout= TRUE, maxit = 1000) nnet.pred <-predict(dat.nnet,dat) cor(dat.nnet$fitted.values,dat$sales)^2 n.data = nrow(dat); n.sample = n.data*0.6; n.rep = 100 trainNN.cor = rep(0,n.rep); trainLM.cor = rep(0,n.rep) testNN.cor = rep(0,n.rep); testLM.cor = rep(0,n.rep) for (i.rep in 1:n.rep){ randperm = sample(1:n.data) train.idx = randperm[1:n.sample] test.idx = randperm[(n.sample+1):n.data] dat.nnet <- nnet(sales~.,size = c(10), linout=T,decay= 0.01, maxit=1000, data = dat[train.idx,]) dat.lm <-lm(sales~.,data=dat[train.idx, ]) trainNN.cor[i.rep] <- cor(predict(dat.nnet,dat[train.idx, ]), dat[train.idx,]$sales) trainLM.cor[i.rep] <- cor(predict(dat.lm,dat[train.idx, ]), dat[train.idx,]$sales) testNN.cor[i.rep] <- cor(predict(dat.nnet,dat[test.idx, ]), dat[test.idx,]$sales) testLM.cor[i.rep] <- cor(predict(dat.lm,dat[test.idx, ]), dat[test.idx,]$sales) } print(c(mean(trainNN.cor,na.rm=T),mean(testNN.cor,na.rm=T))) print(c(mean(trainLM.cor,na.rm=T),mean(testLM.cor,na.rm=T))) print(c(max(trainNN.cor,na.rm=T),max(testNN.cor,na.rm=T))) print(c(max(trainLM.cor,na.rm=T),max(testLM.cor,na.rm=T))) print(c(min(trainNN.cor,na.rm=T),min(testNN.cor,na.rm=T))) print(c(min(trainLM.cor,na.rm=T),min(testLM.cor,na.rm=T))) dat<-read.csv("http://matsuka.info/data_folder/tdkDA01.csv", header=T) dat.nnet<-nnet(class~.,dat,size=30,maxit=1000,decay=0.05) dat.pred<-predict(dat.nnet,dat,type="class") table(dat.pred,dat$class) dat.glm<-glm(class~., family="binomial",dat) glm.pred<-predict(dat.glm, dat, type="response")>0.5 table(glm.pred,dat$class) dat<-read.csv("http://www.matsuka.info/data_folder/cda7-16.csv") wts = rep(1,nrow(dat)) wts[which(dat$survival=="no")]=45 dat.nnet<-nnet(survival~., weights=wts, dat, size=100, maxit=1000, decay=0.1) dat.pred<-predict(dat.nnet,dat,type="class") table(dat.pred,dat$survival) dat<-read.csv("http://matsuka.info/data_folder/tdkDA02.csv",header=T) class.id<-class.ind(dat$class) x = dat[,1:6] dat.nnet<-nnet(x, class.id, size=30, maxit=1000, decay=0.01, softmax=TRUE) max.id = apply(dat.nnet$fitted.values,1,which.max) table(max.id,dat$class) dat<-read.table("http://www.matsuka.info/data_folder/tdkPCA01.txt") dat.nnet<-nnet(dat,dat,size=2, maxit=1000, decay=0.01, linout=TRUE)
Category Archives: データ解析
データ解析基礎論B 判別分析+
library(MASS) dat<-data.frame(writing=c(68,85,50,54,66,35,56,25,43,70), interview=c(65,80,95,70,75,55,65,75,50,40), cl=c(rep("A",5),rep("N",5))) dat.lda<-lda(cl~.,data=dat) intcpt = (dat.lda$scaling[1]*dat.lda$means[1,1] + dat.lda$scaling[2]*dat.lda$means[1,2] + dat.lda$scaling[1]*dat.lda$means[2,1] + dat.lda$scaling[2]*dat.lda$means[2,2])/2 z = as.matrix(dat[,1:2])%*%dat.lda$scaling-intcpt new.dim.slope = dat.lda$scaling[1]/dat.lda$scaling[2] disc.intcpt = intcpt / dat.lda$scaling[2] disc.slope = -dat.lda$scaling[1] / dat.lda$scaling[2] ggplot(dat, aes(x = writing, y= interview, color = cl)) + geom_point(size = 4) + geom_abline(aes(intercept = intcpt, slope = new.dim.slope )) + geom_abline(aes(intercept = disc.intcpt, slope = disc.slope ),color = "red") + xlim(30,100)+ylim(30,100) dat<-read.csv("http://matsuka.info/data_folder/tdkDA01.csv", header=T) dat.lda<-lda(class~.,dat) lda.pred<-predict(dat.lda,dat) table(lda.pred$class, dat$class) dat.lda<-lda(class~.,dat, CV=T) dat.cl = dat.lda$posterior[,1]>dat.lda$posterior[,2] table(dat.cl, dat$class) dat<-read.csv("http://matsuka.info/data_folder/tdkDA02.csv",header=T) dat.lda=lda(class~.,data=dat) lda.pred <- predict(dat.lda, dat) plot(dat.lda, dimen=3, col=as.numeric(lda.pred$class),cex=2) dat.km<-kmeans(dat[,1:6],5) table(lda.pred$class,dat.km$cluster) plot(dat,col=dat.km$cluster,pch=20) plot(dat.lda, dimen=2, col=as.numeric(lda.pred$class),cex=2) dat<-read.csv("http://www.matsuka.info/data_folder/tdkCFA.csv") dat1<-subset(dat,dat$popularity<5) dat2<-subset(dat,dat$popularity>4 & dat$popularity<6) dat3<-subset(dat,dat$popularity>6) dat1$popularity="LP";dat2$popularity="MP";dat3$popularity="VP" datT=rbind(dat1,dat2,dat3) datT.lda<-lda(popularity~.,datT) datT.pred<-predict(datT.lda,datT) table(datT.pred$class,datT$popularity) par(mfrow=c(1,2)) plot(datT.lda,col=c(rep('red',20),rep('blue',28),rep('green',29)),cex=1.5) plot(datT.lda,col=as.numeric(datT.pred$class),cex=1.5) par(mfrow=c(1,1)) library(nnet) dat<-read.csv("http://www.matsuka.info/data_folder/tdkReg01.csv") set.seed(5) x = dat[,1:3] y = dat[,4] dat.nnet = nnet(x,y, size = 150, linout= TRUE, maxit = 1000) nnet.pred <-predict(dat.nnet,dat) cor(dat.nnet$fitted.values,dat$sales)^2 dat.lm<-lm(sales~.,data=dat) plot(dat.nnet$fitted.values, dat$sales,pch=20,col='black') points(predict(dat.lm), dat$sales,pch=20,col='red') n.data = nrow(dat);n.sample = n.data*0.6; n.rep = 100 trainNN.cor = rep(0,n.rep); trainLM.cor = rep(0,n.rep) testNN.cor = rep(0,n.rep); testLM.cor = rep(0,n.rep) for (i.rep in 1:n.rep){ randperm = sample(1:n.data) train.idx = randperm[1:n.sample] test.idx = randperm[(n.sample+1):n.data] dat.nnet <- nnet(sales~.,size = 100, linout=T, maxit=1000, data = dat[train.idx,]) dat.lm <-lm(sales~.,data=dat[train.idx, ]) trainNN.cor[i.rep] <- cor(predict(dat.nnet,dat[train.idx, ]), dat[train.idx,]$sales) trainLM.cor[i.rep] <- cor(predict(dat.lm,dat[train.idx, ]), dat[train.idx,]$sales) testNN.cor[i.rep] <- cor(predict(dat.nnet,dat[test.idx, ]), dat[test.idx,]$sales) testLM.cor[i.rep] <- cor(predict(dat.lm,dat[test.idx, ]), dat[test.idx,]$sales) } print(c(mean(trainNN.cor,na.rm=T),mean(testNN.cor,na.rm=T))) print(c(mean(trainLM.cor,na.rm=T),mean(testLM.cor,na.rm=T))) n.data = nrow(dat);n.sample = n.data*0.6; n.rep = 100 trainNN.cor = rep(0,n.rep); trainLM.cor = rep(0,n.rep) testNN.cor = rep(0,n.rep); testLM.cor = rep(0,n.rep) for (i.rep in 1:n.rep){ randperm = sample(1:n.data) train.idx = randperm[1:n.sample] test.idx = randperm[(n.sample+1):n.data] dat.nnet <- nnet(sales~.,size = 30, linout=T,decay= 0.1, maxit=1000, data = dat[train.idx,]) dat.lm <-lm(sales~.,data=dat[train.idx, ]) trainNN.cor[i.rep] <- cor(predict(dat.nnet,dat[train.idx, ]), dat[train.idx,]$sales) trainLM.cor[i.rep] <- cor(predict(dat.lm,dat[train.idx, ]), dat[train.idx,]$sales) testNN.cor[i.rep] <- cor(predict(dat.nnet,dat[test.idx, ]), dat[test.idx,]$sales) testLM.cor[i.rep] <- cor(predict(dat.lm,dat[test.idx, ]), dat[test.idx,]$sales) } print(c(mean(trainNN.cor,na.rm=T),mean(testNN.cor,na.rm=T))) print(c(mean(trainLM.cor,na.rm=T),mean(testLM.cor,na.rm=T))) dat<-read.csv("http://matsuka.info/data_folder/tdkDA01.csv", header =T) dat.nnet<-nnet(class~.,dat,size=30,maxit=1000,decay=0.05) dat.pred<-predict(dat.nnet,dat,type="class") table(dat.pred,dat$class) dat.glm<-glm(class~., family="binomial",dat) glm.pred<-predict(dat.glm, dat, type="response")>0.5 table(glm.pred,dat$class) dat<-read.csv("http://www.matsuka.info/data_folder/cda7-16.csv") dat.nnet<-nnet(survival~., dat, size=30, maxit=1000, decay=0.01) dat.pred<-predict(dat.nnet,dat,type="class") table(dat.pred,dat$survival) Ns = summary(dat$survival) (Ns[1]/Ns[2])^-1 wts = rep(1,nrow(dat)) wts[which(dat$survival=="no")]=45 dat.nnet<-nnet(survival~., weights=wts, dat, size=30, maxit=1000, decay=0.01) dat.pred<-predict(dat.nnet,dat,type="class") table(dat.pred,dat$survival) dat<-read.csv("http://matsuka.info/data_folder/tdkDA02.csv",header=T) class.id<-class.ind(dat$class) x = dat[,1:6] dat.nnet<-nnet(x,class.id,size=30, maxit=1000, decay=0.01, softmax=TRUE) max.id = apply(dat.nnet$fitted.values,1,which.max) table(max.id,dat$class) dat<-read.table("http://www.matsuka.info/data_folder/tdkPCA01.txt") dat.nnet<-nnet(dat,dat,size=2, maxit=1000, decay=0.01, linout=TRUE) cor(dat.nnet$fitted.values,dat)
データ解析基礎論b テスト理論
install.packages("psych") library("psych") dat<-read.table("http://peach.l.chiba-u.ac.jp/course_folder/survey_data01.txt") ca <- alpha(dat) dat2<-read.csv("http://peach.l.chiba-u.ac.jp/course_folder/survey2.csv") image(cor(dat2)[10:1,1:10]) ca2 <- alpha(dat2) ca2 ca1_5 = alpha(dat[,1:5]) ca6_10 = alpha(dat[,6:10]) ### not recommended ### # FA F1<-factanal(dat[,1:5],1) F2<-factanal(dat[,6:10],1) library(sem) fa.model=cfa(reference.indicator=FALSE) F1: q1,q2,q3,q4,q5 F2: q6,q7,q8,q9,q10 fa.model<-update(fa.model) delete, F1<->F2 fa.result<-sem(fa.model, cov(dat2), 300) summary(fa.result) ### recommended approach install.packages("ltm") library(ltm) dat<-read.table("http://peach.l.chiba-u.ac.jp/course_folder/survey_data01.txt")-1 descript(dat) irt1P<-rasch(dat) irt1P summary(irt1P) theta = factor.scores(irt1P) GoF.rasch(irt1P) person.fit(irt1P) item.fit(irt1P) cor(rowSums(theta[[1]][,1:10]),theta[[1]]$z1) ## 2p IRT irt2P<-ltm(dat~z1) irt2P plot(irt2P) person.fit(irt2P) item.fit(irt2P) anova(irt1P,irt2P) ## jisshu dat <- read.csv("http://peach.l.chiba-u.ac.jp/course_folder/test_data01.csv")
データ解析基礎論B LogLinear Analysis
freq<-c(33,37,7,23) pref<-factor(c('soba','udon','soba','udon')) region<-factor(c('east','east','west','west')) dat<-data.frame(pref,region,freq) dat.table=table(pref,region) dat.table[cbind(pref,region)]<-freq ME.lrt=2*sum((freq)*log(freq/25)) dat.loglinCE_A<-loglin(dat.table, list(1), fit=T,param=T) 1-pchisq(dat.loglinCE_A$lrt, dat.loglinCE_A$df) dat.loglinCE_B<-loglin(dat.table,list(2), fit=T,param=T) 1-pchisq(dat.loglinCE_B$lrt, dat.loglinCE_B$df) dat.loglinIND<-loglin(dat.table,list(1,2), fit=T,param=T) 1-pchisq(dat.loglinIND$lrt, dat.loglinIND$df) dat.loglinSAT<-loglin(dat.table,list(c(1,2)), fit=T,param=T) ME.lrt-dat.loglinCE_A$lrt 1-pchisq(ME.lrt-dat.loglinCE_A$lrt,1) dat.loglinCE_A$lrt-dat.loglinIND$lrt 1-pchisq(dat.loglinCE_A$lrt-dat.loglinIND$lrt,1) dat.loglinIND$lrt-dat.loglinSAT$lrt 1-pchisq(dat.loglinIND$lrt-dat.loglinSAT$lrt,1) freq<-c(9,5,2,4,16,10,26,28) gener<-factor(c(rep('female',4),c(rep('male',4)))) affil<-factor(rep(c('L','L','E','E'),2)) circle<-factor(rep(c('tennis','astro'),4)) dat<-data.frame(gener,affil,circle,freq) dat.table<-table(gender,affil,circle) dat.table[cbind(gender,affil,circle)]<-freq dat.loglin2<-loglin(dat.table,list(1), fit=T,param=T) dat.loglin3<-loglin(dat.table,list(1,3), fit=T,param=T) dat.loglin4<-loglin(dat.table,list(1,2,3), fit=T,param=T) dat.loglin5<-loglin(dat.table,list(c(1,3),2), fit=T,param=T) dat.loglin6<-loglin(dat.table,list(c(1,3),c(1,2)), fit=T,param=T) dat.loglin7<-loglin(dat.table,list(c(1,3),c(1,2),c(2,3)), fit=T,param=T) dat.loglin8<-loglin(dat.table,list(c(1,2,3)), fit=T,param=T) source('http://peach.l.chiba-u.ac.jp/course_folder/cuUtil02.R') dat<-read.csv("http://www.matsuka.info/data_folder/cda7-16.csv") dat.tab <- table(dat) dat.LL <- loglin(dat.tab, list(c(1,4),c(2,4),c(3,4)), fit = T, param=T) 1-pchisq(dat.LL$lrt, dat.LL$df)
データ解析基礎論B GLM
library(tidyverse) dat<-read.csv("http://peach.l.chiba-u.ac.jp/course_folder/logisticReg01.txt ") ggplot(dat) + geom_point(aes(x = study, y = pass), size =3) + xlab("Hours studied") + ylab("Passing") dat.lm <- lm(pass~study,dat) ggplot(dat,aes(x = study, y = pass)) + geom_point(size =3) + xlab("Hours studied") + ylab("Passing") + geom_abline(slope = dat.lm$coefficients[2], intercept = dat.lm$coefficients[1]+1, color = "red",size = 2) par(mfrow=c(2,2)) plot(dat.lm) real.p = data.frame( real.p = table(dat$pass, dat$study)[2,] / colSums(table(dat$pass, dat$study)), x = 0:30) ggplot(real.p,aes(x = x, y = real.p)) + geom_point(size =3) + xlab("Hours studied") + ylab("Passing (actual probability)") dat.lr <- glm(pass~study,family=binomial, data=dat) summary(dat.lr) coef = coefficients(dat.lr) temp.x = seq(0,30,0.1) pred.pass.p = data.frame(pred.p = 1/(1+exp(-(coef[1]+coef[2]*temp.x))), x = temp.x) ggplot(dat, aes(x = study,y = pass)) + geom_point(size=3) + geom_line(aes(x =x, y= pred.p + 1), data = pred.pass.p, color = 'red',size = 1)+ xlab("Hours studied") + ylab("Passing") ## pred.pass.p = 1/(1+exp(-(coef[1]+coef[2]*c(10:15)))) odds=pred.pass.p/(1-pred.pass.p) exp(coef[2]) odds[2:6]/odds[1:5] dat<-read.csv("http://www.matsuka.info/data_folder/datWA01.txt") dat.lr<-glm(gender~shoesize,family=binomial,data=dat) anova(dat.lr, test ="Chisq") dat.lr0<-glm(gender~1,family="binomial",data=dat) dat.lrS<-glm(gender~shoesize,family=binomial,data=dat) dat.lrh<-glm(gender~h,family="binomial",data=dat) M=matrix(c(52,48,8,42),nrow=2) rownames(M)<-c("present", "absent") colnames(M)<-c("smoker",'non-smoker') dat<-as.data.frame((as.table(M))) colnames(dat)<-c("cancer","smoking","freq") dat=dat[rep(1:nrow(dat),dat$freq),1:2] rownames(dat)<-c() dat.glm<-glm(cancer~smoking,family=binomial,data=dat) dat<-read.csv("http://www.matsuka.info/data_folder/cda7-16.csv") dat.glm<-glm(survival~age, family=binomial,data=dat) dat.glm2<-glm(survival~Ncigarettes,family=binomial,data=dat) dat.glm3<-glm(survival~NdaysGESTATION,family=binomial,data=dat) dat.glmAllAdd=glm(survival~age+Ncigarettes+NdaysGESTATION,family=binomial,data=dat) dat.glmAllMult=glm(survival~age*Ncigarettes*NdaysGESTATION,family=binomial,data=dat) library(MASS) stepAIC(dat.glmAllMult) dat<-read.csv("http://peach.l.chiba-u.ac.jp/course_folder/poissonReg01.txt ") ggplot(dat) + geom_histogram(aes(eye.count), fill='red') + xlab("Number of times looking at eyes") ggplot(dat, aes(x=attr, y = eye.count)) + geom_point(size =2) + ylab("Number of times looking at eyes")+ xlab("Attractiveness") + geom_abline(slope = dat.lm$coefficients[2], intercept = dat.lm$coefficients[1], color = "red",size = 2) dat.lm <- lm(eye.count~attr,data = dat) dat.pr<-glm(eye.count~attr,family = "poisson",data=dat) cf = coefficients(dat.pr) x.temp <- seq(0,10,0.1) pred.c = data.frame(x=x.temp, pred = exp(cf[1]+cf[2]*x.temp)) ggplot(dat, aes(x=attr, y = eye.count)) + geom_point(size =2) + ylab("Number of times looking at eyes")+ xlab("Attractiveness") + geom_abline(slope = dat.lm$coefficients[2], intercept = dat.lm$coefficients[1], color = "red",size = 2)+ geom_line(aes(x = x.temp, y= pred),data = pred.c, color="blue", size=2)
データ解析基礎論B 多変量解析
install.packages("ggfortify") install.packages("ggdendro") library(ggfortify) library(ggdendro) # pca dat<-read.table("http://www.matsuka.info/data_folder/tdkPCA01.txt") dat.pca<-princomp(dat) autoplot(dat.pca, label = TRUE, label.size = 6, loadings = TRUE, loadings.colour = 'red', loadings.label = TRUE, loadings.label.size = 5) autoplot(dat.pca, shape = FALSE, label.size = 6, loadings = TRUE, loadings.colour = 'red', loadings.label = TRUE, loadings.label.size = 5) cldata<-data.frame(var1=c(4,1,5,1,5), var2=c(1,5,4,3,1)) rownames(cldata) = c("A","B","C","D","E") autoplot(dist(cldata)) cldata.cluster=hclust(dist(cldata),method="average") ggdendrogram(cldata.cluster, rotate = T, theme_dendro = FALSE)+ xlab("Individual") dat<-read.csv("http://matsuka.info/data_folder/tdkClust.csv", header=TRUE, row.names=1) autoplot(dist(dat)) dat.cluster=hclust(dist(dat)) ggdendrogram(dat.cluster, rotate = T, theme_dendro = FALSE)+ xlab("Occupation") dat.pca = princomp(dat) autoplot(dat.pca, label = TRUE, shape = FALSE, label.size = 4, loadings = TRUE, loadings.colour = 'red', loadings.label = TRUE, loadings.label.size = 5) dat.HC.S=hclust(dist(dat), method = "single") dat.HC.C=hclust(dist(dat), method = "complete") dat.HC.A=hclust(dist(dat), method = "average") dat.HC.W=hclust(dist(dat), method = "ward.D") ggdendrogram(dat.HC.S, rotate = T, theme_dendro = FALSE)+ xlab("Occupation")+ggtitle("Method = Single") ggdendrogram(dat.HC.C, rotate = T, theme_dendro = FALSE)+ xlab("Occupation")+ggtitle("Method = Complete") ggdendrogram(dat.HC.A, rotate = T, theme_dendro = FALSE)+ xlab("Occupation")+ggtitle("Method = Average") ggdendrogram(dat.HC.W, rotate = T, theme_dendro = FALSE)+ xlab("Occupation")+ggtitle("Method = Ward's MV") dat.kmeans=kmeans(dat, centers=3, nstart=10) pairs(dat, main = "Clustering Occupations", pch = 21, bg = c("red", "blue", "green") [unclass(dat.kmeans$cluster)]) autoplot(dat.kmeans, dat, size = 3, label = TRUE, label.size = 5) source("http://www.matsuka.info/univ/course_folder/cuUtil02.R") res<-cu.KMC.rep(dat,10,100) autoplot(dat.kmeans, dat, frame = TRUE, frame.type = 'norm') + ylim(-0.7,0.7)+xlim(-1.2,0.7) autoplot(dat.kmeans, dat, frame = TRUE)+ ylim(-0.7,0.7)+xlim(-1.2,0.7) dat<-data.frame(writing=c(68,85,50,54,66,35,56,25,43,70), interview=c(65,80,95,70,75,55,65,75,50,40), cl=c(rep("A",5),rep("N",5))) library(MASS) dat.lda<-lda(cl~.,data=dat) intcpt = (dat.lda$scaling[1]*dat.lda$means[1,1]+dat.lda$scaling[2]*dat.lda$means[1,2]+ dat.lda$scaling[1]*dat.lda$means[2,1]+dat.lda$scaling[2]*dat.lda$means[2,2])/2 new.dim.slope = dat.lda$scaling[1]/dat.lda$scaling[2] disc.intcpt = intcpt / dat.lda$scaling[2] disc.slope = -dat.lda$scaling[1] / dat.lda$scaling[2] ggplot(dat, aes(x = writing, y= interview, color = cl)) + geom_point(size = 4) + geom_abline(aes(intercept = intcpt, slope = new.dim.slope )) + geom_abline(aes(intercept = disc.intcpt, slope = disc.slope ),color = "red") + xlim(30,100)+ylim(30,100) dat<-read.csv("http://matsuka.info/data_folder/tdkDA01.csv", header=T) dat.lda<-lda(class~.,dat) lda.pred<-predict(dat.lda,dat) table(lda.pred$class, dat$class) dat.ldaCV<-lda(class~.,dat, CV=T) dat<-read.csv("http://matsuka.info/data_folder/tdkDA02.csv",header=T) dat.lda=lda(class~.,data=dat) lda.pred <- predict(dat.lda)$x %>% as.data.frame %>% cbind(class = dat$class) ggplot(lda.pred) + geom_point(aes(x=LD1, y=LD2, color = class), size = 2.5) dat<-data.frame(p1=c(4,1,5,1,5),p2=c(1,5,4,3,1)) rownames(dat)<-c('a','b','c','d','e') dat.mds<-cmdscale(dist(dat),2) ggplot(dat.mds, aes(x = dat.mds[,1],y = dat.mds[,2])) + geom_text(aes(label = row.names(dat.mds)), size = 6)
データ解析基礎論B W05 Factor Analysis
chisq.test(c(72,23,16,49),p=rep(40,4),rescale.p=F) chisq.test(c(72,23,16,49),p=rep(0.25,4),rescale.p=F) M=matrix(c(52,48,8,42),nrow=2) chisq.test(M,correct=T) #(abs(52-40)-0.5)^2/40+(abs(48-60)-0.5)^2/60 # +(abs(8-20)-0.5)^2/20+(abs(42-30)-0.5)^2/30 dat<-read.csv("http://peach.l.chiba-u.ac.jp/course_folder/FA01.csv") dat.fa<-factanal(dat,1) dat<-read.table("http://www.matsuka.info/data_folder/tdkPCA01.txt") dat.pca<-princomp(dat) dat.fa<-factanal(dat,1) dat.fa<-factanal(dat,1,score="regression") plot(dat.fa$score~dat.pca$score[,1],pch=20,cex=2,xlab="Component Score", ylab="Factor Score") fa_pca.scores = tibble(fa = dat.fa$scores, pca = dat.pca$scores[,1], total.score = rowSums(dat)) ggplot(fa_pca.scores) + geom_point(aes(x = fa, y = pca), size = 3) + xlab("Factor Score") + ylab("Component Score") cor(dat.fa$score,dat.pca$score) ggplot(fa_pca.scores) + geom_point(aes(x = fa, y = total.score), size = 3) + xlab("Factor Score") + ylab("Total Score") dat<-read.csv("http://www.matsuka.info/data_folder/tdkCFA.csv") dat.faWOR<-factanal(dat,2, rotation="none", score="regression") dat.faWR<-factanal(dat,2, rotation="varimax", score="regression") loadingsWOR <- dat.faWOR$loadings[] %>% as.tibble() %>% add_column(variable = row.names(dat.faWOR$loadings)) %>% gather("Factor1","Factor2", key = "factor", value = "loadings") loadingsWR <- dat.faWR$loadings[] %>% as.tibble() %>% add_column(variable = row.names(dat.faWR$loadings)) %>% gather("Factor1","Factor2", key = "factor", value = "loadings") ggplot(loadingsWOR, aes(variable, abs(loadings), fill=loadings)) + facet_wrap(~ factor, nrow=1) + geom_bar(stat="identity") + coord_flip() + scale_fill_gradient2(name = "loadings", high = "blue", mid = "white", low = "red", midpoint=0, guide=F) + ylab("Loading Strength") ggplot(loadingsWR, aes(variable, abs(loadings), fill=loadings)) + facet_wrap(~ factor, nrow=1) + geom_bar(stat="identity") + coord_flip() + scale_fill_gradient2(name = "loadings", high = "blue", mid = "white", low = "red", midpoint=0, guide=F) + ylab("Loading Strength") loadingsWR2 <- as.data.frame(dat.faWR$loadings[]) ggplot(loadingsWR2, aes(x = Factor1, y = Factor2)) + geom_point(size = 3, color = "red") + geom_vline(xintercept=0) + geom_hline(yintercept=0) + geom_text(aes(label = rownames(loadingsWR2))) + ylim(-1.1, 1.1) + xlim(-1.1, 1.1) dat.model1<-factanal(dat,1) dat.model2<-factanal(dat,2) dat.model3<-factanal(dat,3) dat.model4<-factanal(dat,4) source("http://www.matsuka.info/univ/course_folder/cuUtil02.R") cu.lrtest.csq(dat.model3,dat.model4) cu.AIC.csq(dat.model1) library(sem) model01=cfa(reference.indicator=FALSE) F1:extrovert,cheerful, leadership, antisocial, talkative, motivated, hesitance, popularity cv.mat = cov(dat) mod1<-sem(model01,cv.mat,100) model02=cfa(reference.indicator=FALSE) F1: extrovert, leadership, motivated, hesitance F2: cheerful, antisocial, talkative, popularity mod2<-sem(model02, cov(dat), nrow(dat)) opt <- options(fit.indices = c("RMSEA")) summary(mod2)
データ解析基礎論B PCA
dat<-read.table("http://www.matsuka.info/data_folder/tdkPCA01.txt") dat.pca<-princomp(dat) summary(dat.pca) biplot(dat.pca) dat.pca$loadings[,1] scoreP1S1 = dat.pca$loadings[1,1]*(dat[1,1]-mean(dat$writing))+ dat.pca$loadings[2,1]*(dat[1,2]-mean(dat$thesis))+ dat.pca$loadings[3,1]*(dat[1,3]-mean(dat$interview)) dat<-read.csv("http://www.matsuka.info/data_folder/tdkCFA.csv") dat.pca<-princomp(dat) dat<-read.csv("http://peach.l.chiba-u.ac.jp/course_folder/AMSA_track_mens.csv",row.names=1) plot(dat,pch=20) dat.pca<-princomp(dat) screeplot(dat.pca,type="lines") dist = c(100,200,400,800,1500,5000,10000,42195) log.dist=log(dist) plot(log.dist, dat.pca$loadings[,1],pch=20,col='red', cex=2,ylab="PC loadings (1st)",xlab="Log(distance)") plot(log.dist, dat.pca$loadings[,2],pch=20,col='red', cex=5,ylab="PC loadings (2nd)",xlab="Log(distance)")
データ解析基礎論B W03 R graphics
library(tidyverse) # CLT NperSample = 10 SampleSize = 300000 runif(NperSample * SampleSize) %>% matrix(nrow=NperSample) %>% colMeans() %>% tibble(sample.mean = .) -> means ggplot(means,aes(x = sample.mean, y = ..density..)) + geom_histogram(bins=200) + geom_density(colour = "orange",size=2) ggplot(means,aes(x = sample.mean, y = ..density..)) + geom_histogram(bins=200) + geom_line(stat = "density", colour = "orange",size=2) runif(NperSample * SampleSize) %>% matrix(nrow=NperSample) %>% colMeans() %>% tibble(sample.mean = .) %>% ggplot(., aes(x = sample.mean, y = ..density..)) + geom_histogram(bins=100,colour = "grey20") + geom_line(stat = "density", colour = "skyblue",size=2) dat <- read.csv("http://www.matsuka.info/data_folder/sampleData2013.txt") dt <- as_tibble(dat) ggplot(dt, aes(x = Hworked, y = nbooks)) + geom_point(size = 3) ggplot(dt) + geom_point(aes(x = Hworked, y = nbooks, color = grade),size = 3) ggplot(dt) + geom_point(aes(x = Hworked, y = nbooks, shape = grade),size = 5) ggplot(dt) + geom_point(aes(x = Hworked, y = nbooks),size = 5) + facet_wrap(~ grade, nrow = 1) ggplot(dt) + geom_smooth(aes(x = Hworked, y = nbooks)) ggplot(dt) + geom_smooth(aes(x = Hworked, y = nbooks, linetype = grade)) ggplot(dt) + geom_smooth(aes(x = Hworked, y = nbooks)) + facet_wrap(~ grade, nrow = 4) ggplot(dt) + geom_smooth(aes(x = Hworked, y = nbooks)) + geom_point(aes(x = Hworked, y = nbooks), size = 4) ggplot(dt) + geom_smooth(aes(x = Hworked, y = nbooks), colour = "black", se = FALSE) + geom_point(aes(x = Hworked, y = nbooks, color = grade), size = 4) ggplot(dt) + geom_smooth(aes(x = Hworked, y = nbooks, color = grade), se = FALSE) + geom_point(aes(x = Hworked, y = nbooks, color = grade), size = 4) plot1 <- ggplot(dt) + geom_smooth(aes(x = Hworked, y = nbooks, color = grade), se = FALSE) + geom_point(aes(x = Hworked, y = nbooks, color = grade), size = 4) plot1 + xlab("Hours worked") + ylab("Number of books read") plot1 + xlab("Hours worked") + ylab("Number of books read") + theme(axis.title.x = element_text(face = "italic",size = 14, colour = "navy"), axis.title.y = element_text(face = "bold",size = 10, colour = "darkgreen")) ggplot(filter(dt, affil == "LA")) + geom_point(aes(x = Hworked, y = nbooks, color = grade), size = 4) dt$grade <- fct_relevel(dt$grade, "FR","SP","JR","SR") group_by(dt, grade) %>% summarize(ave.books = mean(nbooks, na.rm = T)) %>% ggplot() + geom_bar(aes(x = grade, y = ave.books), stat = "identity") group_by(dt, grade) %>% summarize(ave.books = mean(nbooks, na.rm = T)) %>% ggplot() + geom_bar(aes(x = grade, y = ave.books), stat = "identity") group_by(dt, grade) %>% summarize(ave.books = mean(nbooks, na.rm = T), se = sd(nbooks, na.rm =T)/n()) %>% ggplot(aes(x = grade, y = ave.books)) + geom_bar(stat = "identity", fill = "grey70") + geom_errorbar(aes(ymin = ave.books - se, ymax = ave.books +se), width = 0.2) + ylab("Average # books read") ggplot(dt,aes(x = Hworked, y = nbooks)) + stat_density2d(aes(colour =..level..)) + geom_point() ggplot(dt,aes(x = Hworked, y = nbooks)) + stat_density2d(aes(alpha =..density..), geom="tile",contour=F) + geom_point(alpha =0.4) ggplot(dt) + stat_summary(aes(x = grade, y = nbooks), fun.y = mean, fun.ymin = function(x) mean(x) - sd(x), fun.ymax = function(x) mean(x) + sd(x)) ggplot(dt) + geom_boxplot(aes(x = grade, y = nbooks)) ggplot(dt) + geom_boxplot(aes(x = grade, y = nbooks)) + coord_flip() dat <- read.csv("http://www.matsuka.info/data_folder/datWA01.txt") dt <- as_tibble(dat) dt.lm <- lm(h~shoesize, dt) cfs <- coef(dt.lm) ggplot(dt, aes(x = shoesize, y = h)) + geom_point() + geom_abline(intercept = cfs[1], slope = cfs[2], col = "red") + geom_text( x= 22, y =175, aes(label = paste("r^2 =",round(summary(dt.lm)$r.squared,3))))
データ解析基礎論B W02
install.packages("tidyverse") library(tidyverse) random.number <- rnorm(1000) mean(random.number) mean(random.number <- rnorm(1000)) rnorm(1000) %>% mean() # CLT NperSample = 10 SampleSize = 300000 # "traditional" random.number <- runif(NperSample * SampleSize) dat <- matrix(random.number, nrow=NperSample) means <- colMeans(dat) dens <- density(means) hist(means, breaks = 100, probability = T, main = "Distributionf of Means") lines(dens, lwd = 3, col = "orange") runif(NperSample * SampleSize) %>% matrix(nrow=NperSample) %>% colMeans() -> means hist(means, breaks=100,probability = T, main = "Distributionf of Means") means %>% density() %>% lines(,lwd=3,col='orange') histWdensity <- function(dat, nbreaks=30, main.txt){ dens <- density(dat) hist(dat, breaks = nbreaks, probability = T, main = main.txt) lines(dens, lwd = 3, col = "orange") } runif(NperSample * SampleSize) %>% matrix(nrow=NperSample) %>% colMeans() %>% histWdensity(nbreaks = 100, main.txt = "Distributionf of Means") dat <- read.csv("http://www.matsuka.info/data_folder/sampleData2013.txt") dt <- as_tibble(dat) dt.la <- filter(dt, affil == "LA") dt.la2 <- filter(dt, affil == "LA" & grade == "SP") dt.laNot2 <- filter(dt, affil == "LA" & grade != "SP") dt.GB <- select(dt, grade, nbooks) dtWOgender <- select(dt, -gender) dt.arranged <- arrange(dt, affil, grade) dt.weekly <- mutate(dt,nbooksWeek = nbooks / 52) dt.atLeastOneBook <- mutate(dt, atleastOneBook = (nbooks/52) >= 1) dt.atLeastOneBook <- mutate(dt, atleastOneBook = (nbooks/12) >= 1) dt.BWindex = mutate(dt, nbooksWeek = nbooks / 52, idx = nbooksWeek / (12*7-Hworked)) dt.byGrade <- group_by(dt, grade) summarize(dt.byGrade, ave.books = mean(nbooks,na.rm = TRUE), ave.Hworked = mean(Hworked, na.rm = TRUE)) dt.byGrAf <- group_by(dt, grade, affil) dt.summ <- summarize(dt.byGrAf, ave.books = mean(nbooks,na.rm = TRUE), ave.Hworked = mean(Hworked, na.rm = TRUE), N = n()) dt.summ2 <- dt %>% group_by(grade, affil) %>% summarize(ave.books = mean(nbooks,na.rm = TRUE), ave.Hworked = mean(Hworked, na.rm = TRUE), N = n()) %>% filter(N > 2) %>% arrange(desc(ave.books)) plot(x = dt.summ2$ave.books, y = dt.summ2$ave.Hworked, pch=20, cex = 3, xlab = "Ave. # books read",ylab = "Ave hours worked") dt <- read_csv("http://www.matsuka.info/data_folder/sampleData2013.txt", col_names = TRUE) dt.summ3 <- dt %>% group_by(grade, gender) %>% summarize(ave.books = mean(nbooks,na.rm = TRUE), ave.Hworked = mean(Hworked, na.rm = TRUE)) dt.summ3G <- dt.summ3 %>% gather('ave.books', 'ave.Hworked', key = 'ave.values', value = "BorW") dt.summ3SformG <- spread(dt.summ3G, key = ave.values, value =BorW) dt.sumLA <- dt %>% filter(affil=="LA") %>% group_by(grade) %>% summarize(ave.books = mean(nbooks)) toeic <- tibble( grade = factor(c("SP", "JR")), score = c(800,830), ) new.dt1 <- dt.sumLA %>% inner_join(toeic, by = "grade") dt.sumLA <- add_row(dt.sumLA, grade = c("MS"), ave.books = (13)) toeic2 <- tibble( grade = factor(c("SP", "JR","DR")), score = c(800,830,900), ) new.dt3 <- full_join(dt.sumLA, toeic2) new.dt4 <- left_join(dt.sumLA, toeic2) new.dt5 <- right_join(dt.sumLA, toeic2)