データ解析基礎論B review02

n.subj = 5
set.seed(10)
groupA1 = rnorm(n.subj, mean = 100, sd =15)
groupA2 = rnorm(n.subj, mean = 100, sd =15)
groupID = c(rep("A1",n.subj),rep("A2",n.subj))
dat = data.frame(score = c(groupA1,groupA2),ID = groupID)
grand.mean = mean(dat$score)
mean.A1 = mean(groupA1)
mean.A2 = mean(groupA2)
plot(dat$ID, dat$score)
points(as.numeric(dat$ID), dat$score, col=c(rep('red',n.subj),rep("blue",n.subj)),pch=20)

idx = 1:(n.subj+n.subj)
plot(idx,dat$score,col=c(rep('red',n.subj),rep("blue",n.subj)),
     pch=20,ylim=c(65,145),ylab ="score",xlab="subjID")
abline(h=grand.mean,lwd=2)
abline(h=mean.A1,lty=2,col='red',lwd=2)
abline(h=mean.A2,lty=2,col='blue',lwd=2)
legend("topright",c("overall mean", "G1 mean","G2 mean"),
       lty=c(1,2,2),col=c('black','red','blue'),lwd=2)

ss.total = sum((dat$score-grand.mean)^2)
ss.group1 = n.subj*(mean.A1-grand.mean)^2
ss.group2 = n.subj*(mean.A2-grand.mean)^2
ss.group = ss.group1+ss.group2
ss.error1 = sum((groupA1-mean.A1)^2)
ss.error2 = sum((groupA2-mean.A2)^2)
ss.error = ss.error1+ss.error2
df.group = 2 - 1
df.error = (n.subj-1)+(n.subj-1)
ms.group = ss.group/df.group
ms.error = ss.error/df.error
f.value = ms.group/ms.error
p.value = 1 - pf(f.value, df.group, df.error)
summary(aov(score~ID, data=dat))
n.rep = 1e5
n.subj = 5
f.hist = rep(0,n.rep)
for (i.rep in 1:n.rep){
  groupA1 = rnorm(n.subj, mean = 100, sd =15)
  groupA2 = rnorm(n.subj, mean = 100, sd =15)
  groupID = c(rep("A1",n.subj),rep("A2",n.subj))
  dat = data.frame(score = c(groupA1,groupA2),ID = groupID)
  grand.mean = mean(dat$score)
  mean.A1 = mean(groupA1)
  mean.A2 = mean(groupA2)
  ss.total = sum((dat$score-grand.mean)^2)
  ss.group = n.subj*(mean.A1-grand.mean)^2+n.subj*(mean.A2-grand.mean)^2
  ss.error = sum((groupA1-mean.A1)^2)+sum((groupA2-mean.A2)^2)
  df.group = 2 - 1
  df.error = (n.subj-1)+(n.subj-1)
  ms.group = ss.group/df.group
  ms.error = ss.error/df.error
  f.hist[i.rep] = ms.group/ms.error
}

hist(f.hist,1000,xlim=c(0,15),probability = T)
f.temp = seq(0,15,0.05)
f.true = df(f.temp,df.group,df.error)
lines(f.temp,f.true,col='red',lty=2,lwd=2)
thres.f = qf(0.95,df.group,df.error)
abline(v = thres.f, col='green')
prop.FP = sum(f.hist > thres.f)/n.rep

データ解析基礎論B Reivew01

# central limit theorem
n.sample = 1e6
n.data = 10
dat = matrix(runif(n.sample*n.data), nrow = n.data)
means = colMeans(dat)
sds = apply(dat,2,sd)

hist(means,100,probability = T)
x.temp  = seq(0,1, 0.01)
true.sd = sqrt(1/12)
true.dens = dnorm(x.temp,mean=0.5, sd = true.sd/sqrt(n.data))
lines(x.temp, true.dens,col='red',lwd=2)
ci.low = qnorm(0.025,mean=0.5, sd = true.sd/sqrt(n.data))
ci.high = qnorm(0.975,mean=0.5, sd = true.sd/sqrt(n.data))
abline(v=ci.low,col='blue',lwd=2)
abline(v=ci.high,col='blue',lwd=2)

# standardizing data
z = (means-0.5)/(true.sd/sqrt(n.data))
hist(z,100,probability = T)
x.tempZ = seq(-4,4,0.01)
z.dens = dnorm(x.tempZ,mean=0, sd = 1)
lines(x.tempZ, z.dens,col='red',lwd=2)
ci.lowZ = qnorm(0.025,mean=0, sd = 1)
ci.highZ = qnorm(0.975,mean=0, sd = 1)
abline(v=ci.lowZ,col='blue',lwd=2)
abline(v=ci.highZ,col='blue',lwd=2)

# t distribution
t.temp = seq(-4,4,0.01)
t.dens = dt(t.temp, df=(n.data-1))
lines(t.temp, t.dens,col='green',lwd=2)
ci.lowT = qt(0.025,df=(n.data-1))
ci.highT = qt(0.975,df=(n.data-1))
abline(v=ci.lowT,col='cyan',lwd=2)
abline(v=ci.highT,col='cyan',lwd=2)


### cell automata
nCell=201
nGen=100
res=matrix(0,nrow=nGen,ncol=nCell)
res[1,100]=1
for (i_gen in 2:nGen) {
  for (i_cell in 2:(nCell-1)) {
    res[i_gen,i_cell]=sum(res[i_gen-1,(i_cell-1):(i_cell+1)])%%2;
  } 
 # 左端
  res[i_gen,1]=(res[i_gen-1,nCell]+res[i_gen-1,1]+res[i_gen-1,2])%%2 
 # 右端
  res[i_gen,nCell]=(res[i_gen-1,(nCell-1)]+res[i_gen-1,nCell]+res[i_gen-1,1])%%2;
} 
image(res)


# general version
dec2bin<-function(num, digits=8) {
  bin=c()
  if (num==0){
    bin=0
  } else {
    while(num!=0){
      rem=num%%2
     num=num%/%2
      bin=c(rem,bin)
    }
  }
  if (length(bin) < digits){
    res=matrix(0,nrow=1,ncol=digits)
    res[(digits-length(bin)+1):digits]=bin
  } else {res=bin}
  return(res)
}

transFUN<-function(st,ruleID){
  output=dec2bin(ruleID,8);
  a=matrix(c(1,1,1,1,1,0,1,0,1,1,0,0,0,1,1,0,1,0,0,0,1,0,0,0),nrow=8,byrow=T)
  newSt=output[which(apply(a,1,function(x) {all(x==st)}))]
  return(newSt)
}
ECA<-function(nCell, nGen,ruleID){
  res=matrix(0,nrow=nGen,ncol=nCell)
  res[1,ceiling(nCell/2)]=1;
  for (i_gen in 2:nGen) {
    for (i_cell in 2:(nCell-1)) {
      res[i_gen,i_cell]=transFUN(res[i_gen-1,(i_cell-1):(i_cell+1)],ruleID)
     }
   res[i_gen,1]=transFUN(c(res[i_gen-1,nCell],res[i_gen-1,1],res[i_gen-1,2]),ruleID)
   res[i_gen,nCell]=transFUN(c(res[i_gen-1,(nCell-1)],res[i_gen-1,nCell],res[i_gen-1,1]),ruleID)
  }
  return(res)
}
res<-ECA(200,100,99)
image(res) 

データ解析基礎論B W10

install.packages("nnet")
library(nnet)

# regression
dat<-read.csv("http://www.matsuka.info/data_folder/tdkReg01.csv")
for (i.iter in 1:10){
set.seed(5)
print(i.iter)
x = dat[,1:3]
y = dat[,4]
dat.nnet = nnet(x,y, size = 150, linout= TRUE,maxit = 1000)
nnet.pred <-predict(dat.nnet,dat)
print(c(cor(nnet.pred,dat$sales),cor(nnet.pred,dat$sales)^2))
plot(dat.nnet$fitted.values, dat$sales,pch=20,col='black')
points(predict(dat.lm), dat$sales,pch=20,col='red')
abline(h=max(dat$sales),col='green')
abline(v=max(dat$sales),col='green')
abline(h=min(dat$sales),col='green')
abline(v=min(dat$sales),col='green')
abline(h=mean(dat$sales),col='green')
abline(v=mean(dat$sales),col='green')


n.data = nrow(dat);n.sample = n.data*0.6; n.rep = 1000 
trainNN.cor = rep(0,n.rep); trainLM.cor = rep(0,n.rep)
testNN.cor = rep(0,n.rep); testLM.cor = rep(0,n.rep)
for (i.rep in 1:n.rep){  
  randperm = sample(1:n.data)  
  train.idx = randperm[1:n.sample]  
  test.idx = randperm[(n.sample+1):n.data]  
  dat.nnet <- nnet(sales~.,size = 10, linout=T,decay= 0.1, maxit=1000, data = dat[train.idx,])  
  dat.lm <-lm(sales~.,data=dat[train.idx, ])  
  trainNN.cor[i.rep] <- cor(predict(dat.nnet,dat[train.idx, ]), dat[train.idx,]$sales)  
  trainLM.cor[i.rep] <- cor(predict(dat.lm,dat[train.idx, ]), dat[train.idx,]$sales)  
  testNN.cor[i.rep] <- cor(predict(dat.nnet,dat[test.idx, ]), dat[test.idx,]$sales)  
  testLM.cor[i.rep] <- cor(predict(dat.lm,dat[test.idx, ]), dat[test.idx,]$sales)
}
print(c(mean(trainNN.cor,na.rm=T),mean(testNN.cor,na.rm=T)))
print(c(mean(trainLM.cor,na.rm=T),mean(testLM.cor,na.rm=T)))

# logistic regression
dat<-read.csv("http://www.matsuka.info/data_folder/tdkDA01.csv")
dat.nnet<-nnet(class~.,dat,size=30,maxit=1000,decay=0.05)
dat.pred<-predict(dat.nnet,dat,type="class")
table(dat.pred,dat$class)

dat.glm<-glm(class~., family="binomial",dat)
glm.pred<-predict(dat.glm, dat, type="response")>0.5
table(glm.pred,dat$class)
dat.glm<-glm(class~social*coop.*dilig.*enterp., family="binomial",dat)


dat<-read.csv("http://www.matsuka.info/data_folder/cda7-16.csv")
dat.nnet<-nnet(survival~., dat, size=30, maxit=1000, decay=0.01)
dat.pred<-predict(dat.nnet,dat,type="class")
table(dat.pred,dat$survival)

Ns = summary(dat$survival)
(Ns[1]/Ns[2])^-1

wts = rep(1,nrow(dat))
wts[which(dat$survival=="no")]=45
dat.nnet<-nnet(survival~., weights=wts, dat, size=30, maxit=1000, decay=0.01)
dat.pred<-predict(dat.nnet,dat,type="class")
table(dat.pred,dat$survival)

# discriminant analysis
dat<-read.csv("http://www.matsuka.info/data_folder/tdkDA02.csv")
class.id<-class.ind(dat$class)
x = dat[,1:6]
dat.nnet<-nnet(x,class.id,size=30, maxit=1000, decay=0.01, softmax=TRUE)
max.id = apply(dat.nnet$fitted.values,1,which.max)
table(max.id,dat$class)


dat<-read.table("http://www.matsuka.info/data_folder/tdkPCA01.txt")
dat.nnet<-nnet(dat,dat,size=2, maxit=1000, decay=0.01, linout=TRUE)

データ解析基礎論B W08

dat<-data.frame(x1=c(50,69,93,76,88,43,56,38,21,25),
                x2=c(15.5,18.4,26.4,22.9,18.6,16.9,21.6,12.2,16.0,10.5), 
                cl=c(rep("h",5),rep("d",5)))
library(MASS)
dat.lda<-lda(cl~.,data=dat)
 2, col='skyblue')

dat<-read.csv("http://matsuka.info/data_folder/tdkDA01.csv", header=T)
dat.lda<-lda(class~.,dat)


dat<-read.csv("http://matsuka.info/data_folder/tdkDA02.csv",header=T)
dat.lda=lda(class~.,data=dat)
lda.pred<-predict(dat.lda,dat)

plot(dat.lda, dimen=2, col=as.numeric(dat$class), cex=3)
plot(dat.lda, dimen=5, col=as.numeric(lda.pred$class),cex=2)


dat.km<-kmeans(dat[,1:6],5)

dat<-read.csv("http://www.matsuka.info/data_folder/tdkCFA.csv")
dat1<-subset(dat,dat$popularity<5)
dat2<-subset(dat,dat$popularity>4 & dat$popularity<6)
dat3<-subset(dat,dat$popularity>6)
datT=rbind(dat1,dat2,dat3)
datT.lda<-lda(popularity~.,datT)
datT.pred<-predict(datT.lda,datT)
table(datT.pred$class,datT$popularity)   

plot(datT.lda,col=as.numeric(datT.pred$class)+2,cex=1)
colors=rep("black",nrow(datT))
miss.idx = which(datT.pred$class!= datT$popularity)
colors[miss.idx]='red'
points(datT.pred$x,pch=20,col=colors)
legend("bottomright",c("correct pred.", "missed"), pch=20,col=(1:2))


dat<-read.csv("http://matsuka.info/data_folder/tdkDA01.csv", header=T)
dat.lda<-lda(class~.,dat)
dat.glm<-glm(class~.,family=binomial,data=dat)

#MDS
dat<-data.frame(p1=c(4,1,5,1,5),p2=c(1,5,4,3,1))
rownames(dat)<-c('a','b','c','d','e')
dat.mds<-cmdscale(dist(dat),2)
plot(dat.mds[,1],dat.mds[,2], type='n')
text(dat.mds[,1],dat.mds[,2],labels=row.names(dat))
dat<-read.csv("http://matsuka.info/data_folder/tdkMDS02.csv", row.name=1)
dat.mds<-cmdscale(dist(t(dat)),2,eig=T)

plot(dat.mds$points[,1],dat.mds$points[,2], type='n')
text(dat.mds$points[,1],dat.mds$points[,2],labels=colnames(dat), cex=2)

dat<-read.csv("http://www.matsuka.info/data_folder/tdkCFA.csv")

データ解析基礎論B WAB08

データ解析基礎論B WAB08 IRTなど
提出期限 2017.12.12 講義開始前まで

WAB08.1
このデータは、ある試験のデータ(計30問)です。まず、ltmパッケージのdescriptなどを使用して、各問題の記述統計値やクロンバックのアルファなどを求めてください。
次に、1パラメターのIRTモデル(rasch)及び2パラメターのIRTモデル(ltm)を用いて分析してください。そして、モデルを比較しより適切なモデルはどちらか説明してください。最後に、最適なモデルの問題の難しさ(Dffclt)の推定値と、各問題の統計量(平均値)の相関を求めてください。

WAB08.2
このデータを用いて、WAB08.1と同じ分析を行ってください。

データ解析基礎論B W07

# reliability
install.packages("psych")
library("psych")
dat<-read.table("http://peach.l.chiba-u.ac.jp/course_folder/survey_data01.txt")
ca<-alpha(dat)
dat<-read.csv("http://peach.l.chiba-u.ac.jp/course_folder/fa2015.csv")
caALL<-alpha(dat[,1:10])
ca1_5<-alpha(dat[,1:10])
ca6_10<-alpha(dat[,1:10])

# w/ factor analysis
F1<-factanal(dat[,1:5],1)
F2<-factanal(dat[,6:10],1)
library(sem)
fa.model=cfa(reference.indicator=FALSE)
F1: q1,q2,q3,q4,q5
F2: q6,q7,q8,q9,q10

fa.model<-update(fa.model)
delete, F1<->F2

fa.result<-sem(fa.model, cov(dat), 300)

# IRT models
install.packages("ltm")
library(ltm)
dat<-read.table("http://peach.l.chiba-u.ac.jp/course_folder/survey_data01.txt")
descript(dat)
irt1P<-rasch(dat)
plot(irt1P)
lrt2P<-ltm(dat~z1)
plot(irt2P)

item.fit(lrt1P)
item.fit(lrt2P)
anova(irt1P,lrt2P)

# cluster analysis
cldata<-data.frame(var1=c(4,1,5,1,5), var2=c(1,5,4,3,1))
cldata.cluster=hclust(dist(cldata),method="average")
plot(cldata.cluster,cex=2)
cutree(cldata.cluster,2)

dat<-read.csv("http://matsuka.info/data_folder/tdkClust.csv", header=TRUE, row.names=1)
dat.cluster=hclust(dist(dat),method="average")
plot(dat.cluster,cex=1.5)
dat.pca=princomp(dat)
biplot(dat.pca)

dat.kmeans=kmeans(dat, centers=3)
plot(dat,col=dat.kmeans$cluster,pch=20,cex=1.75)

データ解析基礎論b Weekly Assignment B07

データ解析基礎論B WAB07 因子分析・主成分分析
提出期限 2017.12.05 講義開始前まで

WAB07.1
このデータは、ある質問紙の回答(計10問)です。最初の5問で要因F1を、残りの5問は他の要因F2を測定する前提で作成された質問紙です。この前提が妥当に質問紙に反映されているか探索的因子分析(factanalを用いて)を行い検証してください。分析に先立ち、データを可視化してください。

WAB07.2
上記のデータを用いて主成分分析を行ってください。主成分分析の結果を用いて上記の前提に関して言及してください。

WAB07.3
上記の前提の妥当性を確認的因子分析(semを用いて)を行い検証してください。

データ解析基礎論B W06

# PCA
dat<-read.csv("http://peach.l.chiba-u.ac.jp/course_folder/AMSA_track_mens.csv",row.names=1)
plot(dat,pch=20)
dat.pca<-princomp(dat)
summary(dat.pca)
screeplot(dat.pca,type='l',lwd=2,pch=20,cex=4,,col='red')
abline(h=sum(summary(dat.pca)[[1]])/8,lty=2,lwd=2)

# factor analysis
dat<-read.csv("http://peach.l.chiba-u.ac.jp/course_folder/FA01.csv")
dat.fa<-factanal(dat,1)
dat.fa$uniquenesses
1-dat.fa$loadings[1:3]^2

dat<-read.table("http://www.matsuka.info/data_folder/tdkPCA01.txt")
dat.pca<-princomp(dat)

dat.fa<-factanal(dat,1,score="regression")
plot(dat.fa$score~dat.pca$score[,1],pch=20,cex=2,xlab="Component Score", ylab="Factor Score")
cor(dat.fa$score,dat.pca$score)
cor(rowSums(dat),dat.fa$score)

# plotting results
dat<-read.csv("http://www.matsuka.info/data_folder/tdkCFA.csv")
dat.fa<-factanal(dat,2,score="regression")
plot(dat.fa$loadings[,1],dat.fa$loadings[,2],pch=20,cex=2,col="red",
     xlab="Factor 1", ylab="Factor 2")
abline(h=0,v=0)
n.var = length(dat.fa$loadings[,1])
for (i.var  in 1:n.var){
  lines(c(0,dat.fa$loadings[i.var,1]),c(0,dat.fa$loadings[i.var,2]),
        lty=2,lwd=2,col='skyblue')
}
text(dat.fa$loadings[,1],dat.fa$loadings[,2],colnames(dat),cex=1.4)

# model comparison
dat.model1<-factanal(dat,1)
dat.model2<-factanal(dat,2)
dat.model3<-factanal(dat,3)
dat.model4<-factanal(dat,4)
print(m1res<-c(dat.model1$dof,dat.model1$PVAL))
print(m2res<-c(dat.model2$dof,dat.model2$PVAL))
print(m3res<-c(dat.model3$dof,dat.model3$PVAL))
print(m4res<-c(dat.model4$dof,dat.model4$PVAL))
1-pchisq(dat.model3$STATISTIC-dat.model4$STATISTIC,dat.model3$dof-dat.model4$dof)
dat.model2$STATISTIC
dat.model2$dof

# CFA with SEM
install.packages('sem')
library(sem)

model01=cfa(reference.indicator=FALSE)
F1:extrovert,cheerful, leadership, antisocial, talkative, motivated, hesitance, popularity

mod1<-sem(model01, cov(dat), 100)

model02=cfa(reference.indicator=FALSE)
F1: extrovert, leadership, motivated, hesitance 
F2: cheerful, antisocial, talkative, popularity

mod2<-sem(model02, cov(dat), 100)

opt <- options(fit.indices = c("RMSEA"))

データ解析基礎論B W04

# Split Plot Factorial (SFP)
source("http://peach.l.chiba-u.ac.jp/course_folder/tsme.txt")
dat<-read.csv("http://www.matsuka.info/data_folder/dktb3211.txt")
dat.aov<-aov(result~method*duration+Error(s+s:duration),dat)

TSME<-SPF.tsme(dat.aov,dat,"result")
dat.aov.sum<-summary(dat.aov)

# within-subjcts factor
DF_error.W = dat.aov.sum[[2]][[1]][3,1]
MS_error.W = dat.aov.sum[[2]][[1]][3,3]

q <- qtukey(0.95,4,DF_error.W)
hsd<-q*sqrt(MS_error.W/(5*2))
dat.means.duration<-tapply(dat$result,dat$duration,mean)
abs(outer(dat.means.duration,dat.means.duration,'-'))>hsd

# between-subjects factor
DF_error.B = dat.aov.sum[[1]][[1]][2,1]
MS_error.B = dat.aov.sum[[1]][[1]][2,3]

q <- qtukey(0.95,2,DF_error.B)
hsd<-q*sqrt(MS_error.B/(5*4))

dat.means.method<-tapply(dat$result,dat$method,mean)
abs(outer(dat.means.method,dat.means.method,'-'))>hsd

# randomized block factorial
dat<-read.csv("http://www.matsuka.info/data_folder/dktb3218.txt")
dat.aov<-aov(result~method+duration+method:duration +
  Error(s+method:s+duration:s+method:duration:s), data=dat)
TSME<-RBF.tsme(dat.aov, dat, "result")

dat.aov.sum<-summary(dat.aov)
# testing for method
mse  = dat.aov.sum[[2]][[1]][2,3]
qv=qtukey(0.95,2,4)
hsd=qv*sqrt(mse/(5*4))
dat.means.method=tapply(dat$result,dat$method,mean)
abs(outer(dat.means.method,dat.means.method,'-')) > hsd

# testing for duration
mse  = dat.aov.sum[[3]][[1]][2,3]
qv=qtukey(0.95,4,12)
hsd=qv*sqrt(mse/(5*2))
dat.means.duration=tapply(dat$result,dat$duration,mean)
abs(outer(dat.means.duration,dat.means.duration,'-')) > hsd