データ解析基礎論A Week05 t検定

dat<-read.csv("http://www.matsuka.info/data_folder/datWA01.txt")
mean.M <-mean(dat$h[dat$gender=="M"])
sigma = 10
n.M = length(dat$h[dat$gender=="M"])
z.value=(mean.M-171)/(sqrt(sigma/n.M))
(1-pnorm(abs(z.value)))*2

ssize = c(24,25,26,23.5,25,27,24,22,27.5,28)
ssize.mean = mean(ssize)
ssize.var = var(ssize)
N = 10
t.value=(ssize.mean-24)/(sqrt(ssize.var/N))
(1-pt(abs(t.value),df=9))*2

h.mean.M <-mean(dat$h[dat$gender=="M"])
h.var.M <- var(dat$h[dat$gender=="M"])
n.M = length(dat$h[dat$gender=="M"])
t.value=(h.mean.M-171)/(sqrt(h.var.M/n.M))
(1-pt(abs(t.value),df = (n.M-1)))*2

# RCMD
A=c(12,19,10,10,14,18,15,11,16)
B=c(15,20,16,14,17,16,12,12,19)
d=A-B
tValue<-mean(d)/sqrt(var(d)/length(d))
(1-pt(abs(tValue), df=8))*2

t.test(d,mu=0)


X1=c(78,70,66,76,78,76,88,76)
X2=c(76,72,60,72,70,72,84,70)
t.value=(mean(X1)-mean(X2))/sqrt((var(X1)+var(X2))/8)
2*(1-pt(abs(t.value),14))

データ解析基礎論a Week04

nSample=10;nRep=10^5;
CLT.unif <- function(nSample, nRep) {
  x=matrix(runif(nSample*nRep),nrow=nSample);
  x.means<-colMeans(x)
  hist(x.means,50,main='Distribution of Means of Uniform Distribution', 
    xlab='Means', probability=T)
  x.means.dens<-density(x.means)
  lines(x.means.dens,lwd=3,lty=1,col='red')
  x2=seq(0,1,0.001);CLT=dnorm(x2,mean=0.5,sd=(sqrt(1/12))/(sqrt(nSample)))
  lines(x2,CLT,lwd=3,lty=3,col='cyan')
  legend("topright",c("Density of Actual Means","Normal Distribution"), 
   lty=c(1,3), col=c('red','cyan'),lwd=3)
}

> CLT.unif(10,100000)

データ解析基礎論A Week03

dat<-read.table("http://www.matsuka.info/data_folder/aov01.txt",header=T)
head(dat)
summary(dat)
boxplot(dat$shoesize, col = "skyblue", main = "Dist. of Shoesize",
 ylab = "Size of shoe (in cm)")
boxplot(dat$h, col = "coral", main = "Dist. of Height", ylab = "Height (in cm)")

dat.meter = dat[,1:2]*0.01 
dat.h.ext5 = dat$h+5
dat.ss.ext1 = dat$shoesize+1
hANDshoe = dat$h+dat$shoesize
dat.h.meter.ext5 = dat$h*0.01+0.05

データ解析基礎論A グラフの基礎

データ解析基礎論A 第2週 グラフの基礎

# descriptive statistics
dat<-read.table("http://www.matsuka.info/data_folder/aov01.txt",header=T)
summary(dat)
mean(dat$shoesize)
var(dat[,1:2])
cov(dat[,1:2])
cor(dat[,1:2])

# basics - plotting
x=seq(-3,3,0.1);y=x^2;
plot(x,y)
plot(x,y,col='red')
plot(x,y,pch=20)
plot(x,y,type='l')
plot(x,y,type='l',lty=4,lwd=3)
plot(x,y,main="THIS IS THE TITLE", xlab="Label for X-axis",ylab="Label for Y-axis")
plot(x,y,main="TITLE", xlab="X here",ylab="Y here",xlim=c(-3.5,3.5),ylim=c(-0.5, 10))
plot(x,y,col='blue',type='o',lty=2,pch=19,lwd=3,main="Y=X*X", xlab="X",ylab="X*X",
  xlim=c(-3.5,3.5),ylim=c(-0.5, 10))

# histogram
dat<-read.table("http://www.matsuka.info/data_folder/aov01.txt",header=T)
hist(dat$h,main="Histogram of Height",xlab="Height",col='blue',xlim=c(140,190))
dens<-density(dat$h);
hist(dat$h,main="Histogram of Height",xlab="Height",xlim=c(140,190),probability=T)
lines(dens,lwd=2,col='red',lty=2)

# boxplot
boxplot(dat$h,main="Boxplot of Height",ylab="Height",col='cyan',ylim=c(140,190))
boxplot(dat$h~dat$gender,main="Distribution of Height by Gender",ylab="Gender",
 xlab="Height",col=c('blue','cyan'),ylim=c(140,190),horizontal=T)
boxplot(dat$h~dat$gender+dat$affil, main="Distribution of Height by Gender and Affiliation",
  ylab="Gender x Affiliation", xlab="Height", col=c('blue','cyan','red','magenta'),
  ylim=c(140,190),horizontal=T)

# barplot
install.packages("gplots")
library(gplots)

means <- tapply(dat$h, dat$gender, mean)
sds<-tapply(dat$h,dat$gender,sd)     
ns<-tapply(dat$h,dat$gender,length) 
sems = sds/sqrt(ns) 
barplot2(means, plot.ci=T,
 ci.l = means - sems,
 ci.u = means + sems,
 ylim = c(140,180),
 names.arg = c("Female","Male"),
 xpd = F,
 ylab = "height",
 xlab = "gender")

# another barplot 
means <- tapply(dat$h,list(dat$gender,dat$affil),mean)
sds <- tapply(dat$h,list(dat$gender,dat$affil),sd)
ns <- tapply(dat$h,list(dat$gender,dat$affil),length)
sem = sds/sqrt(ns)
barplot2(means[1:4], plot.ci=T, ci.l=means[1:4]-sem[1:4],   
        ci.u=means[1:4] + sem[1:4], ylim=c(150,175), 
    names.arg=c("Female,CS","Male,CS","Female,PSY","Male,PSY"),   
        xpd=F,ylab="height",xlab="gender & affiliation")

# histogram again
par(mfrow=c(1,2))
hist(dat[dat$gender=='F',]$h,main="Dist. of Height for Female Participants",
 xlab="Height",xlim=c(140,190),probability=T)
dens.F=density(dat[dat$gender=='F',]$h);lines(dens.F,col='blue',lwd=2)
hist(dat[dat$gender=='M',]$h,main="Dist. of Height for Male Participants",
 xlab="Height",xlim=c(140,190),probability=T,ylim=c(0,0.08))
dens.M=density(dat[dat$gender=='M',]$h);lines(dens.M,col='green',lwd=2)

par(mfrow=c(2,1))

par(mfrow=c(1,1))
plot(dens.F,col='blue',lwd=2,main="Dist. of Height by gender",xlab='Height',
  ylab='density',xlim=c(140,190))
lines(dens.M,col='green',lwd=2)
legend("topleft", c('Female','Male'),col=c('blue','green'),cex=1.5,lwd=2)

# inserting text
text(157.5,0.04,'Female',col='blue',cex=2)
text(170,0.04,'Male',col='green',cex=2)

# scatterplot
plot(dat$shoesize,dat$h,main="Relationship b/w shoesize and height",xlab='shoe size', 
  ylab='height',pch=19,col='red')
text(22,175,paste("r =",substr(cor(dat$shoesize,dat$h),1,5)),cex=1.5)
abline(h=mean(dat$h),col='blue');
abline(v=mean(dat$shoesize),col='green');
text(21.5,165,'mean height',col='blue')
text(25.7,145,'mean shoesize',col='green')
abline(lm(dat$h~dat$shoesize),lty=2,lwd=2)

plot(dat[dat$gender=='F',]$shoesize,dat[dat$gender=='F',]$h,
 main="Relationship b/w shoesize and height",xlab='shoesize',ylab='height',
 cex.lab=1.5,pch=19,col='blue',xlim=c(20,29),ylim=c(140,190))
lines(dat[dat$gender=='M',]$shoesize,dat[dat$gender=='M',]$h,type='p',pch=15,col='green')
legend("topleft", c('Female','Male'), pch =c(19,15),col=c('blue','green'),cex=1.5)
dat.reg<-read.csv("http://www.matsuka.info/data_folder/tdkReg01.csv", header=T)
plot(dat.reg,pch=20,col=c('blue'))
dat.pca<-read.table("http://www.matsuka.info/data_folder/tdkPCA01.txt",header=T)

# intro to central limit theorem
ckCLT=function(n_rep,n_sample){
  dat<-matrix(rnorm(n_rep*n_sample),nrow=n_rep,ncol=n_sample);
  means<-rowMeans(dat)}
n_rep=10^6
n5=ckCLT(n_rep,5)
hist(n5,main="Dist. of sample meanx",xlab="sample mean",xlim=c(-3,3),probability=T)
den5=density(n5);lines(den5,col='blue',lwd=2)

n10=ckCLT(n_rep,10)
n25=ckCLT(n_rep,25)
n100=ckCLT(n_rep,100)
plot(den5,col='blue',lwd=2,,main="Dist. of sample meanx",xlab="sample mean",
  xlim=c(-2,2),ylim=c(0,4))
den10=density(n10);lines(den10,col='red',lwd=2)
den25=density(n25);lines(den25,col='black',lwd=2)
den100=density(n100);lines(den100,col='green',lwd=2)
legend("topleft", c('N=5','N=10','N=25','N=100'),col=c('blue','red','black','green'),
  cex=1.5,lwd=2)

データ解析基礎論a WAA01

データ解析基礎論A Weekly Assignment WAA01
提出期限:2017.04.18 4限開始まで
提出物: Rのコマンドとその出力

WAA01.1
下記のデータをdata.frameのコマンドを用いてRに入力してください。

> dat
 id gender score
  1      M    73
  2      M    64
  3      M    78
  4      F    74
  5      F    84
  6      F    78 

WAA01.2
gender == “M” や gender == “F” などを用いて、上記のデータ男性と女性のscoreの平均値を求めてください。

WAA01.3
このファイルをダウンロードして、scoreが100の男性と女性の数を調べてください。

解答例
WAA01.1
例1
dat<-data.frame(id = 1:6, gender = c(rep("M",3),rep("F",3)), 
  score = c(73,64,78,74,84,78))
例2
id = 1:6
gender = c(rep("M",3),rep("F",3))
score = c(73,64,78,74,84,78)
dat2 <- data.frame(id = id, gender = gender, score = score)

WAA01.2
例1
mean(dat$score[which(dat$gender=="M")])
mean(dat$score[which(dat$gender=="F")])
例2
dat.male = dat[which(dat$gender=="M"),]
mean(dat.male$score)
dat.female = dat[which(dat$gender=="F"),]
mean(dat.female$score)

WAA01.3
例1
dat3 <- read.csv("http://peach.l.chiba-u.ac.jp/course_folder/waa01.csv")
head(dat3)
which(dat3[which(dat3$gender == "M"),]$score == 100)
which(dat3[which(dat3$gender == "F"),]$score == 100)
例2
dat3.male = dat3[which(dat3$gender == "M"), ]
dat3.female = dat3[which(dat3$gender == "F"), ]
dat3.male$id[which(dat3.male$score == 100)]
dat3.female$id[which(dat3.female$score == 100)]

データ解析基礎論a 2017 W01

データ解析基礎論a 2017 W01で使用するコードなど

data01<-data.frame(score = c(2,4,3,4),     
                   dose = c(rep(10,2),rep(100,2)),  
                   condition = rep(c('exp','control'),2))

dat01<-read.csv("http://www.matsuka.info/data_folder/temp_data01.txt",
                header=T)

dat02<-read.csv("http://www.matsuka.info/data_folder/temp_data02.txt", 
                header=T, row.name=1)

dat03<-read.table("http://www.matsuka.info/data_folder/temp_data03.txt",
                  header=T, row.name=4)

dat04<-data.frame(score=c(dat03$x,dat03$y,dat03$z),   
                  names=rep(c('sazae','masuo','tarachan'),3),
                  condition=sort(rep(c("x","y","z"),3)))

dat<-read.csv("http://www.matsuka.info/data_folder/datWA01.txt",   
              header=T);

mean(dat$shoesize[dat$gender == "M"])
mean(dat$shoesize[dat$gender == "F"])
mean(dat$shoesize[dat$h > 180])

単純主効果の検定など

SPF.tsmeとRBF.tsmeを更新しました。
使用方法は:
SPF.tsme(分散分析の結果、データ名、従属変数名)
RBF.tsme(分散分析の結果、データ名、従属変数名)
具体例は以下の通りです。
なお、これらを使用する前に
source(“http://peach.l.chiba-u.ac.jp/course_folder/tsme.txt”)
を必ず実行してください。

source("http://peach.l.chiba-u.ac.jp/course_folder/tsme.txt")

# SPFの例
dat<-read.csv("http://www.matsuka.info/data_folder/dktb3211.txt")
dat.aov<-aov(result~method*duration+Error(s+s:duration),dat)
tsme <- SPF.tsme(dat.aov, dat, "result")
simple main effect test for BETWEEEN subject factor 
           ss df   ms       f         p
1hr       2.5  1  2.5  1.0417 0.3150887
2hr       0.4  1  0.4  0.1667 0.6858104
3hr      12.1  1 12.1  5.0417 0.0317831
4hr      32.4  1 32.4 13.5000 0.0008662
residual 76.8 32  2.4                  

 Tukey HSD test - between subject factor @ duration = 3hr 
      D     E
D FALSE  TRUE
E  TRUE FALSE

 Tukey HSD test - between subject factor @ duration = 4hr 
      D     E
D FALSE  TRUE
E  TRUE FALSE

 simple main effect test for WITHIN subject factor 
           ss df      ms      f         p
D        49.2  3 16.4000 12.990 3.047e-05
E         1.0  3  0.3333  0.264 8.506e-01
residual 30.3 24  1.2625                 

 Tukey HSD test - within subject factor @ method = D 
      1hr   2hr   3hr   4hr
1hr FALSE FALSE  TRUE  TRUE
2hr FALSE FALSE FALSE  TRUE
3hr  TRUE FALSE FALSE FALSE
4hr  TRUE  TRUE FALSE FALSE

# RBFの例
dat<-read.csv("http://www.matsuka.info/data_folder/dktb3218.txt")
dat.aov<-aov(result~method*duration+Error(s+s:duration+s:method),dat)
tsme <- RBF.tsme(dat.aov, dat, "result")
simple main effect test for the 1st within-subject factor 
           ss df   ms      f         p
1hr       2.5  1  2.5  1.563 0.2292743
2hr       0.4  1  0.4  0.250 0.6238816
3hr      12.1  1 12.1  7.562 0.0142341
4hr      32.4  1 32.4 20.250 0.0003635
residual 25.6 16  1.6                 

 Tukey HSD test - 1st within-subject factor @ duration = 3hr 
      D     E
D FALSE  TRUE
E  TRUE FALSE

 Tukey HSD test - 1st within-subject factor @ duration = 4hr 
      D     E
D FALSE  TRUE
E  TRUE FALSE

 simple main effect test for the 2nd within-subject factor 
           ss df      ms      f         p
D        49.2  3 16.4000 12.990 3.047e-05
E         1.0  3  0.3333  0.264 8.506e-01
residual 30.3 24  1.2625                 

 Tukey HSD test - 2nd within-subject factor @ method = D 
      1hr   2hr   3hr   4hr
1hr FALSE FALSE  TRUE  TRUE
2hr FALSE FALSE FALSE  TRUE
3hr  TRUE FALSE FALSE FALSE
4hr  TRUE  TRUE FALSE FALSE

データ解析基礎論 分散分析の例

# 可視化
interaction.plot(dat$gender,
  dat$affil,
  dat$shoesize, 
  pch=c(20,20), 
  col=c("skyblue","orange"), 
  xlab="gender", ylab="shoesize", 
  lwd=3,type='b',cex=2,
  trace.label="Affiliation")

# 可視化2(非効率)
means<-tapply(dat$shoesize,list(dat$gender, dat$affil),mean)
Ns<-tapply(dat$shoesize,list(dat$gender, dat$affil),length)
sds<-tapply(dat$shoesize,list(dat$gender, dat$affil),sd)
sems<-sds/sqrt(Ns)

plot(c(0,1),means[,1],type='o',col='skyblue',
 ylim=c(min(means)*0.975,max(means)*1.025),
  xlim=c(-0.1,1.1),lwd=2,cex=2,pch=20,xlab="gender",ylab="shoesize")
lines(c(0,1),means[,2],type='o',col='orange',lwd=2,cex=2,pch=20)
legend("topleft",c("CogSci","PsySci"),col=c("skyblue","orange"),lwd=2)
lines(c(0,0),c(means[1,1]-sems[1,1],means[1,1]+sems[1,1]),col="skyblue",lwd=2.5)
lines(c(1,1),c(means[2,1]-sems[2,1],means[2,1]+sems[2,1]),col="skyblue",lwd=2.5)
lines(c(0,0),c(means[1,2]-sems[1,2],means[1,2]+sems[1,2]),col="orange",lwd=2.5)
lines(c(1,1),c(means[2,2]-sems[2,2],means[2,2]+sems[2,2]),col="orange",lwd=2.5)

# 分散分析
dat.aov=aov(shoesize~gender*affil, data=dat)
dat.aov.sum=summary(dat.aov)

# 単純主効果の検定
#  各所属講座における性別の効果
means<-tapply(dat$shoesize, list(dat$gender,dat$affil), mean)
SS_gen_CS<- 5*(means[2,1]^2 + means[1,1]^2 -0.5*sum(means[,1])^2) # SS_gender CS
SS_gen_PS<- 5*(means[2,2]^2 + means[1,2]^2 -0.5*sum(means[,2])^2) # SS_gender PS
dat.aov.sum=summary(dat.aov)   # ANOVA table
MSe=dat.aov.sum[[1]][4,3]      # MSE from ANOVA table or MSe=0.62
dfE=dat.aov.sum[[1]][4,1]      # DF for error or dfE=16
dfG=1                          # DF for gender
F_gen_CS=(SS_gen_CS/dfG)/MSe   # F-value for gender effect given CS
F_gen_PS=(SS_gen_PS/dfG)/MSe   # F-value for gender effect given PS
P_gen_CS=1-pf(F_gen_CS,1,dfE)  # p-value for gender effect given CS
P_gen_PS=1-pf(F_gen_PS,1,dfE)  # p-value for gender effect given PS

#  各性別における所属講座の効果
SS_affil_F<- 5*(means[1,1]^2+means[1,2]^2-0.5*sum(means[1,])^2) #SS_affil | F
SS_affil_M<- 5*(means[2,1]^2+means[2,2]^2-0.5*sum(means[2,])^2) #SS_affil | M
dfA=1				          # DF for affil
F_affil_F=SS_affil_F/dfA/MSe         # F-value for affiliation effect | F
F_affil_M=SS_affil_M/dfA/MSe         # F-value for affiliation effect | M
P_affil_F=1-pf(F_affil_F,1,dfE)      # p-value for affiliation effect | F
P_affil_M=1-pf(F_affil_M,1,dfE)      # p-value for affiliation effect | M

# 例2
interaction.plot(dat$duration,dat$method,dat$result, 
  pch=c(20,20), col=c("skyblue","orange"), ylab="score", 
  xlab="Duration",lwd=3,type='b',cex=2,trace.label="Method")
mod1=aov(result~method+duration,data=dat)
mod1.sum=print(summary(mod1))
mod2=aov(result~method*duration,data=dat)
mod2.sum=print(summary(mod2))
means<-tapply(dat$result,list(dat$method,dat$duration),mean)
ssM_1=5*(sum(means[,1]^2)-0.5*(sum(means[,1])^2))
ssM_2=5*(sum(means[,2]^2)-0.5*(sum(means[,2])^2))
ssM_3=5*(sum(means[,3]^2)-0.5*(sum(means[,3])^2))
ssM_4=5*(sum(means[,4]^2)-0.5*(sum(means[,4])^2))
MSe=mod2.sum[[1]][4,3]
DFe=mod2.sum[[1]][4,1]
DFm=1
fM_1=(ssM_1/DFm)/MSe
1-pf(fM_1,DFm,DFe)
fM_2=(ssM_2/DFm)/MSe
1-pf(fM_2,DFm,DFe)
fM_3=(ssM_3/DFm)/MSe
1-pf(fM_3,DFm,DFe)
fM_4=(ssM_4/DFm)/MSe
1-pf(fM_4,DFm,DFe)
ssD_X=5*(sum(means[1,]^2)-1/4*(sum(means[1,])^2))
ssD_Y=5*(sum(means[2,]^2)-1/4*(sum(means[2,])^2))
DFd=3
fD_X=(ssD_X/DFd)/MSe
fD_Y=(ssD_Y/DFd)/MSe
1-pf(fD_X,DFd,DFe)
1-pf(fD_X,DFd,DFe)
qv=qtukey(0.95,DFd+1,DFe)
hsd=qv*(sqrt(MSe/5))
print(diffM<-outer(means[1,],means[1,],"-"))           
abs(diffM)>hsd      

CLT simulation

中央極限定理の実験

source("http://peach.l.chiba-u.ac.jp/course_folder/ckCLT.txt")

# r command
genR<-function(n_rep,n_sample,prob,distID){
switch(distID,
  binom=rbinom(n_sample*n_rep,n_sample,prob),
  normal=rnorm(n_rep*n_sample),
  uniform=runif(n_rep*n_sample))
}
ckCLT=function(n_rep,n_sample,prob,Distr){
  vecR<-genR(n_rep,n_sample,prob,Distr)
  dat<-matrix(vecR,nrow=n_rep,ncol=n_sample);
  means<-rowMeans(dat)
  par(mfrow=c(2,1))
  hist(vecR,main="Dist. of the original data set")
  hist(means,main="Dist. of sample meanx",xlab="sample mean",probability=T)
  if (Distr=="binom"){
    denS=density(means,bw=0.125)
  } else {denS=density(means)}
  lines(denS,col='blue',lwd=2)
 return(means);
}