Posted on May 16, 2017 by matsukat

データ解析基礎論A Week05 t検定

dat<-read.csv("http://www.matsuka.info/data_folder/datWA01.txt")
mean.M <-mean(dat$h[dat$gender=="M"])
sigma = 10
n.M = length(dat$h[dat$gender=="M"])
z.value=(mean.M-171)/(sqrt(sigma/n.M))
(1-pnorm(abs(z.value)))*2

ssize = c(24,25,26,23.5,25,27,24,22,27.5,28)
ssize.mean = mean(ssize)
ssize.var = var(ssize)
N = 10
t.value=(ssize.mean-24)/(sqrt(ssize.var/N))
(1-pt(abs(t.value),df=9))*2

h.mean.M <-mean(dat$h[dat$gender=="M"])
h.var.M <- var(dat$h[dat$gender=="M"])
n.M = length(dat$h[dat$gender=="M"])
t.value=(h.mean.M-171)/(sqrt(h.var.M/n.M))
(1-pt(abs(t.value),df = (n.M-1)))*2

# RCMD
A=c(12,19,10,10,14,18,15,11,16)
B=c(15,20,16,14,17,16,12,12,19)
d=A-B
tValue<-mean(d)/sqrt(var(d)/length(d))
(1-pt(abs(tValue), df=8))*2

t.test(d,mu=0)


X1=c(78,70,66,76,78,76,88,76)
X2=c(76,72,60,72,70,72,84,70)
t.value=(mean(X1)-mean(X2))/sqrt((var(X1)+var(X2))/8)
2*(1-pt(abs(t.value),14))

Posted on May 8, 2017 by matsukat

データ解析基礎論a Week04

nSample=10;nRep=10^5;
CLT.unif <- function(nSample, nRep) {
  x=matrix(runif(nSample*nRep),nrow=nSample);
  x.means<-colMeans(x)
  hist(x.means,50,main='Distribution of Means of Uniform Distribution', 
    xlab='Means', probability=T)
  x.means.dens<-density(x.means)
  lines(x.means.dens,lwd=3,lty=1,col='red')
  x2=seq(0,1,0.001);CLT=dnorm(x2,mean=0.5,sd=(sqrt(1/12))/(sqrt(nSample)))
  lines(x2,CLT,lwd=3,lty=3,col='cyan')
  legend("topright",c("Density of Actual Means","Normal Distribution"), 
   lty=c(1,3), col=c('red','cyan'),lwd=3)
}

> CLT.unif(10,100000)

Posted on April 25, 2017 by matsukat

データ解析基礎論A Week03

dat<-read.table("http://www.matsuka.info/data_folder/aov01.txt",header=T)
head(dat)
summary(dat)
boxplot(dat$shoesize, col = "skyblue", main = "Dist. of Shoesize",
 ylab = "Size of shoe (in cm)")
boxplot(dat$h, col = "coral", main = "Dist. of Height", ylab = "Height (in cm)")

dat.meter = dat[,1:2]*0.01 
dat.h.ext5 = dat$h+5
dat.ss.ext1 = dat$shoesize+1
hANDshoe = dat$h+dat$shoesize
dat.h.meter.ext5 = dat$h*0.01+0.05

Posted on April 17, 2017 by matsukat

データ解析基礎論A グラフの基礎

データ解析基礎論A 第2週　グラフの基礎

# descriptive statistics
dat<-read.table("http://www.matsuka.info/data_folder/aov01.txt",header=T)
summary(dat)
mean(dat$shoesize)
var(dat[,1:2])
cov(dat[,1:2])
cor(dat[,1:2])

# basics - plotting
x=seq(-3,3,0.1);y=x^2;
plot(x,y)
plot(x,y,col='red')
plot(x,y,pch=20)
plot(x,y,type='l')
plot(x,y,type='l',lty=4,lwd=3)
plot(x,y,main="THIS IS THE TITLE", xlab="Label for X-axis",ylab="Label for Y-axis")
plot(x,y,main="TITLE", xlab="X here",ylab="Y here",xlim=c(-3.5,3.5),ylim=c(-0.5, 10))
plot(x,y,col='blue',type='o',lty=2,pch=19,lwd=3,main="Y=X*X", xlab="X",ylab="X*X",
  xlim=c(-3.5,3.5),ylim=c(-0.5, 10))

# histogram
dat<-read.table("http://www.matsuka.info/data_folder/aov01.txt",header=T)
hist(dat$h,main="Histogram of Height",xlab="Height",col='blue',xlim=c(140,190))
dens<-density(dat$h);
hist(dat$h,main="Histogram of Height",xlab="Height",xlim=c(140,190),probability=T)
lines(dens,lwd=2,col='red',lty=2)

# boxplot
boxplot(dat$h,main="Boxplot of Height",ylab="Height",col='cyan',ylim=c(140,190))
boxplot(dat$h~dat$gender,main="Distribution of Height by Gender",ylab="Gender",
 xlab="Height",col=c('blue','cyan'),ylim=c(140,190),horizontal=T)
boxplot(dat$h~dat$gender+dat$affil, main="Distribution of Height by Gender and Affiliation",
  ylab="Gender x Affiliation", xlab="Height", col=c('blue','cyan','red','magenta'),
  ylim=c(140,190),horizontal=T)

# barplot
install.packages("gplots")
library(gplots)

means <- tapply(dat$h, dat$gender, mean)
sds<-tapply(dat$h,dat$gender,sd)     
ns<-tapply(dat$h,dat$gender,length) 
sems = sds/sqrt(ns) 
barplot2(means, plot.ci=T,
 ci.l = means - sems,
 ci.u = means + sems,
 ylim = c(140,180),
 names.arg = c("Female","Male"),
 xpd = F,
 ylab = "height",
 xlab = "gender")

# another barplot 
means <- tapply(dat$h,list(dat$gender,dat$affil),mean)
sds <- tapply(dat$h,list(dat$gender,dat$affil),sd)
ns <- tapply(dat$h,list(dat$gender,dat$affil),length)
sem = sds/sqrt(ns)
barplot2(means[1:4], plot.ci=T, ci.l=means[1:4]-sem[1:4],   
        ci.u=means[1:4] + sem[1:4], ylim=c(150,175), 
    names.arg=c("Female,CS","Male,CS","Female,PSY","Male,PSY"),   
        xpd=F,ylab="height",xlab="gender & affiliation")

# histogram again
par(mfrow=c(1,2))
hist(dat[dat$gender=='F',]$h,main="Dist. of Height for Female Participants",
 xlab="Height",xlim=c(140,190),probability=T)
dens.F=density(dat[dat$gender=='F',]$h);lines(dens.F,col='blue',lwd=2)
hist(dat[dat$gender=='M',]$h,main="Dist. of Height for Male Participants",
 xlab="Height",xlim=c(140,190),probability=T,ylim=c(0,0.08))
dens.M=density(dat[dat$gender=='M',]$h);lines(dens.M,col='green',lwd=2)

par(mfrow=c(2,1))

par(mfrow=c(1,1))
plot(dens.F,col='blue',lwd=2,main="Dist. of Height by gender",xlab='Height',
  ylab='density',xlim=c(140,190))
lines(dens.M,col='green',lwd=2)
legend("topleft", c('Female','Male'),col=c('blue','green'),cex=1.5,lwd=2)

# inserting text
text(157.5,0.04,'Female',col='blue',cex=2)
text(170,0.04,'Male',col='green',cex=2)

# scatterplot
plot(dat$shoesize,dat$h,main="Relationship b/w shoesize and height",xlab='shoe size', 
  ylab='height',pch=19,col='red')
text(22,175,paste("r =",substr(cor(dat$shoesize,dat$h),1,5)),cex=1.5)
abline(h=mean(dat$h),col='blue');
abline(v=mean(dat$shoesize),col='green');
text(21.5,165,'mean height',col='blue')
text(25.7,145,'mean shoesize',col='green')
abline(lm(dat$h~dat$shoesize),lty=2,lwd=2)

plot(dat[dat$gender=='F',]$shoesize,dat[dat$gender=='F',]$h,
 main="Relationship b/w shoesize and height",xlab='shoesize',ylab='height',
 cex.lab=1.5,pch=19,col='blue',xlim=c(20,29),ylim=c(140,190))
lines(dat[dat$gender=='M',]$shoesize,dat[dat$gender=='M',]$h,type='p',pch=15,col='green')
legend("topleft", c('Female','Male'), pch =c(19,15),col=c('blue','green'),cex=1.5)
dat.reg<-read.csv("http://www.matsuka.info/data_folder/tdkReg01.csv", header=T)
plot(dat.reg,pch=20,col=c('blue'))
dat.pca<-read.table("http://www.matsuka.info/data_folder/tdkPCA01.txt",header=T)

# intro to central limit theorem
ckCLT=function(n_rep,n_sample){
  dat<-matrix(rnorm(n_rep*n_sample),nrow=n_rep,ncol=n_sample);
  means<-rowMeans(dat)}
n_rep=10^6
n5=ckCLT(n_rep,5)
hist(n5,main="Dist. of sample meanx",xlab="sample mean",xlim=c(-3,3),probability=T)
den5=density(n5);lines(den5,col='blue',lwd=2)

n10=ckCLT(n_rep,10)
n25=ckCLT(n_rep,25)
n100=ckCLT(n_rep,100)
plot(den5,col='blue',lwd=2,,main="Dist. of sample meanx",xlab="sample mean",
  xlim=c(-2,2),ylim=c(0,4))
den10=density(n10);lines(den10,col='red',lwd=2)
den25=density(n25);lines(den25,col='black',lwd=2)
den100=density(n100);lines(den100,col='green',lwd=2)
legend("topleft", c('N=5','N=10','N=25','N=100'),col=c('blue','red','black','green'),
  cex=1.5,lwd=2)

Posted on April 12, 2017 by matsukat

データ解析基礎論a WAA01

データ解析基礎論A　Weekly Assignment WAA01
提出期限：2017.04.18 4限開始まで
提出物：　Rのコマンドとその出力

WAA01.1
下記のデータをdata.frameのコマンドを用いてRに入力してください。

> dat
 id gender score
  1      M    73
  2      M    64
  3      M    78
  4      F    74
  5      F    84
  6      F    78

WAA01.2
gender == “M” や　gender == “F” などを用いて、上記のデータ男性と女性のscoreの平均値を求めてください。

WAA01.3
このファイルをダウンロードして、scoreが100の男性と女性の数を調べてください。

解答例
WAA01.1
例１
dat<-data.frame(id = 1:6, gender = c(rep("M",3),rep("F",3)), 
  score = c(73,64,78,74,84,78))
例２
id = 1:6
gender = c(rep("M",3),rep("F",3))
score = c(73,64,78,74,84,78)
dat2 <- data.frame(id = id, gender = gender, score = score)

WAA01.2
例１
mean(dat$score[which(dat$gender=="M")])
mean(dat$score[which(dat$gender=="F")])
例２
dat.male = dat[which(dat$gender=="M"),]
mean(dat.male$score)
dat.female = dat[which(dat$gender=="F"),]
mean(dat.female$score)

WAA01.3
例１
dat3 <- read.csv("http://peach.l.chiba-u.ac.jp/course_folder/waa01.csv")
head(dat3)
which(dat3[which(dat3$gender == "M"),]$score == 100)
which(dat3[which(dat3$gender == "F"),]$score == 100)
例２
dat3.male = dat3[which(dat3$gender == "M"), ]
dat3.female = dat3[which(dat3$gender == "F"), ]
dat3.male$id[which(dat3.male$score == 100)]
dat3.female$id[which(dat3.female$score == 100)]

Posted on April 11, 2017 by matsukat

データ解析基礎論a 2017 W01

データ解析基礎論a 2017 W01で使用するコードなど

data01<-data.frame(score = c(2,4,3,4),     
                   dose = c(rep(10,2),rep(100,2)),  
                   condition = rep(c('exp','control'),2))

dat01<-read.csv("http://www.matsuka.info/data_folder/temp_data01.txt",
                header=T)

dat02<-read.csv("http://www.matsuka.info/data_folder/temp_data02.txt", 
                header=T, row.name=1)

dat03<-read.table("http://www.matsuka.info/data_folder/temp_data03.txt",
                  header=T, row.name=4)

dat04<-data.frame(score=c(dat03$x,dat03$y,dat03$z),   
                  names=rep(c('sazae','masuo','tarachan'),3),
                  condition=sort(rep(c("x","y","z"),3)))

dat<-read.csv("http://www.matsuka.info/data_folder/datWA01.txt",   
              header=T);

mean(dat$shoesize[dat$gender == "M"])
mean(dat$shoesize[dat$gender == "F"])
mean(dat$shoesize[dat$h > 180])

Posted on October 27, 2016 by matsukat

単純主効果の検定など

SPF.tsmeとRBF.tsmeを更新しました。
使用方法は：
SPF.tsme(分散分析の結果、データ名、従属変数名）
RBF.tsme(分散分析の結果、データ名、従属変数名）
具体例は以下の通りです。
なお、これらを使用する前に
source(“http://peach.l.chiba-u.ac.jp/course_folder/tsme.txt”)
を必ず実行してください。

source("http://peach.l.chiba-u.ac.jp/course_folder/tsme.txt")

# SPFの例
dat<-read.csv("http://www.matsuka.info/data_folder/dktb3211.txt")
dat.aov<-aov(result~method*duration+Error(s+s:duration),dat)
tsme <- SPF.tsme(dat.aov, dat, "result")
simple main effect test for BETWEEEN subject factor 
           ss df   ms       f         p
1hr       2.5  1  2.5  1.0417 0.3150887
2hr       0.4  1  0.4  0.1667 0.6858104
3hr      12.1  1 12.1  5.0417 0.0317831
4hr      32.4  1 32.4 13.5000 0.0008662
residual 76.8 32  2.4                  

 Tukey HSD test - between subject factor @ duration = 3hr 
      D     E
D FALSE  TRUE
E  TRUE FALSE

 Tukey HSD test - between subject factor @ duration = 4hr 
      D     E
D FALSE  TRUE
E  TRUE FALSE

 simple main effect test for WITHIN subject factor 
           ss df      ms      f         p
D        49.2  3 16.4000 12.990 3.047e-05
E         1.0  3  0.3333  0.264 8.506e-01
residual 30.3 24  1.2625                 

 Tukey HSD test - within subject factor @ method = D 
      1hr   2hr   3hr   4hr
1hr FALSE FALSE  TRUE  TRUE
2hr FALSE FALSE FALSE  TRUE
3hr  TRUE FALSE FALSE FALSE
4hr  TRUE  TRUE FALSE FALSE

# RBFの例
dat<-read.csv("http://www.matsuka.info/data_folder/dktb3218.txt")
dat.aov<-aov(result~method*duration+Error(s+s:duration+s:method),dat)
tsme <- RBF.tsme(dat.aov, dat, "result")
simple main effect test for the 1st within-subject factor 
           ss df   ms      f         p
1hr       2.5  1  2.5  1.563 0.2292743
2hr       0.4  1  0.4  0.250 0.6238816
3hr      12.1  1 12.1  7.562 0.0142341
4hr      32.4  1 32.4 20.250 0.0003635
residual 25.6 16  1.6                 

 Tukey HSD test - 1st within-subject factor @ duration = 3hr 
      D     E
D FALSE  TRUE
E  TRUE FALSE

 Tukey HSD test - 1st within-subject factor @ duration = 4hr 
      D     E
D FALSE  TRUE
E  TRUE FALSE

 simple main effect test for the 2nd within-subject factor 
           ss df      ms      f         p
D        49.2  3 16.4000 12.990 3.047e-05
E         1.0  3  0.3333  0.264 8.506e-01
residual 30.3 24  1.2625                 

 Tukey HSD test - 2nd within-subject factor @ method = D 
      1hr   2hr   3hr   4hr
1hr FALSE FALSE  TRUE  TRUE
2hr FALSE FALSE FALSE  TRUE
3hr  TRUE FALSE FALSE FALSE
4hr  TRUE  TRUE FALSE FALSE

Posted on June 14, 2016 by matsukat

データ解析基礎論分散分析の例

# 可視化
interaction.plot(dat$gender,
  dat$affil,
  dat$shoesize, 
  pch=c(20,20), 
  col=c("skyblue","orange"), 
  xlab="gender", ylab="shoesize", 
  lwd=3,type='b',cex=2,
  trace.label="Affiliation")

# 可視化２（非効率）
means<-tapply(dat$shoesize,list(dat$gender, dat$affil),mean)
Ns<-tapply(dat$shoesize,list(dat$gender, dat$affil),length)
sds<-tapply(dat$shoesize,list(dat$gender, dat$affil),sd)
sems<-sds/sqrt(Ns)

plot(c(0,1),means[,1],type='o',col='skyblue',
　ylim=c(min(means)*0.975,max(means)*1.025),
  xlim=c(-0.1,1.1),lwd=2,cex=2,pch=20,xlab="gender",ylab="shoesize")
lines(c(0,1),means[,2],type='o',col='orange',lwd=2,cex=2,pch=20)
legend("topleft",c("CogSci","PsySci"),col=c("skyblue","orange"),lwd=2)
lines(c(0,0),c(means[1,1]-sems[1,1],means[1,1]+sems[1,1]),col="skyblue",lwd=2.5)
lines(c(1,1),c(means[2,1]-sems[2,1],means[2,1]+sems[2,1]),col="skyblue",lwd=2.5)
lines(c(0,0),c(means[1,2]-sems[1,2],means[1,2]+sems[1,2]),col="orange",lwd=2.5)
lines(c(1,1),c(means[2,2]-sems[2,2],means[2,2]+sems[2,2]),col="orange",lwd=2.5)

# 分散分析
dat.aov=aov(shoesize~gender*affil, data=dat)
dat.aov.sum=summary(dat.aov)

# 単純主効果の検定
#  各所属講座における性別の効果
means<-tapply(dat$shoesize, list(dat$gender,dat$affil), mean)
SS_gen_CS<- 5*(means[2,1]^2 + means[1,1]^2 -0.5*sum(means[,1])^2) # SS_gender CS
SS_gen_PS<- 5*(means[2,2]^2 + means[1,2]^2 -0.5*sum(means[,2])^2) # SS_gender PS
dat.aov.sum=summary(dat.aov)   # ANOVA table
MSe=dat.aov.sum[[1]][4,3]      # MSE from ANOVA table or MSe=0.62
dfE=dat.aov.sum[[1]][4,1]      # DF for error or dfE=16
dfG=1                          # DF for gender
F_gen_CS=(SS_gen_CS/dfG)/MSe   # F-value for gender effect given CS
F_gen_PS=(SS_gen_PS/dfG)/MSe   # F-value for gender effect given PS
P_gen_CS=1-pf(F_gen_CS,1,dfE)  # p-value for gender effect given CS
P_gen_PS=1-pf(F_gen_PS,1,dfE)  # p-value for gender effect given PS

#  各性別における所属講座の効果
SS_affil_F<- 5*(means[1,1]^2+means[1,2]^2-0.5*sum(means[1,])^2) #SS_affil | F
SS_affil_M<- 5*(means[2,1]^2+means[2,2]^2-0.5*sum(means[2,])^2) #SS_affil | M
dfA=1				          # DF for affil
F_affil_F=SS_affil_F/dfA/MSe         # F-value for affiliation effect | F
F_affil_M=SS_affil_M/dfA/MSe         # F-value for affiliation effect | M
P_affil_F=1-pf(F_affil_F,1,dfE)      # p-value for affiliation effect | F
P_affil_M=1-pf(F_affil_M,1,dfE)      # p-value for affiliation effect | M

# 例２
interaction.plot(dat$duration,dat$method,dat$result, 
  pch=c(20,20), col=c("skyblue","orange"), ylab="score", 
  xlab="Duration",lwd=3,type='b',cex=2,trace.label="Method")
mod1=aov(result~method+duration,data=dat)
mod1.sum=print(summary(mod1))
mod2=aov(result~method*duration,data=dat)
mod2.sum=print(summary(mod2))
means<-tapply(dat$result,list(dat$method,dat$duration),mean)
ssM_1=5*(sum(means[,1]^2)-0.5*(sum(means[,1])^2))
ssM_2=5*(sum(means[,2]^2)-0.5*(sum(means[,2])^2))
ssM_3=5*(sum(means[,3]^2)-0.5*(sum(means[,3])^2))
ssM_4=5*(sum(means[,4]^2)-0.5*(sum(means[,4])^2))
MSe=mod2.sum[[1]][4,3]
DFe=mod2.sum[[1]][4,1]
DFm=1
fM_1=(ssM_1/DFm)/MSe
1-pf(fM_1,DFm,DFe)
fM_2=(ssM_2/DFm)/MSe
1-pf(fM_2,DFm,DFe)
fM_3=(ssM_3/DFm)/MSe
1-pf(fM_3,DFm,DFe)
fM_4=(ssM_4/DFm)/MSe
1-pf(fM_4,DFm,DFe)
ssD_X=5*(sum(means[1,]^2)-1/4*(sum(means[1,])^2))
ssD_Y=5*(sum(means[2,]^2)-1/4*(sum(means[2,])^2))
DFd=3
fD_X=(ssD_X/DFd)/MSe
fD_Y=(ssD_Y/DFd)/MSe
1-pf(fD_X,DFd,DFe)
1-pf(fD_X,DFd,DFe)
qv=qtukey(0.95,DFd+1,DFe)
hsd=qv*(sqrt(MSe/5))
print(diffM<-outer(means[1,],means[1,],"-"))           
abs(diffM)>hsd

Posted on May 12, 2015 by matsukat

CLT simulation

中央極限定理の実験

source("http://peach.l.chiba-u.ac.jp/course_folder/ckCLT.txt")

# r command
genR<-function(n_rep,n_sample,prob,distID){
switch(distID,
  binom=rbinom(n_sample*n_rep,n_sample,prob),
  normal=rnorm(n_rep*n_sample),
  uniform=runif(n_rep*n_sample))
}
ckCLT=function(n_rep,n_sample,prob,Distr){
  vecR<-genR(n_rep,n_sample,prob,Distr)
  dat<-matrix(vecR,nrow=n_rep,ncol=n_sample);
  means<-rowMeans(dat)
  par(mfrow=c(2,1))
  hist(vecR,main="Dist. of the original data set")
  hist(means,main="Dist. of sample meanx",xlab="sample mean",probability=T)
  if (Distr=="binom"){
    denS=density(means,bw=0.125)
  } else {denS=density(means)}
  lines(denS,col='blue',lwd=2)
 return(means);
}

CourseLog

Dept Cognitive & Info Science

Category Archives: データ解析