データ解析基礎論A 第2週 グラフの基礎
# descriptive statistics dat<-read.table("http://www.matsuka.info/data_folder/aov01.txt",header=T) summary(dat) mean(dat$shoesize) var(dat[,1:2]) cov(dat[,1:2]) cor(dat[,1:2]) # basics - plotting x=seq(-3,3,0.1);y=x^2; plot(x,y) plot(x,y,col='red') plot(x,y,pch=20) plot(x,y,type='l') plot(x,y,type='l',lty=4,lwd=3) plot(x,y,main="THIS IS THE TITLE", xlab="Label for X-axis",ylab="Label for Y-axis") plot(x,y,main="TITLE", xlab="X here",ylab="Y here",xlim=c(-3.5,3.5),ylim=c(-0.5, 10)) plot(x,y,col='blue',type='o',lty=2,pch=19,lwd=3,main="Y=X*X", xlab="X",ylab="X*X", xlim=c(-3.5,3.5),ylim=c(-0.5, 10)) # histogram dat<-read.table("http://www.matsuka.info/data_folder/aov01.txt",header=T) hist(dat$h,main="Histogram of Height",xlab="Height",col='blue',xlim=c(140,190)) dens<-density(dat$h); hist(dat$h,main="Histogram of Height",xlab="Height",xlim=c(140,190),probability=T) lines(dens,lwd=2,col='red',lty=2) # boxplot boxplot(dat$h,main="Boxplot of Height",ylab="Height",col='cyan',ylim=c(140,190)) boxplot(dat$h~dat$gender,main="Distribution of Height by Gender",ylab="Gender", xlab="Height",col=c('blue','cyan'),ylim=c(140,190),horizontal=T) boxplot(dat$h~dat$gender+dat$affil, main="Distribution of Height by Gender and Affiliation", ylab="Gender x Affiliation", xlab="Height", col=c('blue','cyan','red','magenta'), ylim=c(140,190),horizontal=T) # barplot install.packages("gplots") library(gplots) means <- tapply(dat$h, dat$gender, mean) sds<-tapply(dat$h,dat$gender,sd) ns<-tapply(dat$h,dat$gender,length) sems = sds/sqrt(ns) barplot2(means, plot.ci=T, ci.l = means - sems, ci.u = means + sems, ylim = c(140,180), names.arg = c("Female","Male"), xpd = F, ylab = "height", xlab = "gender") # another barplot means <- tapply(dat$h,list(dat$gender,dat$affil),mean) sds <- tapply(dat$h,list(dat$gender,dat$affil),sd) ns <- tapply(dat$h,list(dat$gender,dat$affil),length) sem = sds/sqrt(ns) barplot2(means[1:4], plot.ci=T, ci.l=means[1:4]-sem[1:4], ci.u=means[1:4] + sem[1:4], ylim=c(150,175), names.arg=c("Female,CS","Male,CS","Female,PSY","Male,PSY"), xpd=F,ylab="height",xlab="gender & affiliation") # histogram again par(mfrow=c(1,2)) hist(dat[dat$gender=='F',]$h,main="Dist. of Height for Female Participants", xlab="Height",xlim=c(140,190),probability=T) dens.F=density(dat[dat$gender=='F',]$h);lines(dens.F,col='blue',lwd=2) hist(dat[dat$gender=='M',]$h,main="Dist. of Height for Male Participants", xlab="Height",xlim=c(140,190),probability=T,ylim=c(0,0.08)) dens.M=density(dat[dat$gender=='M',]$h);lines(dens.M,col='green',lwd=2) par(mfrow=c(2,1)) par(mfrow=c(1,1)) plot(dens.F,col='blue',lwd=2,main="Dist. of Height by gender",xlab='Height', ylab='density',xlim=c(140,190)) lines(dens.M,col='green',lwd=2) legend("topleft", c('Female','Male'),col=c('blue','green'),cex=1.5,lwd=2) # inserting text text(157.5,0.04,'Female',col='blue',cex=2) text(170,0.04,'Male',col='green',cex=2) # scatterplot plot(dat$shoesize,dat$h,main="Relationship b/w shoesize and height",xlab='shoe size', ylab='height',pch=19,col='red') text(22,175,paste("r =",substr(cor(dat$shoesize,dat$h),1,5)),cex=1.5) abline(h=mean(dat$h),col='blue'); abline(v=mean(dat$shoesize),col='green'); text(21.5,165,'mean height',col='blue') text(25.7,145,'mean shoesize',col='green') abline(lm(dat$h~dat$shoesize),lty=2,lwd=2) plot(dat[dat$gender=='F',]$shoesize,dat[dat$gender=='F',]$h, main="Relationship b/w shoesize and height",xlab='shoesize',ylab='height', cex.lab=1.5,pch=19,col='blue',xlim=c(20,29),ylim=c(140,190)) lines(dat[dat$gender=='M',]$shoesize,dat[dat$gender=='M',]$h,type='p',pch=15,col='green') legend("topleft", c('Female','Male'), pch =c(19,15),col=c('blue','green'),cex=1.5) dat.reg<-read.csv("http://www.matsuka.info/data_folder/tdkReg01.csv", header=T) plot(dat.reg,pch=20,col=c('blue')) dat.pca<-read.table("http://www.matsuka.info/data_folder/tdkPCA01.txt",header=T) # intro to central limit theorem ckCLT=function(n_rep,n_sample){ dat<-matrix(rnorm(n_rep*n_sample),nrow=n_rep,ncol=n_sample); means<-rowMeans(dat)} n_rep=10^6 n5=ckCLT(n_rep,5) hist(n5,main="Dist. of sample meanx",xlab="sample mean",xlim=c(-3,3),probability=T) den5=density(n5);lines(den5,col='blue',lwd=2) n10=ckCLT(n_rep,10) n25=ckCLT(n_rep,25) n100=ckCLT(n_rep,100) plot(den5,col='blue',lwd=2,,main="Dist. of sample meanx",xlab="sample mean", xlim=c(-2,2),ylim=c(0,4)) den10=density(n10);lines(den10,col='red',lwd=2) den25=density(n25);lines(den25,col='black',lwd=2) den100=density(n100);lines(den100,col='green',lwd=2) legend("topleft", c('N=5','N=10','N=25','N=100'),col=c('blue','red','black','green'), cex=1.5,lwd=2)