認知情報解析演習 06

Traveling Salesman Problem

TSP.inv <- function(x, n){
  x.length = length(x)
  start.idx = sample(1:x.length,1)
  end.idx = min(x.length, (start.idx + n -1))
  x[start.idx:end.idx]=x[end.idx:start.idx]
  return(x)
}

TSP.switch <- function(x){
  x.length = length(x)
  switch.idx = sample(1:x.length,2)
  x[switch.idx] = x[rev(switch.idx)]
  return(x)
}

TSP.trans <-function(x){
  x.length = length(x)
  trans.idx = sample(1:x.length,2)
  new.x = x[-trans.idx[1]]
  new.x = append(new.x, x[trans.idx[1]],after = (trans.idx[2]-1))
  return(new.x)
}

TSP.init<-function(n_city=10) {
  return(matrix(runif(n_city*2,1,100),nrow=n_city,ncol=2))
}

cities = TSP.init(10)

TSP.totDist<-function(route, cities) {
  route=c(route,route[1]);
  n.cities=nrow(cities);
  totDist=0;
  for (i.cities in 1:n.cities) {
    totDist = totDist + dist(cities[route[i.cities:(i.cities + 1)],])
  }
  return(totDist)
}

TSP.totDist(route,cities)

TSP.demo<-function(n.city=20, maxItr=1000, parm.set = c(1,0.99, 50)) {
  loc=TSP.init(n.city);
  route=1:n.city
  ## param. for simulated annealing - change values if necessary 
  C=parm.set[1];
  eta=parm.set[2];
  TEMP=parm.set[3]; 
  ##
  optDist=TSP.totDist(route,loc)
  optHist=matrix(0,nrow=maxItr,ncol=(length(route)+1))
  optHist[1,]=c(route,optDist)
  for (i_loop in 2:maxItr) {
    rand.op=sample(c('inv','sw','trans'),1,prob=c(0.75,0.125,0.125))
    if (rand.op=='inv') {
      new.route=TSP.inv(route,sample(2:n.city,1))
    } else if (rand.op=='sw') {
      new.route=TSP.switch(route)
      } else { 
        new.route=TSP.trans(route)
    }
    new.dist=TSP.totDist(new.route,loc)
    delta=new.dist-optDist
    prob=1/(1+exp(delta/(C*TEMP)))
    if (runif(1) < prob) {
      route=new.route;optDist=new.dist;
    } 
    optHist[i_loop,]=c(route,optDist);
    TEMP=min(TEMP*eta,0.1)
  }
  par(mfrow=c(1,2)) 
  plot(rbind(loc,loc[1,]),type='o',pch=20,cex=2.5, col='red',
       xlab='location @X',ylab='location @Y',main='Initial route')
  plot(loc[optHist[1000,c(1:n.city,1)],],type='o',pch=20, col='magenta',
       cex=2.5,xlab='location @X',ylab='location @Y',main='Optimized route')
  return(optHist)
}
# regression with SimAnneal.
reg.ERR<-function(b,X,y){
  yhat<-X%*%b
  return(sum((y-yhat)^2))
}
##
SA.reg <- function(maxItr,parm.set=c(C=1,eta=0.995,TEMP=100,sigma=100)){
  set.seed(20)
  nData = 100
  x=matrix(rnorm(9*nData,mean=10,sd=2),ncol=9);X=cbind(rep(1,nData),x)
  y=X%*%c(10,2,5,-3,-5,0,0,0,0,0)+rnorm(nData,mean=0,sd=2);
  lm.sum = summary(lm(y~X[,2:10]))
  C = parm.set[1]
  eta = parm.set[2]
  TEMP= parm.set[3]
  sigma = parm.set[4]
  b = rnorm(10,0,100)
  optERR=reg.ERR(b,X,y)
  optHist=matrix(0,nrow=maxItr,ncol=(10+1))
  optHist[1,]=c(b,optERR)
  for (i_loop in 2:maxItr) {
    new.b = b + rnorm(10,0,sigma*TEMP)
    new.ERR=Fun.reg(new.b, X,y)
    delta=new.ERR-optERR
    prob=1/(1+exp(delta/(C*TEMP)))
    if (runif(1) < prob) {
      b=new.b;
      optERR=new.ERR;
    } 
    optHist[i_loop,]=c(b,optERR);
    TEMP=max(TEMP*eta,0.01)
  }
  print(optHist[maxItr,])
  print(coef(lm.sum)[,1])
  return(list(optHist=optHist, gt = coef(lm.sum)))
}
res=SA.reg(1e5,c(80,0.999,1,1))

データ解析基礎論A W07

dat<-read.csv("http://www.matsuka.info/data_folder/hwsk8-17-6.csv")
dat.lm <- lm(ani~otouto, data=dat)
summary(dat.lm)
t.test(dat$ani, dat$otouto, var.equal=T)

dat2 <- data.frame(score=c(dat$ani,dat$otouto),order=c(rep("ani",10),rep("otouto",10)))
plot(dat2$score~as.numeric(dat2$order),pch=20,xlab="order",
     ylab="score",xlim=c(0.5,2.5),cex=2,xaxt="n")
axis(1,c(1,2),c("ani","otouto"))
dat2.lm<-lm(score~order,data=dat2)
abline(dat2.lm,col='red',lwd=3)

dat.D = dat$ani - dat$otouto
boxplot(dat.D,col="skyblue",ylab="Difference")
t.test(dat.D)
plot(dat.D~rep(1,10),pch=20,xlab="",ylab="Difference",cex=3)
dat.D.lm<-lm(dat.D~1)
abline(dat.D.lm,col='red',lwd=3)

dat<-read.csv("http://www.matsuka.info/data_folder/tdkReg01.csv")

認知情報解析演習 04

fun01 <- function(x1,x2){
  return(x1^2 - x1*x2 -4*x1 +x2^2 -x2 )
}

x1 = seq(-10,10,0.05); x2 = seq(-10,10,0.05)
v <- outer(x1,x2,function(x1,x2) fun01(x1,x2))
contour(x1,x2,v,nlevels= 100, drawlabels = F,col='blue')

a = runif(2,0,10); b = runif(2,0,10);c = runif(2,0,10)

sortABC <-  function(a,b,c,func){
  fA = do.call(func,list(a[1],a[2]))
  fB = do.call(func,list(b[1],b[2]))
  fC = do.call(func,list(c[1],c[2]))
  sort.F = sort(c(fA,fB,fC),index.return=T)
  temp.set = rbind(a,b,c)
  a = temp.set[sort.F$ix[1],]; fA = sort.F$x[1]
  b = temp.set[sort.F$ix[2],]; fB = sort.F$x[2]
  c = temp.set[sort.F$ix[3],]; fC = sort.F$x[3]
  return(list(abc = rbind(a,b,c), fs = c(fA,fB,fC)))
}

nelder.mead01 <- function(a,b,c, func, tol){
# not in the original form
# expansion -> reflection -> contraction -> shrink
# minimizes f with 2 variables 
  repeat {
    sort.abc = sortABC(a,b,c,func)
    a = sort.abc$abc[1,];b = sort.abc$abc[2,];c = sort.abc$abc[3,]
    fA = sort.abc$fs[1];fB = sort.abc$fs[2];fC = sort.abc$fs[3];
    max.diffs = max(abs(outer(c(fA,fB,fC),c(fA,fB,fC),'-')))
    if (max.diffs < tol){
      return(list(abc=rbind(a,b,c), fs = c(fA,fB,fC)));
      break
    }
    points(a[1],a[2],pch = 20, col ='red')
    points(b[1],b[2],pch = 20, col ='cyan')
    points(c[1],c[2],pch = 20, col ='green')
    m = (a + b)/2
    e = m + 2*(m - c)
    fE = do.call(func,list(e[1],e[2]))
    if (fE < fB){
      c = e; fC = fE
    } else {
      r = 2*m -c
      fR = do.call(func,list(r[1],r[2]))
      if (fR < fC){
        c = r; fC = fR
      }
      if (fR >= fB){
        s = (c + m)/2; fS = do.call(func,list(s[1],s[2]))
        if (fS < fC){
          c = s; fC =fS
        } else {
          b = m; fB = do.call(func,list(m[1],m[2]))
          c = (a + c)/2; fC = do.call(func,list(c[1],c[2]))
        }
      }
    }
  }
}
# simulated annealing
# version 1
SimAneal01<-function(func, initXY, maxItr=1000, C=1, eta=0.99, width=10) {
  x=initXY
  opt.value = do.call(func,list(x))
  n.var = length(x)
  opt.hist=matrix(0, nrow=maxItr, ncol=5)
  opt.hist[1,]=c(x,x,opt.value,opt.value,0)
  for (i_loop in 2:maxItr) {
    accept = 0
    temp.x = x + rnorm(n.var, mean = 0, sd=width)
    temp.value= do.call(func,list(temp.x))
    delta=temp.value-opt.value;
    prob=1/(1+exp(delta/(C*width)))
    if (runif(1) < prob) {
      x = temp.x;
      opt.value = temp.value;
      accept = 1
    } 
    opt.hist[i_loop,]=c(x, temp.x, opt.value, temp.value, accept);
    width=width*eta
  }
  return(data.frame(opt.hist))
}

set.seed(50)
n.iter =500
fun01<-function(x){x^2+2*x}
res <- SimAneal01(fun01, 3, n.iter, 1, 0.985, 1)

# version 2
funZ<-function(x,y) {x^4-16*x^2-5*x+y^4-16*y^2-5*y}
minSA<-function(initXY,maxItr=1000,C=1,eta=0.99,width) {
  x=initXY[1];y=initXY[2];z=funZ(x,y); 
  opHist=matrix(0,nrow=maxItr,ncol=3)
  opHist[1,]=c(x,y,z)
  for (i_loop in 2:maxItr) {
    xTemp=x+rnorm(1,mean=0,sd=width)
    yTemp=y+rnorm(1,mean=0,sd=width)
    zTemp=funZ(xTemp, yTemp);
    delta=zTemp-z;
    prob=1/(1+exp(delta/(C*width)))
    if (runif(1) < prob) {
      x=xTemp;y=yTemp;z=zTemp;
    } 
    opHist[i_loop,]=c(x,y,z);
    width=width*eta
  }
  return(opHist)
}

funZ<-function(x,y) {x^4-16*x^2-5*x+y^4-16*y^2-5*y}
xmin=-5;xmax=5;n_len=100;
x<-seq(xmin, xmax,length.out=n_len);
y<-x;z<-outer(x,y,funZ)

res<-minSA(c(0,0),1000,1,0.99,10)
lines(res[,1:2],type='o',lwd=2,col='green',pch=20)

データ解析基礎論A 06

ssize = c(24,25,26,23.5,25,27,24,22,27.5,28)
t.test(ssize, mu=24)

dat<-read.csv("http://www.matsuka.info/data_folder/datWA01.txt")
t.test(dat$h[dat$gender=="M"],mu=171)
t.test(dat$h[dat$gender=="F"],mu=158)

A=c(12,19,10,10,14,18,15,11,16)
B=c(15,20,16,14,17,16,12,12,19)
d=A-B
t.test(d, mu=0)

X1=c(78,70,66,76,78,76,88,76)
X2=c(76,72,60,72,70,72,84,70)
t.value=(mean(X1)-mean(X2))/sqrt((var(X1)+var(X2))/8)

t.test(X1,X2,var.equal=T)
t.test(X1,X2)

dat<-read.csv("http://www.matsuka.info/data_folder//mobH.csv")

###
may = dat$height[dat$mob=="may"]
dec = dat$height[dat$mob=="dec"]
dat3 = data.frame(height = c(may,dec),
 mob = c(rep('may',length(may)),rep('dec',length(dec))))
boxplot(height~mob, data=dat3,col=c('skyblue','orange'),
  ylab = "height", xlab ="mob")

spring = dat$height[dat$mob =="march"|
                    dat$mob =="April"|
                    dat$mob =="may"]
fall = dat$height[dat$mob =="sep"|
                  dat$mob =="oct"|
                  dat$mob =="nov"]
dat4 = data.frame(height = c(spring,fall),
 season = c(rep('spring',length(spring)), 
 rep('fall',length(fall))))
boxplot(height~season, data=dat4, col=c('skyblue','orange'),
  ylab = "height", xlab ="season")

認知情報解析 03

# objective: find x that minimizes y: y = x^2 +2*x

# simple gradient descent (GD)
tol = 1e-7;lr = 0.1; 
x = 10; x.hist = x
repeat{
  grad = 2*x + 2
  if (abs(grad) <= tol) { break }
  x = x - lr*grad
  x.hist = c(x.hist,x)
}
x.temp = seq(-10,10,length.out = 100)
plot(x.temp, x.temp^2+2*x.temp,type='l',lwd = 3, ylim = c(-5,120),
     ylab = "y",xlab="x")
lines(x.hist,x.hist^2+2*x.hist,type='o',col='red',lwd=2,pch=19)
points(x.hist,rep(-4,length(x.hist)),col='red',pch="I")

# GD w/ momentum
tol = 1e-7;lr = 0.1; gamma = 0.36
x = 10; x.histM = x; v = 0;
repeat{
  grad = 2*x + 2
  if (abs(grad) <= tol) { break }
  v = gamma*v - lr*grad
  x = x + v
  x.histM = c(x.histM,x)
}
lines(x.histM,x.histM^2+2*x.histM,type='o',col='blue',pch=19)
points(x.histM,rep(-2,length(x.histM)),col='blue',pch="I")
legend("topleft",c("standard GD","GD w/ moment."), pch=1,lwd=2,col=c('red','blue'))
c(length(x.hist),length(x.histM))

データ解析基礎論a W05


dat<-read.csv("http://www.matsuka.info/data_folder/datWA01.txt")
# Male
mean.M <-mean(dat$h[dat$gender=="M"])
stddev = 10
n.M = length(dat$h[dat$gender=="M"])
z.value=(mean.M-171)/(sqrt(stddev^2/n.M))
(1-pnorm(abs(z.value)))*2
# Female
mean.F <-mean(dat$h[dat$gender=="F"])
stddev = 5
n.F = length(dat$h[dat$gender=="F"])
z.value=(mean.F-158)/(sqrt(stddev^2/n.F))
(1-pnorm(abs(z.value)))*2

データ解析基礎論A 練習課題WA04

データ解析基礎論A 練習課題(提出不要)

練習課題WA04.1
マーケティングリサーチで商品Aと商品Bをレーティングしてもらい、そしてどちらか一方をレーティングに基づいて選択してもらった結果が以下のデータです。

   S1 S2 S3 S4 S5 S6 S7 S8 S9 
A  12 19 10 10 14 18 15 11 16 
B  15 20 16 14 17 16 12 12 19
-----------------------------------
    B  B  B  B  B  A  A  B  B

つまり、9人中7人が商品Bを選択しました。
商品Aと商品Bが同じ割合で選択されると仮定した場合、以下の確率を求めてください:
A: 9人中7人以上がBを選択する確率を計算してください。
B: 9人中7人以上が一方の商品を選択する確率を計算してください。

練習課題WA04.2
ある通りの平日午前10~11時の1時間の車の交通量は800台で標準偏差が40台だとします。また、この通りの同じ曜日の同じ時間帯の交通量は正規分布に従うと仮定して、以下の数値を求めてください;
A) 交通量が750台から850台になる確率
B)交通量が700台以下となる確率
C)平均値の800を中心に95%の範囲(下限、上限)
D)99%の上限(99%の確率でX台以下となるようなX)

練習課題WA04.3
一般成人の血液100cc中の赤血球の量の平均値が150ユニットで標準偏差が15とします。また、その分布は正規分布に従うと仮定します。運動部Xの10名の赤血球の量を計測したら、165ユニットでした。
A) 運動部Xの赤血球の量は一般成人と同等であると仮定した場合、部員の赤血球の平均値が165ユニット以上となる確率を求めてください。
B)運動部Xの赤血球の量は一般成人と同等であると仮定したが、Aで得られた確率がどの程度だとこの仮説が妥当だと思いますか?

認知情報解析 02

# ケーキの分配ゲーム
n.cake = 10 
# 利得行列
pay.mat = matrix(0,(n.cake+1),(n.cake+1))
for (i.cake in 1:n.cake){
  pay.mat[(i.cake+1),1:(n.cake-i.cake+1)] =i.cake
}
# 初期化(各戦略の確率や時間など)
p.cake = runif(n.cake+1)
p.cake = p.cake/sum(p.cake)

max.time = 50
dt = 0.01
t = seq(0,max.time,dt)
n.iter = length(t)
p.hist = matrix(0,nrow = n.iter, ncol = (n.cake+1))
p.hist[1,] = p.cake
 
# メインのスクリプト
for (i.time in 2:n.iter){
  W = colSums(p.cake*t(pay.mat))
  W.ave = sum(outer(p.cake,p.cake)*pay.mat)
  p.cake = p.cake + p.cake*(W - W.ave)/W.ave * dt
  p.hist[i.time,] = p.cake
}
 
# 結果の可視化
plot(p.hist[,1],ylim=c(0,1),type='l',lwd=2,ylab = 'Proportion',xlab="time")  
for (i.strat in 2:(n.cake+1)){
  lines(p.hist[,i.strat],col=i.strat,lwd=2)
}
legend("topleft",paste("request = ",0:10),col=1:(n.cake+1),lwd =2,cex=0.75)
# 中央極限定理の実験
ss.female = rnorm(6000,mean = 24, sd = 0.5)
ss.male = rnorm(6000,mean = 26, sd = 0.5)
pop = c(ss.female,ss.male)
parm.mu = mean(pop)
parm.sd = sd(pop)

n.per.one.sample = 10;
n.rep = 100000;

samp.data = matrix(sample(pop,n.per.one.sample*n.rep,replace = T),
  ncol = n.per.one.sample)
samp.mean = rowMeans(samp.data)


par(mfrow=c(1,3))
# fig 1
hist(pop,  col='skyblue',breaks = 50, probability = T, 
  main = "Population Dist.",xlab = "shoesizd")
pop.dens<-density(pop)
lines(pop.dens,col='orange',lwd=4)

# fig 2
rand.p = sample(1:n.rep,300)
dat.dens = density(samp.data[rand.p[1],])
plot(dat.dens, col='gray',xlim=c(22,28),ylim=c(0,0.5), 
  main = "Sample Dist. (N = 10)",xlab = "shoesizd")
for (i.d in 2:300){
  dat.dens = density(samp.data[rand.p[i.d],])
  lines(dat.dens, col='gray')
}

# fig 3
mean.dens<-density(samp.mean)
hist(samp.mean, col='skyblue',breaks = 50, probability = T, 
  main = 'Dist of Sample Means',xlab = "shoesizd")
lines(mean.dens,col='red',lwd=4)
x = seq(22,27,0.01)
y = dnorm(x,parm.mu, parm.sd/sqrt(n.per.one.sample))
lines(x, y ,col='darkblue',lty=3,lwd=3)
legend("topright",c("Empirical", "Theoretical"),
  lty=c(1,3),col=c("red","blue"),lwd=4,cex=1)