データ解析基礎論B W03 R graphics

library(tidyverse)


# CLT
NperSample = 10
SampleSize = 300000

runif(NperSample * SampleSize) %>%
  matrix(nrow=NperSample) %>%
     colMeans() %>% tibble(sample.mean = .) -> means

ggplot(means,aes(x = sample.mean, y = ..density..)) +
  geom_histogram(bins=200) +
    geom_density(colour = "orange",size=2)

ggplot(means,aes(x = sample.mean, y = ..density..)) +
  geom_histogram(bins=200) +
  geom_line(stat = "density", colour = "orange",size=2)

runif(NperSample * SampleSize) %>%
  matrix(nrow=NperSample) %>%
  colMeans() %>% tibble(sample.mean = .) %>%
  ggplot(., aes(x = sample.mean, y = ..density..)) +
    geom_histogram(bins=100,colour = "grey20") +
    geom_line(stat = "density", colour = "skyblue",size=2)


dat <- read.csv("http://www.matsuka.info/data_folder/sampleData2013.txt")
dt <- as_tibble(dat)




ggplot(dt, aes(x = Hworked, y = nbooks)) +
  geom_point(size = 3)

ggplot(dt) +
  geom_point(aes(x = Hworked, y = nbooks, color = grade),size = 3)

ggplot(dt) +
  geom_point(aes(x = Hworked, y = nbooks, shape = grade),size = 5)

ggplot(dt) +
  geom_point(aes(x = Hworked, y = nbooks),size = 5) +
  facet_wrap(~ grade, nrow = 1)

ggplot(dt) +
  geom_smooth(aes(x = Hworked, y = nbooks))

ggplot(dt) +
  geom_smooth(aes(x = Hworked, y = nbooks, linetype = grade))

ggplot(dt) +
  geom_smooth(aes(x = Hworked, y = nbooks)) +
    facet_wrap(~ grade, nrow = 4)

ggplot(dt) +
  geom_smooth(aes(x = Hworked, y = nbooks)) +
  geom_point(aes(x = Hworked, y = nbooks), size = 4)

ggplot(dt) +
  geom_smooth(aes(x = Hworked, y = nbooks), colour = "black", se = FALSE) +
  geom_point(aes(x = Hworked, y = nbooks, color = grade), size = 4)

ggplot(dt) +
  geom_smooth(aes(x = Hworked, y = nbooks, color = grade), se = FALSE) +
  geom_point(aes(x = Hworked, y = nbooks, color = grade), size = 4)


plot1 <- ggplot(dt) +
  geom_smooth(aes(x = Hworked, y = nbooks, color = grade), se = FALSE) +
  geom_point(aes(x = Hworked, y = nbooks, color = grade), size = 4)
plot1 + xlab("Hours worked") + ylab("Number of books read")

plot1 + xlab("Hours worked") +  ylab("Number of books read") +
  theme(axis.title.x = element_text(face = "italic",size = 14, colour = "navy"),
        axis.title.y = element_text(face = "bold",size = 10, colour = "darkgreen"))

ggplot(filter(dt, affil == "LA")) +
  geom_point(aes(x = Hworked, y = nbooks, color = grade), size = 4)


dt$grade <- fct_relevel(dt$grade, "FR","SP","JR","SR")
group_by(dt, grade) %>% summarize(ave.books = mean(nbooks, na.rm = T)) %>%
  ggplot() + geom_bar(aes(x = grade, y = ave.books), stat = "identity")

group_by(dt, grade) %>% summarize(ave.books = mean(nbooks, na.rm = T)) %>%
  ggplot() + geom_bar(aes(x = grade, y = ave.books), stat = "identity")

group_by(dt, grade) %>% summarize(ave.books = mean(nbooks, na.rm = T),
                                  se = sd(nbooks, na.rm =T)/n()) %>%
ggplot(aes(x = grade, y = ave.books)) +
  geom_bar(stat = "identity", fill = "grey70") +
  geom_errorbar(aes(ymin = ave.books - se, ymax = ave.books +se), width = 0.2) +
  ylab("Average # books read")

ggplot(dt,aes(x = Hworked, y = nbooks)) +
  stat_density2d(aes(colour =..level..)) +
  geom_point()

ggplot(dt,aes(x = Hworked, y = nbooks)) +
  stat_density2d(aes(alpha =..density..), geom="tile",contour=F) +
 geom_point(alpha =0.4)


ggplot(dt) +
  stat_summary(aes(x = grade, y = nbooks),
               fun.y = mean,
               fun.ymin = function(x) mean(x) - sd(x),
               fun.ymax = function(x) mean(x) + sd(x))

ggplot(dt) +
  geom_boxplot(aes(x = grade, y = nbooks))
ggplot(dt) +
  geom_boxplot(aes(x = grade, y = nbooks)) +
  coord_flip()

dat <- read.csv("http://www.matsuka.info/data_folder/datWA01.txt")
dt <- as_tibble(dat)
dt.lm <- lm(h~shoesize, dt)
cfs <- coef(dt.lm)
ggplot(dt, aes(x = shoesize, y = h)) +
  geom_point() +
  geom_abline(intercept = cfs[1], slope = cfs[2], col = "red") +
  geom_text( x= 22, y =175, aes(label = paste("r^2  =",round(summary(dt.lm)$r.squared,3))))

広域システム特別講義II 10.11の講義

10.11ABの講義は家族が急病の為、休講とさせてください。直前の連絡で申し訳ありません。
次回の講義・実習は10.25日です。
補講日は次回、皆さんの都合で調整しましょう。
補講日の候補は11.01、12.27、 および、東大が指定している補講日である、12.25(午後)、01.17(午前)、01.20、01.21です。

Posted in UT

データ解析基礎論B W02

install.packages("tidyverse")
library(tidyverse)

random.number <- rnorm(1000)
mean(random.number)
mean(random.number <- rnorm(1000))

rnorm(1000) %>% mean()

# CLT
NperSample = 10
SampleSize = 300000

# "traditional"
random.number <- runif(NperSample * SampleSize)
dat <- matrix(random.number, nrow=NperSample)
means <- colMeans(dat)
dens <- density(means)
hist(means, breaks = 100, probability = T, main = "Distributionf of Means")
lines(dens, lwd = 3, col = "orange")


runif(NperSample * SampleSize) %>% 
  matrix(nrow=NperSample) %>%
     colMeans() -> means
hist(means, breaks=100,probability = T, main = "Distributionf of Means")
means %>% density() %>% lines(,lwd=3,col='orange')


histWdensity <- function(dat, nbreaks=30, main.txt){
  dens <- density(dat)
  hist(dat, breaks = nbreaks, probability = T, main = main.txt)
  lines(dens, lwd = 3, col = "orange")
}

runif(NperSample * SampleSize) %>% 
  matrix(nrow=NperSample) %>%
  colMeans() %>% 
    histWdensity(nbreaks = 100, main.txt = "Distributionf of Means")

dat <- read.csv("http://www.matsuka.info/data_folder/sampleData2013.txt")
dt <- as_tibble(dat)
dt.la <- filter(dt, affil == "LA")

dt.la2 <- filter(dt, affil == "LA" & grade == "SP")
dt.laNot2 <- filter(dt, affil == "LA" & grade != "SP")

dt.GB <- select(dt, grade, nbooks)
dtWOgender <- select(dt, -gender)

dt.arranged <- arrange(dt, affil, grade)

dt.weekly <- mutate(dt,nbooksWeek = nbooks / 52)

dt.atLeastOneBook <- mutate(dt, atleastOneBook = (nbooks/52) >= 1)
dt.atLeastOneBook <- mutate(dt, atleastOneBook = (nbooks/12) >= 1)

dt.BWindex = mutate(dt, nbooksWeek = nbooks / 52, 
                        idx = nbooksWeek / (12*7-Hworked))

dt.byGrade <- group_by(dt, grade)
summarize(dt.byGrade, ave.books = mean(nbooks,na.rm = TRUE), 
                      ave.Hworked = mean(Hworked, na.rm = TRUE))

dt.byGrAf <- group_by(dt, grade, affil)
dt.summ <- summarize(dt.byGrAf, ave.books = mean(nbooks,na.rm = TRUE),
                     ave.Hworked = mean(Hworked, na.rm = TRUE), N = n())

dt.summ2 <- dt %>% 
  group_by(grade, affil) %>% 
  summarize(ave.books = mean(nbooks,na.rm = TRUE), 
            ave.Hworked = mean(Hworked, na.rm = TRUE), 
            N = n()) %>% filter(N > 2) %>% arrange(desc(ave.books))

plot(x = dt.summ2$ave.books, y = dt.summ2$ave.Hworked, pch=20, cex = 3,
  xlab = "Ave. # books read",ylab = "Ave hours worked")

dt <- read_csv("http://www.matsuka.info/data_folder/sampleData2013.txt",
  col_names = TRUE)

dt.summ3 <- dt %>% 
  group_by(grade, gender) %>% 
  summarize(ave.books = mean(nbooks,na.rm = TRUE), 
    ave.Hworked = mean(Hworked, na.rm = TRUE)) 

dt.summ3G <- dt.summ3 %>% gather('ave.books', 'ave.Hworked', 
  key = 'ave.values', value = "BorW")

dt.summ3SformG <- spread(dt.summ3G, key = ave.values, value =BorW)

dt.sumLA <- dt %>% filter(affil=="LA") %>% group_by(grade) %>% 
   summarize(ave.books = mean(nbooks))

toeic <- tibble(
  grade = factor(c("SP", "JR")),
  score = c(800,830),
)

new.dt1 <- dt.sumLA %>% inner_join(toeic, by = "grade")

dt.sumLA <- add_row(dt.sumLA, grade = c("MS"), ave.books = (13))
toeic2 <- tibble(
  grade = factor(c("SP", "JR","DR")),
  score = c(800,830,900),
)
new.dt3 <- full_join(dt.sumLA, toeic2)

new.dt4 <- left_join(dt.sumLA, toeic2)
new.dt5 <- right_join(dt.sumLA, toeic2)