library(tidyverse)
library(brms)
library(tidybayes)
library(ggdist)
library(broom.mixed)
library(glue)
# Plot stuff
<- MetBrewer::met.brewer("Lakota", 6)
clrs theme_set(theme_bw())
# Seed stuff
<- 1234
BAYES_SEED set.seed(1234)
Chapter 3 exercises
Posteriors from grids
This births
data shows the sex of the first and second children born to 100 different families (1
= boy, 2
= girl). The first family had a boy then a girl, the second had a girl and then a boy, the thrid had no boys, etc.
data(homeworkch3, package = "rethinking")
<- tibble(birth1, birth2)
all_births
all_births## # A tibble: 100 × 2
## birth1 birth2
## <dbl> <dbl>
## 1 1 0
## 2 0 1
## 3 0 0
## 4 0 1
## 5 1 0
## 6 1 1
## 7 0 1
## 8 1 1
## 9 0 0
## 10 1 0
## # … with 90 more rows
In these exercises we don’t care about birth order, so we’ll pool all the births into one long 200-birth vector:
<- all_births %>%
births pivot_longer(cols = everything(), values_to = "boy")
births## # A tibble: 200 × 2
## name boy
## <chr> <dbl>
## 1 birth1 1
## 2 birth2 0
## 3 birth1 0
## 4 birth2 1
## 5 birth1 0
## 6 birth2 0
## 7 birth1 0
## 8 birth2 1
## 9 birth1 1
## 10 birth2 0
## # … with 190 more rows
What proportion of births were boys?
<- births %>%
births_summary summarize(n_boys = sum(boy),
prop_boys = n_boys / n())
births_summary## # A tibble: 1 × 2
## n_boys prop_boys
## <dbl> <dbl>
## 1 111 0.555
<- pull(births_summary, n_boys)
n_boys <- pull(births_summary, prop_boys)
prop_boys <- nrow(births)
total_births <- total_births - n_boys n_girls
3H1
Using grid approximation, compute the posterior distribution for the probability of being a boy. Assume a uniform prior probability. Which parameter value maximizes the posterior probability?
<- tibble(p_grid = seq(0, 1, length.out = 1001),
birth_grid prior_uniform = 1) %>%
mutate(likelihood = dbinom(n_boys, size = total_births, prob = p_grid)) %>%
mutate(posterior_uniform = (likelihood * prior_uniform) / sum(likelihood * prior_uniform))
birth_grid## # A tibble: 1,001 × 4
## p_grid prior_uniform likelihood posterior_uniform
## <dbl> <dbl> <dbl> <dbl>
## 1 0 1 0 0
## 2 0.001 1 2.48e-275 4.98e-276
## 3 0.002 1 5.89e-242 1.18e-242
## 4 0.003 1 1.89e-222 3.81e-223
## 5 0.004 1 1.28e-208 2.57e-209
## 6 0.005 1 6.68e-198 1.34e-198
## 7 0.006 1 3.76e-189 7.56e-190
## 8 0.007 1 9.28e-182 1.86e-182
## 9 0.008 1 2.32e-175 4.66e-176
## 10 0.009 1 1.01e-169 2.03e-170
## # … with 991 more rows
%>%
birth_grid ggplot(aes(x = p_grid, y = posterior_uniform)) +
geom_area(fill = clrs[6])
Parameter that maximizes the probability:
%>%
birth_grid filter(posterior_uniform == max(posterior_uniform))
## # A tibble: 1 × 4
## p_grid prior_uniform likelihood posterior_uniform
## <dbl> <dbl> <dbl> <dbl>
## 1 0.555 1 0.0567 0.0114
With brms
<- brm(
model_births bf(boy | trials(total_births) ~ 0 + Intercept),
data = list(boy = n_boys, total_births = total_births),
family = binomial(link = "identity"),
prior = prior(beta(1, 1), class = b, lb = 0, ub = 1),
iter = 5000, warmup = 1000, seed = BAYES_SEED, cores = 4
)## Compiling Stan program...
## Trying to compile a simple C file
## Start sampling
%>%
model_births gather_draws(b_Intercept) %>%
ggplot(aes(x = .value, y = .variable)) +
stat_halfeye(fill = clrs[6])
%>%
model_births gather_draws(b_Intercept) %>%
summarize(median = median(.value))
## # A tibble: 1 × 2
## .variable median
## <chr> <dbl>
## 1 b_Intercept 0.555
3H2
Using the
sample
function, draw 10,000 random parameter values from the posterior distribution you calculated above. Use these samples to estimate the 50%, 89%, and 97% highest posterior density intervals.
<- sample(birth_grid$p_grid, prob = birth_grid$posterior_uniform,
birth_samples size = 10000, replace = TRUE)
tibble(x = birth_samples) %>%
ggplot(aes(x = x)) +
stat_slab(
aes(fill_ramp = stat(level)),
.width = c(0.02, 0.5, 0.89, 0.97, 1),
fill = clrs[3]
+
) scale_fill_ramp_discrete(range = c(0.2, 1), guide = "none")
::hdi(birth_samples, credMass = 0.5)
HDInterval## lower upper
## 0.530 0.577
## attr(,"credMass")
## [1] 0.5
::hdi(birth_samples, credMass = 0.89)
HDInterval## lower upper
## 0.494 0.605
## attr(,"credMass")
## [1] 0.89
::hdi(birth_samples, credMass = 0.97)
HDInterval## lower upper
## 0.473 0.626
## attr(,"credMass")
## [1] 0.97
With brms
%>%
model_births spread_draws(b_Intercept) %>%
median_hdci(b_Intercept, .width = c(0.5, 0.89, 0.97))
## # A tibble: 3 × 6
## b_Intercept .lower .upper .width .point .interval
## <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 0.555 0.532 0.578 0.5 median hdci
## 2 0.555 0.501 0.612 0.89 median hdci
## 3 0.555 0.480 0.629 0.97 median hdci
%>%
model_births tidy_draws() %>%
ggplot(aes(x = b_Intercept)) +
stat_slab(
aes(fill_ramp = stat(level)),
.width = c(0.02, 0.5, 0.89, 0.97, 1),
fill = clrs[3]
+
) scale_fill_ramp_discrete(range = c(0.2, 1), guide = "none")
3H3
Use
rbinom
to simulate 10,000 replicates of 200 births. You should end up with 10,000 numbers, each one a acount of boyts out of 200 births. Compare the distribution of predicted numbers of boys to the actual count in the data (111 boys out of 200 births). … Does it look like the model fits the data well? That is, does the distribution of predcitions include the actual observation as a central, likely outcome?
Looks good!
# Posterior predictive distribution
<- rbinom(10000, size = 200, prob = birth_samples)
posterior_pred_births
%>%
posterior_pred_births enframe() %>%
ggplot(aes(x = value)) +
geom_histogram(binwidth = 2, color = "white", size = 0.25, fill = clrs[1]) +
geom_vline(xintercept = n_boys, color = "red") +
labs(caption = glue("Red line shows actual observed number of boys ({n_boys})"))
With brms
%>%
model_births predicted_draws(newdata = tibble(total_births = 200)) %>%
ggplot(aes(x = .prediction)) +
geom_histogram(binwidth = 2, color = "white", size = 0.25, fill = clrs[1]) +
geom_vline(xintercept = n_boys, color = "red") +
labs(caption = glue("Red line shows actual observed number of boys ({n_boys})"))
3H4
Now compare 10,000 counts of boys from 100 simulated first borns only to the number of boys in the first births,
birth1
. How does the model look in this light?
We need to just look at first births:
<- sum(birth1)
n_boys_first <- length(birth1)
total_births_first
<- tibble(p_grid = seq(0, 1, length.out = 1001),
birth_grid_first prior_uniform = 1) %>%
mutate(likelihood = dbinom(n_boys_first, size = total_births_first, prob = p_grid)) %>%
mutate(posterior_uniform = (likelihood * prior_uniform) / sum(likelihood * prior_uniform))
<- sample(birth_grid_first$p_grid, prob = birth_grid_first$posterior_uniform,
first_samples size = 10000, replace = TRUE)
<- rbinom(10000, size = 100, prob = first_samples)
posterior_pred_first
%>%
posterior_pred_first enframe() %>%
ggplot(aes(x = value)) +
geom_histogram(binwidth = 2, color = "white", size = 0.25, fill = clrs[6]) +
geom_vline(xintercept = n_boys_first, color = "red") +
labs(caption = glue("Red line shows actual observed number of firstborn boys ({n_boys_first})"))
Looks good still!
<- brm(
model_births_first bf(boy | trials(total_births) ~ 0 + Intercept),
data = list(boy = n_boys_first, total_births = total_births_first),
family = binomial(link = "identity"),
prior = prior(beta(1, 1), class = b, lb = 0, ub = 1),
iter = 5000, warmup = 1000, seed = BAYES_SEED, cores = 4
)## Compiling Stan program...
## recompiling to avoid crashing R session
## Trying to compile a simple C file
## Start sampling
With brms
Looks the same with brms too:
%>%
model_births_first predicted_draws(newdata = tibble(total_births = 100)) %>%
ggplot(aes(x = .prediction)) +
geom_histogram(binwidth = 2, color = "white", size = 0.25, fill = clrs[6]) +
geom_vline(xintercept = n_boys_first, color = "red") +
labs(caption = glue("Red line shows actual observed number of firstborn boys ({n_boys_first})"))
3H5
The model assumes that sex of first and second births are independent. To check this assumption, focus now on second births that followed female first borns. Compare 10,000 simulated conts of boys to only those second births that followed girls. To do this correctly, you need to count the number of first borns who were girls and simulate that many births, 10,000 times. Compare the counts of boys in your simulations to the actual observed count of boys following girls. How does the model look in this light? Any guesses what is going on in these data?
<- length(birth1) - sum(birth1)
n_girls_first <- all_births %>%
n_boys_after_girls filter(birth1 == 0) %>% # All families with a firstborn girl
summarize(boy_after_girl = sum(birth2)) %>%
pull(boy_after_girl)
<- rbinom(10000, size = n_girls_first, prob = first_samples)
posterior_pred_first_girl
%>%
posterior_pred_first_girl enframe() %>%
ggplot(aes(x = value)) +
geom_histogram(binwidth = 2, color = "white", size = 0.25, fill = clrs[5]) +
geom_vline(xintercept = n_boys_after_girls, color = "red") +
labs(caption = glue("Red line shows actual observed number of boys born after girls ({n_boys_after_girls})"))
With brms
<- brm(
model_births_first_girls bf(girl | trials(total_births) ~ 0 + Intercept),
data = list(girl = n_girls_first, total_births = total_births_first),
family = binomial(link = "identity"),
prior = prior(beta(1, 1), class = b, lb = 0, ub = 1),
iter = 5000, warmup = 1000, seed = BAYES_SEED, cores = 4
)## Compiling Stan program...
## recompiling to avoid crashing R session
## Trying to compile a simple C file
## Start sampling
%>%
model_births_first_girls predicted_draws(newdata = tibble(total_births = n_girls_first)) %>%
ggplot(aes(x = .prediction)) +
geom_histogram(binwidth = 2, color = "white", size = 0.25, fill = clrs[5]) +
geom_vline(xintercept = n_boys_after_girls, color = "red") +
labs(caption = glue("Red line shows actual observed number of boys born after girls ({n_boys_after_girls})"))