## 6.14 Exercise solutions

# Exerise 1
## Make a histogram, Q-Q plot, and a box-plot for the life expectancy
## for a continent of your choice, but for all years.
## Do the data appear normally distributed?

asia_data = mydata %>%
filter(continent %in% c("Asia"))

p1 = asia_data %>%
ggplot(aes(x = lifeExp)) +
geom_histogram(bins = 15)

p2 = asia_data %>%
ggplot(aes(sample = lifeExp)) +             # sample =  for Q-Q plot
geom_qq() +
geom_qq_line(colour = "blue")

p3 = asia_data %>%
ggplot(aes(x = year, y = lifeExp)) +
geom_boxplot(aes(fill = factor(year))) +  # optional: year as factor
geom_jitter(alpha = 0.4) +
theme(legend.position = "none")

library(patchwork)
p1 / p2 | p3

# Exercise 2
## Select any 2 years in any continent and perform a *t*-test to
## determine whether mean life expectancy is significantly different.
## Remember to plot your data first.

asia_2years = asia_data %>%
filter(year %in% c(1952, 1972))

p1 = asia_2years %>%
ggplot(aes(x = lifeExp)) +
geom_histogram(bins = 15) +
facet_wrap(~year)

p2 = asia_2years %>%
ggplot(aes(sample = lifeExp)) +
geom_qq() +
geom_qq_line(colour = "blue") +
facet_wrap(~year)

p3 = asia_2years %>%
ggplot(aes(x = factor(year), y = lifeExp)) +
geom_boxplot(aes(fill = factor(year))) +
geom_jitter(alpha = 0.4) +
theme(legend.position = "none")

library(patchwork)
p1 / p2 | p3

asia_2years %>%
t.test(lifeExp ~ year, data = .)
##
##  Welch Two Sample t-test
##
## data:  lifeExp by year
## t = -4.7007, df = 63.869, p-value = 1.428e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -15.681981  -6.327769
## sample estimates:
## mean in group 1952 mean in group 1972
##           46.31439           57.31927
# Exercise 3
## In 2007, in which continents did mean life expectancy differ from 70
mydata %>%
filter(year == 2007) %>%
group_by(continent) %>%
do(
t.test(.\$lifeExp, mu = 70) %>%
tidy()
)
## # A tibble: 5 x 9
## # Groups:   continent [5]
##   continent estimate statistic  p.value parameter conf.low conf.high method
##   <fct>        <dbl>     <dbl>    <dbl>     <dbl>    <dbl>     <dbl> <chr>
## 1 Africa        54.8   -11.4   1.33e-15        51     52.1      57.5 One S…
## 2 Americas      73.6     4.06  4.50e- 4        24     71.8      75.4 One S…
## 3 Asia          70.7     0.525 6.03e- 1        32     67.9      73.6 One S…
## 4 Europe        77.6    14.1   1.76e-14        29     76.5      78.8 One S…
## 5 Oceania       80.7    20.8   3.06e- 2         1     74.2      87.3 One S…
## # … with 1 more variable: alternative <chr>
# Exercise 4
## Use Kruskal-Wallis to determine if the mean population changed
## significantly through the 1990s/2000s in individual continents.

mydata %>%
filter(year >= 1990) %>%
ggplot(aes(x = factor(year), y = pop)) +
geom_boxplot() +
facet_wrap(~continent)

mydata %>%
filter(year >= 1990) %>%
group_by(continent) %>%
do(
kruskal.test(pop ~ year, data = .) %>%
tidy()
)
## # A tibble: 5 x 5
## # Groups:   continent [5]
##   continent statistic p.value parameter method
##   <fct>         <dbl>   <dbl>     <int> <chr>
## 1 Africa        2.10    0.553         3 Kruskal-Wallis rank sum test
## 2 Americas      0.847   0.838         3 Kruskal-Wallis rank sum test
## 3 Asia          1.57    0.665         3 Kruskal-Wallis rank sum test
## 4 Europe        0.207   0.977         3 Kruskal-Wallis rank sum test
## 5 Oceania       1.67    0.644         3 Kruskal-Wallis rank sum test