7.5 Solutions

Solution to Exercise 7.4.2:

gapdata %>% 
  filter(continent == "Europe") %>% 
  ggplot(aes(x = year, y = gdpPercap)) + 
  geom_point() +
  geom_smooth(method = "lm") +
  facet_wrap(country ~ .)
## `geom_smooth()` using formula 'y ~ x'

# Countries not linear: Ireland, Montenegro, Serbia.

# Add quadratic term
gapdata %>% 
  filter(continent == "Europe") %>% 
  ggplot(aes(x = year, y = gdpPercap)) + 
  geom_point() +
  geom_smooth(method = "lm", formula = "y ~ poly(x, 2)") +
  facet_wrap(country ~ .)

Solution to Exercise 7.4.3:

# Plot first
gapdata %>% 
  filter(country %in% c("Albania", "Austria")) %>% 
  ggplot() + 
  geom_point(aes(x = year, y = gdpPercap, colour= country))

# Fit average line between two countries. 
fit_both1 = gapdata %>% 
  filter(country %in% c("Albania", "Austria")) %>% 
  lm(gdpPercap ~ year, data = .)

gapdata %>% 
  filter(country %in% c("Albania", "Austria")) %>% 
  ggplot() + 
  geom_point(aes(x = year, y = gdpPercap, colour = country)) +
  geom_line(aes(x = year, y = predict(fit_both1)))

# Fit average line between two countries. 
fit_both3 = gapdata %>% 
  filter(country %in% c("Albania", "Austria")) %>% 
  lm(gdpPercap ~ year * country, data = .)

gapdata %>% 
  filter(country %in% c("Albania", "Austria")) %>% 
  ggplot() + 
  geom_point(aes(x = year, y = gdpPercap, colour = country)) +
  geom_line(aes(x = year, y = predict(fit_both3), group = country))

# You can use the regression equation by hand to work out the difference

# Or pass newdata to predict to estimate the two points of interest
gdp_1980 <- predict(fit_both3, newdata = data.frame(
  country = c("Albania", "Austria"),
  year = c(1980, 1980))
gdp_1980[2] - gdp_1980[1]

Solution to Exercise 7.4.4:

# Plot data first
wcgsdata %>% 
  ggplot(aes(x = age, y = chol))+
  geom_point() + 
  geom_smooth(method = "lm", formula = "y~x")
## Warning: Removed 12 rows containing non-finite values (stat_smooth).
## Warning: Removed 12 rows containing missing values (geom_point).

# Weak positive relationship

# Simple linear regression
dependent <- "chol"
explanatory <- "age" 
wcgsdata %>% 
  finalfit(dependent, explanatory, metrics = TRUE)
## Note: dependent includes missing data. These are dropped.
# For each year of age, cholesterol increases by 0.7 mg/100 ml. 
# This gradient differs from zero. 

# Is this effect independent of other available variables?

# Make BMI as above
dependent <- "chol"
explanatory <- c( "age", "bmi", "sbp", "smoking", "personality_2L") 
wcgsdata %>% 
    bmi = ((weight*0.4536) / (height*0.0254)^2) %>% 
  ) %>% 
  finalfit(dependent, explanatory, metrics = TRUE)
## Note: dependent includes missing data. These are dropped.
# Effect size is reduced, but still present. 
# Model poorly describes data, R2=0.033.