library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6 v purrr 0.3.4## v tibble 3.1.6 v dplyr 1.0.9## v tidyr 1.2.0 v stringr 1.4.0## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --## x dplyr::filter() masks stats::filter()## x dplyr::lag() masks stats::lag()
library(broom)
bears <- read_csv("./data/bears.csv") %>% rename(active_hunters = `Active Hunters`) %>% filter(WMU != "Total") %>% filter(WMU %in% c("82A","83","84","73","74","75","76"))bears
## # A tibble: 63 x 4## WMU Year active_hunters Harvest## <chr> <dbl> <dbl> <dbl>## 1 73 2012 58 4## 2 73 2013 44 0## 3 73 2014 52 2## 4 73 2015 79 7## 5 73 2016 111 15## 6 73 2017 109 7## 7 73 2018 99 9## 8 73 2019 144 21## 9 73 2020 148 13## 10 74 2012 74 9## # ... with 53 more rows
bears_lm <- lm(Harvest ~ active_hunters, data=bears)summary(bears_lm)
## ## Call:## lm(formula = Harvest ~ active_hunters, data = bears)## ## Residuals:## Min 1Q Median 3Q Max ## -20.2036 -2.8621 0.0816 3.3201 25.6144 ## ## Coefficients:## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -5.048595 1.560276 -3.236 0.00196 ** ## active_hunters 0.188743 0.009188 20.542 < 2e-16 ***## ---## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1## ## Residual standard error: 7.749 on 61 degrees of freedom## Multiple R-squared: 0.8737, Adjusted R-squared: 0.8716 ## F-statistic: 422 on 1 and 61 DF, p-value: < 2.2e-16
We are interested in a 95% confidence interval for the mean harvest when the number of active hunters equals 150. Using the functions built into R, we can easily obtain this interval.
predict( bears_lm, newdata = data.frame(active_hunters=150), interval = "confidence")
## fit lwr upr## 1 23.26288 21.28423 25.24152
The formula for a 95% confidence interval for the mean response is:
ˆY±t1−α2,df⋅ˆσ⋅√1n+(X∗−¯X)2∑i(Xi−¯X)2
Let's collect all these values and store them in individual columns in a tibble.
broom::augment()
and
broom::glance()
broom::augment()
augments a model by returning the data used to fit the model and diagnostic
informationaugment(bears_lm)
## # A tibble: 63 x 8## Harvest active_hunters .fitted .resid .hat .sigma .cooksd .std.resid## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>## 1 4 58 5.90 -1.90 0.0237 7.81 0.000745 -0.248 ## 2 0 44 3.26 -3.26 0.0269 7.80 0.00251 -0.426 ## 3 2 52 4.77 -2.77 0.0250 7.81 0.00167 -0.361 ## 4 7 79 9.86 -2.86 0.0199 7.80 0.00141 -0.373 ## 5 15 111 15.9 -0.902 0.0165 7.81 0.000116 -0.117 ## 6 7 109 15.5 -8.52 0.0166 7.73 0.0104 -1.11 ## 7 9 99 13.6 -4.64 0.0174 7.79 0.00324 -0.604 ## 8 21 144 22.1 -1.13 0.0161 7.81 0.000177 -0.147 ## 9 13 148 22.9 -9.89 0.0162 7.71 0.0136 -1.29 ## 10 9 74 8.92 0.0816 0.0207 7.81 0.00000120 0.0106## # ... with 53 more rows
broom::glance()
returns general information on the model and goodness of fit measuresglance(bears_lm)
## # A tibble: 1 x 12## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>## 1 0.874 0.872 7.75 422. 4.26e-29 1 -217. 441. 447. 3663. 61 63
augment(bears_lm) %>% summarise( xbar = mean(active_hunters), Sxx = sum((active_hunters - xbar)^2) ) %>% bind_cols( glance(bears_lm) %>% select(sigma, df.residual, nobs) )
## # A tibble: 1 x 5## xbar Sxx sigma df.residual nobs## <dbl> <dbl> <dbl> <int> <int>## 1 132. 711260. 7.75 61 63
augment(bears_lm) %>% summarise( xbar = mean(active_hunters), Sxx = sum((active_hunters - xbar)^2) ) %>% bind_cols( glance(bears_lm) %>% select(sigma, df.residual, nobs) ) %>% mutate( alpha = 0.05, beta0 = coef(bears_lm)[1], beta1 = coef(bears_lm)[2], xstar = 150, yhat = beta0 + beta1 * xstar, tval = qt(1 - alpha/2, df=df.residual) )
## # A tibble: 1 x 11## xbar Sxx sigma df.residual nobs alpha beta0 beta1 xstar yhat tval## <dbl> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>## 1 132. 711260. 7.75 61 63 0.05 -5.05 0.189 150 23.3 2.00
Finally, create a variable to store these values.
values <- augment(bears_lm) %>% summarise( xbar = mean(active_hunters), Sxx = sum((active_hunters - xbar)^2) ) %>% bind_cols( glance(bears_lm) %>% select(sigma, df.residual, nobs) ) %>% mutate( alpha = 0.05, beta0 = coef(bears_lm)[1], beta1 = coef(bears_lm)[2], xstar = 150, yhat = beta0 + beta1 * xstar, tval = qt(1 - alpha/2, df=df.residual) )
values %>% summarise( fit = beta0 + beta1 * xstar, lower = fit - tval * sigma * sqrt((1 / nobs) + (xstar - xbar)^2 / Sxx), upper = fit + tval * sigma * sqrt((1 / nobs) + (xstar - xbar)^2 / Sxx) ) %>% as.matrix() # only to see more digits
## fit lower upper## [1,] 23.26288 21.28423 25.24152
Compare them to our results from predict()
:
predict( bears_lm, newdata = data.frame(active_hunters=150), interval = "confidence")
## fit lwr upr## 1 23.26288 21.28423 25.24152
We are interested in a 90% prediction interval for the response value of a new observation, where the number of active hunters equals 55. Using the functions built into R, we can easily obtain this interval.
predict( bears_lm, newdata = data.frame(active_hunters=55), interval = "prediction", level = 0.90)
## fit lwr upr## 1 5.332277 -7.766788 18.43134
The formula for a 90% confidence interval for the response value of a new observation is:
ˆY±t1−α2,df⋅ˆσ⋅√1+1n+(X∗−¯X)2∑i(Xi−¯X)2
We can reuse the values that we had previously collected and update:
alpha
xstar
tval
values2 <- values %>% mutate( alpha = 0.10, xstar = 55, tval = qt(1 - alpha/2, df=df.residual) )values2
## # A tibble: 1 x 11## xbar Sxx sigma df.residual nobs alpha beta0 beta1 xstar yhat tval## <dbl> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>## 1 132. 711260. 7.75 61 63 0.1 -5.05 0.189 55 23.3 1.67
The formula differs only by a 1+ term under the square root.
values2 %>% summarise( fit = beta0 + beta1 * xstar, lower = fit - tval * sigma * sqrt(1 + (1 / nobs) + (xstar - xbar)^2 / Sxx), upper = fit + tval * sigma * sqrt(1 + (1 / nobs) + (xstar - xbar)^2 / Sxx) ) %>% as.matrix() # only to see more digits
## fit lower upper## [1,] 5.332277 -7.766788 18.43134
Compare them to our results from predict()
:
predict( bears_lm, newdata = data.frame(active_hunters=55), interval = "prediction", level = 0.90)
## fit lwr upr## 1 5.332277 -7.766788 18.43134
We can extract the design matrix, X, from our linear model using the model.matrix()
function.
X <- model.matrix(bears_lm)head(X)
## (Intercept) active_hunters## 1 1 58## 2 1 44## 3 1 52## 4 1 79## 5 1 111## 6 1 109
t(X) %*% X
## (Intercept) active_hunters## (Intercept) 63 8345## active_hunters 8345 1816641
The hat matrix is defined as
H=X(XTX)−1XT
H <- X %*% solve(t(X) %*% X) %*% t(X)H[1:6, 1:6]
## 1 2 3 4 5 6## 1 0.02366811 0.02513375 0.02429624 0.02146967 0.01811965 0.01832903## 2 0.02513375 0.02687494 0.02587997 0.02252195 0.01854206 0.01879081## 3 0.02429624 0.02587997 0.02497498 0.02192064 0.01830069 0.01852693## 4 0.02146967 0.02252195 0.02192064 0.01989125 0.01748603 0.01763636## 5 0.01811965 0.01854206 0.01830069 0.01748603 0.01652052 0.01658087## 6 0.01832903 0.01879081 0.01852693 0.01763636 0.01658087 0.01664684
The variance-covariance matrix of the parameter estimates is obtained as
Var(ˆβ)=ˆσ2(XTX)−1
We already have the value of ˆσ from our previous collection of values, so we just need to remember to square it.
var_beta <- values$sigma^2 * solve(t(X) %*% X)var_beta
## (Intercept) active_hunters## (Intercept) 2.43446196 -1.118305e-02## active_hunters -0.01118305 8.442565e-05
Since the variances were along the diagonal, if we square root them we will get the standard errors.
We can extract the diagonal elements of a matrix using diag()
.
se_beta <- sqrt(diag(var_beta))se_beta
## (Intercept) active_hunters ## 1.560276244 0.009188343
Compare with the standard errors computed from summary()
or broom::tidy()
:
tidy(bears_lm)
## # A tibble: 2 x 5## term estimate std.error statistic p.value## <chr> <dbl> <dbl> <dbl> <dbl>## 1 (Intercept) -5.05 1.56 -3.24 1.96e- 3## 2 active_hunters 0.189 0.00919 20.5 4.26e-29
They're the same (as they should be)!
bears <- read_csv("./data/bears.csv") %>% rename(active_hunters = `Active Hunters`) %>% filter(WMU != "Total") %>% filter(WMU %in% c("82A","83","84","73","74","75","76"))bears
## # A tibble: 63 x 4## WMU Year active_hunters Harvest## <chr> <dbl> <dbl> <dbl>## 1 73 2012 58 4## 2 73 2013 44 0## 3 73 2014 52 2## 4 73 2015 79 7## 5 73 2016 111 15## 6 73 2017 109 7## 7 73 2018 99 9## 8 73 2019 144 21## 9 73 2020 148 13## 10 74 2012 74 9## # ... with 53 more rows
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
o | Tile View: Overview of Slides |
Esc | Back to slideshow |
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6 v purrr 0.3.4## v tibble 3.1.6 v dplyr 1.0.9## v tidyr 1.2.0 v stringr 1.4.0## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --## x dplyr::filter() masks stats::filter()## x dplyr::lag() masks stats::lag()
library(broom)
bears <- read_csv("./data/bears.csv") %>% rename(active_hunters = `Active Hunters`) %>% filter(WMU != "Total") %>% filter(WMU %in% c("82A","83","84","73","74","75","76"))bears
## # A tibble: 63 x 4## WMU Year active_hunters Harvest## <chr> <dbl> <dbl> <dbl>## 1 73 2012 58 4## 2 73 2013 44 0## 3 73 2014 52 2## 4 73 2015 79 7## 5 73 2016 111 15## 6 73 2017 109 7## 7 73 2018 99 9## 8 73 2019 144 21## 9 73 2020 148 13## 10 74 2012 74 9## # ... with 53 more rows
bears_lm <- lm(Harvest ~ active_hunters, data=bears)summary(bears_lm)
## ## Call:## lm(formula = Harvest ~ active_hunters, data = bears)## ## Residuals:## Min 1Q Median 3Q Max ## -20.2036 -2.8621 0.0816 3.3201 25.6144 ## ## Coefficients:## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -5.048595 1.560276 -3.236 0.00196 ** ## active_hunters 0.188743 0.009188 20.542 < 2e-16 ***## ---## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1## ## Residual standard error: 7.749 on 61 degrees of freedom## Multiple R-squared: 0.8737, Adjusted R-squared: 0.8716 ## F-statistic: 422 on 1 and 61 DF, p-value: < 2.2e-16
We are interested in a 95% confidence interval for the mean harvest when the number of active hunters equals 150. Using the functions built into R, we can easily obtain this interval.
predict( bears_lm, newdata = data.frame(active_hunters=150), interval = "confidence")
## fit lwr upr## 1 23.26288 21.28423 25.24152
The formula for a 95% confidence interval for the mean response is:
ˆY±t1−α2,df⋅ˆσ⋅√1n+(X∗−¯X)2∑i(Xi−¯X)2
Let's collect all these values and store them in individual columns in a tibble.
broom::augment()
and
broom::glance()
broom::augment()
augments a model by returning the data used to fit the model and diagnostic
informationaugment(bears_lm)
## # A tibble: 63 x 8## Harvest active_hunters .fitted .resid .hat .sigma .cooksd .std.resid## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>## 1 4 58 5.90 -1.90 0.0237 7.81 0.000745 -0.248 ## 2 0 44 3.26 -3.26 0.0269 7.80 0.00251 -0.426 ## 3 2 52 4.77 -2.77 0.0250 7.81 0.00167 -0.361 ## 4 7 79 9.86 -2.86 0.0199 7.80 0.00141 -0.373 ## 5 15 111 15.9 -0.902 0.0165 7.81 0.000116 -0.117 ## 6 7 109 15.5 -8.52 0.0166 7.73 0.0104 -1.11 ## 7 9 99 13.6 -4.64 0.0174 7.79 0.00324 -0.604 ## 8 21 144 22.1 -1.13 0.0161 7.81 0.000177 -0.147 ## 9 13 148 22.9 -9.89 0.0162 7.71 0.0136 -1.29 ## 10 9 74 8.92 0.0816 0.0207 7.81 0.00000120 0.0106## # ... with 53 more rows
broom::glance()
returns general information on the model and goodness of fit measuresglance(bears_lm)
## # A tibble: 1 x 12## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>## 1 0.874 0.872 7.75 422. 4.26e-29 1 -217. 441. 447. 3663. 61 63
augment(bears_lm) %>% summarise( xbar = mean(active_hunters), Sxx = sum((active_hunters - xbar)^2) ) %>% bind_cols( glance(bears_lm) %>% select(sigma, df.residual, nobs) )
## # A tibble: 1 x 5## xbar Sxx sigma df.residual nobs## <dbl> <dbl> <dbl> <int> <int>## 1 132. 711260. 7.75 61 63
augment(bears_lm) %>% summarise( xbar = mean(active_hunters), Sxx = sum((active_hunters - xbar)^2) ) %>% bind_cols( glance(bears_lm) %>% select(sigma, df.residual, nobs) ) %>% mutate( alpha = 0.05, beta0 = coef(bears_lm)[1], beta1 = coef(bears_lm)[2], xstar = 150, yhat = beta0 + beta1 * xstar, tval = qt(1 - alpha/2, df=df.residual) )
## # A tibble: 1 x 11## xbar Sxx sigma df.residual nobs alpha beta0 beta1 xstar yhat tval## <dbl> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>## 1 132. 711260. 7.75 61 63 0.05 -5.05 0.189 150 23.3 2.00
Finally, create a variable to store these values.
values <- augment(bears_lm) %>% summarise( xbar = mean(active_hunters), Sxx = sum((active_hunters - xbar)^2) ) %>% bind_cols( glance(bears_lm) %>% select(sigma, df.residual, nobs) ) %>% mutate( alpha = 0.05, beta0 = coef(bears_lm)[1], beta1 = coef(bears_lm)[2], xstar = 150, yhat = beta0 + beta1 * xstar, tval = qt(1 - alpha/2, df=df.residual) )
values %>% summarise( fit = beta0 + beta1 * xstar, lower = fit - tval * sigma * sqrt((1 / nobs) + (xstar - xbar)^2 / Sxx), upper = fit + tval * sigma * sqrt((1 / nobs) + (xstar - xbar)^2 / Sxx) ) %>% as.matrix() # only to see more digits
## fit lower upper## [1,] 23.26288 21.28423 25.24152
Compare them to our results from predict()
:
predict( bears_lm, newdata = data.frame(active_hunters=150), interval = "confidence")
## fit lwr upr## 1 23.26288 21.28423 25.24152
We are interested in a 90% prediction interval for the response value of a new observation, where the number of active hunters equals 55. Using the functions built into R, we can easily obtain this interval.
predict( bears_lm, newdata = data.frame(active_hunters=55), interval = "prediction", level = 0.90)
## fit lwr upr## 1 5.332277 -7.766788 18.43134
The formula for a 90% confidence interval for the response value of a new observation is:
ˆY±t1−α2,df⋅ˆσ⋅√1+1n+(X∗−¯X)2∑i(Xi−¯X)2
We can reuse the values that we had previously collected and update:
alpha
xstar
tval
values2 <- values %>% mutate( alpha = 0.10, xstar = 55, tval = qt(1 - alpha/2, df=df.residual) )values2
## # A tibble: 1 x 11## xbar Sxx sigma df.residual nobs alpha beta0 beta1 xstar yhat tval## <dbl> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>## 1 132. 711260. 7.75 61 63 0.1 -5.05 0.189 55 23.3 1.67
The formula differs only by a 1+ term under the square root.
values2 %>% summarise( fit = beta0 + beta1 * xstar, lower = fit - tval * sigma * sqrt(1 + (1 / nobs) + (xstar - xbar)^2 / Sxx), upper = fit + tval * sigma * sqrt(1 + (1 / nobs) + (xstar - xbar)^2 / Sxx) ) %>% as.matrix() # only to see more digits
## fit lower upper## [1,] 5.332277 -7.766788 18.43134
Compare them to our results from predict()
:
predict( bears_lm, newdata = data.frame(active_hunters=55), interval = "prediction", level = 0.90)
## fit lwr upr## 1 5.332277 -7.766788 18.43134
We can extract the design matrix, X, from our linear model using the model.matrix()
function.
X <- model.matrix(bears_lm)head(X)
## (Intercept) active_hunters## 1 1 58## 2 1 44## 3 1 52## 4 1 79## 5 1 111## 6 1 109
t(X) %*% X
## (Intercept) active_hunters## (Intercept) 63 8345## active_hunters 8345 1816641
The hat matrix is defined as
H=X(XTX)−1XT
H <- X %*% solve(t(X) %*% X) %*% t(X)H[1:6, 1:6]
## 1 2 3 4 5 6## 1 0.02366811 0.02513375 0.02429624 0.02146967 0.01811965 0.01832903## 2 0.02513375 0.02687494 0.02587997 0.02252195 0.01854206 0.01879081## 3 0.02429624 0.02587997 0.02497498 0.02192064 0.01830069 0.01852693## 4 0.02146967 0.02252195 0.02192064 0.01989125 0.01748603 0.01763636## 5 0.01811965 0.01854206 0.01830069 0.01748603 0.01652052 0.01658087## 6 0.01832903 0.01879081 0.01852693 0.01763636 0.01658087 0.01664684
The variance-covariance matrix of the parameter estimates is obtained as
Var(ˆβ)=ˆσ2(XTX)−1
We already have the value of ˆσ from our previous collection of values, so we just need to remember to square it.
var_beta <- values$sigma^2 * solve(t(X) %*% X)var_beta
## (Intercept) active_hunters## (Intercept) 2.43446196 -1.118305e-02## active_hunters -0.01118305 8.442565e-05
Since the variances were along the diagonal, if we square root them we will get the standard errors.
We can extract the diagonal elements of a matrix using diag()
.
se_beta <- sqrt(diag(var_beta))se_beta
## (Intercept) active_hunters ## 1.560276244 0.009188343
Compare with the standard errors computed from summary()
or broom::tidy()
:
tidy(bears_lm)
## # A tibble: 2 x 5## term estimate std.error statistic p.value## <chr> <dbl> <dbl> <dbl> <dbl>## 1 (Intercept) -5.05 1.56 -3.24 1.96e- 3## 2 active_hunters 0.189 0.00919 20.5 4.26e-29
They're the same (as they should be)!