r 回归系数置信度估计方法
# Ben Fasoli
rm(list = ls())
# Get google trend data for searches for skis and ski boots
query <- c('skis', 'ski boots')
trend <- gtrends(query, geo = 'US', time = 'all')
trend_ts <- trend$interest_over_time %>%
spread(key = keyword, value = hits) %>%
select(date, skis, `ski boots`)
# Linear regression to predict searches for skis by searches for ski boots
# skis = m * ski boots + b
mod <- lm(skis ~ `ski boots`, data = trend_ts)
# Calculate the confidence interval
yi <- trend_ts$skis # actual values
yih <- predict(mod) # model predicted values
N <- nrow(trend_ts) # number of observations
xi <- trend_ts$`ski boots` # independent variable values
xh <- mean(xi) # mean of independent variable
# Calculate coefficient errors from the sum of squared errors
se <- sqrt(sum((yi - yih)^2) / (N - 2)) / sqrt(sum((xi - xh)^2))
# 0.06204275
# or from the variances (diagonals given by variance-covariance matrix)
se <- sqrt(diag(vcov(mod)))[2]
# 0.06204275
# Manually calculate 95% confidence interval
alpha <- 1 - 0.95
cp <- 1 - alpha / 2 # critical probability
DF <- N - 2 # degrees of freedom
cv <- qt(cp, DF) # critical value from quantile of t-distribution
me <- cv * se # margin of error
ci <- rep(coef(mod)['`ski boots`'], times = 3) + c(-me, 0, me)
# `ski boots` `ski boots` `ski boots`
# 2.485902 2.608391 2.730880
# Validate with confidence interval derived by R from the model object
confint(mod, level = 0.95)
# 2.5 % 97.5 %
# (Intercept) 13.476521 16.26735
# `ski boots` 2.485902 2.73088
# Compare coefficient standard error calculation methods
# On first order, the margin of error at the 95% confidence interval should be
# roughly 2 * se assuming a gaussian distribution
# Derived from variance-coviariance matrix diagonals (variances)
# [1] 0.06204275
se * 2
# 0.1240855
# 0.1224892
# Calculated within summary.lm table
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 14.87194 0.70680 21.04 <2e-16 ***
# `ski boots` 2.60839 0.06204 42.04 <2e-16 ***
# Alternative technique to estimate error in regression coefficients using
# random sampling with replacement (bootstrapping). Results are highly sensitive
# to the size parameter of the resampled populations
bootstrap <- function (x, fun, size = nrow(x), iter = 10) {
sapply(1:iter, function(i) {
id <- sample(1:nrow(x), size = size, replace = T)
y <- x[id, ]
co <- bootstrap(trend_ts, iter = 10000, fun = function(df) {
# Extract second coefficient (slope) from model output
coef(lm(skis ~ `ski boots`, data = df))[2]
# 0.06463646
