library(tidyverse) # for graphing and data cleaning
library(tidymodels) # for modeling
library(stacks) # for stacking models
library(naniar) # for analyzing missing values
library(vip) # for variable importance plots
library(usemodels) # for tidymodels suggestions
library(xgboost) # for boosting - need to install, don't need to load
library(doParallel) # for parallel processing
library(lubridate) # for dates
library(moderndive) # for King County housing data
library(patchwork) # for combining plots nicely
library(rmarkdown) # for paged tables
theme_set(theme_minimal()) # Lisa's favorite theme
pollution <- read.csv("data_standardized.csv")
pollution %>%
select(where(is.numeric)) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "value") %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(vars(variable),
scales = "free",
nrow = 5)
pollution %>%
select(Diseases.of.heart) %>%
ggplot(aes(x = Diseases.of.heart)) +
geom_histogram()
for(i in 1:ncol(pollution)) {
median <- median(pollution[,i],na.rm = TRUE)
for (j in 1:nrow(pollution)){
if (is.na(pollution[j, i])){
pollution[j, i] <- median
}
}
}
pollution_mod <- pollution %>%
mutate(log_heart = log(Diseases.of.heart, base = 10)) %>%
select(-Diseases.of.heart) %>%
mutate(across(where(is.character), as.factor)) %>%
add_n_miss() %>%
filter(n_miss_all == 0) %>%
select(-n_miss_all)
set.seed(456)
pollution_split <- initial_split(pollution_mod, prop = .75)
pollution_training <- training(pollution_split)
pollution_testing <- testing(pollution_split)
pollution_recipe <- recipe(log_heart ~ ., data = pollution_training) %>%
step_rm(County,
Year,
All.causes..total.,
Alzheimer.s.disease,
Malignant.neoplasms,
Chronic.lower.respiratory.diseases,
Diabetes.mellitus,
Assault..homicide.,
#Diseases.of.heart,
Essential.hypertension.and.hypertensive.renal.disease,
Accidents..unintentional.injuries.,
Chronic.liver.disease.and.cirrhosis,
Nephritis..nephrotic.syndrome.and.nephrosis,
Parkinson.s.disease,
Influenza.and.pneumonia,
Cerebrovascular.diseases,
Intentional.self.harm..suicide.,
FIPS,
cancer_incidence_rate,
asthma_deaths,
Bladder_Surgery_Ct,
Brain_Surgery_Ct,
Breast_Surgery_Ct,
Colon_Surgery_Ct,
Esophagus_Surgery_Ct,
Liver_Surgery_Ct,
Lung_Surgery_Ct,
Pancreas_Surgery_Ct,
Prostate_Surgery_Ct,
Rectum_Surgery_Ct,
Stomach_Surgery_Ct,
asthma_er_avg) %>%
step_normalize(all_predictors(),
-all_nominal()) %>%
step_dummy(all_nominal(),
-all_outcomes())
pollution_recipe %>%
prep(pollution_training) %>%
juice()
## # A tibble: 114 × 33
## annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
## <dbl> <dbl> <dbl> <dbl>
## 1 0.769 1.91 -0.438 1.20
## 2 -0.791 -0.0862 -0.176 -0.426
## 3 0.0376 -0.153 -0.176 0.136
## 4 0.130 -0.118 -0.176 0.0213
## 5 -0.851 0.0999 -0.969 -0.949
## 6 -0.625 -0.107 -0.860 -0.442
## 7 0.861 -0.0905 -0.176 -0.150
## 8 1.70 0.645 -0.176 -1.08
## 9 -1.25 -1.03 -0.176 -0.150
## 10 3.09 -0.0905 -0.176 -0.150
## # … with 104 more rows, and 29 more variables: annual_mean_co <dbl>,
## # annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## # economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## # neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## # insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## # commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## # employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …
pollution_lasso_mod <-
linear_reg(mixture = 1) %>%
set_engine("glmnet") %>%
set_args(penalty = tune()) %>%
set_mode("regression")
pollution_lasso_wf <-
workflow() %>%
add_recipe(pollution_recipe) %>%
add_model(pollution_lasso_mod)
penalty_grid <- grid_regular(penalty(),
levels = 20)
penalty_grid
## # A tibble: 20 × 1
## penalty
## <dbl>
## 1 1 e-10
## 2 3.36e-10
## 3 1.13e- 9
## 4 3.79e- 9
## 5 1.27e- 8
## 6 4.28e- 8
## 7 1.44e- 7
## 8 4.83e- 7
## 9 1.62e- 6
## 10 5.46e- 6
## 11 1.83e- 5
## 12 6.16e- 5
## 13 2.07e- 4
## 14 6.95e- 4
## 15 2.34e- 3
## 16 7.85e- 3
## 17 2.64e- 2
## 18 8.86e- 2
## 19 2.98e- 1
## 20 1 e+ 0
ctrl_grid <- control_stack_grid()
set.seed(456)
pollution_cv <- vfold_cv(pollution_training, v = 5)
pollution_lasso_tune <-
pollution_lasso_wf %>%
tune_grid(
resamples = pollution_cv,
grid = penalty_grid,
control = ctrl_grid
)
pollution_lasso_tune
## # Tuning results
## # 5-fold cross-validation
## # A tibble: 5 × 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [91/23]> Fold1 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 2 <split [91/23]> Fold2 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 3 <split [91/23]> Fold3 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 4 <split [91/23]> Fold4 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 5 <split [92/22]> Fold5 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [440 × 5]>
pollution_lasso_tune %>%
select(id, .metrics) %>%
unnest(.metrics) %>%
filter(.metric == "rmse")
## # A tibble: 100 × 6
## id penalty .metric .estimator .estimate .config
## <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 Fold1 1 e-10 rmse standard 0.0656 Preprocessor1_Model01
## 2 Fold1 3.36e-10 rmse standard 0.0656 Preprocessor1_Model02
## 3 Fold1 1.13e- 9 rmse standard 0.0656 Preprocessor1_Model03
## 4 Fold1 3.79e- 9 rmse standard 0.0656 Preprocessor1_Model04
## 5 Fold1 1.27e- 8 rmse standard 0.0656 Preprocessor1_Model05
## 6 Fold1 4.28e- 8 rmse standard 0.0656 Preprocessor1_Model06
## 7 Fold1 1.44e- 7 rmse standard 0.0656 Preprocessor1_Model07
## 8 Fold1 4.83e- 7 rmse standard 0.0656 Preprocessor1_Model08
## 9 Fold1 1.62e- 6 rmse standard 0.0656 Preprocessor1_Model09
## 10 Fold1 5.46e- 6 rmse standard 0.0656 Preprocessor1_Model10
## # … with 90 more rows
pollution_lasso_tune %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
scale_x_log10(
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10",scales::math_format(10^.x))) +
labs(x = "penalty", y = "rmse")
pollution_lasso_tune %>%
show_best(metric = "rmse")
## # A tibble: 5 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 e-10 rmse standard 0.0797 5 0.00856 Preprocessor1_Model01
## 2 3.36e-10 rmse standard 0.0797 5 0.00856 Preprocessor1_Model02
## 3 1.13e- 9 rmse standard 0.0797 5 0.00856 Preprocessor1_Model03
## 4 3.79e- 9 rmse standard 0.0797 5 0.00856 Preprocessor1_Model04
## 5 1.27e- 8 rmse standard 0.0797 5 0.00856 Preprocessor1_Model05
best_param <- pollution_lasso_tune %>%
select_best(metric = "rmse")
best_param
## # A tibble: 1 × 2
## penalty .config
## <dbl> <chr>
## 1 0.0000000001 Preprocessor1_Model01
one_se_param <- pollution_lasso_tune %>%
select_by_one_std_err(metric = "rmse", desc(penalty))
one_se_param
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .bound
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.00234 rmse standard 0.0861 5 0.00366 Preprocessor1_M… 0.0797 0.0883
pollution_lasso_final_wf <- pollution_lasso_wf %>%
finalize_workflow(one_se_param)
pollution_lasso_final_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_rm()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
##
## Main Arguments:
## penalty = 0.00233572146909012
## mixture = 1
##
## Computational engine: glmnet
pollution_lasso_final_mod <- pollution_lasso_final_wf %>%
fit(data = pollution_training)
pollution_lasso_final_mod %>%
pull_workflow_fit() %>%
tidy()
## # A tibble: 33 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) -2.76 0.00234
## 2 annual_mean_pm25 0.0134 0.00234
## 3 annual_mean_ozone 0.00800 0.00234
## 4 annual_mean_pb -0.00795 0.00234
## 5 annual_mean_pm10 -0.0126 0.00234
## 6 annual_mean_co -0.0170 0.00234
## 7 annual_mean_no2 0.0106 0.00234
## 8 annual_mean_so2 -0.0190 0.00234
## 9 hpi2score 0 0.00234
## 10 economic 0 0.00234
## # … with 23 more rows
pollution_lasso_test <- pollution_lasso_final_wf %>%
last_fit(pollution_split)
# Metrics for model applied to test data
pollution_lasso_test %>%
collect_metrics()
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.0628 Preprocessor1_Model1
## 2 rsq standard 0.726 Preprocessor1_Model1
use_xgboost(log_heart ~ ., data = pollution_training)
## xgboost_recipe <-
## recipe(formula = log_heart ~ ., data = pollution_training) %>%
## step_novel(all_nominal(), -all_outcomes()) %>%
## step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("regression") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(90624)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
boost_recipe <-
recipe(formula = log_heart ~ ., data = pollution_training) %>%
step_rm(County,
Year,
All.causes..total.,
Alzheimer.s.disease,
Malignant.neoplasms,
Chronic.lower.respiratory.diseases,
Diabetes.mellitus,
Assault..homicide.,
#Diseases.of.heart,
Essential.hypertension.and.hypertensive.renal.disease,
Accidents..unintentional.injuries.,
Chronic.liver.disease.and.cirrhosis,
Nephritis..nephrotic.syndrome.and.nephrosis,
Parkinson.s.disease,
Influenza.and.pneumonia,
Cerebrovascular.diseases,
Intentional.self.harm..suicide.,
FIPS,
cancer_incidence_rate,
asthma_deaths,
Bladder_Surgery_Ct,
Brain_Surgery_Ct,
Breast_Surgery_Ct,
Colon_Surgery_Ct,
Esophagus_Surgery_Ct,
Liver_Surgery_Ct,
Lung_Surgery_Ct,
Pancreas_Surgery_Ct,
Prostate_Surgery_Ct,
Rectum_Surgery_Ct,
Stomach_Surgery_Ct,
asthma_er_avg) %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors(),
-all_nominal())
boost_recipe %>%
prep() %>%
juice()
## # A tibble: 114 × 33
## annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
## <dbl> <dbl> <dbl> <dbl>
## 1 0.769 1.91 -0.438 1.20
## 2 -0.791 -0.0862 -0.176 -0.426
## 3 0.0376 -0.153 -0.176 0.136
## 4 0.130 -0.118 -0.176 0.0213
## 5 -0.851 0.0999 -0.969 -0.949
## 6 -0.625 -0.107 -0.860 -0.442
## 7 0.861 -0.0905 -0.176 -0.150
## 8 1.70 0.645 -0.176 -1.08
## 9 -1.25 -1.03 -0.176 -0.150
## 10 3.09 -0.0905 -0.176 -0.150
## # … with 104 more rows, and 29 more variables: annual_mean_co <dbl>,
## # annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## # economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## # neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## # insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## # commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## # employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …
boost_spec <- boost_tree(
trees = 1000, # number of trees, T in the equations above
tree_depth = 2, # max number of splits in the tree
min_n = 5, # min points required for node to be further split
loss_reduction = 10^-5, # when to stop - smaller = more since it only has to get a little bit better
sample_size = 1, # proportion of training data to use
learn_rate = tune(), # lambda from the equations above
stop_iter = 50 # number of iterations w/o improvement b4 stopping
) %>%
set_engine("xgboost", colsample_bytree = 1) %>%
set_mode("regression")
boost_grid <- grid_regular(learn_rate(),
levels = 10)
boost_grid
## # A tibble: 10 × 1
## learn_rate
## <dbl>
## 1 0.0000000001
## 2 0.000000001
## 3 0.00000001
## 4 0.0000001
## 5 0.000001
## 6 0.00001
## 7 0.0001
## 8 0.001
## 9 0.01
## 10 0.1
boost_wf <- workflow() %>%
add_recipe(boost_recipe) %>%
add_model(boost_spec)
set.seed(456)
val_split <- validation_split(pollution_training,
prop = .6)
val_split
## # Validation Set Split (0.6/0.4)
## # A tibble: 1 × 2
## splits id
## <list> <chr>
## 1 <split [68/46]> validation
set.seed(456)
registerDoParallel()
boost_tune <- boost_wf %>%
tune_grid(
# resamples = val_split,
resamples = pollution_cv,
grid = boost_grid,
control = ctrl_grid
)
collect_metrics(boost_tune)
## # A tibble: 20 × 7
## learn_rate .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000000001 rmse standard 3.26 5 0.0114 Preprocessor1_Model01
## 2 0.0000000001 rsq standard NaN 0 NA Preprocessor1_Model01
## 3 0.000000001 rmse standard 3.26 5 0.0114 Preprocessor1_Model02
## 4 0.000000001 rsq standard NaN 0 NA Preprocessor1_Model02
## 5 0.00000001 rmse standard 3.26 5 0.0114 Preprocessor1_Model03
## 6 0.00000001 rsq standard NaN 0 NA Preprocessor1_Model03
## 7 0.0000001 rmse standard 3.26 5 0.0114 Preprocessor1_Model04
## 8 0.0000001 rsq standard NaN 0 NA Preprocessor1_Model04
## 9 0.000001 rmse standard 3.26 5 0.0114 Preprocessor1_Model05
## 10 0.000001 rsq standard NaN 0 NA Preprocessor1_Model05
## 11 0.00001 rmse standard 3.23 5 0.0115 Preprocessor1_Model06
## 12 0.00001 rsq standard NaN 0 NA Preprocessor1_Model06
## 13 0.0001 rmse standard 2.96 5 0.0117 Preprocessor1_Model07
## 14 0.0001 rsq standard NaN 0 NA Preprocessor1_Model07
## 15 0.001 rmse standard 1.22 5 0.0130 Preprocessor1_Model08
## 16 0.001 rsq standard NaN 0 NA Preprocessor1_Model08
## 17 0.01 rmse standard 0.0701 5 0.00863 Preprocessor1_Model09
## 18 0.01 rsq standard 0.749 5 0.0468 Preprocessor1_Model09
## 19 0.1 rmse standard 0.0548 5 0.00336 Preprocessor1_Model10
## 20 0.1 rsq standard 0.811 5 0.0227 Preprocessor1_Model10
collect_metrics(boost_tune) %>%
filter(.metric == "rmse") %>%
ggplot(aes(x = learn_rate, y = mean)) +
geom_point() +
geom_line() +
scale_x_log10() +
labs(y = "rmse") +
theme_minimal()
best_lr <- select_best(boost_tune, "rmse")
best_lr
## # A tibble: 1 × 2
## learn_rate .config
## <dbl> <chr>
## 1 0.1 Preprocessor1_Model10
# finalize workflow
final_boost_wf <- finalize_workflow(
boost_wf,
best_lr
)
# fit final
final_boost <- final_boost_wf %>%
fit(data = pollution_training)
final_boost %>%
pull_workflow_fit() %>%
vip(geom = "col")
# Use model on test data
test_preds <- pollution_testing %>%
bind_cols(predict(final_boost, new_data = pollution_testing))
# Compute test rmse
test_preds %>%
summarize(rmse = sqrt(mean((log_heart - .pred)^2))) %>%
pull(rmse)
## [1] 0.04564211
# Graph results
test_preds %>%
ggplot(aes(x = log_heart,
y = .pred)) +
geom_point(alpha = .5,
size = .5) +
geom_smooth(se = FALSE) +
geom_abline(slope = 1,
intercept = 0,
color = "darkred") +
labs(x = "Actual log(heart)",
y = "Predicted log(heart)")
pollution_rfrecipe <- recipe(log_heart ~ ., data = pollution_training) %>%
step_rm(County,
Year,
All.causes..total.,
Alzheimer.s.disease,
Malignant.neoplasms,
Chronic.lower.respiratory.diseases,
Diabetes.mellitus,
Assault..homicide.,
#Diseases.of.heart,
Essential.hypertension.and.hypertensive.renal.disease,
Accidents..unintentional.injuries.,
Chronic.liver.disease.and.cirrhosis,
Nephritis..nephrotic.syndrome.and.nephrosis,
Parkinson.s.disease,
Influenza.and.pneumonia,
Cerebrovascular.diseases,
Intentional.self.harm..suicide.,
FIPS,
cancer_incidence_rate,
asthma_deaths,
Bladder_Surgery_Ct,
Brain_Surgery_Ct,
Breast_Surgery_Ct,
Colon_Surgery_Ct,
Esophagus_Surgery_Ct,
Liver_Surgery_Ct,
Lung_Surgery_Ct,
Pancreas_Surgery_Ct,
Prostate_Surgery_Ct,
Rectum_Surgery_Ct,
Stomach_Surgery_Ct,
asthma_er_avg) %>%
step_normalize(all_predictors(),
-all_nominal()) %>%
step_dummy(all_nominal(),
-all_outcomes())
pollution_rfrecipe %>%
prep(pollution_training) %>%
juice()
## # A tibble: 114 × 33
## annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
## <dbl> <dbl> <dbl> <dbl>
## 1 0.769 1.91 -0.438 1.20
## 2 -0.791 -0.0862 -0.176 -0.426
## 3 0.0376 -0.153 -0.176 0.136
## 4 0.130 -0.118 -0.176 0.0213
## 5 -0.851 0.0999 -0.969 -0.949
## 6 -0.625 -0.107 -0.860 -0.442
## 7 0.861 -0.0905 -0.176 -0.150
## 8 1.70 0.645 -0.176 -1.08
## 9 -1.25 -1.03 -0.176 -0.150
## 10 3.09 -0.0905 -0.176 -0.150
## # … with 104 more rows, and 29 more variables: annual_mean_co <dbl>,
## # annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## # economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## # neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## # insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## # commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## # employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …
ranger_pollution <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = 200) %>%
set_mode("regression") %>%
set_engine("ranger")
pollution_rfworkflow <-
workflow() %>%
add_recipe(pollution_rfrecipe) %>%
add_model(ranger_pollution)
# Set up penalty grid model
rf_penalty_grid <- grid_regular(finalize(mtry(),
pollution_training %>%
select(-log_heart)),
min_n(),
levels = 3)
set.seed(456)
pollution_cv <- vfold_cv(pollution_training, v = 5)
# Tune model
pollution_rfTUNE <-
pollution_rfworkflow %>%
tune_grid(
resamples = pollution_cv,
grid = rf_penalty_grid,
control = ctrl_grid
)
best_rmse <-
pollution_rfTUNE %>%
select_best(metric = "rmse")
best_rmse
## # A tibble: 1 × 3
## mtry min_n .config
## <int> <int> <chr>
## 1 63 2 Preprocessor1_Model3
pol_rfFinal <- pollution_rfworkflow %>%
finalize_workflow(best_rmse) %>%
fit(data = pollution_training)
pollution_rffit <- pollution_rfworkflow %>%
fit(pollution_training)
pollution_rffit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_rm()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~tune(), x), num.trees = ~200, min.node.size = min_rows(~tune(), x), num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
##
## Type: Regression
## Number of trees: 200
## Sample size: 114
## Number of independent variables: 32
## Mtry: 32
## Target node size: 114
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.01520217
## R squared (OOB): -0.007846003
# OOB error (MSE)
pollution_rffit$fit$fit$fit$prediction.error
## [1] 0.01520217
# OOB RMSE
sqrt(pollution_rffit$fit$fit$fit$prediction.error)
## [1] 0.1232971
set.seed(456)
metric <- metric_set(rmse)
ctrl_res <- control_stack_resamples()
pollution_rfcv <- pollution_rfworkflow %>%
fit_resamples(pollution_cv,
metrics = metric,
control = ctrl_res)
collect_metrics(pollution_rfTUNE)
Now that we have our set of candidate models (9 random forest, 20 lasso, and 10 boost), we’re ready to stack! We’re going to follow the process laid out on the stacks webpage.
First, create the stack. Shown by the number of columns, it has removed some of the lasso models, likely since they were too similar to other lasso models. The set of candidate models now has 9 random forest, 7 lasso, and 10 boost.
pollution_stack <-
stacks() %>%
add_candidates(boost_tune) %>%
add_candidates(pollution_lasso_tune) %>%
add_candidates(pollution_rfTUNE)
We can look at the predictions from the candidate models in a tibble. Most of the predictions are very close to the actual values, except the first few boost models.
as_tibble(pollution_stack)
## # A tibble: 114 × 28
## log_heart boost_tune_1_01 boost_tune_1_03 boost_tune_1_04 boost_tune_1_05
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -2.80 0.5 0.500 0.500 0.497
## 2 -2.77 0.5 0.500 0.500 0.497
## 3 -2.76 0.5 0.500 0.500 0.497
## 4 -2.53 0.5 0.500 0.500 0.497
## 5 -2.70 0.5 0.500 0.500 0.497
## 6 -2.79 0.5 0.500 0.500 0.497
## 7 -2.63 0.5 0.500 0.500 0.497
## 8 -2.63 0.5 0.500 0.500 0.497
## 9 -2.57 0.5 0.500 0.500 0.497
## 10 -2.52 0.5 0.500 0.500 0.497
## # … with 104 more rows, and 23 more variables: boost_tune_1_06 <dbl>,
## # boost_tune_1_07 <dbl>, boost_tune_1_08 <dbl>, boost_tune_1_09 <dbl>,
## # boost_tune_1_10 <dbl>, pollution_lasso_tune_1_01 <dbl>,
## # pollution_lasso_tune_1_11 <dbl>, pollution_lasso_tune_1_12 <dbl>,
## # pollution_lasso_tune_1_13 <dbl>, pollution_lasso_tune_1_14 <dbl>,
## # pollution_lasso_tune_1_15 <dbl>, pollution_lasso_tune_1_16 <dbl>,
## # pollution_lasso_tune_1_17 <dbl>, pollution_lasso_tune_1_18 <dbl>, …
set.seed(456)
pollution_blend <-
pollution_stack %>%
blend_predictions()
pollution_blend
## # A tibble: 3 × 3
## member type weight
## <chr> <chr> <dbl>
## 1 boost_tune_1_10 boost_tree 0.557
## 2 pollution_rfTUNE_1_3 rand_forest 0.329
## 3 pollution_lasso_tune_1_01 linear_reg 0.115
pollution_blend$metrics %>%
filter(.metric == "rmse")
## # A tibble: 6 × 8
## penalty mixture .metric .estimator mean n std_err .config
## <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.000001 1 rmse standard 0.0542 25 0.00145 Preprocessor1_Model1
## 2 0.00001 1 rmse standard 0.0542 25 0.00145 Preprocessor1_Model2
## 3 0.0001 1 rmse standard 0.0541 25 0.00144 Preprocessor1_Model3
## 4 0.001 1 rmse standard 0.0539 25 0.00143 Preprocessor1_Model4
## 5 0.01 1 rmse standard 0.0536 25 0.00138 Preprocessor1_Model5
## 6 0.1 1 rmse standard 0.113 25 0.00284 Preprocessor1_Model6
autoplot(pollution_blend)
autoplot(pollution_blend, type = "members")
autoplot(pollution_blend, type = "weights")
pollution_final_stack <- pollution_blend %>%
fit_members()
pollution_final_stack
## # A tibble: 3 × 3
## member type weight
## <chr> <chr> <dbl>
## 1 boost_tune_1_10 boost_tree 0.557
## 2 pollution_rfTUNE_1_3 rand_forest 0.329
## 3 pollution_lasso_tune_1_01 linear_reg 0.115
pollution_final_stack %>%
predict(new_data = pollution_testing) %>%
bind_cols(pollution_testing) %>%
ggplot(aes(x = log_heart,
y = .pred)) +
geom_point(alpha = .5,
size = .5) +
geom_smooth(se = FALSE) +
geom_abline(slope = 1,
intercept = 0,
color = "darkred") +
labs(x = "Actual log(heart)",
y = "Predicted log(heart)")
library(Metrics)
data_rmse <- pollution_final_stack %>%
predict(new_data = pollution_testing) %>%
bind_cols(pollution_testing)
rmse(data_rmse$log_alzheimers, data_rmse$.pred)
## [1] NaN