library(tidyverse)         # for graphing and data cleaning
library(tidymodels)        # for modeling
library(stacks)            # for stacking models
library(naniar)            # for analyzing missing values
library(vip)               # for variable importance plots
library(usemodels)         # for tidymodels suggestions
library(xgboost)           # for boosting - need to install, don't need to load
library(doParallel)        # for parallel processing
library(lubridate)         # for dates
library(moderndive)        # for King County housing data
library(patchwork)         # for combining plots nicely
library(rmarkdown)         # for paged tables
theme_set(theme_minimal()) # Lisa's favorite theme

Lasso Model

pollution <- read.csv("data_standardized.csv")

pollution %>% 
  select(where(is.numeric)) %>% 
  pivot_longer(cols = everything(),
               names_to = "variable", 
               values_to = "value") %>% 
  ggplot(aes(x = value)) +
  geom_histogram(bins = 30) +
  facet_wrap(vars(variable), 
             scales = "free",
             nrow = 5)

pollution %>% 
  select(Diseases.of.heart) %>% 
  ggplot(aes(x = Diseases.of.heart)) +
  geom_histogram()

for(i in 1:ncol(pollution)) {
  median <- median(pollution[,i],na.rm = TRUE)
  for (j in 1:nrow(pollution)){
    if (is.na(pollution[j, i])){
      pollution[j, i] <- median
    }
  }
}

pollution_mod <- pollution %>%
  mutate(log_heart = log(Diseases.of.heart, base = 10)) %>% 
  select(-Diseases.of.heart) %>% 
  mutate(across(where(is.character), as.factor)) %>% 
  add_n_miss() %>% 
  filter(n_miss_all == 0) %>% 
  select(-n_miss_all)

set.seed(456)
pollution_split <- initial_split(pollution_mod, prop = .75)
pollution_training <- training(pollution_split)
pollution_testing <- testing(pollution_split)

pollution_recipe <- recipe(log_heart ~ ., data = pollution_training) %>% 
  step_rm(County,
          Year,
          All.causes..total.,
          Alzheimer.s.disease,
          Malignant.neoplasms,
          Chronic.lower.respiratory.diseases,
          Diabetes.mellitus,
          Assault..homicide.,
          #Diseases.of.heart,
          Essential.hypertension.and.hypertensive.renal.disease,
          Accidents..unintentional.injuries.,
          Chronic.liver.disease.and.cirrhosis,
          Nephritis..nephrotic.syndrome.and.nephrosis,
          Parkinson.s.disease,
          Influenza.and.pneumonia,
          Cerebrovascular.diseases,
          Intentional.self.harm..suicide.,
          FIPS,
          cancer_incidence_rate,
          asthma_deaths,
          Bladder_Surgery_Ct,
          Brain_Surgery_Ct,
          Breast_Surgery_Ct,
          Colon_Surgery_Ct,
          Esophagus_Surgery_Ct,
          Liver_Surgery_Ct,
          Lung_Surgery_Ct,
          Pancreas_Surgery_Ct,
          Prostate_Surgery_Ct,
          Rectum_Surgery_Ct,
          Stomach_Surgery_Ct,
          asthma_er_avg) %>% 
  step_normalize(all_predictors(), 
                 -all_nominal()) %>% 
  step_dummy(all_nominal(), 
             -all_outcomes())

pollution_recipe %>% 
  prep(pollution_training) %>%
  juice()

## # A tibble: 114 × 33
##    annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
##               <dbl>             <dbl>          <dbl>            <dbl>
##  1           0.769             1.91           -0.438           1.20  
##  2          -0.791            -0.0862         -0.176          -0.426 
##  3           0.0376           -0.153          -0.176           0.136 
##  4           0.130            -0.118          -0.176           0.0213
##  5          -0.851             0.0999         -0.969          -0.949 
##  6          -0.625            -0.107          -0.860          -0.442 
##  7           0.861            -0.0905         -0.176          -0.150 
##  8           1.70              0.645          -0.176          -1.08  
##  9          -1.25             -1.03           -0.176          -0.150 
## 10           3.09             -0.0905         -0.176          -0.150 
## # … with 104 more rows, and 29 more variables: annual_mean_co <dbl>,
## #   annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## #   economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## #   neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## #   insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## #   commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## #   employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …

pollution_lasso_mod <- 
  linear_reg(mixture = 1) %>% 
  set_engine("glmnet") %>% 
  set_args(penalty = tune()) %>% 
  set_mode("regression")

pollution_lasso_wf <- 
  workflow() %>% 
  add_recipe(pollution_recipe) %>% 
  add_model(pollution_lasso_mod)

penalty_grid <- grid_regular(penalty(),
                             levels = 20)
penalty_grid

## # A tibble: 20 × 1
##     penalty
##       <dbl>
##  1 1   e-10
##  2 3.36e-10
##  3 1.13e- 9
##  4 3.79e- 9
##  5 1.27e- 8
##  6 4.28e- 8
##  7 1.44e- 7
##  8 4.83e- 7
##  9 1.62e- 6
## 10 5.46e- 6
## 11 1.83e- 5
## 12 6.16e- 5
## 13 2.07e- 4
## 14 6.95e- 4
## 15 2.34e- 3
## 16 7.85e- 3
## 17 2.64e- 2
## 18 8.86e- 2
## 19 2.98e- 1
## 20 1   e+ 0

ctrl_grid <- control_stack_grid()

set.seed(456)
pollution_cv <- vfold_cv(pollution_training, v = 5)

pollution_lasso_tune <- 
  pollution_lasso_wf %>% 
  tune_grid(
    resamples = pollution_cv,
    grid = penalty_grid,
    control = ctrl_grid
    )

pollution_lasso_tune

## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 × 5
##   splits          id    .metrics          .notes           .predictions      
##   <list>          <chr> <list>            <list>           <list>            
## 1 <split [91/23]> Fold1 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 2 <split [91/23]> Fold2 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 3 <split [91/23]> Fold3 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 4 <split [91/23]> Fold4 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 5 <split [92/22]> Fold5 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [440 × 5]>

pollution_lasso_tune %>% 
  select(id, .metrics) %>% 
  unnest(.metrics) %>% 
  filter(.metric == "rmse")

## # A tibble: 100 × 6
##    id     penalty .metric .estimator .estimate .config              
##    <chr>    <dbl> <chr>   <chr>          <dbl> <chr>                
##  1 Fold1 1   e-10 rmse    standard      0.0656 Preprocessor1_Model01
##  2 Fold1 3.36e-10 rmse    standard      0.0656 Preprocessor1_Model02
##  3 Fold1 1.13e- 9 rmse    standard      0.0656 Preprocessor1_Model03
##  4 Fold1 3.79e- 9 rmse    standard      0.0656 Preprocessor1_Model04
##  5 Fold1 1.27e- 8 rmse    standard      0.0656 Preprocessor1_Model05
##  6 Fold1 4.28e- 8 rmse    standard      0.0656 Preprocessor1_Model06
##  7 Fold1 1.44e- 7 rmse    standard      0.0656 Preprocessor1_Model07
##  8 Fold1 4.83e- 7 rmse    standard      0.0656 Preprocessor1_Model08
##  9 Fold1 1.62e- 6 rmse    standard      0.0656 Preprocessor1_Model09
## 10 Fold1 5.46e- 6 rmse    standard      0.0656 Preprocessor1_Model10
## # … with 90 more rows

pollution_lasso_tune %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  ggplot(aes(x = penalty, y = mean)) +
  geom_point() +
  geom_line() +
  scale_x_log10(
   breaks = scales::trans_breaks("log10", function(x) 10^x),
   labels = scales::trans_format("log10",scales::math_format(10^.x))) +
  labs(x = "penalty", y = "rmse")

pollution_lasso_tune %>% 
  show_best(metric = "rmse")

## # A tibble: 5 × 7
##    penalty .metric .estimator   mean     n std_err .config              
##      <dbl> <chr>   <chr>       <dbl> <int>   <dbl> <chr>                
## 1 1   e-10 rmse    standard   0.0797     5 0.00856 Preprocessor1_Model01
## 2 3.36e-10 rmse    standard   0.0797     5 0.00856 Preprocessor1_Model02
## 3 1.13e- 9 rmse    standard   0.0797     5 0.00856 Preprocessor1_Model03
## 4 3.79e- 9 rmse    standard   0.0797     5 0.00856 Preprocessor1_Model04
## 5 1.27e- 8 rmse    standard   0.0797     5 0.00856 Preprocessor1_Model05

best_param <- pollution_lasso_tune %>% 
  select_best(metric = "rmse")
best_param

## # A tibble: 1 × 2
##        penalty .config              
##          <dbl> <chr>                
## 1 0.0000000001 Preprocessor1_Model01

one_se_param <- pollution_lasso_tune %>% 
  select_by_one_std_err(metric = "rmse", desc(penalty))
one_se_param

## # A tibble: 1 × 9
##   penalty .metric .estimator   mean     n std_err .config           .best .bound
##     <dbl> <chr>   <chr>       <dbl> <int>   <dbl> <chr>             <dbl>  <dbl>
## 1 0.00234 rmse    standard   0.0861     5 0.00366 Preprocessor1_M… 0.0797 0.0883

pollution_lasso_final_wf <- pollution_lasso_wf %>% 
  finalize_workflow(one_se_param)
pollution_lasso_final_wf

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_rm()
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Main Arguments:
##   penalty = 0.00233572146909012
##   mixture = 1
## 
## Computational engine: glmnet

pollution_lasso_final_mod <- pollution_lasso_final_wf %>% 
  fit(data = pollution_training)

pollution_lasso_final_mod %>% 
  pull_workflow_fit() %>% 
  tidy()

## # A tibble: 33 × 3
##    term              estimate penalty
##    <chr>                <dbl>   <dbl>
##  1 (Intercept)       -2.76    0.00234
##  2 annual_mean_pm25   0.0134  0.00234
##  3 annual_mean_ozone  0.00800 0.00234
##  4 annual_mean_pb    -0.00795 0.00234
##  5 annual_mean_pm10  -0.0126  0.00234
##  6 annual_mean_co    -0.0170  0.00234
##  7 annual_mean_no2    0.0106  0.00234
##  8 annual_mean_so2   -0.0190  0.00234
##  9 hpi2score          0       0.00234
## 10 economic           0       0.00234
## # … with 23 more rows

pollution_lasso_test <- pollution_lasso_final_wf %>% 
  last_fit(pollution_split)

# Metrics for model applied to test data
pollution_lasso_test %>% 
  collect_metrics()

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      0.0628 Preprocessor1_Model1
## 2 rsq     standard      0.726  Preprocessor1_Model1

Xgboost model

use_xgboost(log_heart ~ ., data = pollution_training)

## xgboost_recipe <- 
##   recipe(formula = log_heart ~ ., data = pollution_training) %>% 
##   step_novel(all_nominal(), -all_outcomes()) %>% 
##   step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("regression") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(90624)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

boost_recipe <- 
  recipe(formula = log_heart ~ ., data = pollution_training) %>% 
  step_rm(County,
          Year,
          All.causes..total.,
          Alzheimer.s.disease,
          Malignant.neoplasms,
          Chronic.lower.respiratory.diseases,
          Diabetes.mellitus,
          Assault..homicide.,
          #Diseases.of.heart,
          Essential.hypertension.and.hypertensive.renal.disease,
          Accidents..unintentional.injuries.,
          Chronic.liver.disease.and.cirrhosis,
          Nephritis..nephrotic.syndrome.and.nephrosis,
          Parkinson.s.disease,
          Influenza.and.pneumonia,
          Cerebrovascular.diseases,
          Intentional.self.harm..suicide.,
          FIPS,
          cancer_incidence_rate,
          asthma_deaths,
          Bladder_Surgery_Ct,
          Brain_Surgery_Ct,
          Breast_Surgery_Ct,
          Colon_Surgery_Ct,
          Esophagus_Surgery_Ct,
          Liver_Surgery_Ct,
          Lung_Surgery_Ct,
          Pancreas_Surgery_Ct,
          Prostate_Surgery_Ct,
          Rectum_Surgery_Ct,
          Stomach_Surgery_Ct,
          asthma_er_avg) %>% 
  step_novel(all_nominal_predictors()) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE)  %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_predictors(), 
                 -all_nominal())

boost_recipe %>% 
  prep() %>% 
  juice()

## # A tibble: 114 × 33
##    annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
##               <dbl>             <dbl>          <dbl>            <dbl>
##  1           0.769             1.91           -0.438           1.20  
##  2          -0.791            -0.0862         -0.176          -0.426 
##  3           0.0376           -0.153          -0.176           0.136 
##  4           0.130            -0.118          -0.176           0.0213
##  5          -0.851             0.0999         -0.969          -0.949 
##  6          -0.625            -0.107          -0.860          -0.442 
##  7           0.861            -0.0905         -0.176          -0.150 
##  8           1.70              0.645          -0.176          -1.08  
##  9          -1.25             -1.03           -0.176          -0.150 
## 10           3.09             -0.0905         -0.176          -0.150 
## # … with 104 more rows, and 29 more variables: annual_mean_co <dbl>,
## #   annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## #   economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## #   neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## #   insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## #   commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## #   employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …

boost_spec <- boost_tree(
  trees = 1000,            # number of trees, T in the equations above
  tree_depth = 2,          # max number of splits in the tree
  min_n = 5,               # min points required for node to be further split
  loss_reduction = 10^-5,  # when to stop - smaller = more since it only has to get a little bit better 
  sample_size = 1,         # proportion of training data to use
  learn_rate = tune(),     # lambda from the equations above
  stop_iter = 50           # number of iterations w/o improvement b4 stopping
) %>% 
  set_engine("xgboost", colsample_bytree = 1) %>% 
  set_mode("regression")

boost_grid <- grid_regular(learn_rate(),
                           levels = 10)
boost_grid

## # A tibble: 10 × 1
##      learn_rate
##           <dbl>
##  1 0.0000000001
##  2 0.000000001 
##  3 0.00000001  
##  4 0.0000001   
##  5 0.000001    
##  6 0.00001     
##  7 0.0001      
##  8 0.001       
##  9 0.01        
## 10 0.1

boost_wf <- workflow() %>% 
  add_recipe(boost_recipe) %>%
  add_model(boost_spec)

set.seed(456)
val_split <- validation_split(pollution_training, 
                              prop = .6)
val_split

## # Validation Set Split (0.6/0.4)  
## # A tibble: 1 × 2
##   splits          id        
##   <list>          <chr>     
## 1 <split [68/46]> validation

set.seed(456)
registerDoParallel()

boost_tune <- boost_wf %>% 
  tune_grid(
  # resamples = val_split,
  resamples = pollution_cv,
  grid = boost_grid,
  control = ctrl_grid
)

collect_metrics(boost_tune)

## # A tibble: 20 × 7
##      learn_rate .metric .estimator     mean     n  std_err .config              
##           <dbl> <chr>   <chr>         <dbl> <int>    <dbl> <chr>                
##  1 0.0000000001 rmse    standard     3.26       5  0.0114  Preprocessor1_Model01
##  2 0.0000000001 rsq     standard   NaN          0 NA       Preprocessor1_Model01
##  3 0.000000001  rmse    standard     3.26       5  0.0114  Preprocessor1_Model02
##  4 0.000000001  rsq     standard   NaN          0 NA       Preprocessor1_Model02
##  5 0.00000001   rmse    standard     3.26       5  0.0114  Preprocessor1_Model03
##  6 0.00000001   rsq     standard   NaN          0 NA       Preprocessor1_Model03
##  7 0.0000001    rmse    standard     3.26       5  0.0114  Preprocessor1_Model04
##  8 0.0000001    rsq     standard   NaN          0 NA       Preprocessor1_Model04
##  9 0.000001     rmse    standard     3.26       5  0.0114  Preprocessor1_Model05
## 10 0.000001     rsq     standard   NaN          0 NA       Preprocessor1_Model05
## 11 0.00001      rmse    standard     3.23       5  0.0115  Preprocessor1_Model06
## 12 0.00001      rsq     standard   NaN          0 NA       Preprocessor1_Model06
## 13 0.0001       rmse    standard     2.96       5  0.0117  Preprocessor1_Model07
## 14 0.0001       rsq     standard   NaN          0 NA       Preprocessor1_Model07
## 15 0.001        rmse    standard     1.22       5  0.0130  Preprocessor1_Model08
## 16 0.001        rsq     standard   NaN          0 NA       Preprocessor1_Model08
## 17 0.01         rmse    standard     0.0701     5  0.00863 Preprocessor1_Model09
## 18 0.01         rsq     standard     0.749      5  0.0468  Preprocessor1_Model09
## 19 0.1          rmse    standard     0.0548     5  0.00336 Preprocessor1_Model10
## 20 0.1          rsq     standard     0.811      5  0.0227  Preprocessor1_Model10

collect_metrics(boost_tune) %>% 
  filter(.metric == "rmse") %>% 
  ggplot(aes(x = learn_rate, y = mean)) +
  geom_point() +
  geom_line() +
  scale_x_log10() +
  labs(y = "rmse") +
  theme_minimal()

best_lr <- select_best(boost_tune, "rmse")
best_lr

## # A tibble: 1 × 2
##   learn_rate .config              
##        <dbl> <chr>                
## 1        0.1 Preprocessor1_Model10

# finalize workflow
final_boost_wf <- finalize_workflow(
  boost_wf,
  best_lr
)

# fit final
final_boost <- final_boost_wf %>% 
  fit(data = pollution_training)

final_boost %>% 
  pull_workflow_fit() %>%
  vip(geom = "col")

# Use model on test data
test_preds <- pollution_testing %>% 
  bind_cols(predict(final_boost, new_data = pollution_testing)) 

# Compute test rmse
test_preds %>% 
  summarize(rmse = sqrt(mean((log_heart - .pred)^2))) %>% 
  pull(rmse)

## [1] 0.04564211

# Graph results
test_preds %>% 
  ggplot(aes(x = log_heart,
             y = .pred)) +
  geom_point(alpha = .5, 
             size = .5) +
  geom_smooth(se = FALSE) +
  geom_abline(slope = 1, 
              intercept = 0, 
              color = "darkred") +
  labs(x = "Actual log(heart)", 
       y = "Predicted log(heart)")

Random Forest Model

pollution_rfrecipe <- recipe(log_heart ~ ., data = pollution_training) %>% 
  step_rm(County,
          Year,
          All.causes..total.,
          Alzheimer.s.disease,
          Malignant.neoplasms,
          Chronic.lower.respiratory.diseases,
          Diabetes.mellitus,
          Assault..homicide.,
          #Diseases.of.heart,
          Essential.hypertension.and.hypertensive.renal.disease,
          Accidents..unintentional.injuries.,
          Chronic.liver.disease.and.cirrhosis,
          Nephritis..nephrotic.syndrome.and.nephrosis,
          Parkinson.s.disease,
          Influenza.and.pneumonia,
          Cerebrovascular.diseases,
          Intentional.self.harm..suicide.,
          FIPS,
          cancer_incidence_rate,
          asthma_deaths,
          Bladder_Surgery_Ct,
          Brain_Surgery_Ct,
          Breast_Surgery_Ct,
          Colon_Surgery_Ct,
          Esophagus_Surgery_Ct,
          Liver_Surgery_Ct,
          Lung_Surgery_Ct,
          Pancreas_Surgery_Ct,
          Prostate_Surgery_Ct,
          Rectum_Surgery_Ct,
          Stomach_Surgery_Ct,
          asthma_er_avg) %>% 
  step_normalize(all_predictors(), 
                 -all_nominal()) %>% 
  step_dummy(all_nominal(), 
             -all_outcomes())

pollution_rfrecipe %>% 
  prep(pollution_training) %>%
  juice()

## # A tibble: 114 × 33
##    annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
##               <dbl>             <dbl>          <dbl>            <dbl>
##  1           0.769             1.91           -0.438           1.20  
##  2          -0.791            -0.0862         -0.176          -0.426 
##  3           0.0376           -0.153          -0.176           0.136 
##  4           0.130            -0.118          -0.176           0.0213
##  5          -0.851             0.0999         -0.969          -0.949 
##  6          -0.625            -0.107          -0.860          -0.442 
##  7           0.861            -0.0905         -0.176          -0.150 
##  8           1.70              0.645          -0.176          -1.08  
##  9          -1.25             -1.03           -0.176          -0.150 
## 10           3.09             -0.0905         -0.176          -0.150 
## # … with 104 more rows, and 29 more variables: annual_mean_co <dbl>,
## #   annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## #   economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## #   neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## #   insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## #   commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## #   employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …

ranger_pollution <- 
  rand_forest(mtry = tune(), 
              min_n = tune(), 
              trees = 200) %>% 
  set_mode("regression") %>% 
  set_engine("ranger")

pollution_rfworkflow <- 
  workflow() %>% 
  add_recipe(pollution_rfrecipe) %>% 
  add_model(ranger_pollution)

# Set up penalty grid model 
rf_penalty_grid <- grid_regular(finalize(mtry(),
                                         pollution_training %>%
                                           select(-log_heart)),
                                min_n(),
                                levels = 3)

set.seed(456) 

pollution_cv <- vfold_cv(pollution_training, v = 5) 

# Tune model
pollution_rfTUNE <- 
  pollution_rfworkflow %>% 
  tune_grid(
    resamples = pollution_cv,
    grid = rf_penalty_grid,
    control = ctrl_grid
  )

best_rmse <- 
  pollution_rfTUNE %>%
  select_best(metric = "rmse")

best_rmse

## # A tibble: 1 × 3
##    mtry min_n .config             
##   <int> <int> <chr>               
## 1    63     2 Preprocessor1_Model3

pol_rfFinal <- pollution_rfworkflow %>%
  finalize_workflow(best_rmse) %>%
  fit(data = pollution_training)

pollution_rffit <- pollution_rfworkflow %>% 
  fit(pollution_training)

pollution_rffit

## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_rm()
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~tune(),      x), num.trees = ~200, min.node.size = min_rows(~tune(), x),      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1)) 
## 
## Type:                             Regression 
## Number of trees:                  200 
## Sample size:                      114 
## Number of independent variables:  32 
## Mtry:                             32 
## Target node size:                 114 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.01520217 
## R squared (OOB):                  -0.007846003

# OOB error (MSE)
pollution_rffit$fit$fit$fit$prediction.error

## [1] 0.01520217

# OOB RMSE
sqrt(pollution_rffit$fit$fit$fit$prediction.error)

## [1] 0.1232971

set.seed(456)

metric <- metric_set(rmse) 
ctrl_res <- control_stack_resamples() 

pollution_rfcv <- pollution_rfworkflow %>%
  fit_resamples(pollution_cv, 
               metrics = metric, 
               control = ctrl_res) 

collect_metrics(pollution_rfTUNE)

Stacking

Now that we have our set of candidate models (9 random forest, 20 lasso, and 10 boost), we’re ready to stack! We’re going to follow the process laid out on the stacks webpage.

First, create the stack. Shown by the number of columns, it has removed some of the lasso models, likely since they were too similar to other lasso models. The set of candidate models now has 9 random forest, 7 lasso, and 10 boost.

pollution_stack <- 
  stacks() %>% 
  add_candidates(boost_tune) %>%
  add_candidates(pollution_lasso_tune) %>% 
  add_candidates(pollution_rfTUNE)

We can look at the predictions from the candidate models in a tibble. Most of the predictions are very close to the actual values, except the first few boost models.

as_tibble(pollution_stack)

## # A tibble: 114 × 28
##    log_heart boost_tune_1_01 boost_tune_1_03 boost_tune_1_04 boost_tune_1_05
##        <dbl>           <dbl>           <dbl>           <dbl>           <dbl>
##  1     -2.80             0.5           0.500           0.500           0.497
##  2     -2.77             0.5           0.500           0.500           0.497
##  3     -2.76             0.5           0.500           0.500           0.497
##  4     -2.53             0.5           0.500           0.500           0.497
##  5     -2.70             0.5           0.500           0.500           0.497
##  6     -2.79             0.5           0.500           0.500           0.497
##  7     -2.63             0.5           0.500           0.500           0.497
##  8     -2.63             0.5           0.500           0.500           0.497
##  9     -2.57             0.5           0.500           0.500           0.497
## 10     -2.52             0.5           0.500           0.500           0.497
## # … with 104 more rows, and 23 more variables: boost_tune_1_06 <dbl>,
## #   boost_tune_1_07 <dbl>, boost_tune_1_08 <dbl>, boost_tune_1_09 <dbl>,
## #   boost_tune_1_10 <dbl>, pollution_lasso_tune_1_01 <dbl>,
## #   pollution_lasso_tune_1_11 <dbl>, pollution_lasso_tune_1_12 <dbl>,
## #   pollution_lasso_tune_1_13 <dbl>, pollution_lasso_tune_1_14 <dbl>,
## #   pollution_lasso_tune_1_15 <dbl>, pollution_lasso_tune_1_16 <dbl>,
## #   pollution_lasso_tune_1_17 <dbl>, pollution_lasso_tune_1_18 <dbl>, …

set.seed(456)

pollution_blend <- 
  pollution_stack %>% 
  blend_predictions()

pollution_blend

## # A tibble: 3 × 3
##   member                    type        weight
##   <chr>                     <chr>        <dbl>
## 1 boost_tune_1_10           boost_tree   0.557
## 2 pollution_rfTUNE_1_3      rand_forest  0.329
## 3 pollution_lasso_tune_1_01 linear_reg   0.115

pollution_blend$metrics %>% 
  filter(.metric == "rmse")

## # A tibble: 6 × 8
##    penalty mixture .metric .estimator   mean     n std_err .config             
##      <dbl>   <dbl> <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1 0.000001       1 rmse    standard   0.0542    25 0.00145 Preprocessor1_Model1
## 2 0.00001        1 rmse    standard   0.0542    25 0.00145 Preprocessor1_Model2
## 3 0.0001         1 rmse    standard   0.0541    25 0.00144 Preprocessor1_Model3
## 4 0.001          1 rmse    standard   0.0539    25 0.00143 Preprocessor1_Model4
## 5 0.01           1 rmse    standard   0.0536    25 0.00138 Preprocessor1_Model5
## 6 0.1            1 rmse    standard   0.113     25 0.00284 Preprocessor1_Model6

autoplot(pollution_blend)

autoplot(pollution_blend, type = "members")

autoplot(pollution_blend, type = "weights")

pollution_final_stack <- pollution_blend %>% 
  fit_members()

pollution_final_stack

## # A tibble: 3 × 3
##   member                    type        weight
##   <chr>                     <chr>        <dbl>
## 1 boost_tune_1_10           boost_tree   0.557
## 2 pollution_rfTUNE_1_3      rand_forest  0.329
## 3 pollution_lasso_tune_1_01 linear_reg   0.115

pollution_final_stack %>% 
  predict(new_data = pollution_testing) %>% 
  bind_cols(pollution_testing) %>% 
  ggplot(aes(x = log_heart, 
             y = .pred)) +
  geom_point(alpha = .5, 
             size = .5) +
  geom_smooth(se = FALSE) +
  geom_abline(slope = 1, 
              intercept = 0, 
              color = "darkred") +
  labs(x = "Actual log(heart)", 
       y = "Predicted log(heart)")

library(Metrics)

data_rmse <- pollution_final_stack %>% 
  predict(new_data = pollution_testing) %>% 
  bind_cols(pollution_testing) 

rmse(data_rmse$log_alzheimers, data_rmse$.pred)

## [1] NaN

Heart Disease Models

11/30/2021

Lasso Model

Xgboost model

Random Forest Model

Stacking