MATH 427: Workflow Sets and Feature Selection

Eric Friedlander

Computational Set-Up

library(tidyverse)
library(tidymodels)
library(knitr)
library(janitor) # for contingency tables
library(ISLR2)
library(readODS)

tidymodels_prefer()

set.seed(427)

Workflow Sets in R

Data: Different Ames Housing Prices

Goal: Predict Sale_Price.

ames <- read_rds("../data/AmesHousing.rds")
ames |> glimpse()
Rows: 881
Columns: 20
$ Sale_Price    <int> 244000, 213500, 185000, 394432, 190000, 149000, 149900, …
$ Gr_Liv_Area   <int> 2110, 1338, 1187, 1856, 1844, NA, NA, 1069, 1940, 1544, …
$ Garage_Type   <fct> Attchd, Attchd, Attchd, Attchd, Attchd, Attchd, Attchd, …
$ Garage_Cars   <dbl> 2, 2, 2, 3, 2, 2, 2, 2, 3, 3, 2, 3, 3, 2, 2, 2, 3, 2, 2,…
$ Garage_Area   <dbl> 522, 582, 420, 834, 546, 480, 500, 440, 606, 868, 532, 7…
$ Street        <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pa…
$ Utilities     <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, …
$ Pool_Area     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Neighborhood  <fct> North_Ames, Stone_Brook, Gilbert, Stone_Brook, Northwest…
$ Screen_Porch  <int> 0, 0, 0, 0, 0, 0, 0, 165, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ Overall_Qual  <fct> Good, Very_Good, Above_Average, Excellent, Above_Average…
$ Lot_Area      <int> 11160, 4920, 7980, 11394, 11751, 11241, 12537, 4043, 101…
$ Lot_Frontage  <dbl> 93, 41, 0, 88, 105, 0, 0, 53, 83, 94, 95, 90, 105, 61, 6…
$ MS_SubClass   <fct> One_Story_1946_and_Newer_All_Styles, One_Story_PUD_1946_…
$ Misc_Val      <int> 0, 0, 500, 0, 0, 700, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ Open_Porch_SF <int> 0, 0, 21, 0, 122, 0, 0, 55, 95, 35, 70, 74, 130, 82, 48,…
$ TotRms_AbvGrd <int> 8, 6, 6, 8, 7, 5, 6, 4, 8, 7, 7, 7, 7, 6, 7, 7, 10, 7, 7…
$ First_Flr_SF  <int> 2110, 1338, 1187, 1856, 1844, 1004, 1078, 1069, 1940, 15…
$ Second_Flr_SF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 563, 0, 886, 656, 11…
$ Year_Built    <int> 1968, 2001, 1992, 2010, 1977, 1970, 1971, 1977, 2009, 20…

Clean Data Set

ames <- ames |> 
  mutate(Overall_Qual = factor(Overall_Qual, levels = c("Very_Poor", "Poor", 
                                                        "Fair", "Below_Average",
                                                        "Average", "Above_Average", 
                                                        "Good", "Very_Good",
                                                        "Excellent", "Very_Excellent")),
         Garage_Type = if_else(is.na(Garage_Type), "No_Garage", Garage_Type),
         Garage_Type = as_factor(Garage_Type)
         )

Initial Data Split

data_split <- initial_split(ames, strata = "Sale_Price")
ames_train <- training(data_split)
ames_test  <- testing(data_split)

Define Folds

ames_folds <- vfold_cv(ames_train, v = 10, repeats = 10)
ames_folds
#  10-fold cross-validation repeated 10 times 
# A tibble: 100 × 3
   splits           id       id2   
   <list>           <chr>    <chr> 
 1 <split [594/66]> Repeat01 Fold01
 2 <split [594/66]> Repeat01 Fold02
 3 <split [594/66]> Repeat01 Fold03
 4 <split [594/66]> Repeat01 Fold04
 5 <split [594/66]> Repeat01 Fold05
 6 <split [594/66]> Repeat01 Fold06
 7 <split [594/66]> Repeat01 Fold07
 8 <split [594/66]> Repeat01 Fold08
 9 <split [594/66]> Repeat01 Fold09
10 <split [594/66]> Repeat01 Fold10
# ℹ 90 more rows

Define Model(s)

lm_model <- linear_reg() |> 
  set_engine("lm")

knn5_model <- nearest_neighbor(neighbors = 5) |>
  set_engine("kknn") |>
  set_mode("regression")

knn10_model <- nearest_neighbor(neighbors = 10) |>
  set_engine("kknn") |>
  set_mode("regression")

Define Preprocessing: Linear regression

lm_knnimpute <- recipe(Sale_Price ~ ., data = ames_train) |> 
  step_nzv(all_predictors()) |>  # remove zero or near-zero variable predictors
  step_impute_knn(Year_Built, Gr_Liv_Area) |>  # impute missing values in Overall_Qual and Year_Built
  step_integer(Overall_Qual) |> # convert Overall_Qual into ordinal encoding
  step_other(all_nominal_predictors(), threshold = 0.01, other = "Other") |> # lump all categories with less than 1% representation into a category called Other for each variable
  step_dummy(all_nominal_predictors(), one_hot = FALSE) |>  # in general use one_hot unless doing linear regression
  step_corr(all_numeric_predictors(), threshold = 0.5) |> # remove highly correlated predictors
  step_lincomb(all_numeric_predictors()) # remove variables that have exact linear combinations

Define Preprocessing: Linear regression

lm_meanimpute <- recipe(Sale_Price ~ ., data = ames_train) |> 
  step_nzv(all_predictors()) |>  # remove zero or near-zero variable predictors
  step_impute_mean(Year_Built, Gr_Liv_Area) |>  # impute missing values in Overall_Qual and Year_Built
  step_integer(Overall_Qual) |> # convert Overall_Qual into ordinal encoding
  step_other(all_nominal_predictors(), threshold = 0.01, other = "Other") |> # lump all categories with less than 1% representation into a category called Other for each variable
  step_dummy(all_nominal_predictors(), one_hot = FALSE) |>  # in general use one_hot unless doing linear regression
  step_corr(all_numeric_predictors(), threshold = 0.5) |> # remove highly correlated predictors
  step_lincomb(all_numeric_predictors()) # remove variables that have exact linear combinations

Define Preprocessing: Linear regression

lm_medianimpute <- recipe(Sale_Price ~ ., data = ames_train) |> 
  step_nzv(all_predictors()) |>  # remove zero or near-zero variable predictors
  step_impute_median(Year_Built, Gr_Liv_Area) |>  # impute missing values in Overall_Qual and Year_Built
  step_integer(Overall_Qual) |> # convert Overall_Qual into ordinal encoding
  step_other(all_nominal_predictors(), threshold = 0.01, other = "Other") |> # lump all categories with less than 1% representation into a category called Other for each variable
  step_dummy(all_nominal_predictors(), one_hot = FALSE) |>  # in general use one_hot unless doing linear regression
  step_corr(all_numeric_predictors(), threshold = 0.5) |> # remove highly correlated predictors
  step_lincomb(all_numeric_predictors()) # remove variables that have exact linear combinations

Define Preprocessing: KNN

knn_preproc1 <- recipe(Sale_Price ~ ., data = ames_train) |> 
  step_nzv(all_predictors()) |>  # remove zero or near-zero variable predictors
  step_impute_knn(Year_Built, Gr_Liv_Area) |>  # impute missing values in Overall_Qual and Year_Built
  step_integer(Overall_Qual) |> # convert Overall_Qual into ordinal encoding
  step_other(all_nominal_predictors(), threshold = 0.01, other = "Other") |> # lump all categories with less than 1% representation into a category called Other for each variable
  step_dummy(all_nominal_predictors(), one_hot = TRUE) |>  # in general use one_hot unless doing linear regression
  step_nzv(all_predictors()) |> 
  step_normalize(all_numeric_predictors())

Define Preprocessing: KNN

knn_preproc2 <- recipe(Sale_Price ~ ., data = ames_train) |> 
  step_nzv(all_predictors()) |>  # remove zero or near-zero variable predictors
  step_impute_mean(Year_Built, Gr_Liv_Area) |>  # impute missing values in Overall_Qual and Year_Built
  step_integer(Overall_Qual) |> # convert Overall_Qual into ordinal encoding
  step_other(all_nominal_predictors(), threshold = 0.01, other = "Other") |> # lump all categories with less than 1% representation into a category called Other for each variable
  step_dummy(all_nominal_predictors(), one_hot = TRUE) |>  # in general use one_hot unless doing linear regression
  step_nzv(all_predictors()) |> 
  step_normalize(all_numeric_predictors())

Define Preprocessing: KNN

knn_preproc3 <- recipe(Sale_Price ~ ., data = ames_train) |> 
  step_nzv(all_predictors()) |>  # remove zero or near-zero variable predictors
  step_impute_median(Year_Built, Gr_Liv_Area) |>  # impute missing values in Overall_Qual and Year_Built
  step_integer(Overall_Qual) |> # convert Overall_Qual into ordinal encoding
  step_other(all_nominal_predictors(), threshold = 0.01, other = "Other") |> # lump all categories with less than 1% representation into a category called Other for each variable
  step_dummy(all_nominal_predictors(), one_hot = TRUE) |>  # in general use one_hot unless doing linear regression
  step_nzv(all_predictors()) |> 
  step_normalize(all_numeric_predictors())

Workflow Sets

  • Input lists of models and recipes
  • If cross = TRUE will try out all combinations

Create lists

knn_preprocessors <- list(
  knn_knn_impute = knn_preproc1,
  knn_mean_impute = knn_preproc2,
  knn_median_imput = knn_preproc3
)

knn_models <- list(
  knn5 = knn5_model,
  knn10 = knn10_model
)

Create lists

lm_preprocessors <- list(
  lm_knn_impute = lm_knnimpute,
  lm_mean_impute = lm_meanimpute,
  lm_median_imput = lm_medianimpute
)

lm_models <- list(
  lm_model = lm_model
)

Define Workflow Sets

knn_models <- workflow_set(knn_preprocessors, knn_models, cross = TRUE)
lm_models <-  workflow_set(lm_preprocessors, lm_models, cross = TRUE)
all_models <- lm_models |> 
  bind_rows(knn_models)
  
all_models
# A workflow set/tibble: 9 × 4
  wflow_id                 info             option    result    
  <chr>                    <list>           <list>    <list>    
1 lm_knn_impute_lm_model   <tibble [1 × 4]> <opts[0]> <list [0]>
2 lm_mean_impute_lm_model  <tibble [1 × 4]> <opts[0]> <list [0]>
3 lm_median_imput_lm_model <tibble [1 × 4]> <opts[0]> <list [0]>
4 knn_knn_impute_knn5      <tibble [1 × 4]> <opts[0]> <list [0]>
5 knn_knn_impute_knn10     <tibble [1 × 4]> <opts[0]> <list [0]>
6 knn_mean_impute_knn5     <tibble [1 × 4]> <opts[0]> <list [0]>
7 knn_mean_impute_knn10    <tibble [1 × 4]> <opts[0]> <list [0]>
8 knn_median_imput_knn5    <tibble [1 × 4]> <opts[0]> <list [0]>
9 knn_median_imput_knn10   <tibble [1 × 4]> <opts[0]> <list [0]>

Define Metrics

ames_metrics <- metric_set(rmse, rsq)

Fit Resamples

all_fits <- all_models |> 
  workflow_map("fit_resamples",
               resamples = ames_folds,
               metrics = ames_metrics)

View Metrics

collect_metrics(all_fits) |> 
  filter(.metric == "rmse") |> 
  kable()
wflow_id .config preproc model .metric .estimator mean n std_err
lm_knn_impute_lm_model Preprocessor1_Model1 recipe linear_reg rmse standard 39754.78 100 724.5845
lm_mean_impute_lm_model Preprocessor1_Model1 recipe linear_reg rmse standard 40339.76 100 790.8570
lm_median_imput_lm_model Preprocessor1_Model1 recipe linear_reg rmse standard 40303.18 100 792.2404
knn_knn_impute_knn5 Preprocessor1_Model1 recipe nearest_neighbor rmse standard 40865.74 100 1168.9516
knn_knn_impute_knn10 Preprocessor1_Model1 recipe nearest_neighbor rmse standard 40585.89 100 1196.3147
knn_mean_impute_knn5 Preprocessor1_Model1 recipe nearest_neighbor rmse standard 40973.29 100 1169.1127
knn_mean_impute_knn10 Preprocessor1_Model1 recipe nearest_neighbor rmse standard 40749.35 100 1197.0999
knn_median_imput_knn5 Preprocessor1_Model1 recipe nearest_neighbor rmse standard 40979.23 100 1166.2676
knn_median_imput_knn10 Preprocessor1_Model1 recipe nearest_neighbor rmse standard 40747.77 100 1196.2514

View Metrics

collect_metrics(all_fits) |> 
  filter(.metric == "rsq") |> 
  kable()
wflow_id .config preproc model .metric .estimator mean n std_err
lm_knn_impute_lm_model Preprocessor1_Model1 recipe linear_reg rsq standard 0.7728989 100 0.0055434
lm_mean_impute_lm_model Preprocessor1_Model1 recipe linear_reg rsq standard 0.7661074 100 0.0067038
lm_median_imput_lm_model Preprocessor1_Model1 recipe linear_reg rsq standard 0.7667739 100 0.0066014
knn_knn_impute_knn5 Preprocessor1_Model1 recipe nearest_neighbor rsq standard 0.7574393 100 0.0080339
knn_knn_impute_knn10 Preprocessor1_Model1 recipe nearest_neighbor rsq standard 0.7662157 100 0.0070608
knn_mean_impute_knn5 Preprocessor1_Model1 recipe nearest_neighbor rsq standard 0.7566579 100 0.0076762
knn_mean_impute_knn10 Preprocessor1_Model1 recipe nearest_neighbor rsq standard 0.7651128 100 0.0068290
knn_median_imput_knn5 Preprocessor1_Model1 recipe nearest_neighbor rsq standard 0.7568404 100 0.0076513
knn_median_imput_knn10 Preprocessor1_Model1 recipe nearest_neighbor rsq standard 0.7653082 100 0.0068162

Plotting Results

library(ggrepel)
autoplot(all_fits, metric = "rmse") +
  geom_text_repel(aes(label = wflow_id), nudge_x = 1/8, nudge_y = 1/100) +
  theme(legend.position = "none")

Plotting Results

autoplot(all_fits, metric = "rsq") +
  geom_text_repel(aes(label = wflow_id), nudge_x = 1/8, nudge_y = 1/100) +
  theme(legend.position = "none")

Feature Selection for Linear Regression

What is feature selection?

  • How do we choose what variables to include in our model?
  • Up to now… include all of them… probably not the best
  • Advantage of linear regression: interpretability
  • Including every feature decreases interpretability
  • Reasons for feature selection:
    • Improve model performance
    • Improve model interpretability
  • Parsimony: simpler models are called more parsimonious
  • Occam’s Razor: more parsimonious models are better than less parsimonious models, holding all else constant

Types of feature selection

  • Subset selection: Forward/Backward/Best-Subset Selection
  • Shrinkage-based methods: LASSO and Ridge Regression
  • Dimension reduction: consider linear combinations of predictors

Subset Selection

Exercise

  • With your group, write out the steps for the following algorithms on the board
  • Group 1: Forward selection
  • Group 2: Backward elimination
  • Group 3: Step-wise selection
  • Group 4: Best-subset selection

Subset Selection in R

  • tidymodels does not have an implementation for any subset selection techniques
  • regularization (shrinkage-based) methods almost always perform better
  • colino package provides tidymodels implementation
  • Other options
    • caret package
    • olsrr and blorr packages if you don’t care about cross-validation
    • implement yourself

Feature Selection in R

  • When creating your recipe, don’t need to always include all variables in your recipe:
int_recipe <- recipe(pred ~ var1 + var2 + var1*var2, data = training_data) |> 
  step_x(...)

Re-using Recipe but changing formula

noint_recipe2 <- new_recipe |> 
  remove_formula() |> 
  add_formula(pred ~ var1 + var2)