Totally Tiny Tuning Tools
Max Kuhn (RStudio)
1 / 24

2 / 24

The Ames Housing Data

3 / 24

Pre-processing with {recipes}

library(tidymodels)
library(AmesHousing)
ames <- make_ames()
ames_rec <- 
  recipe(
    Sale_Price ~ Bldg_Type + Neighborhood + Year_Built + 
      Gr_Liv_Area + Full_Bath + Year_Sold + Lot_Area +
      Central_Air + Longitude + Latitude,
    data = ames
  )

4 / 24

Pre-processing with {recipes}

library(tidymodels)
library(AmesHousing)
ames <- make_ames()
ames_rec <- 
  recipe(
    Sale_Price ~ Bldg_Type + Neighborhood + Year_Built + 
      Gr_Liv_Area + Full_Bath + Year_Sold + Lot_Area +
      Central_Air + Longitude + Latitude,
    data = ames
  ) %>%
  step_log(Sale_Price, Lot_Area, Gr_Liv_Area, base = 10)

5 / 24

Pre-processing with {recipes}

library(tidymodels)
library(AmesHousing)
ames <- make_ames()
ames_rec <- 
  recipe(
    Sale_Price ~ Bldg_Type + Neighborhood + Year_Built + 
      Gr_Liv_Area + Full_Bath + Year_Sold + Lot_Area +
      Central_Air + Longitude + Latitude,
    data = ames
  ) %>%
  step_log(Sale_Price, Lot_Area, Gr_Liv_Area, base = 10) %>%
  step_other(Neighborhood, threshold = 0.05)

6 / 24

Pre-processing with {recipes}

library(tidymodels)
library(AmesHousing)
ames <- make_ames()
ames_rec <- 
  recipe(
    Sale_Price ~ Bldg_Type + Neighborhood + Year_Built + 
      Gr_Liv_Area + Full_Bath + Year_Sold + Lot_Area +
      Central_Air + Longitude + Latitude,
    data = ames
  ) %>%
  step_log(Sale_Price, Lot_Area, Gr_Liv_Area, base = 10) %>%
  step_other(Neighborhood, threshold = 0.05) %>%
  step_dummy(all_nominal())

7 / 24

Pre-processing with {recipes}

library(tidymodels)
library(AmesHousing)
ames <- make_ames()
ames_rec <- 
  recipe(
    Sale_Price ~ Bldg_Type + Neighborhood + Year_Built + 
      Gr_Liv_Area + Full_Bath + Year_Sold + Lot_Area +
      Central_Air + Longitude + Latitude,
    data = ames
  ) %>%
  step_log(Sale_Price, Lot_Area, Gr_Liv_Area, base = 10) %>%
  step_other(Neighborhood, threshold = 0.05) %>%
  step_dummy(all_nominal()) %>%
  step_interact(~ starts_with("Bldg_Type"):Gr_Liv_Area)

8 / 24

Pre-processing with {recipes}

library(tidymodels)
library(AmesHousing)
ames <- make_ames()
ames_rec <- 
  recipe(
    Sale_Price ~ Bldg_Type + Neighborhood + Year_Built + 
      Gr_Liv_Area + Full_Bath + Year_Sold + Lot_Area +
      Central_Air + Longitude + Latitude,
    data = ames
  ) %>%
  step_log(Sale_Price, Lot_Area, Gr_Liv_Area, base = 10) %>%
  step_other(Neighborhood, threshold = 0.05) %>%
  step_dummy(all_nominal()) %>%
  step_interact(~ starts_with("Bldg_Type"):Gr_Liv_Area) %>%
  step_ns(Longitude, Latitude, deg_free = 5)

9 / 24

Creating Models with {parsnip}

reg_mod <- linear_reg()

10 / 24

Creating Models with {parsnip}

reg_mod <- linear_reg() %>% 
  set_engine("glmnet")
  # Could have been "lm", "stan", "keras", ...

11 / 24

Creating Models with {parsnip}

reg_mod <- linear_reg(penalty = 0.1, mixture = 0.5) %>%
  set_engine("glmnet") 
  # Could have been "lm", "stan", "keras", ...

But how do we know that penalty = 0.1, mixture = 0.5, and 5 degree of freedom splines are what we should use?

These are tuning parameters.

The tune package can be used to find good values for these parameters.

How can we alter these objects to "tag" which arguments should be tuned?

12 / 24

I thought about making {tune}with the same intensity thatHilary Parker thinks about making ☕13 / 24

Tagging Parameters using {tune}

library(tune) # also will be in library(tidymodels)
ames_rec <- 
  recipe(
    Sale_Price ~ Bldg_Type + Neighborhood + Year_Built + 
      Gr_Liv_Area + Full_Bath + Year_Sold + Lot_Area +
      Central_Air + Longitude + Latitude,
    data = ames
  ) %>%
  step_log(Sale_Price, Lot_Area, Gr_Liv_Area, base = 10) %>%
  step_other(Neighborhood, threshold = 0.05) %>%
  step_dummy(all_nominal()) %>%
  step_interact(~ starts_with("Bldg_Type"):Gr_Liv_Area) %>%
  step_ns(Longitude, Latitude, deg_free = 5) 
reg_mod <- linear_reg(penalty = 0.1, mixture = 0.5) %>%
  set_engine("glmnet")

14 / 24

Tagging Parameters using {tune}

library(tune) # also will be in library(tidymodels)
ames_rec <- 
  recipe(
    Sale_Price ~ Bldg_Type + Neighborhood + Year_Built + 
      Gr_Liv_Area + Full_Bath + Year_Sold + Lot_Area +
      Central_Air + Longitude + Latitude,
    data = ames
  ) %>%
  step_log(Sale_Price, Lot_Area, Gr_Liv_Area, base = 10) %>%
  step_other(Neighborhood, threshold = 0.05) %>%
  step_dummy(all_nominal()) %>%
  step_interact(~ starts_with("Bldg_Type"):Gr_Liv_Area) %>%
  step_ns(Longitude, Latitude, deg_free = tune())
reg_mod <- linear_reg(penalty = tune(), mixture = tune()) %>%
  set_engine("glmnet")

# returns an expression of itself:
tune()

## tune()

15 / 24

Tagging Parameters using {tune}

library(tune) # also will be in library(tidymodels)
ames_rec <- 
  recipe(
    Sale_Price ~ Bldg_Type + Neighborhood + Year_Built + 
      Gr_Liv_Area + Full_Bath + Year_Sold + Lot_Area +
      Central_Air + Longitude + Latitude,
    data = ames
  ) %>%
  step_log(Sale_Price, Lot_Area, Gr_Liv_Area, base = 10) %>%
  step_other(Neighborhood, threshold = 0.05) %>%
  step_dummy(all_nominal()) %>%
  step_interact(~ starts_with("Bldg_Type"):Gr_Liv_Area) %>%
  step_ns(Longitude, deg_free = tune()) %>%
  step_ns(Latitude, deg_free = tune())
reg_mod <- linear_reg(penalty = tune(), mixture = tune()) %>%
  set_engine("glmnet")

16 / 24

Tagging Parameters using {tune}

library(tune) # also will be in library(tidymodels)
ames_rec <- 
  recipe(
    Sale_Price ~ Bldg_Type + Neighborhood + Year_Built + 
      Gr_Liv_Area + Full_Bath + Year_Sold + Lot_Area +
      Central_Air + Longitude + Latitude,
    data = ames
  ) %>%
  step_log(Sale_Price, Lot_Area, Gr_Liv_Area, base = 10) %>%
  step_other(Neighborhood, threshold = 0.05) %>%
  step_dummy(all_nominal()) %>%
  step_interact(~ starts_with("Bldg_Type"):Gr_Liv_Area) %>%
  step_ns(Longitude, deg_free = tune("Longitude df")) %>%
  step_ns(Latitude, deg_free = tune("Latitude df"))
reg_mod <- linear_reg(penalty = tune(), mixture = tune()) %>%
  set_engine("glmnet")

17 / 24

Ingredients for Grid Search in tidymodels

Model/recipe specification.
A resampling or validation data specification
A pre-defined grid of candidate tuning parameters to evaluate.
Performance metrics to calculate.

set.seed(214828)
ten_fold <- vfold_cv(ames)

18 / 24

Grid Search via {tune}

set.seed(70801)
grid_res <- tune_grid(ames_rec, reg_mod, resamples = ten_fold, grid = 10)
grid_res

## #  10-fold cross-validation 
## # A tibble: 10 x 4
##    splits             id     .metrics          .notes          
##  * <list>             <chr>  <list>            <list>          
##  1 <split [2.6K/293]> Fold01 <tibble [20 × 7]> <tibble [1 × 1]>
##  2 <split [2.6K/293]> Fold02 <tibble [20 × 7]> <tibble [1 × 1]>
##  3 <split [2.6K/293]> Fold03 <tibble [20 × 7]> <tibble [1 × 1]>
##  4 <split [2.6K/293]> Fold04 <tibble [20 × 7]> <tibble [1 × 1]>
##  5 <split [2.6K/293]> Fold05 <tibble [20 × 7]> <tibble [1 × 1]>
##  6 <split [2.6K/293]> Fold06 <tibble [20 × 7]> <tibble [1 × 1]>
##  7 <split [2.6K/293]> Fold07 <tibble [20 × 7]> <tibble [1 × 1]>
##  8 <split [2.6K/293]> Fold08 <tibble [20 × 7]> <tibble [1 × 1]>
##  9 <split [2.6K/293]> Fold09 <tibble [20 × 7]> <tibble [1 × 1]>
## 10 <split [2.6K/293]> Fold10 <tibble [20 × 7]> <tibble [1 × 1]>

Kind of looks like the rsample objects with some extra list columns.

We have a bunch of high-level functions that will work with these objects.

19 / 24

Getting the Results that We Want

collect_metrics(grid_res) %>% slice(1:4)

## # A tibble: 4 x 9
##    penalty mixture `Longitude df` `Latitude df` .metric .estimator   mean     n
##      <dbl>   <dbl>          <int>         <int> <chr>   <chr>       <dbl> <int>
## 1 7.27e-10   0.706              4             1 rmse    standard   0.0796    10
## 2 7.27e-10   0.706              4             1 rsq     standard   0.798     10
## 3 6.92e- 9   0.322              9             3 rmse    standard   0.0792    10
## 4 6.92e- 9   0.322              9             3 rsq     standard   0.800     10
## # … with 1 more variable: std_err <dbl>

show_best(grid_res, metric = "rmse", maximize = FALSE) # `select_best()` too

## # A tibble: 5 x 9
##   penalty mixture `Longitude df` `Latitude df` .metric .estimator   mean     n
##     <dbl>   <dbl>          <int>         <int> <chr>   <chr>       <dbl> <int>
## 1 6.45e-6  0.455              11            12 rmse    standard   0.0781    10
## 2 1.00e-8  0.406               2            10 rmse    standard   0.0787    10
## 3 8.47e-3  0.0560             14             8 rmse    standard   0.0787    10
## 4 3.02e-7  0.738               5             5 rmse    standard   0.0788    10
## 5 1.29e-5  0.855              11             7 rmse    standard   0.0789    10
## # … with 1 more variable: std_err <dbl>

20 / 24

Plot Method Shows the Marginal Relationships

# This will improve. Maybe a `shinytune` package? 
autoplot(grid_res, metric = "rsq")

21 / 24

Miscellaneous Notes

The default grid is a space filling design.
We capture all warnings and errors and store them on the .notes column.
You can save the predictions, fitted models/recipes with additional options.
The results using verbose = TRUE are 💯.
foreach is used to standard parallel processing tools can be used.
You can alter the grid, performance metrics, and other aspects.
tune works well with our new workflows package.

22 / 24

Iterative Search via Bayesian Optimization

ctrl <- control_bayes(verbose = TRUE)
set.seed(70801)
srch_res <- tune_bayes(ames_rec, reg_mod, resamples = ten_fold, initial = grid_res, control = ctrl)

Optimizing rmse using the expected improvement
── Iteration 1 ───────────────────────────────────────────────────────────
i Current best:        rmse=0.07797 (@iter 0)
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i penalty=0.000494, mixture=0.394, Longitude df=13, Latitude df=15
i Estimating performance
✓ Estimating performance
ⓧ Newest results:    rmse=0.07823 (+/-0.00217)
── Iteration 2 ───────────────────────────────────────────────────────────
i Current best:        rmse=0.07797 (@iter 0)
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
<snip>
── Iteration 10 ──────────────────────────────────────────────────────────
i Current best:        rmse=0.07788 (@iter 6)
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i penalty=0.943, mixture=0.0825, Longitude df=15, Latitude df=3
i Estimating performance
✓ Estimating performance
ⓧ Newest results:    rmse=0.1664 (+/-0.00406)

23 / 24

Fini

tune is almost on CRAN. For now:

devtools::install_github("tidymodels/tune")

A good place to learn: https://tidymodels.github.io/tune/

We are working on a tidymodels book (coming soon!) that will discuss this thoroughly.

↑, ←, Pg Up, k	Go to previous slide
↓, →, Pg Dn, Space, j	Go to next slide
Home	Go to first slide
End	Go to last slide
Number + Return	Go to specific slide
b / m / f	Toggle blackout / mirrored / fullscreen mode
c	Clone slideshow
p	Toggle presenter mode
t	Restart the presentation timer
?, h	Toggle this help

Totally Tiny Tuning Tools

Max Kuhn (RStudio)

The Ames Housing Data

Pre-processing with {recipes}

Pre-processing with {recipes}

Pre-processing with {recipes}

Pre-processing with {recipes}

Pre-processing with {recipes}

Pre-processing with {recipes}

Creating Models with {parsnip}

Creating Models with {parsnip}

Creating Models with {parsnip}

I thought about making {tune}

with the same intensity that

Hilary Parker thinks about making ☕

Tagging Parameters using {tune}

Tagging Parameters using {tune}

Tagging Parameters using {tune}

Tagging Parameters using {tune}

Ingredients for Grid Search in tidymodels

Grid Search via {tune}

Getting the Results that We Want

Plot Method Shows the Marginal Relationships

Miscellaneous Notes

Iterative Search via Bayesian Optimization

Fini

Help