tidymodels Workflow Patterns

Overview

Core workflow patterns for building machine learning models using the tidymodels ecosystem. Covers the complete pipeline from data splitting through model deployment.

Core Workflow Components

Data Splitting with rsample

library(tidymodels)

# Basic train/test split
set.seed(123)
data_split <- initial_split(data, prop = 0.75, strata = outcome)
train_data <- training(data_split)
test_data <- testing(data_split)

# Validation set approach
data_split <- initial_validation_split(data, prop = c(0.6, 0.2))
train_data <- training(data_split)
val_data <- validation(data_split)
test_data <- testing(data_split)

Recipe Creation

# Create preprocessing recipe
recipe_spec <- recipe(outcome ~ ., data = train_data) |>
  step_normalize(all_numeric_predictors()) |>
  step_dummy(all_nominal_predictors()) |>
  step_zv(all_predictors())

Model Specification with parsnip

# Specify model with tune placeholders
model_spec <- rand_forest(
  mtry = tune(),
  trees = 1000,
  min_n = tune()
) |>
  set_engine("ranger") |>
  set_mode("classification")

Workflow Assembly

# Combine recipe and model
workflow_spec <- workflow() |>
  add_recipe(recipe_spec) |>
  add_model(model_spec)

Resampling Setup

# Cross-validation folds
cv_folds <- vfold_cv(train_data, v = 10, strata = outcome)

# Bootstrap samples
boot_samples <- bootstraps(train_data, times = 25)

Hyperparameter Tuning

# Define tuning grid
tune_grid <- grid_regular(
  mtry(range = c(2, 10)),
  min_n(range = c(2, 20)),
  levels = 5
)

# Tune model
tune_results <- workflow_spec |>
  tune_grid(
    resamples = cv_folds,
    grid = tune_grid,
    metrics = metric_set(roc_auc, accuracy)
  )

Model Selection

# Select best parameters
best_params <- select_best(tune_results, metric = "roc_auc")

# Finalize workflow
final_workflow <- workflow_spec |>
  finalize_workflow(best_params)

Final Fit

# Fit on full training data, evaluate on test
final_fit <- final_workflow |>
  last_fit(data_split)

# Extract metrics
collect_metrics(final_fit)

# Extract predictions
collect_predictions(final_fit)

Model Extraction and Deployment

# Extract fitted workflow
fitted_wf <- extract_workflow(final_fit)

# Save model
saveRDS(fitted_wf, "output/models/final_model.rds")

# Predict on new data
predictions <- predict(fitted_wf, new_data)

Complete Workflow Example

library(tidymodels)
tidymodels_prefer()

# 1. Load and split data
set.seed(123)
data_split <- initial_split(ames, prop = 0.75, strata = Sale_Price)

# 2. Create recipe
ames_recipe <- recipe(Sale_Price ~ ., data = training(data_split)) |>

  step_log(Sale_Price, base = 10) |>
  step_other(Neighborhood, threshold = 0.05) |>
  step_dummy(all_nominal_predictors()) |>
  step_normalize(all_numeric_predictors()) |>
  step_zv(all_predictors())

# 3. Specify model
xgb_spec <- boost_tree(
  trees = tune(),
  tree_depth = tune(),
  learn_rate = tune()
) |>
  set_engine("xgboost") |>
  set_mode("regression")

# 4. Create workflow
xgb_wf <- workflow(ames_recipe, xgb_spec)

# 5. Setup resampling
cv_folds <- vfold_cv(training(data_split), v = 5)

# 6. Tune hyperparameters
xgb_tune <- xgb_wf |>
  tune_grid(
    resamples = cv_folds,
    grid = 20,
    metrics = metric_set(rmse, rsq)
  )

# 7. Select best and finalize
best_xgb <- select_best(xgb_tune, metric = "rmse")
final_wf <- finalize_workflow(xgb_wf, best_xgb)

# 8. Final evaluation
final_fit <- last_fit(final_wf, data_split)
collect_metrics(final_fit)

Workflow Sets for Model Comparison

# Create multiple preprocessing recipes
basic_recipe <- recipe(outcome ~ ., data = train) |>
  step_normalize(all_numeric_predictors())

pca_recipe <- basic_recipe |>
  step_pca(all_numeric_predictors(), num_comp = 5)

# Create multiple model specifications
lm_spec <- linear_reg() |> set_engine("lm")
rf_spec <- rand_forest(trees = 500) |> set_engine("ranger") |> set_mode("regression")
xgb_spec <- boost_tree() |> set_engine("xgboost") |> set_mode("regression")

# Create workflow set
wf_set <- workflow_set(
  preproc = list(basic = basic_recipe, pca = pca_recipe),
  models = list(lm = lm_spec, rf = rf_spec, xgb = xgb_spec)
)

# Fit all workflows
wf_results <- wf_set |>
  workflow_map(
    resamples = cv_folds,
    grid = 10,
    verbose = TRUE
  )

# Compare results
autoplot(wf_results)
rank_results(wf_results, rank_metric = "rmse")

Key Packages

•rsample: Data splitting and resampling
•recipes: Feature engineering
•parsnip: Model specification
•workflows: Combine preprocessing and models
•tune: Hyperparameter optimization
•yardstick: Model evaluation metrics
•workflowsets: Compare multiple workflows
•broom: Tidy model outputs

Best Practices

•Always set a seed before splitting data
•Use stratified sampling for imbalanced outcomes
•Keep test data completely separate until final evaluation
•Use cross-validation for honest performance estimates
•Tune hyperparameters on training data only
•Use last_fit() for final evaluation on test set
•Save the complete workflow object for deployment