Recipes Feature Engineering Patterns

Overview

Comprehensive patterns for feature engineering using the recipes package. Covers preprocessing steps for numeric, categorical, and text data while preventing information leakage.

Recipe Fundamentals

Basic Recipe Structure

library(recipes)

# Initialize recipe with formula
rec <- recipe(outcome ~ ., data = training_data)

# Or with explicit roles
rec <- recipe(training_data) |>
  update_role(outcome, new_role = "outcome") |>
  update_role(id_column, new_role = "ID") |>
  update_role(-outcome, -id_column, new_role = "predictor")

Selector Functions

# Type-based selectors
all_predictors()
all_outcomes()
all_numeric_predictors()
all_nominal_predictors()
all_numeric()
all_nominal()

# Name-based selectors
starts_with("prefix_")
ends_with("_suffix")
contains("pattern")
matches("regex")
one_of(c("var1", "var2"))

Numeric Preprocessing

Normalization and Scaling

rec <- recipe(outcome ~ ., data = train) |>
  # Center and scale (z-score)
  step_normalize(all_numeric_predictors()) |>

  # Scale to [0, 1]
  step_range(all_numeric_predictors(), min = 0, max = 1) |>

  # Center only
  step_center(all_numeric_predictors()) |>

  # Scale only
  step_scale(all_numeric_predictors())

Transformations for Normality

rec <- recipe(outcome ~ ., data = train) |>
  # Yeo-Johnson (handles zero and negative values)
  step_YeoJohnson(all_numeric_predictors()) |>

  # Box-Cox (positive values only)
  step_BoxCox(positive_vars) |>

  # Log transformation
  step_log(skewed_vars, base = 10) |>

  # Square root
  step_sqrt(count_vars)

Spline and Polynomial Features

rec <- recipe(outcome ~ ., data = train) |>
  # Natural splines
  step_ns(continuous_var, deg_free = 5) |>

  # B-splines
  step_bs(continuous_var, deg_free = 5, degree = 3) |>

  # Polynomial features
  step_poly(continuous_var, degree = 3)

Categorical Encoding

Dummy Variables

rec <- recipe(outcome ~ ., data = train) |>
  # One-hot encoding (drop first level)
  step_dummy(all_nominal_predictors()) |>

  # Keep all levels
  step_dummy(all_nominal_predictors(), one_hot = TRUE)

Handling Rare Categories

rec <- recipe(outcome ~ ., data = train) |>
  # Pool infrequent levels
  step_other(categorical_var, threshold = 0.05, other = "other") |>

  # Handle novel levels in new data
  step_novel(all_nominal_predictors()) |>

  # Convert NA to explicit level
  step_unknown(all_nominal_predictors())

Target Encoding (embed package)

library(embed)

rec <- recipe(outcome ~ ., data = train) |>
  # Likelihood encoding
  step_lencode_glm(high_cardinality_var, outcome = vars(outcome)) |>

  # Mixed model encoding (for hierarchical data)
  step_lencode_mixed(category, outcome = vars(outcome)) |>

  # Weight of evidence
  step_woe(categorical_var, outcome = vars(binary_outcome))

Missing Data Handling

Imputation Methods

rec <- recipe(outcome ~ ., data = train) |>
  # Simple imputation
  step_impute_mean(all_numeric_predictors()) |>
  step_impute_median(all_numeric_predictors()) |>
  step_impute_mode(all_nominal_predictors()) |>

  # KNN imputation
  step_impute_knn(all_predictors(), neighbors = 5) |>

  # Bagged tree imputation
  step_impute_bag(all_predictors()) |>

  # Linear model imputation
  step_impute_linear(numeric_var, impute_with = imp_vars(predictor1, predictor2))

Missing Indicators

rec <- recipe(outcome ~ ., data = train) |>
  # Create indicator for missingness
  step_indicate_na(all_predictors()) |>

  # Then impute

  step_impute_median(all_numeric_predictors()) |>
  step_impute_mode(all_nominal_predictors())

Dimensionality Reduction

PCA and Related Methods

rec <- recipe(outcome ~ ., data = train) |>
  step_normalize(all_numeric_predictors()) |>

  # Principal Component Analysis
  step_pca(all_numeric_predictors(), num_comp = 5) |>

  # Or keep components explaining variance threshold
  step_pca(all_numeric_predictors(), threshold = 0.95)

Other Reduction Methods

library(embed)

rec <- recipe(outcome ~ ., data = train) |>
  # UMAP
  step_umap(all_numeric_predictors(), num_comp = 2) |>

  # Kernel PCA
  step_kpca(all_numeric_predictors(), num_comp = 5)

Interactions and Combinations

rec <- recipe(outcome ~ ., data = train) |>
  # Create interaction terms
  step_interact(terms = ~ var1:var2) |>

  # Multiple interactions
  step_interact(terms = ~ starts_with("x"):starts_with("z")) |>

  # Ratios
  step_ratio(numerator = denom_vars(var1), denom = denom_vars(var2))

Feature Selection

rec <- recipe(outcome ~ ., data = train) |>
  # Remove zero variance
  step_zv(all_predictors()) |>

  # Remove near-zero variance
  step_nzv(all_predictors(), freq_cut = 95/5, unique_cut = 10) |>

  # Remove highly correlated
  step_corr(all_numeric_predictors(), threshold = 0.9) |>

  # Remove linear combinations
  step_lincomb(all_numeric_predictors())

Class Imbalance (themis)

library(themis)

rec <- recipe(outcome ~ ., data = train) |>
  # Downsample majority class
  step_downsample(outcome) |>

  # Upsample minority class
  step_upsample(outcome) |>

  # SMOTE
  step_smote(outcome) |>

  # ADASYN
  step_adasyn(outcome)

Date/Time Features

rec <- recipe(outcome ~ ., data = train) |>
  # Extract date components
  step_date(date_var, features = c("year", "month", "dow", "doy")) |>

  # Holiday indicators
  step_holiday(date_var, holidays = c("USChristmasDay", "USNewYearsDay")) |>

  # Time components
  step_time(datetime_var, features = c("hour", "minute"))

Text Features (textrecipes)

library(textrecipes)

rec <- recipe(outcome ~ ., data = train) |>
  step_tokenize(text_var) |>
  step_stopwords(text_var) |>
  step_stem(text_var) |>
  step_ngram(text_var, num_tokens = 2) |>
  step_tfidf(text_var, max_tokens = 100)

Recipe Execution

# Prepare recipe (estimate parameters from training data)
prepped_rec <- prep(rec, training = train_data)

# Apply to training data
train_processed <- bake(prepped_rec, new_data = NULL)  # or juice(prepped_rec)

# Apply to new data
test_processed <- bake(prepped_rec, new_data = test_data)

# Inspect recipe
tidy(prepped_rec)
tidy(prepped_rec, number = 1)  # specific step

Step Ordering Best Practices

recipe(outcome ~ ., data = train) |>
  # 1. Handle roles and IDs
  update_role(id, new_role = "ID") |>


  # 2. Impute missing values first
  step_impute_median(all_numeric_predictors()) |>
  step_impute_mode(all_nominal_predictors()) |>

  # 3. Handle individual variable issues
  step_other(all_nominal_predictors(), threshold = 0.05) |>
  step_novel(all_nominal_predictors()) |>

  # 4. Transform numeric variables
  step_YeoJohnson(all_numeric_predictors()) |>

  # 5. Create interactions before encoding
  step_interact(terms = ~ var1:var2) |>

  # 6. Encode categorical variables
  step_dummy(all_nominal_predictors()) |>

  # 7. Normalize (after dummy coding)
  step_normalize(all_numeric_predictors()) |>

  # 8. Feature selection last
  step_zv(all_predictors()) |>
  step_corr(all_numeric_predictors())

Key Principles

•Prevent leakage: All statistics computed from training data only
•Order matters: Impute → Transform → Encode → Normalize → Select
•Use selectors: More maintainable than listing variable names
•Document decisions: Comment why each step is included
•Test on holdout: Verify recipe generalizes to new data