ml-models-exercise

Author

Andrew Ruiz

Module 11: Machine Learning Models Exercise

Load libraries

library(here)

here() starts at /Users/andrewruiz/andrew_ruiz-MADA-portfolio

library(tidymodels)

── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

✔ broom        1.0.5      ✔ recipes      1.0.10
✔ dials        1.2.1      ✔ rsample      1.2.0 
✔ dplyr        1.1.4      ✔ tibble       3.2.1 
✔ ggplot2      3.5.0      ✔ tidyr        1.3.1 
✔ infer        1.0.6      ✔ tune         1.1.2 
✔ modeldata    1.3.0      ✔ workflows    1.1.4 
✔ parsnip      1.2.0      ✔ workflowsets 1.0.1 
✔ purrr        1.0.2      ✔ yardstick    1.3.1

── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter()  masks stats::filter()
✖ dplyr::lag()     masks stats::lag()
✖ recipes::step()  masks stats::step()
• Learn how to get started at https://www.tidymodels.org/start/

library(ggplot2)
library(GGally)

Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2

library(ggcorrplot)
library(glmnet)

Loading required package: Matrix


Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Loaded glmnet 4.1-8

library(ranger)
library(dplyr)
library(glmnet)
library(workflows)
library(yardstick)

Set the seed for reproducibility

#Set the random seed to 1234
ex11_seed = 1234
set.seed(ex11_seed)

Load rds from the fitting exercise

# Construct the path to the RDS file using here()
file_path_mav <- here("ml-models-exercise", "mav_clean_ex11.rds")

# Load the mav_clean dataframe from the specified RDS file
mav_ex11 <- readRDS(file_path_mav)

# examine the dataset
head(mav_ex11)

# A tibble: 6 × 7
      Y DOSE    AGE SEX   RACE     WT    HT
  <dbl> <fct> <int> <fct> <fct> <dbl> <dbl>
1 2691. 25       42 1     2      94.3  1.77
2 2639. 25       24 1     2      80.4  1.76
3 2150. 25       31 1     1      71.8  1.81
4 1789. 25       46 2     1      77.4  1.65
5 3126. 25       41 2     2      64.3  1.56
6 2337. 25       27 1     2      74.1  1.83

dim(mav_ex11)

[1] 120   7

Examine the RACE variable

# Count occurrences of each category in the RACE variable
race_counts <- table(mav_ex11$RACE)

# Calculate percentages
race_percentages <- (race_counts / sum(race_counts)) * 100

# Combine counts and percentages into a data frame for better readability
race_summary <- data.frame(
  Race = names(race_counts),
  Counts = as.integer(race_counts), # Ensure counts are in integer format
  Percentages = race_percentages
)
# Print the summary data frame
print(race_summary)

  Race Counts Percentages.Var1 Percentages.Freq
1    1     74                1        61.666667
2    2     36                2        30.000000
3    7      2                7         1.666667
4   88      8               88         6.666667

Recode the RACE variable

According to Table 1 of Wendling, T. et al.

Model-Based Evaluation of the Impact of Formulation and Food Intake on the Complex Oral Absorption of Mavoglurant in Healthy Subjects. Pharm Res 32, 1764–1778 (2015). https://doi.org/10.1007/s11095-014-1574-1 race break down was: Caucasian (61.7), Black (30), Native American (1.7) and Other (6.7)

So, for Race

1=Caucasian 2=Black 7=Native American 88=Other

However, for this exercise, we will just recode to 3 variables.

# Recode RACE
mav_ex11 <- mav_ex11 %>%
  mutate(RACE = as.character(RACE)) %>% # Convert to character to handle NA properly
  mutate(RACE = recode(RACE, '7' = '3', '88' = '3')) %>%
  mutate(RACE = factor(RACE, levels = c('1', '2', '3'), 
                       labels = c("Caucasian", "Black", "Other")))

# Now, print the levels of RACE after recoding:
print(levels(mav_ex11$RACE))

[1] "Caucasian" "Black"     "Other"

make a pairwise correlation plot for the continuous variables.

#####If we were to find any very strong correlations, we might want to remove those.

# Selecting only continuous variables
continuous_vars <- mav_ex11[, c("AGE", "WT", "HT")]

# Computing the correlation matrix
cor_matrix <- cor(continuous_vars, use = "complete.obs")

# Visualizing the correlation matrix
ggcorrplot(cor_matrix, method = "circle", hc.order = TRUE, type = "lower",
           lab = TRUE, lab_size = 3, colors = c("gold", "snow", "tomato"))

print(cor_matrix)

           AGE        WT         HT
AGE  1.0000000 0.1196740 -0.3518581
WT   0.1196740 1.0000000  0.5997505
HT  -0.3518581 0.5997505  1.0000000

While the correlation between HT and WT is not strong, we will combine the two into BMI calculating new variable, BMI, from HT(m) and WT(kg).

BMI = WT(kg) / HT(m)^2

# Calculate BMI
mav_ex11$BMI <- mav_ex11$WT / (mav_ex11$HT^2)

str(mav_ex11)

tibble [120 × 8] (S3: tbl_df/tbl/data.frame)
 $ Y   : num [1:120] 2691 2639 2150 1789 3126 ...
 $ DOSE: Factor w/ 3 levels "25","37.5","50": 1 1 1 1 1 1 1 1 1 1 ...
 $ AGE : int [1:120] 42 24 31 46 41 27 23 20 23 28 ...
 $ SEX : Factor w/ 2 levels "1","2": 1 1 1 2 2 1 1 1 1 1 ...
 $ RACE: Factor w/ 3 levels "Caucasian","Black",..: 2 2 1 1 2 2 1 3 2 1 ...
 $ WT  : num [1:120] 94.3 80.4 71.8 77.4 64.3 ...
 $ HT  : num [1:120] 1.77 1.76 1.81 1.65 1.56 ...
 $ BMI : num [1:120] 30.1 26 21.9 28.4 26.4 ...

mav_ex11 <- mav_ex11 %>%
  mutate(
    DOSE = as.factor(DOSE),
    RACE = as.factor(RACE),
    SEX = as.factor(SEX)
  )

Create linear model with all predictors.

# Define a linear regression model
linear_model_spec <- linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")

# Prepare the recipe for the linear model, specifying categorical variables
linear_model_recipe <- recipe(Y ~ ., data = mav_ex11) %>%
  step_dummy(all_nominal(), -all_outcomes()) # Convert categorical variables to dummy variables

# Combine the model and recipe into a workflow, then fit it to the data
linear_model_workflow <- workflow() %>%
  add_model(linear_model_spec) %>%
  add_recipe(linear_model_recipe) %>%
  fit(data = mav_ex11)

LASSO Regression Model Specification with glmnet engine

# Then define the recipe for LASSO
lasso_model_recipe <- recipe(Y ~ ., data = mav_ex11) %>%
  step_dummy(all_nominal(), -all_outcomes())

lasso_model_spec <- linear_reg(penalty = 0.1, mixture = 1) %>%
  set_engine("glmnet") %>%
  set_mode("regression")

# Recipe for LASSO, converting categorical variables to dummy variables
lasso_model_recipe <- recipe(Y ~ ., data = mav_ex11) %>%
  step_dummy(all_nominal(), -all_outcomes())
  

# Workflow for LASSO, combining model and recipe, then fitting to data
lasso_model_workflow <- workflow() %>%
  add_model(lasso_model_spec) %>%
  add_recipe(lasso_model_recipe) %>%
  fit(data = mav_ex11)

Random Forest Model Specification with ranger engine

# Random Forest Model Specification with ranger engine
rf_model_spec <- rand_forest() %>%
  set_engine("ranger", seed = ex11_seed) %>%
  set_mode("regression")

# Recipe for Random Forest, ensuring categorical variables are treated correctly
# No need for step_dummy() as random forest can handle categorical variables directly
rf_model_recipe <- recipe(Y ~ ., data = mav_ex11)

# Workflow for Random Forest, combining model and recipe, then fitting to data
rf_model_workflow <- workflow() %>%
  add_model(rf_model_spec) %>%
  add_recipe(rf_model_recipe) %>%
  fit(data = mav_ex11)

Linear Model Predictions

# Predictions
lm_predictions_ex11 <- predict(linear_model_workflow, new_data = mav_ex11) %>%
  bind_cols(mav_ex11)

# Calculate RMSE
lm_rmse_ex11 <- rmse(lm_predictions_ex11, truth = Y, estimate = .pred)

# Observed vs Predicted Plot
ggplot(lm_predictions_ex11, aes(x = Y, y = .pred)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(x = "Observed", y = "Predicted", title = "Linear Model: Observed vs Predicted")

Lasso Model Predictions

# Predictions
lasso_predictions_ex11 <- predict(lasso_model_workflow, new_data = mav_ex11) %>%
  bind_cols(mav_ex11)

# Calculate RMSE
lasso_rmse_ex11 <- rmse(lasso_predictions_ex11, truth = Y, estimate = .pred)

# Observed vs Predicted Plot
ggplot(lasso_predictions_ex11, aes(x = Y, y = .pred)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(x = "Observed", y = "Predicted", title = "LASSO Model: Observed vs Predicted")

Random Forest Model Predictions

# Predictions
rf_predictions_ex11 <- predict(rf_model_workflow, new_data = mav_ex11) %>%
  bind_cols(mav_ex11)

# Calculate RMSE
rf_rmse_ex11 <- rmse(rf_predictions_ex11, truth = Y, estimate = .pred)

# Observed vs Predicted Plot
ggplot(rf_predictions_ex11, aes(x = Y, y = .pred)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(x = "Observed", y = "Predicted", title = "Random Forest Model: Observed vs Predicted")

Examine the RSME values from the models

#RMSE
# Create a dataframe to hold the RMSE values
rmse_summary_ex11 <- tibble(
  Model = c("Linear", "LASSO", "Random Forest"),
  RMSE = c(lm_rmse_ex11$.estimate, lasso_rmse_ex11$.estimate, rf_rmse_ex11$.estimate)
)

# Print the summary table
print(rmse_summary_ex11)

# A tibble: 3 × 2
  Model          RMSE
  <chr>         <dbl>
1 Linear         570.
2 LASSO          571.
3 Random Forest  358.

Tuning the LASSO model wihtout CV

this is not a good idea

# Define the range of penalty values
penalty_grid <- 10^seq(-5, 2, length.out = 50)

# Update the LASSO model spec to use tune() for the penalty
lasso_model_spec_tuned <- linear_reg(penalty = tune(), mixture = 1) %>%
  set_engine("glmnet") %>%
  set_mode("regression")

# Use the same recipe as before
lasso_model_recipe <- recipe(Y ~ ., data = mav_ex11) %>%
  step_dummy(all_nominal(), -all_outcomes())

# Define the tuning workflow
lasso_tuning_workflow <- workflow() %>%
  add_model(lasso_model_spec_tuned) %>%
  add_recipe(lasso_model_recipe)

# Create a tibble with penalty values for tuning
penalty_tibble <- tibble(penalty = penalty_grid)

# Use apparent resampling for tuning (not recommended in practice)
lasso_resamples <- apparent(mav_ex11)

# Tune the model
lasso_tuned_results <- tune_grid(
  lasso_tuning_workflow,
  resamples = lasso_resamples,
  grid = penalty_tibble
)

# Evaluate the tuned model
best_lasso <- select_best(lasso_tuned_results, "rmse")

# Print the best penalty value
print(best_lasso)

# A tibble: 1 × 2
  penalty .config              
    <dbl> <chr>                
1 0.00001 Preprocessor1_Model01

# Visualize tuning diagnostics for LASSO
lasso_tuned_results %>% autoplot()

As the penalty parameter increases, LASSO regression can drive more coefficients to zero, effectively removing them from the model. If the penalty is too large, it may remove too many features, leading the model towards a null model, which is a model with no predictors.

The unpenalized linear model is fully optimized to fit the training data without any restraint, possibly capturing noise and overfitting. When LASSO introduces a penalty for complexity, it trades off some of the training data fit to achieve a model that should generalize better. However, since we are only evaluating on the same data used to tune the penalty, we don’t see the benefit of this trade-off. In fact, due to this evaluation approach, we may see an increase in RMSE as we overly simplify the model, potentially leading to underfitting when the penalty is too high

Now for Randome Forest model

# Define the Random Forest model with specific tuning indications
rf_model_spec <- rand_forest(trees = 300, mtry = tune(), min_n = tune()) %>%
  set_engine("ranger", seed = 123) %>%
  set_mode("regression")

# Define the recipe
rf_recipe <- recipe(Y ~ ., data = mav_ex11)

# Define the tuning grid
tuning_grid <- grid_regular(
  mtry(range = c(1, 7)),
  min_n(range = c(1, 21)),
  levels = 7
)

# Define resampling method - using the full dataset through apparent resampling
resamples <- apparent(data = mav_ex11)

# Combine the model and recipe into a workflow
rf_workflow <- workflow() %>%
  add_model(rf_model_spec) %>%
  add_recipe(rf_recipe)

# Tune the model using the workflow
rf_tuned_results <- tune_grid(
  rf_workflow,
  resamples = resamples,
  grid = tuning_grid
)

# Visualize the tuning results
autoplot(rf_tuned_results)

Now we will try it all with CV

LASSO

# Set the seed for reproducibility
set.seed(ex11_seed)

# Define the range of penalty values for the grid
penalty_grid <- 10^seq(-5, 2, length.out = 50)

# Update the LASSO model spec to use tune() for the penalty
lasso_model_spec_tuned <- linear_reg(penalty = tune(), mixture = 1) %>%
  set_engine("glmnet") %>%
  set_mode("regression")

# Define the recipe, including dummy variables as needed
lasso_model_recipe <- recipe(Y ~ ., data = mav_ex11) %>%
  step_dummy(all_nominal(), -all_outcomes())

# Define a workflow that includes the model spec and the recipe
lasso_tuning_workflow <- workflow() %>%
  add_model(lasso_model_spec_tuned) %>%
  add_recipe(lasso_model_recipe)

# Create the cross-validation resamples
cv_resamples <- vfold_cv(mav_ex11, v = 5, repeats = 5)

# Tune the model with cross-validation
lasso_tuned_cv_results <- tune_grid(
  lasso_tuning_workflow,
  resamples = cv_resamples,
  grid = penalty_tibble
)

# Visualize tuning diagnostics
autoplot(lasso_tuned_cv_results)

Random Forest

rf_model_spec <- rand_forest(trees = 300, mtry = tune(), min_n = tune()) %>%
  set_engine("ranger", seed = ex11_seed) %>%
  set_mode("regression")

# Define the recipe
rf_recipe <- recipe(Y ~ ., data = mav_ex11)

# Define the tuning grid
tuning_grid <- grid_regular(
  mtry(range = c(1, 7)),
  min_n(range = c(1, 21)),
  levels = 7
)

# Define resampling method - using the full dataset through apparent resampling
rf_cv_resamples <- vfold_cv(data = mav_ex11)

# Combine the model and recipe into a workflow
rf_workflow <- workflow() %>%
  add_model(rf_model_spec) %>%
  add_recipe(rf_recipe)

# Tune the model using the workflow
rf_cvtuned_results <- tune_grid(
  rf_workflow,
  resamples = resamples,
  grid = tuning_grid
)

# Set the seed for reproducibility
set.seed(ex11_seed)

# Create 5-fold CV resamples, repeated 5 times
rf_cv_resamples <- vfold_cv(data = mav_ex11, v = 5, repeats = 5)

# Tune the model using the workflow with the corrected resamples variable
rf_cvtuned_results <- tune_grid(
  rf_workflow,
  resamples = rf_cv_resamples,
  grid = tuning_grid
)

# Visualize the tuning results
autoplot(rf_cvtuned_results)

Without external validation data, it’s hard to fully assess which model generalizes better. Lower training error (like RMSE or MSE) doesn’t always guarantee better performance on unseen data. Models like RF can sometimes be more robust to overfitting compared to linear models, depending on the data and how the tuning is done.

Conclusion:

Based on the given tuning results, the LASSO model appears to offer better predictive accuracy with a lower RMSE compared to the RF model’s MSE/RMSE. However, the final decision on which model to use should also consider factors such as model interpretability, computational cost, and how well you expect the model to generalize to new, unseen data.