From 6956b45e759219e1bafd852b82c93ad56a85d55e Mon Sep 17 00:00:00 2001 From: Athos Damiani Date: Thu, 16 Apr 2020 23:47:39 -0300 Subject: [PATCH 01/12] vip S3 method for workflow class object --- R/vip.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/vip.R b/R/vip.R index 5cc13523..888c5e0a 100644 --- a/R/vip.R +++ b/R/vip.R @@ -308,7 +308,6 @@ vip.default <- function( } - #' @rdname vip #' #' @export @@ -316,3 +315,9 @@ vip.model_fit <- function(object, ...) { vip(object$fit, ...) } +#' @rdname vip +#' +#' @export +vip.workflow <- function(object, ...) { + vip(object$fit$fit$fit, ...) +} From 04b0b9480159514460b08faa797cc4cd0559a3ae Mon Sep 17 00:00:00 2001 From: Athos Damiani Date: Fri, 17 Apr 2020 00:08:41 -0300 Subject: [PATCH 02/12] vi S3 method for workflow class --- R/vi.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/vi.R b/R/vi.R index 555cf7b9..89fa64bf 100644 --- a/R/vi.R +++ b/R/vi.R @@ -188,6 +188,12 @@ vi.model_fit <- function(object, ...) { # package: parsnip vi(object$fit, ...) } +#' @rdname vi +#' +#' @export +vi.workflow <- function(object, ...) { # package: workflows + vi(object$fit$fit$fit, ...) +} #' @rdname vi #' From 8ca9adc74d484adc7a95503f6343600bfad9b15f Mon Sep 17 00:00:00 2001 From: Athos Damiani Date: Fri, 17 Apr 2020 00:08:53 -0300 Subject: [PATCH 03/12] vi_model S3 method for workflow class --- R/vi_model.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/R/vi_model.R b/R/vi_model.R index 4d317b97..563d80a6 100644 --- a/R/vi_model.R +++ b/R/vi_model.R @@ -724,6 +724,14 @@ vi_model.model_fit <- function(object, ...) { # package: parsnip vi_model(object$fit, ...) } +# Package: parsnip ------------------------------------------------------------- + +#' @rdname vi_model +#' +#' @export +vi_model.workflow <- function(object, ...) { # package: workflows + vi_model(object$fit$fit$fit, ...) +} # Package: party --------------------------------------------------------------- From 2f8a59d14f99204c2937188473bc863dfc400416 Mon Sep 17 00:00:00 2001 From: Athos Damiani Date: Fri, 17 Apr 2020 00:09:46 -0300 Subject: [PATCH 04/12] docs for S3 methods for workflow class --- NAMESPACE | 3 +++ man/vi.Rd | 3 +++ man/vi_model.Rd | 3 +++ man/vip.Rd | 3 +++ 4 files changed, 12 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index ad138732..ff3e8a3d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ S3method(vi,Learner) S3method(vi,WrappedModel) S3method(vi,default) S3method(vi,model_fit) +S3method(vi,workflow) S3method(vi_firm,default) S3method(vi_model,C5.0) S3method(vi_model,H2OBinomialModel) @@ -39,11 +40,13 @@ S3method(vi_model,randomForest) S3method(vi_model,ranger) S3method(vi_model,rpart) S3method(vi_model,train) +S3method(vi_model,workflow) S3method(vi_model,xgb.Booster) S3method(vi_permute,default) S3method(vi_shap,default) S3method(vip,default) S3method(vip,model_fit) +S3method(vip,workflow) export("%>%") export("%T>%") export(add_sparklines) diff --git a/man/vi.Rd b/man/vi.Rd index 0fe265f9..47320242 100644 --- a/man/vi.Rd +++ b/man/vi.Rd @@ -4,6 +4,7 @@ \alias{vi} \alias{vi.default} \alias{vi.model_fit} +\alias{vi.workflow} \alias{vi.WrappedModel} \alias{vi.Learner} \title{Variable importance} @@ -27,6 +28,8 @@ vi(object, ...) \method{vi}{model_fit}(object, ...) +\method{vi}{workflow}(object, ...) + \method{vi}{WrappedModel}(object, ...) \method{vi}{Learner}(object, ...) diff --git a/man/vi_model.Rd b/man/vi_model.Rd index eb298bdf..991e7aec 100644 --- a/man/vi_model.Rd +++ b/man/vi_model.Rd @@ -18,6 +18,7 @@ \alias{vi_model.nn} \alias{vi_model.nnet} \alias{vi_model.model_fit} +\alias{vi_model.workflow} \alias{vi_model.RandomForest} \alias{vi_model.constparty} \alias{vi_model.cforest} @@ -72,6 +73,8 @@ vi_model(object, ...) \method{vi_model}{model_fit}(object, ...) +\method{vi_model}{workflow}(object, ...) + \method{vi_model}{RandomForest}(object, type = c("accuracy", "auc"), ...) \method{vi_model}{constparty}(object, ...) diff --git a/man/vip.Rd b/man/vip.Rd index 3d78b426..c260adbd 100644 --- a/man/vip.Rd +++ b/man/vip.Rd @@ -4,6 +4,7 @@ \alias{vip} \alias{vip.default} \alias{vip.model_fit} +\alias{vip.workflow} \title{Variable importance plots} \usage{ vip(object, ...) @@ -29,6 +30,8 @@ vip(object, ...) ) \method{vip}{model_fit}(object, ...) + +\method{vip}{workflow}(object, ...) } \arguments{ \item{object}{A fitted model object (e.g., a \code{"randomForest"} object) or From 342d028fed6808f251b8c7369a123dff86bd9644 Mon Sep 17 00:00:00 2001 From: Athos Damiani Date: Tue, 21 Apr 2020 18:31:23 -0300 Subject: [PATCH 05/12] tinytests for workflows --- inst/tinytest/test_pkg_workflows.R | 59 ++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 inst/tinytest/test_pkg_workflows.R diff --git a/inst/tinytest/test_pkg_workflows.R b/inst/tinytest/test_pkg_workflows.R new file mode 100644 index 00000000..94108b61 --- /dev/null +++ b/inst/tinytest/test_pkg_workflows.R @@ -0,0 +1,59 @@ +# Exits +if (!requireNamespace("workflows", quietly = TRUE)) { + exit_file("Package workflows missing") +} + +# Load required packages +suppressMessages({ + library(workflows) +}) + +# Generate Friedman benchmark data +friedman1 <- gen_friedman(seed = 101) + +# Fit a linear model +lin <- parsnip::linear_reg() %>% + parsnip::set_engine("lm") + +wf <- workflows::workflow() %>% + workflows::add_model(lin) %>% + workflows::add_formula(y ~ .) + +lin_fit <- wf %>% + parsnip::fit(data = friedman1) + +# Compute model-based VI scores +vis <- vi(lin_fit, scale = TRUE) + +# Expect `vi()` and `vi_model()` to both work +expect_identical( + current = vi(lin_fit, sort = FALSE), + target = vi_model(lin_fit) +) + +# Check class +expect_identical(class(vis), target = c("vi", "tbl_df", "tbl", "data.frame")) + +# Check dimensions (should be one row for each feature) +expect_identical(ncol(friedman1) - 1L, target = nrow(vis)) + +# Display VIP +vip(vis, geom = "point") + +# Try permutation importance +set.seed(953) # for reproducibility +p <- vip( + object = lin_fit, + method = "permute", + train = friedman1, + target = "y", + pred_wrapper = predict, + metric = "rmse", + nsim = 30, + geom = "violin", + jitter = TRUE, + all_permutation = TRUE, + mapping = ggplot2::aes(color = Variable) +) +expect_true(inherits(p, what = "ggplot")) +p # display VIP From a6156c78d5ac7fe50545995dc5599e029029706d Mon Sep 17 00:00:00 2001 From: Athos Damiani Date: Wed, 22 Apr 2020 18:47:08 -0300 Subject: [PATCH 06/12] catboost_info in gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 083bae16..fa5d6213 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ inst/doc .DS_Store logs derby.log +catboost_info From 57c37cce906d4f1885f9d853d9d0ea1a1711fe25 Mon Sep 17 00:00:00 2001 From: Athos Damiani Date: Wed, 22 Apr 2020 18:47:26 -0300 Subject: [PATCH 07/12] catboost in suggests --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0866f8c3..05ca0eb5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -77,5 +77,6 @@ Suggests: sparklyr (>= 0.8.0), tinytest, varImp, - xgboost + xgboost, + catboost RoxygenNote: 7.1.0 From 6e9a5daa0c768b198ef7dbf7027fc8605fd1c823 Mon Sep 17 00:00:00 2001 From: Athos Damiani Date: Wed, 22 Apr 2020 22:34:04 -0300 Subject: [PATCH 08/12] get_feature_names for catboost.Model --- NAMESPACE | 1 + R/get_feature_names.R | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index ff3e8a3d..2dbd4b6c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,6 +14,7 @@ S3method(vi_model,H2ORegressionModel) S3method(vi_model,Learner) S3method(vi_model,RandomForest) S3method(vi_model,WrappedModel) +S3method(vi_model,catboost.Model) S3method(vi_model,cforest) S3method(vi_model,constparty) S3method(vi_model,cubist) diff --git a/R/get_feature_names.R b/R/get_feature_names.R index b1bb21cf..25f96865 100644 --- a/R/get_feature_names.R +++ b/R/get_feature_names.R @@ -282,3 +282,14 @@ get_feature_names.xgb.Booster <- function(object, ...) { object$feature_names } } + +# Package: catboost ------------------------------------------------------------- + +#' @keywords internal +get_feature_names.catboost.Model <- function(object, ...) { + if (is.null(rownames(fit$feature_importances))) { + get_feature_names.default(object) + } else { + rownames(fit$feature_importances) + } +} From 8ffd975d1721886bc8aac8944ed51831df69ce1c Mon Sep 17 00:00:00 2001 From: Athos Damiani Date: Wed, 22 Apr 2020 22:34:31 -0300 Subject: [PATCH 09/12] vi_model for catboost.Model --- R/vi_model.R | 36 ++++++++++++++++++++++++++++++++++++ man/vi_model.Rd | 8 ++++++++ 2 files changed, 44 insertions(+) diff --git a/R/vi_model.R b/R/vi_model.R index 563d80a6..31614df9 100644 --- a/R/vi_model.R +++ b/R/vi_model.R @@ -1343,3 +1343,39 @@ vi_model.xgb.Booster <- function(object, type = c("gain", "cover", "frequency"), tib } + +# Package: catboost ------------------------------------------------------------- + +#' @rdname vi_model +#' +#' @export +vi_model.catboost.Model <- function(object, type = c("FeatureImportance", "PredictionValuesChange", "LossFunctionChange", "Interaction", "ShapValues"), ...) { + + # Determine which type of variable importance to compute + type <- match.arg(type) + + # Construct model-specific variable importance scores + imp <- catboost::catboost.get_feature_importance(model = object, type = type, ...) + var_names <- rownames(object$feature_importances) + + if(type %in% c("LossFunctionChange", "FeatureImportance", "PredictionValuesChange")) { + tib <- tibble::enframe(imp[,1], name = "Variable", value = "Importance") + } else if(type == "Interaction") { + tib <- tibble::as_tibble(imp) + tib <- setNames(tib, c("Variable1", "Variable2", "Importance")) + tib$Variable1 <- var_names[tib$Variable1 + 1] + tib$Variable2 <- var_names[tib$Variable2 + 1] + } else if(type == "ShapValues") { + tib <- tibble::as_tibble(imp) + tib <- setNames(tib, c(var_names, ".pred")) + } + + # Add variable importance type attribute + attr(tib, which = "type") <- type + + # Add "vi" class + class(tib) <- c("vi", class(tib)) + + # Return results + tib +} diff --git a/man/vi_model.Rd b/man/vi_model.Rd index 991e7aec..3bf2101c 100644 --- a/man/vi_model.Rd +++ b/man/vi_model.Rd @@ -37,6 +37,7 @@ \alias{vi_model.ml_model_random_forest_classification} \alias{vi_model.lm} \alias{vi_model.xgb.Booster} +\alias{vi_model.catboost.Model} \title{Model-specific variable importance} \usage{ vi_model(object, ...) @@ -110,6 +111,13 @@ vi_model(object, ...) \method{vi_model}{lm}(object, type = c("stat", "raw"), ...) \method{vi_model}{xgb.Booster}(object, type = c("gain", "cover", "frequency"), ...) + +\method{vi_model}{catboost.Model}( + object, + type = c("FeatureImportance", "PredictionValuesChange", "LossFunctionChange", + "Interaction", "ShapValues"), + ... +) } \arguments{ \item{object}{A fitted model object (e.g., a \code{"randomForest"} object).} From de885c8526331dd0e73c22da7a09100d6e60ca8f Mon Sep 17 00:00:00 2001 From: Athos Damiani Date: Wed, 22 Apr 2020 22:50:07 -0300 Subject: [PATCH 10/12] vi_model document update with catboost info --- R/vi_model.R | 6 +++++- man/vi_model.Rd | 8 +++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/R/vi_model.R b/R/vi_model.R index 31614df9..e5edc824 100644 --- a/R/vi_model.R +++ b/R/vi_model.R @@ -238,6 +238,10 @@ #' #' }} #' +#' \item{\code{\link[catboost]{catboost}}}{See \code{\link[catboost]{catboost.get_feature_importance}} or visit +#' \url{https://catboost.ai/docs/concepts/r-reference_catboost-get_feature_importance.html} +#' for details.} +#' #' } #' #' @note Inspired by the \code{\link[caret]{varImp}} function. @@ -1356,7 +1360,7 @@ vi_model.catboost.Model <- function(object, type = c("FeatureImportance", "Predi # Construct model-specific variable importance scores imp <- catboost::catboost.get_feature_importance(model = object, type = type, ...) - var_names <- rownames(object$feature_importances) + var_names <- get_feature_names.catboost.Model(object) if(type %in% c("LossFunctionChange", "FeatureImportance", "PredictionValuesChange")) { tib <- tibble::enframe(imp[,1], name = "Variable", value = "Importance") diff --git a/man/vi_model.Rd b/man/vi_model.Rd index 3bf2101c..d56a8065 100644 --- a/man/vi_model.Rd +++ b/man/vi_model.Rd @@ -132,7 +132,9 @@ argument applies to.} A tidy data frame (i.e., a \code{"tibble"} object) with two columns: \code{Variable} and \code{Importance}. For \code{"lm"/"glm"}-like object, an additional column, called \code{Sign}, is also included which includes the -sign (i.e., POS/NEG) of the original coefficient. +sign (i.e., POS/NEG) of the original coefficient. For \code{"catboost.Model"} +object, type \code{Interaction} returns two Variable columns ("Variable1", +"Variable2"). } \description{ Compute model-specific variable importance scores for the predictors in a @@ -359,6 +361,10 @@ obtain three different types of variable importance: }} +\item{\code{\link[catboost]{catboost}}}{See \code{\link[catboost]{catboost.get_feature_importance}} or visit +\url{https://catboost.ai/docs/concepts/r-reference_catboost-get_feature_importance.html} +for details.} + } } \note{ From 50c3f7a71b26c0d82936febe0ed11e8b83cdab53 Mon Sep 17 00:00:00 2001 From: Athos Damiani Date: Wed, 22 Apr 2020 23:23:52 -0300 Subject: [PATCH 11/12] shap removed from catboost vi_model --- R/vi_model.R | 5 +---- R/vi_shap.R | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/R/vi_model.R b/R/vi_model.R index e5edc824..4dc865b3 100644 --- a/R/vi_model.R +++ b/R/vi_model.R @@ -1353,7 +1353,7 @@ vi_model.xgb.Booster <- function(object, type = c("gain", "cover", "frequency"), #' @rdname vi_model #' #' @export -vi_model.catboost.Model <- function(object, type = c("FeatureImportance", "PredictionValuesChange", "LossFunctionChange", "Interaction", "ShapValues"), ...) { +vi_model.catboost.Model <- function(object, type = c("FeatureImportance", "PredictionValuesChange", "LossFunctionChange", "Interaction"), ...) { # Determine which type of variable importance to compute type <- match.arg(type) @@ -1369,9 +1369,6 @@ vi_model.catboost.Model <- function(object, type = c("FeatureImportance", "Predi tib <- setNames(tib, c("Variable1", "Variable2", "Importance")) tib$Variable1 <- var_names[tib$Variable1 + 1] tib$Variable2 <- var_names[tib$Variable2 + 1] - } else if(type == "ShapValues") { - tib <- tibble::as_tibble(imp) - tib <- setNames(tib, c(var_names, ".pred")) } # Add variable importance type attribute diff --git a/R/vi_shap.R b/R/vi_shap.R index 48562bdb..18d5b160 100644 --- a/R/vi_shap.R +++ b/R/vi_shap.R @@ -85,3 +85,19 @@ vi_shap.default <- function(object, feature_names = NULL, train = NULL, ...) { tib } + +#' @rdname vi_shap +#' +#' @export +vi_shap.catboost.Model <- function(object, feature_names = NULL, train = NULL, ...) { + # Try to extract feature names if not supplied + if (is.null(feature_names)) { + feature_names <- get_feature_names(object) + } + + # catboost do not give access to the training data directly from the model object. + if (is.null(train)) { + stop("Please provide a `catboost.Pool` object to the train argument. See `catboost::catboost.load_pool()`.") + } + +} From 0c8f0a2b143128394106db6cbe0015e3c574751a Mon Sep 17 00:00:00 2001 From: Athos Damiani Date: Wed, 22 Apr 2020 23:27:25 -0300 Subject: [PATCH 12/12] catboost tests --- inst/tinytest/test_pkg_catboost.R | 60 +++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 inst/tinytest/test_pkg_catboost.R diff --git a/inst/tinytest/test_pkg_catboost.R b/inst/tinytest/test_pkg_catboost.R new file mode 100644 index 00000000..cf455aad --- /dev/null +++ b/inst/tinytest/test_pkg_catboost.R @@ -0,0 +1,60 @@ +# Exits +if (!requireNamespace("catboost", quietly = TRUE)) { + exit_file("Package catboost missing") +} + +# # Load required packages +# suppressMessages({ +# library(catboost) +# }) + +# Generate Friedman benchmark data +friedman1 <- gen_friedman(seed = 101) + +# Fit model(s) +set.seed(101) +fit <- catboost::catboost.train( + learn_pool = catboost::catboost.load_pool(friedman1[,-1], friedman1[,1, drop = TRUE]), + params = list(logging_level = "Silent", iterations = 10) +) + +# Compute VI scores +vis_FeatureImportance_default <- vi_model(fit) +vis_FeatureImportance <- vi_model(fit, type = "FeatureImportance") +vis_PredictionValuesChange <- vi_model(fit, type = "PredictionValuesChange") +vis_LossFunctionChange <- vi_model(fit, type = "LossFunctionChange", pool = catboost::catboost.load_pool(friedman1[,-1], friedman1[,1, drop = TRUE])) +vis_Interaction <- vi_model(fit, type = "Interaction") + +# Expectations for `vi_model()` +expect_identical() +expect_identical() +expect_identical() + +# Expectations for `get_training_data()` +expect_error(vip:::get_training_data.default(fit)) + +# Expectations for `get_feature_names()` +expect_identical( + current = vip:::get_feature_names.catboost.Model(fit), + target = paste0("x", 1L:10L) +) + +# Call `vip::vip()` directly +p <- vip(fit, method = "model", include_type = TRUE) + +# Expect `p` to be a `"gg" "ggplot"` object +expect_identical( + current = class(p), + target = c("gg", "ggplot") +) + +# Display VIPs side by side +grid.arrange( + vip(vis_FeatureImportance_default, include_type = TRUE), + vip(vis_FeatureImportance, include_type = TRUE), + vip(vis_PredictionValuesChange, include_type = TRUE), + vip(vis_LossFunctionChange, include_type = TRUE), + # vip(vis_Interaction, include_type = TRUE), + p, + nrow = 1 +)