Chapter 9 Customer churn and deep learning
drake
is designed for workflows with long runtimes, and a major use case is deep learning. This chapter demonstrates how to leverage drake
to manage a deep learning workflow. The original example comes from a blog post by Matt Dancho, and the chapter’s content itself comes directly from this R notebook, part of an RStudio Solutions Engineering example demonstrating TensorFlow in R. The notebook is modified and redistributed under the terms of the Apache 2.0 license, copyright RStudio (details here).
9.1 Churn packages
First, we load our packages into a fresh R session.
library(drake)
library(keras)
library(tidyverse)
library(rsample)
library(recipes)
library(yardstick)
9.2 Churn functions
drake
is R-focused and function-oriented. We create functions to preprocess the data,
<- function(data) {
prepare_recipe %>%
data training() %>%
recipe(Churn ~ .) %>%
step_rm(customerID) %>%
step_naomit(all_outcomes(), all_predictors()) %>%
step_discretize(tenure, options = list(cuts = 6)) %>%
step_log(TotalCharges) %>%
step_mutate(Churn = ifelse(Churn == "Yes", 1, 0)) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_center(all_predictors(), -all_outcomes()) %>%
step_scale(all_predictors(), -all_outcomes()) %>%
prep()
}
define a keras
model, exposing arguments to set the dimensionality and activation functions of the layers,
<- function(rec, units1, units2, act1, act2, act3) {
define_model <- ncol(
input_shape juice(rec, all_predictors(), composition = "matrix")
)keras_model_sequential() %>%
layer_dense(
units = units1,
kernel_initializer = "uniform",
activation = act1,
input_shape = input_shape
%>%
) layer_dropout(rate = 0.1) %>%
layer_dense(
units = units2,
kernel_initializer = "uniform",
activation = act2
%>%
) layer_dropout(rate = 0.1) %>%
layer_dense(
units = 1,
kernel_initializer = "uniform",
activation = act3
) }
train a model,
<- function(
train_model
rec,units1 = 16,
units2 = 16,
act1 = "relu",
act2 = "relu",
act3 = "sigmoid"
) {<- define_model(
model rec = rec,
units1 = units1,
units2 = units2,
act1 = act1,
act2 = act2,
act3 = act3
)compile(
model,optimizer = "adam",
loss = "binary_crossentropy",
metrics = c("accuracy")
)<- juice(
x_train_tbl
rec,all_predictors(),
composition = "matrix"
)<- juice(rec, all_outcomes()) %>%
y_train_vec pull()
fit(
object = model,
x = x_train_tbl,
y = y_train_vec,
batch_size = 32,
epochs = 32,
validation_split = 0.3,
verbose = 0
)
model }
compare predictions against reality,
<- function(data, rec, model) {
confusion_matrix <- bake(rec, testing(data))
testing_data <- testing_data %>%
x_test_tbl select(-Churn) %>%
as.matrix()
<- testing_data %>%
y_test_vec select(Churn) %>%
pull()
<- model %>%
yhat_keras_class_vec predict_classes(x_test_tbl) %>%
as.factor() %>%
fct_recode(yes = "1", no = "0")
<-
yhat_keras_prob_vec %>%
model predict_proba(x_test_tbl) %>%
as.vector()
<- y_test_vec %>%
test_truth as.factor() %>%
fct_recode(yes = "1", no = "0")
<- tibble(
estimates_keras_tbl truth = test_truth,
estimate = yhat_keras_class_vec,
class_prob = yhat_keras_prob_vec
)%>%
estimates_keras_tbl conf_mat(truth, estimate)
}
and compare the performance of multiple models.
<- function(...) {
compare_models <- match.call()[-1] %>%
name as.character()
<- map_df(list(...), summary) %>%
df filter(.metric %in% c("accuracy", "sens", "spec")) %>%
mutate(name = rep(name, each = n() / length(name))) %>%
rename(metric = .metric, estimate = .estimate)
ggplot(df) +
geom_line(aes(x = metric, y = estimate, color = name, group = name)) +
theme_gray(24)
}
9.3 Churn plan
Next, we define our workflow in a drake
plan. We will prepare the data, train different models with different activation functions, and compare the models in terms of performance.
<- c("relu", "sigmoid")
activations
<- drake_plan(
plan data = read_csv(file_in("customer_churn.csv"), col_types = cols()) %>%
initial_split(prop = 0.3),
rec = prepare_recipe(data),
model = target(
train_model(rec, act1 = act),
format = "keras", # Supported in drake > 7.5.2 to store models properly.
transform = map(act = !!activations)
),conf = target(
confusion_matrix(data, rec, model),
transform = map(model, .id = act)
),metrics = target(
compare_models(conf),
transform = combine(conf)
) )
The plan is a data frame with the steps we are going to do.
plan#> # A tibble: 7 x 3
#> target command format
#> <chr> <expr_lst> <chr>
#> 1 conf_relu confusion_matrix(data, rec, model_relu) … <NA>
#> 2 conf_sigmoid confusion_matrix(data, rec, model_sigmoid) … <NA>
#> 3 data read_csv(file_in("customer_churn.csv"), col_types = cols(… <NA>
#> 4 metrics compare_models(conf_relu, conf_sigmoid) … <NA>
#> 5 model_relu train_model(rec, act1 = "relu") … keras
#> 6 model_sigmo… train_model(rec, act1 = "sigmoid") … keras
#> 7 rec prepare_recipe(data) … <NA>
9.4 Churn dependency graph
The graph visualizes the dependency relationships among the steps of the workflow.
vis_drake_graph(plan)
9.5 Run the Keras models
Call make()
to actually run the workflow.
make(plan)
#> ▶ target data
#> ▶ target rec
#> ▶ target model_relu
#> ▶ target model_sigmoid
#> ▶ target conf_relu
#> ▶ target conf_sigmoid
#> ▶ target metrics
9.6 Inspect the Keras results
The two models performed about the same.
readd(metrics) # see also loadd()
9.7 Add Keras models
Let’s try the softmax activation function.
<- c("relu", "sigmoid", "softmax")
activations
<- drake_plan(
plan data = read_csv(file_in("customer_churn.csv"), col_types = cols()) %>%
initial_split(prop = 0.3),
rec = prepare_recipe(data),
model = target(
train_model(rec, act1 = act),
format = "keras", # Supported in drake > 7.5.2 to store models properly.
transform = map(act = !!activations)
),conf = target(
confusion_matrix(data, rec, model),
transform = map(model, .id = act)
),metrics = target(
compare_models(conf),
transform = combine(conf)
) )
vis_drake_graph(plan) # see also outdated() and predict_runtime()
make()
skips the relu and sigmoid models because they are already up to date. (Their dependencies did not change.) Only the softmax model needs to run.
make(plan)
#> ▶ target model_softmax
#> ▶ target conf_softmax
#> ▶ target metrics
9.8 Inspect the Churn results again
readd(metrics) # see also loadd()
9.9 Update the Churn code
If you change upstream functions, even nested ones, drake
automatically refits the affected models. Let’s increase dropout in both layers.
<- function(rec, units1, units2, act1, act2, act3) {
define_model <- ncol(
input_shape juice(rec, all_predictors(), composition = "matrix")
)keras_model_sequential() %>%
layer_dense(
units = units1,
kernel_initializer = "uniform",
activation = act1,
input_shape = input_shape
%>%
) layer_dropout(rate = 0.15) %>% # Changed from 0.1 to 0.15.
layer_dense(
units = units2,
kernel_initializer = "uniform",
activation = act2
%>%
) layer_dropout(rate = 0.15) %>% # Changed from 0.1 to 0.15.
layer_dense(
units = 1,
kernel_initializer = "uniform",
activation = act3
) }
All the models and downstream results are affected.
make(plan)
#> ▶ target model_relu
#> ▶ target model_sigmoid
#> ▶ target model_softmax
#> ▶ target conf_relu
#> ▶ target conf_sigmoid
#> ▶ target conf_softmax
#> ▶ target metrics
9.10 Churn history and provenance
drake
tracks history and provenance. You can see which models you ran, when you ran them, how long they took, and which settings you tried (i.e. named arguments to function calls in your commands).
<- drake_history()
history
history#> # A tibble: 17 x 10
#> target current built exists hash command seed runtime prop act1
#> <chr> <lgl> <chr> <lgl> <chr> <chr> <int> <dbl> <dbl> <chr>
#> 1 conf_r… FALSE 2021-02… TRUE 625c… "confusion_… 4.05e8 0.455 NA <NA>
#> 2 conf_r… TRUE 2021-02… TRUE 0e43… "confusion_… 4.05e8 0.385 NA <NA>
#> 3 conf_s… FALSE 2021-02… TRUE 6d39… "confusion_… 1.93e9 0.39 NA <NA>
#> 4 conf_s… TRUE 2021-02… TRUE edd4… "confusion_… 1.93e9 0.344 NA <NA>
#> 5 conf_s… FALSE 2021-02… TRUE 6813… "confusion_… 1.80e9 0.370 NA <NA>
#> 6 conf_s… TRUE 2021-02… TRUE 5079… "confusion_… 1.80e9 0.351 NA <NA>
#> 7 data TRUE 2021-02… TRUE 62d3… "read_csv(f… 1.29e9 0.085 0.3 <NA>
#> 8 metrics FALSE 2021-02… TRUE 00c6… "compare_mo… 1.21e9 0.053 NA <NA>
#> 9 metrics FALSE 2021-02… TRUE d3e7… "compare_mo… 1.21e9 0.075 NA <NA>
#> 10 metrics TRUE 2021-02… TRUE 3a8e… "compare_mo… 1.21e9 0.061 NA <NA>
#> 11 model_… FALSE 2021-02… TRUE a52a… "train_mode… 1.47e9 10.2 NA relu
#> 12 model_… TRUE 2021-02… TRUE 056b… "train_mode… 1.47e9 4.99 NA relu
#> 13 model_… FALSE 2021-02… TRUE 8b5b… "train_mode… 1.26e9 4.93 NA sigm…
#> 14 model_… TRUE 2021-02… TRUE 36e7… "train_mode… 1.26e9 5.02 NA sigm…
#> 15 model_… FALSE 2021-02… TRUE 220e… "train_mode… 8.05e8 5.00 NA soft…
#> 16 model_… TRUE 2021-02… TRUE 4e0c… "train_mode… 8.05e8 5.04 NA soft…
#> 17 rec TRUE 2021-02… TRUE 71f6… "prepare_re… 6.29e8 0.298 NA <NA>
And as long as you did not run clean(garbage_collection = TRUE)
, you can get the old data back. Let’s find the oldest run of the relu model.
<- history %>%
hash filter(act1 == "relu") %>%
pull(hash) %>%
head(n = 1)
drake_cache()$get_value(hash)
#> Model
#> Model: "sequential"
#> ________________________________________________________________________________
#> Layer (type) Output Shape Param #
#> ================================================================================
#> dense_2 (Dense) (None, 16) 576
#> ________________________________________________________________________________
#> dropout_1 (Dropout) (None, 16) 0
#> ________________________________________________________________________________
#> dense_1 (Dense) (None, 16) 272
#> ________________________________________________________________________________
#> dropout (Dropout) (None, 16) 0
#> ________________________________________________________________________________
#> dense (Dense) (None, 1) 17
#> ================================================================================
#> Total params: 865
#> Trainable params: 865
#> Non-trainable params: 0
#> ________________________________________________________________________________