Chapter 5 Static branching
5.1 Why static branching?
Static branching helps us write large plans compactly. Instead of typing out every single target by hand, we use a special shorthand to declare entire batches of similar targets. To practice static branching in a controlled setting, try the interactive exercises at https://wlandau.shinyapps.io/learndrakeplans (from the workshop at https://github.com/wlandau/learndrake).
Without static branching, plans like this one become too cumbersome to type by hand.
# Without static branching:
drake_plan(
data = get_data(),
analysis_fast_1_main = main(data, mean = 1, tuning = "fast"),
analysis_slow_1_main = main(data, mean = 1, tuning = "slow"),
analysis_fast_2_main = main(data, mean = 2, tuning = "fast"),
analysis_slow_2_main = main(data, mean = 2, tuning = "slow"),
analysis_fast_3_main = main(data, mean = 3, tuning = "fast"),
analysis_slow_3_main = main(data, mean = 3, tuning = "slow"),
analysis_fast_4_main = main(data, mean = 4, tuning = "fast"),
analysis_slow_4_main = main(data, mean = 4, tuning = "slow"),
analysis_fast_1_altv = altv(data, mean = 1, tuning = "fast"),
analysis_slow_1_altv = altv(data, mean = 1, tuning = "slow"),
analysis_fast_2_altv = altv(data, mean = 2, tuning = "fast"),
analysis_slow_2_altv = altv(data, mean = 2, tuning = "slow"),
analysis_fast_3_altv = altv(data, mean = 3, tuning = "fast"),
analysis_slow_3_altv = altv(data, mean = 3, tuning = "slow"),
analysis_fast_4_altv = altv(data, mean = 4, tuning = "fast"),
analysis_slow_4_altv = altv(data, mean = 4, tuning = "slow"),
summary_analysis_fast_1_main = summarize_model(analysis_fast_1_main),
summary_analysis_slow_1_main = summarize_model(analysis_slow_1_main),
summary_analysis_fast_2_main = summarize_model(analysis_fast_2_main),
summary_analysis_slow_2_main = summarize_model(analysis_slow_2_main),
summary_analysis_fast_3_main = summarize_model(analysis_fast_3_main),
summary_analysis_slow_3_main = summarize_model(analysis_slow_3_main),
summary_analysis_fast_4_main = summarize_model(analysis_fast_4_main),
summary_analysis_slow_4_main = summarize_model(analysis_slow_4_main),
summary_analysis_fast_1_altv = summarize_model(analysis_fast_1_altv),
summary_analysis_slow_1_altv = summarize_model(analysis_slow_1_altv),
summary_analysis_fast_2_altv = summarize_model(analysis_fast_2_altv),
summary_analysis_slow_2_altv = summarize_model(analysis_slow_2_altv),
summary_analysis_fast_3_altv = summarize_model(analysis_fast_3_altv),
summary_analysis_slow_3_altv = summarize_model(analysis_slow_3_altv),
summary_analysis_fast_4_altv = summarize_model(analysis_fast_4_altv),
summary_analysis_slow_4_altv = summarize_model(analysis_slow_4_altv),
model_summary_altv = dplyr::bind_rows(
summary_analysis_fast_1_altv,
summary_analysis_slow_1_altv,
summary_analysis_fast_2_altv,
summary_analysis_slow_2_altv,
summary_analysis_fast_3_altv,
summary_analysis_slow_3_altv,
summary_analysis_fast_4_altv,
summary_analysis_slow_4_altv
),model_summary_main = dplyr::bind_rows(
summary_analysis_fast_1_main,
summary_analysis_slow_1_main,
summary_analysis_fast_2_main,
summary_analysis_slow_2_main,
summary_analysis_fast_3_main,
summary_analysis_slow_3_main,
summary_analysis_fast_4_main,
summary_analysis_slow_4_main
) )
Static branching makes it easier to write and understand plans. To activate static branching, use the transform
argument of target()
.
# With static branching:
<- rlang::syms(c("main", "altv")) # We need symbols.
model_functions
# List of symbols.
model_functions #> [[1]]
#> main
#>
#> [[2]]
#> altv
<- drake_plan(
plan data = get_data(),
analysis = target(
model_function(data, mean = mean_value, tuning = tuning_setting),
# Define an analysis target for each combination of
# tuning_setting, mean_value, and model_function.
transform = cross(
tuning_setting = c("fast", "slow"),
mean_value = !!(1:4), # Why `!!`? See "Tidy Evaluation" below.
model_function = !!model_functions # Why `!!`? See "Tidy Evaluation" below.
)
),# Define a new summary target for each analysis target defined previously.
summary = target(
summarize_model(analysis),
transform = map(analysis)
),# Group together the summary targets by the corresponding value
# of model_function.
model_summary = target(
::bind_rows(summary),
dplyrtransform = combine(summary, .by = model_function)
)
)
plan#> # A tibble: 35 x 2
#> target command
#> <chr> <expr_lst>
#> 1 analysis_fast_1L_main main(data, mean = 1L, tuning = "fast")
#> 2 analysis_slow_1L_main main(data, mean = 1L, tuning = "slow")
#> 3 analysis_fast_2L_main main(data, mean = 2L, tuning = "fast")
#> 4 analysis_slow_2L_main main(data, mean = 2L, tuning = "slow")
#> 5 analysis_fast_3L_main main(data, mean = 3L, tuning = "fast")
#> 6 analysis_slow_3L_main main(data, mean = 3L, tuning = "slow")
#> 7 analysis_fast_4L_main main(data, mean = 4L, tuning = "fast")
#> 8 analysis_slow_4L_main main(data, mean = 4L, tuning = "slow")
#> 9 analysis_fast_1L_altv altv(data, mean = 1L, tuning = "fast")
#> 10 analysis_slow_1L_altv altv(data, mean = 1L, tuning = "slow")
#> # … with 25 more rows
Always check the graph to make sure the plan makes sense.
plot(plan) # a quick and dirty alternative to vis_drake_graph()
If the graph is too complicated to look at or too slow to load, downsize the plan with max_expand
. Then, when you are done debugging and testing, remove max_expand
to scale back up to the full plan.
<- rlang::syms(c("main", "altv"))
model_functions
<- drake_plan(
plan max_expand = 2,
data = get_data(),
analysis = target(
model_function(data, mean = mean_value, tuning = tuning_setting),
transform = cross(
tuning_setting = c("fast", "slow"),
mean_value = !!(1:4), # Why `!!`? See "Tidy Evaluation" below.
model_function = !!model_functions # Why `!!`? See "Tidy Evaluation" below.
)
),summary = target(
summarize_model(analysis),
transform = map(analysis)
),model_summary = target(
::bind_rows(summary),
dplyrtransform = combine(summary, .by = model_function) # defined in "analysis"
)
)
# Click and drag the nodes in the graph to improve the view.
plot(plan)
5.2 Grouping variables
A grouping variable contains iterated values for a single instance of map()
or cross()
. mean_value
and tuning_par
are grouping variables below. Notice how they are defined inside cross()
. Grouping variables are not targets, and they must be declared inside static transformations.
drake_plan(
data = get_data(),
model = target(
fit_model(data, mean_value, tuning_par),
transform = cross(
mean_value = c(1, 2),
tuning_par = c("fast", "slow")
)
)
)#> # A tibble: 5 x 2
#> target command
#> <chr> <expr_lst>
#> 1 data get_data()
#> 2 model_1_fast fit_model(data, 1, "fast")
#> 3 model_2_fast fit_model(data, 2, "fast")
#> 4 model_1_slow fit_model(data, 1, "slow")
#> 5 model_2_slow fit_model(data, 2, "slow")
Each model has its own mean_value
and tuning_par
. To see this correspondence, set trace = TRUE
.
drake_plan(
trace = TRUE,
data = get_data(),
model = target(
fit_model(data, mean_value, tuning_par),
transform = cross(
mean_value = c(1, 2),
tuning_par = c("fast", "slow")
)
)
)#> # A tibble: 5 x 5
#> target command mean_value tuning_par model
#> <chr> <expr_lst> <chr> <chr> <chr>
#> 1 data get_data() <NA> <NA> <NA>
#> 2 model_1_fast fit_model(data, 1, "fast") 1 "\"fast\"" model_1_fast
#> 3 model_2_fast fit_model(data, 2, "fast") 2 "\"fast\"" model_2_fast
#> 4 model_1_slow fit_model(data, 1, "slow") 1 "\"slow\"" model_1_slow
#> 5 model_2_slow fit_model(data, 2, "slow") 2 "\"slow\"" model_2_slow
If we summarize those models, each summary has its own mean_value
and tuning_par
. In other words, grouping variables have a natural nesting, and they propagate forward so we can use them in downstream targets. Notice how mean_value
and tuning_par
appear in summarize_model()
and combine()
below.
<- drake_plan(
plan trace = TRUE,
data = get_data(),
model = target(
fit_model(data, mean_value, tuning_par),
transform = cross(
mean_value = c(1, 2),
tuning_par = c("fast", "slow")
)
),summary = target(
# mean_value and tuning_par are old grouping variables from the models
summarize_model(model, mean_value, tuning_par),
transform = map(model)
),summary_by_tuning = target(
::bind_rows(summary),
dplyr# tuning_par is an old grouping variable from the models.
transform = combine(summary, .by = tuning_par)
)
)
plot(plan)
5.2.1 Limitations of grouping variables
Each grouping variable should be defined only once. In the plan below, there are multiple conflicting definitions of a1
, a2
, and a3
in the dependencies of c1
, so drake
does not know which definitions to use.
drake_plan(
b1 = target(1, transform = map(a1 = 1, a2 = 1, .id = FALSE)),
b2 = target(1, transform = map(a1 = 1, a3 = 1, .id = FALSE)),
b3 = target(1, transform = map(a2 = 1, a3 = 1, .id = FALSE)),
c1 = target(1, transform = map(a1, a2, a3, .id = FALSE)),
trace = TRUE
)#> Warning in min(vapply(out, length, FUN.VALUE = integer(1))): no non-missing
#> arguments to min; returning Inf
#> Error: A grouping variable for target c1 is either undefined or improperly invoked. Details: https://books.ropensci.org/drake/static.html#grouping-variables
Other workarounds include bind_plans()
(on separate sub-plans) and dynamic branching. Always check your plans before you run them (vis_drake_graph()
etc.).
5.3 Tidy evaluation
In earlier plans, we used “bang-bang” operator !!
from tidy evaluation, e.g. model_function = !!model_functions
in cross()
. But why? Why not just type model_function = model_functions
? Consider the following incorrect plan.
<- rlang::syms(c("main", "altv"))
model_functions
<- drake_plan(
plan data = get_data(),
analysis = target(
model_function(data, mean = mean_value, tuning = tuning_setting),
transform = cross(
tuning_setting = c("fast", "slow"),
mean_value = 1:4, # without !!
model_function = model_functions # without !!
)
)
)
drake_plan_source(plan)
#> drake_plan(
#> analysis_fast_1_model_functions = model_functions(data, mean = 1, tuning = "fast"),
#> analysis_slow_1_model_functions = model_functions(data, mean = 1, tuning = "slow"),
#> analysis_fast_4_model_functions = model_functions(data, mean = 4, tuning = "fast"),
#> analysis_slow_4_model_functions = model_functions(data, mean = 4, tuning = "slow"),
#> data = get_data()
#> )
Because we omit !!
, we create two problems:
- The commands use
model_functions()
instead of the desiredmain()
andaltv()
. - We are missing the targets with
mean = 2
andmean = 3
.
Why? To make static branching work properly, drake
does not actually evaluate the arguments to cross()
. It just uses the raw symbols and expressions. To force drake
to use the values instead, we need !!
.
<- rlang::syms(c("main", "altv"))
model_functions
<- drake_plan(
plan data = get_data(),
analysis = target(
model_function(data, mean = mean_value, tuning = tuning_setting),
transform = cross(
tuning_setting = c("fast", "slow"),
mean_value = !!(1:4), # with !!
model_function = !!model_functions # with !!
)
)
)
drake_plan_source(plan)
#> drake_plan(
#> analysis_fast_1L_main = main(data, mean = 1L, tuning = "fast"),
#> analysis_slow_1L_main = main(data, mean = 1L, tuning = "slow"),
#> analysis_fast_2L_main = main(data, mean = 2L, tuning = "fast"),
#> analysis_slow_2L_main = main(data, mean = 2L, tuning = "slow"),
#> analysis_fast_3L_main = main(data, mean = 3L, tuning = "fast"),
#> analysis_slow_3L_main = main(data, mean = 3L, tuning = "slow"),
#> analysis_fast_4L_main = main(data, mean = 4L, tuning = "fast"),
#> analysis_slow_4L_main = main(data, mean = 4L, tuning = "slow"),
#> analysis_fast_1L_altv = altv(data, mean = 1L, tuning = "fast"),
#> analysis_slow_1L_altv = altv(data, mean = 1L, tuning = "slow"),
#> analysis_fast_2L_altv = altv(data, mean = 2L, tuning = "fast"),
#> analysis_slow_2L_altv = altv(data, mean = 2L, tuning = "slow"),
#> analysis_fast_3L_altv = altv(data, mean = 3L, tuning = "fast"),
#> analysis_slow_3L_altv = altv(data, mean = 3L, tuning = "slow"),
#> analysis_fast_4L_altv = altv(data, mean = 4L, tuning = "fast"),
#> analysis_slow_4L_altv = altv(data, mean = 4L, tuning = "slow"),
#> data = get_data()
#> )
5.4 Static transformations
There are four transformations in static branching: map()
, cross()
, split()
, and combine()
. They are not actual functions, just special language to supply to the transform
argument of target()
in drake_plan()
. Each transformation is similar to a function from the Tidyverse.
drake |
Tidyverse analogue |
---|---|
map() |
pmap() from purrr |
cross() |
crossing() from tidyr |
split() |
group_map() from dplyr |
combine() |
summarize() from dplyr |
5.4.1 map()
map()
creates a new target for each row in a grid.
drake_plan(
x = target(
simulate_data(center, scale),
transform = map(center = c(2, 1, 0), scale = c(3, 2, 1))
)
)#> # A tibble: 3 x 2
#> target command
#> <chr> <expr_lst>
#> 1 x_2_3 simulate_data(2, 3)
#> 2 x_1_2 simulate_data(1, 2)
#> 3 x_0_1 simulate_data(0, 1)
You can supply the grid directly with the .data
argument. Note the use of !!
below. (See the tidy evaluation section.)
<- tibble(
my_grid sim_function = c("rnorm", "rt", "rcauchy"),
title = c("Normal", "Student t", "Cauchy")
)$sim_function <- rlang::syms(my_grid$sim_function)
my_grid
drake_plan(
x = target(
simulate_data(sim_function, title, center, scale),
transform = map(
center = c(2, 1, 0),
scale = c(3, 2, 1),
.data = !!my_grid,
# In `.id`, you can select one or more grouping variables
# for pretty target names.
# Set to FALSE to use short numeric suffixes.
.id = sim_function # Try `.id = c(sim_function, center)` yourself.
)
)
)#> # A tibble: 3 x 2
#> target command
#> <chr> <expr_lst>
#> 1 x_rnorm simulate_data(rnorm, "Normal", 2, 3)
#> 2 x_rt simulate_data(rt, "Student t", 1, 2)
#> 3 x_rcauchy simulate_data(rcauchy, "Cauchy", 0, 1)
5.4.2 cross()
cross()
creates a new target for each combination of argument values.
drake_plan(
x = target(
simulate_data(nrow, ncol),
transform = cross(nrow = c(1, 2, 3), ncol = c(4, 5))
)
)#> # A tibble: 6 x 2
#> target command
#> <chr> <expr_lst>
#> 1 x_1_4 simulate_data(1, 4)
#> 2 x_2_4 simulate_data(2, 4)
#> 3 x_3_4 simulate_data(3, 4)
#> 4 x_1_5 simulate_data(1, 5)
#> 5 x_2_5 simulate_data(2, 5)
#> 6 x_3_5 simulate_data(3, 5)
5.4.3 split()
The split()
transformation distributes a dataset as uniformly as possible across multiple targets.
<- drake_plan(
plan large_data = get_data(),
slice_analysis = target(
%>%
large_data analyze(),
transform = split(large_data, slices = 4)
),results = target(
::bind_rows(slice_analysis),
dplyrtransform = combine(slice_analysis)
)
)
plan#> # A tibble: 6 x 2
#> target command
#> <chr> <expr_lst>
#> 1 large_data get_data() …
#> 2 results dplyr::bind_rows(slice_analysis_1, slice_analysis_2, slice_ana…
#> 3 slice_analysi… drake_slice(data = large_data, slices = 4, index = 1) %>% anal…
#> 4 slice_analysi… drake_slice(data = large_data, slices = 4, index = 2) %>% anal…
#> 5 slice_analysi… drake_slice(data = large_data, slices = 4, index = 3) %>% anal…
#> 6 slice_analysi… drake_slice(data = large_data, slices = 4, index = 4) %>% anal…
plot(plan)
At runtime, drake_slice()
takes a single subset of the data. It supports data frames, matrices, and arbitrary arrays.
drake_slice(mtcars, slices = 32, index = 1)
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> Mazda RX4 21 6 160 110 3.9 2.62 16.46 0 1 4 4
drake_slice(mtcars, slices = 32, index = 2)
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4
5.4.4 combine()
combine()
aggregates targets. The closest comparison is the unquote-splice operator !!!
from tidy evaluation.
<- drake_plan(
plan data_group1 = target(
sim_data(mean = x, sd = y),
transform = map(x = c(1, 2), y = c(3, 4))
),data_group2 = target(
pull_data(url),
transform = map(url = c("example1.com", "example2.com"))
),larger = target(
bind_rows(data_group1, data_group2, .id = "id") %>%
arrange(sd) %>%
head(n = 400),
transform = combine(data_group1, data_group2)
)
)
drake_plan_source(plan)
#> drake_plan(
#> data_group1_1_3 = sim_data(mean = 1, sd = 3),
#> data_group1_2_4 = sim_data(mean = 2, sd = 4),
#> data_group2_example1.com = pull_data("example1.com"),
#> data_group2_example2.com = pull_data("example2.com"),
#> larger = bind_rows(data_group1_1_3, data_group1_2_4, data_group2_example1.com,
#> data_group2_example2.com,
#> .id = "id"
#> ) %>%
#> arrange(sd) %>%
#> head(n = 400)
#> )
To create multiple combined groups, use the .by
argument.
<- drake_plan(
plan data = target(
sim_data(mean = x, sd = y, skew = z),
transform = cross(x = c(1, 2), y = c(3, 4), z = c(5, 6))
),combined = target(
bind_rows(data, .id = "id") %>%
arrange(sd) %>%
head(n = 400),
transform = combine(data, .by = c(x, y))
)
)
drake_plan_source(plan)
#> drake_plan(
#> combined_1_3 = bind_rows(data_1_3_5, data_1_3_6, .id = "id") %>%
#> arrange(sd) %>%
#> head(n = 400),
#> combined_2_3 = bind_rows(data_2_3_5, data_2_3_6, .id = "id") %>%
#> arrange(sd) %>%
#> head(n = 400),
#> combined_1_4 = bind_rows(data_1_4_5, data_1_4_6, .id = "id") %>%
#> arrange(sd) %>%
#> head(n = 400),
#> combined_2_4 = bind_rows(data_2_4_5, data_2_4_6, .id = "id") %>%
#> arrange(sd) %>%
#> head(n = 400),
#> data_1_3_5 = sim_data(mean = 1, sd = 3, skew = 5),
#> data_2_3_5 = sim_data(mean = 2, sd = 3, skew = 5),
#> data_1_4_5 = sim_data(mean = 1, sd = 4, skew = 5),
#> data_2_4_5 = sim_data(mean = 2, sd = 4, skew = 5),
#> data_1_3_6 = sim_data(mean = 1, sd = 3, skew = 6),
#> data_2_3_6 = sim_data(mean = 2, sd = 3, skew = 6),
#> data_1_4_6 = sim_data(mean = 1, sd = 4, skew = 6),
#> data_2_4_6 = sim_data(mean = 2, sd = 4, skew = 6)
#> )
5.5 Target names
drake
releases after 7.12.0 let you define your own custom names with the optional .names
argument of transformations.
<- c("experimental", "thorough", "minimal", "naive")
analysis_names
<- drake_plan(
plan dataset = target(
get_dataset(data_index),
transform = map(data_index = !!seq_len(2), .names = c("new", "old"))
),analysis = target(
apply_method(method_name, dataset),
transform = cross(
method_name = c("method1", "method2"),
dataset,.names = !!analysis_names
)
),summary = target(
summarize(analysis),
transform = combine(analysis, .by = dataset, .names = c("table1", "table2"))
)
)
plan#> # A tibble: 8 x 2
#> target command
#> <chr> <expr_lst>
#> 1 experimental apply_method("method1", new)
#> 2 thorough apply_method("method2", new)
#> 3 minimal apply_method("method1", old)
#> 4 naive apply_method("method2", old)
#> 5 new get_dataset(1L)
#> 6 old get_dataset(2L)
#> 7 table1 summarize(experimental, thorough)
#> 8 table2 summarize(minimal, naive)
plot(plan)
The disadvantage of .names
is you need to know in advance the number of targets a transformation will generate. As an alternative, all transformations have an optional .id
argument to control the names of targets. Use it to select the grouping variables that go into the names, as well as the order they appear in the suffixes.
drake_plan(
data = target(
get_data(param1, param2),
transform = map(
param1 = c(123, 456),
param2 = c(7, 9),
param2 = c("abc", "xyz"),
.id = param2
)
)
)#> # A tibble: 2 x 2
#> target command
#> <chr> <expr_lst>
#> 1 data_7 get_data(123, 7)
#> 2 data_9 get_data(456, 9)
drake_plan(
data = target(
get_data(param1, param2),
transform = map(
param1 = c(123, 456),
param2 = c(7, 9),
param2 = c("abc", "xyz"),
.id = c(param2, param1)
)
)
)#> # A tibble: 2 x 2
#> target command
#> <chr> <expr_lst>
#> 1 data_7_123 get_data(123, 7)
#> 2 data_9_456 get_data(456, 9)
drake_plan(
data = target(
get_data(param1, param2),
transform = map(
param1 = c(123, 456),
param2 = c(7, 9),
param2 = c("abc", "xyz"),
.id = c(param1, param2)
)
)
)#> # A tibble: 2 x 2
#> target command
#> <chr> <expr_lst>
#> 1 data_123_7 get_data(123, 7)
#> 2 data_456_9 get_data(456, 9)
Set .id
to FALSE
to ignore the grouping variables altogether.
drake_plan(
data = target(
get_data(param1, param2),
transform = map(
param1 = c(123, 456),
param2 = c(7, 9),
param2 = c("abc", "xyz"),
.id = FALSE
)
)
)#> # A tibble: 2 x 2
#> target command
#> <chr> <expr_lst>
#> 1 data get_data(123, 7)
#> 2 data_2 get_data(456, 9)
Finally, drake
supports a special .id_chr
symbol in commands to let you refer to the name of the current target as a character string.
<- function(x) {
as_chr deparse(substitute(x))
}<- drake_plan(
plan data = target(
get_data(param),
transform = map(param = c(123, 456))
),keras_model = target(
save_model_hdf5(fit_model(data), file_out(!!sprintf("%s.h5", .id_chr))),
transform = map(data, .id = param)
),result = target(
predict(load_model_hdf5(file_in(!!sprintf("%s.h5", as_chr(keras_model))))),
transform = map(keras_model, .id = param)
)
)
plan#> # A tibble: 6 x 2
#> target command
#> <chr> <expr_lst>
#> 1 data_123 get_data(123) …
#> 2 data_456 get_data(456) …
#> 3 keras_model_123 save_model_hdf5(fit_model(data_123), file_out("keras_model_12…
#> 4 keras_model_456 save_model_hdf5(fit_model(data_456), file_out("keras_model_45…
#> 5 result_123 predict(load_model_hdf5(file_in("keras_model_123.h5"))) …
#> 6 result_456 predict(load_model_hdf5(file_in("keras_model_456.h5"))) …
drake_plan_source(plan)
#> drake_plan(
#> data_123 = get_data(123),
#> data_456 = get_data(456),
#> keras_model_123 = save_model_hdf5(fit_model(data_123), file_out("keras_model_123.h5")),
#> keras_model_456 = save_model_hdf5(fit_model(data_456), file_out("keras_model_456.h5")),
#> result_123 = predict(load_model_hdf5(file_in("keras_model_123.h5"))),
#> result_456 = predict(load_model_hdf5(file_in("keras_model_456.h5")))
#> )