ClaytonJY · May 23, 2023 15:41 · ClaytonJY · Mar 23, 2018
diff --git a/tidy-parallel-timing.R b/tidy-parallel-timing.R
 library(dplyr)
 library(purrr)

 # helper to make examples
 new_tbl <- function(n_groups, row_range = 5:10) {

  1:n_groups %>%
    map_df(~tibble(
      group_id = .x,
      value    = rnorm(sample.int(row_range, 1L))
    ))
 }

 # take the row with biggest value in some column
 get_max_slicer <- function(col, time) {
  
  col <- enquo(col)
  
  function(tbl) {

    Sys.sleep(time)
    
    tbl %>%
      arrange(desc(!!col)) %>%
      slice(1L)
  }
 }

 #### functions ####

 # sequential
 do_sequential <- function(tbl, func) {
  
  tbl %>%
    group_by(group_id) %>%
    do(func(.))
 }

 # naive-but-easy parallel
 library(future)
 plan(multiprocess)

 do_small_groups <- function(tbl, func) {
  
  tbl %>%
    split(.$group_id) %>%
    map(~future(
      func(.x)
    )) %>%
    values() %>%
    bind_rows()
 }

 # smarter, probably
 do_big_groups <- function(tbl, func, cores = availableCores()) {
  
  tbl %>%
    split(.$group_id %% cores) %>%  # relies on group_id being contiguous ints
    map(~future(
      do_sequential(.x, func)
    )) %>%
    values() %>%
    bind_rows()
 }

 #### validate ####

 tbl    <- new_tbl(10L)
 slicer <- get_max_slicer(value, 0.1)

 funcs <- lst(
  do_sequential,
  do_small_groups,
  do_big_groups
 )

 results <- map(funcs, ~.x(tbl, slicer))

 # compare each result to the one before
 map2_lgl(
  results[-1], results[-length(results)],
  all_equal
 ) %>%
  all() %>%
  stopifnot()


 #### benchmarking ####

 library(microbenchmark)
 library(tidyr)
 library(ggplot2)

 timer <- function(groups, time, functions, repeats = 3L) {
  
  tbl <- new_tbl(groups)
  f   <- get_max_slicer(value, time)
  
  functions %>%
    map_dfr(
      ~microbenchmark(.x(tbl, f), times = repeats, unit = "s") %>% summary(),
      .id = "func"
    ) %>%
    select(-expr)
 }

 results <- list(
  iter_time = c(0.01, 0.1, 1),
  n_groups  = c(10L, 20L, 40L)
 ) %>%
  cross_df() %>%
  mutate(timings = map2(n_groups, iter_time, timer, funcs)) %>%
  unnest(timings)

 plot_times <- function(tbl) {
  
  tbl %>%
    ggplot(aes(x = n_groups, y = mean, color = func)) +
    facet_wrap(~iter_time) +
    scale_x_continuous(breaks = unique(tbl$n_groups)) +  # must be a simpler way?
    geom_line() + geom_point()
 }

 results %>%
  mutate(func = factor(func, names(funcs))) %>%
  plot_times()

 # more time, parallel options only

 p_results <- list(
  iter_time = c(0.05, 1, 2),
  n_groups  = c(10L, 100L, 1000L)
 ) %>%
  cross_df() %>%
  mutate(timings = map2(n_groups, iter_time, timer, funcs[-1])) %>%
  unnest(timings)

 p_results %>%
  mutate(func = factor(func, names(funcs))) %>%
  plot_times()
	library(dplyr)
	library(purrr)

	# helper to make examples
	new_tbl <- function(n_groups, row_range = 5:10) {

	1:n_groups %>%
	map_df(~tibble(
	group_id = .x,
	value = rnorm(sample.int(row_range, 1L))
	))
	}

	# take the row with biggest value in some column
	get_max_slicer <- function(col, time) {

	col <- enquo(col)

	function(tbl) {

	Sys.sleep(time)

	tbl %>%
	arrange(desc(!!col)) %>%
	slice(1L)
	}
	}

	#### functions ####

	# sequential
	do_sequential <- function(tbl, func) {

	tbl %>%
	group_by(group_id) %>%
	do(func(.))
	}

	# naive-but-easy parallel
	library(future)
	plan(multiprocess)

	do_small_groups <- function(tbl, func) {

	tbl %>%
	split(.$group_id) %>%
	map(~future(
	func(.x)
	)) %>%
	values() %>%
	bind_rows()
	}

	# smarter, probably
	do_big_groups <- function(tbl, func, cores = availableCores()) {

	tbl %>%
	split(.$group_id %% cores) %>% # relies on group_id being contiguous ints
	map(~future(
	do_sequential(.x, func)
	)) %>%
	values() %>%
	bind_rows()
	}

	#### validate ####

	tbl <- new_tbl(10L)
	slicer <- get_max_slicer(value, 0.1)

	funcs <- lst(
	do_sequential,
	do_small_groups,
	do_big_groups
	)

	results <- map(funcs, ~.x(tbl, slicer))

	# compare each result to the one before
	map2_lgl(
	results[-1], results[-length(results)],
	all_equal
	) %>%
	all() %>%
	stopifnot()


	#### benchmarking ####

	library(microbenchmark)
	library(tidyr)
	library(ggplot2)

	timer <- function(groups, time, functions, repeats = 3L) {

	tbl <- new_tbl(groups)
	f <- get_max_slicer(value, time)

	functions %>%
	map_dfr(
	~microbenchmark(.x(tbl, f), times = repeats, unit = "s") %>% summary(),
	.id = "func"
	) %>%
	select(-expr)
	}

	results <- list(
	iter_time = c(0.01, 0.1, 1),
	n_groups = c(10L, 20L, 40L)
	) %>%
	cross_df() %>%
	mutate(timings = map2(n_groups, iter_time, timer, funcs)) %>%
	unnest(timings)

	plot_times <- function(tbl) {

	tbl %>%
	ggplot(aes(x = n_groups, y = mean, color = func)) +
	facet_wrap(~iter_time) +
	scale_x_continuous(breaks = unique(tbl$n_groups)) + # must be a simpler way?
	geom_line() + geom_point()
	}

	results %>%
	mutate(func = factor(func, names(funcs))) %>%
	plot_times()

	# more time, parallel options only

	p_results <- list(
	iter_time = c(0.05, 1, 2),
	n_groups = c(10L, 100L, 1000L)
	) %>%
	cross_df() %>%
	mutate(timings = map2(n_groups, iter_time, timer, funcs[-1])) %>%
	unnest(timings)

	p_results %>%
	mutate(func = factor(func, names(funcs))) %>%
	plot_times()