jhofman · July 31, 2018 13:37 · jhofman · Jul 31, 2018
diff --git a/filter_by_group_id.R b/filter_by_group_id.R
 library(tidyverse)
 library(digest)

 # create a dummy dataframe with 100,000 groups and 1,000,000 rows
 # where group ids are md5 hash of integers from 1 to 100,000
 set.seed(42)
 md5 <- Vectorize(function(x) digest(x, algo="md5"))
 df <- data.frame(group_id=sample(md5(1:1e4), 1e6, replace=T),
                 val=sample(1:100, 1e6, replace=T))

 # group observations by group_id, creating an index on group_id in the background
 df <- df %>%
  group_by(group_id)

 ########################################
 # bad: filter by group id, the naive way
 ########################################

 # this is slow for two reasons
 #   the first is that it's a linear scan over all rows
 #   and the second is that there's overhead created by the grouping

 system.time( df1 <- df %>% filter(group_id == "4b5630ee914e848e8d07221556b0a2fb") )
 #   user  system elapsed 
 #  1.416   0.485   1.957 
                 
 ########################################
 # better: filter by group id, the smart way
 ########################################

 # this is faster than the above because it uses the group index created by dplyr
 # as a result it's linear in the total number of groups + the length of the requested group

 # create a function that uses the group indices to filter more efficiently
 filter_groups <- function(df, filter_formula) {
  # quosure magic for tidy evaluation
  filter_formula <- enquo(filter_formula)
  
  # make sure we're given a grouped data frame
  if(!("grouped_df" %in% class(df))) {
    return(data.frame())
  }
  
  # find the group index for this group label
  labels <- attr(df, "labels") %>%
    rowid_to_column() %>%
    filter(!!filter_formula)
  
  # find the indices of all rows in this group
  ndx <- unlist(attr(df, "indices")[labels$rowid])
  
  # return the rows for this group, adjusting for 0-based indexing
  df[ndx + 1, ]
 }

 system.time( df2 <- filter_groups(df, group_id == "4b5630ee914e848e8d07221556b0a2fb") )
 #   user  system elapsed 
 #  0.002   0.000   0.001 

 # check that results are the same
 all(df1 == df2)

 ########################################
 # much cleaner, slightly slower: created a nested data frame, then filter
 ########################################

 # h/t to @hadleywickham for this solution

 system.time( df_nested <- df %>% nest() )
 #   user  system elapsed 
 #  0.607   0.017   0.630 
 system.time(
  df3 <- df_nested %>%
    filter(group_id == "4b5630ee914e848e8d07221556b0a2fb") %>%
    unnest()
 )
 #   user  system elapsed 
 #  0.005   0.000   0.005 
                 
 all(df1 == df3)
	library(tidyverse)
	library(digest)

	# create a dummy dataframe with 100,000 groups and 1,000,000 rows
	# where group ids are md5 hash of integers from 1 to 100,000
	set.seed(42)
	md5 <- Vectorize(function(x) digest(x, algo="md5"))
	df <- data.frame(group_id=sample(md5(1:1e4), 1e6, replace=T),
	val=sample(1:100, 1e6, replace=T))

	# group observations by group_id, creating an index on group_id in the background
	df <- df %>%
	group_by(group_id)

	########################################
	# bad: filter by group id, the naive way
	########################################

	# this is slow for two reasons
	# the first is that it's a linear scan over all rows
	# and the second is that there's overhead created by the grouping

	system.time( df1 <- df %>% filter(group_id == "4b5630ee914e848e8d07221556b0a2fb") )
	# user system elapsed
	# 1.416 0.485 1.957

	########################################
	# better: filter by group id, the smart way
	########################################

	# this is faster than the above because it uses the group index created by dplyr
	# as a result it's linear in the total number of groups + the length of the requested group

	# create a function that uses the group indices to filter more efficiently
	filter_groups <- function(df, filter_formula) {
	# quosure magic for tidy evaluation
	filter_formula <- enquo(filter_formula)

	# make sure we're given a grouped data frame
	if(!("grouped_df" %in% class(df))) {
	return(data.frame())
	}

	# find the group index for this group label
	labels <- attr(df, "labels") %>%
	rowid_to_column() %>%
	filter(!!filter_formula)

	# find the indices of all rows in this group
	ndx <- unlist(attr(df, "indices")[labels$rowid])

	# return the rows for this group, adjusting for 0-based indexing
	df[ndx + 1, ]
	}

	system.time( df2 <- filter_groups(df, group_id == "4b5630ee914e848e8d07221556b0a2fb") )
	# user system elapsed
	# 0.002 0.000 0.001

	# check that results are the same
	all(df1 == df2)

	########################################
	# much cleaner, slightly slower: created a nested data frame, then filter
	########################################

	# h/t to @hadleywickham for this solution

	system.time( df_nested <- df %>% nest() )
	# user system elapsed
	# 0.607 0.017 0.630
	system.time(
	df3 <- df_nested %>%
	filter(group_id == "4b5630ee914e848e8d07221556b0a2fb") %>%
	unnest()
	)
	# user system elapsed
	# 0.005 0.000 0.005

	all(df1 == df3)