This document captures current best practices for R development, emphasizing modern tidyverse patterns, performance, and style. Last updated: August 2025
- Use modern tidyverse patterns - Prioritize dplyr 1.1+ features, native pipe, and current APIs
- Profile before optimizing - Use profvis and bench to identify real bottlenecks
- Write readable code first - Optimize only when necessary and after profiling
- Follow tidyverse style guide - Consistent naming, spacing, and structure
- Always use native pipe
|>instead of magrittr%>% - R 4.3+ provides all needed features
# Good - Modern native pipe
data |>
filter(year >= 2020) |>
summarise(mean_value = mean(value))
# Avoid - Legacy magrittr pipe
data %>%
filter(year >= 2020) %>%
summarise(mean_value = mean(value))- Use
join_by()instead of character vectors for joins - Support for inequality, rolling, and overlap joins
# Good - Modern join syntax
transactions |>
inner_join(companies, by = join_by(company == id))
# Good - Inequality joins
transactions |>
inner_join(companies, join_by(company == id, year >= since))
# Good - Rolling joins (closest match)
transactions |>
inner_join(companies, join_by(company == id, closest(year >= since)))
# Avoid - Old character vector syntax
transactions |>
inner_join(companies, by = c("company" = "id"))- Use
multipleandunmatchedarguments for quality control
# Expect 1:1 matches, error on multiple
inner_join(x, y, by = join_by(id), multiple = "error")
# Allow multiple matches explicitly
inner_join(x, y, by = join_by(id), multiple = "all")
# Ensure all rows match
inner_join(x, y, by = join_by(id), unmatched = "error")- Understand the difference between data masking and tidy selection
- Use
{{}}(embrace) for function arguments - Use
.data[[]]for character vectors
# Data masking functions: arrange(), filter(), mutate(), summarise()
# Tidy selection functions: select(), relocate(), across()
# Function arguments - embrace with {{}}
my_summary <- function(data, group_var, summary_var) {
data |>
group_by({{ group_var }}) |>
summarise(mean_val = mean({{ summary_var }}))
}
# Character vectors - use .data[[]]
for (var in names(mtcars)) {
mtcars |> count(.data[[var]]) |> print()
}
# Multiple columns - use across()
data |>
summarise(across({{ summary_vars }}, ~ mean(.x, na.rm = TRUE)))- Use
.byfor per-operation grouping (dplyr 1.1+) - Use
pick()for column selection inside data-masking functions - Use
across()for applying functions to multiple columns - Use
reframe()for multi-row summaries
# Good - Per-operation grouping (always returns ungrouped)
data |>
summarise(mean_value = mean(value), .by = category)
# Good - Multiple grouping variables
data |>
summarise(total = sum(revenue), .by = c(company, year))
# Good - pick() for column selection
data |>
summarise(
n_x_cols = ncol(pick(starts_with("x"))),
n_y_cols = ncol(pick(starts_with("y")))
)
# Good - across() for applying functions
data |>
summarise(across(where(is.numeric), mean, .names = "mean_{.col}"), .by = group)
# Good - reframe() for multi-row results
data |>
reframe(quantiles = quantile(x, c(0.25, 0.5, 0.75)), .by = group)
# Avoid - Old persistent grouping pattern
data |>
group_by(category) |>
summarise(mean_value = mean(value)) |>
ungroup()Data-masking allows R expressions to refer to data frame columns as if they were variables in the environment. rlang provides the metaprogramming framework that powers tidyverse data-masking.
- Embracing
{{}}- Forward function arguments to data-masking functions - Injection
!!- Inject single expressions or values - Splicing
!!!- Inject multiple arguments from a list - Dynamic dots - Programmable
...with injection support - Pronouns
.data/.env- Explicit disambiguation between data and environment variables
Use {{}} to forward function arguments to data-masking functions:
# Single argument forwarding
my_summarise <- function(data, var) {
data |> dplyr::summarise(mean = mean({{ var }}))
}
# Works with any data-masking expression
mtcars |> my_summarise(cyl)
mtcars |> my_summarise(cyl * am)
mtcars |> my_summarise(.data$cyl) # pronoun syntax supported# Simple dots forwarding
my_group_by <- function(.data, ...) {
.data |> dplyr::group_by(...)
}
# Works with tidy selections too
my_select <- function(.data, ...) {
.data |> dplyr::select(...)
}
# For single-argument tidy selections, wrap in c()
my_pivot_longer <- function(.data, ...) {
.data |> tidyr::pivot_longer(c(...))
}Use .data pronoun for programmatic column access:
# Single column by name
my_mean <- function(data, var) {
data |> dplyr::summarise(mean = mean(.data[[var]]))
}
# Usage - completely insulated from data-masking
mtcars |> my_mean("cyl") # No ambiguity, works like regular function
# Multiple columns with all_of()
my_select_vars <- function(data, vars) {
data |> dplyr::select(all_of(vars))
}
mtcars |> my_select_vars(c("cyl", "am"))| Operator | Use Case | Example |
|---|---|---|
{{ }} |
Forward function arguments | summarise(mean = mean({{ var }})) |
!! |
Inject single expression/value | summarise(mean = mean(!!sym(var))) |
!!! |
Inject multiple arguments | group_by(!!!syms(vars)) |
.data[[]] |
Access columns by name | mean(.data[[var]]) |
# Create symbols from strings
var <- "cyl"
mtcars |> dplyr::summarise(mean = mean(!!sym(var)))
# Inject values to avoid name collisions
df <- data.frame(x = 1:3)
x <- 100
df |> dplyr::mutate(scaled = x / !!x) # Uses both data and env x
# Use data_sym() for tidyeval contexts (more robust)
mtcars |> dplyr::summarise(mean = mean(!!data_sym(var)))# Multiple symbols from character vector
vars <- c("cyl", "am")
mtcars |> dplyr::group_by(!!!syms(vars))
# Or use data_syms() for tidy contexts
mtcars |> dplyr::group_by(!!!data_syms(vars))
# Splice lists of arguments
args <- list(na.rm = TRUE, trim = 0.1)
mtcars |> dplyr::summarise(mean = mean(cyl, !!!args))my_function <- function(...) {
# Collect with list2() instead of list() for dynamic features
dots <- list2(...)
# Process dots...
}
# Enables these features:
my_function(a = 1, b = 2) # Normal usage
my_function(!!!list(a = 1, b = 2)) # Splice a list
my_function("{name}" := value) # Name injection
my_function(a = 1, ) # Trailing commas OK# Basic name injection
name <- "result"
list2("{name}" := 1) # Creates list(result = 1)
# In function arguments with {{
my_mean <- function(data, var) {
data |> dplyr::summarise("mean_{{ var }}" := mean({{ var }}))
}
mtcars |> my_mean(cyl) # Creates column "mean_cyl"
mtcars |> my_mean(cyl * am) # Creates column "mean_cyl * am"
# Allow custom names with englue()
my_mean <- function(data, var, name = englue("mean_{{ var }}")) {
data |> dplyr::summarise("{name}" := mean({{ var }}))
}
# User can override default
mtcars |> my_mean(cyl, name = "cylinder_mean")# Explicit disambiguation prevents masking issues
cyl <- 1000 # Environment variable
mtcars |> dplyr::summarise(
data_cyl = mean(.data$cyl), # Data frame column
env_cyl = mean(.env$cyl), # Environment variable
ambiguous = mean(cyl) # Could be either (usually data wins)
)
# Use in loops and programmatic contexts
vars <- c("cyl", "am")
for (var in vars) {
result <- mtcars |> dplyr::summarise(mean = mean(.data[[var]]))
print(result)
}Converting between data-masking and tidy selection behaviors:
# across() as selection-to-data-mask bridge
my_group_by <- function(data, vars) {
data |> dplyr::group_by(across({{ vars }}))
}
# Works with tidy selection
mtcars |> my_group_by(starts_with("c"))
# across(all_of()) as names-to-data-mask bridge
my_group_by <- function(data, vars) {
data |> dplyr::group_by(across(all_of(vars)))
}
mtcars |> my_group_by(c("cyl", "am"))# Transform single arguments by wrapping
my_mean <- function(data, var) {
data |> dplyr::summarise(mean = mean({{ var }}, na.rm = TRUE))
}
# Transform dots with across()
my_means <- function(data, ...) {
data |> dplyr::summarise(across(c(...), ~ mean(.x, na.rm = TRUE)))
}
# Manual transformation (advanced)
my_means_manual <- function(.data, ...) {
vars <- enquos(..., .named = TRUE)
vars <- purrr::map(vars, ~ expr(mean(!!.x, na.rm = TRUE)))
.data |> dplyr::summarise(!!!vars)
}# Avoid - String parsing and eval (security risk)
var <- "cyl"
code <- paste("mean(", var, ")")
eval(parse(text = code)) # Dangerous!
# Good - Symbol creation and injection
!!sym(var) # Safe symbol injection
# Avoid - get() in data mask (name collisions)
with(mtcars, mean(get(var))) # Collision-prone
# Good - Explicit injection or .data
with(mtcars, mean(!!sym(var))) # Safe
# or
mtcars |> summarise(mean(.data[[var]])) # Even safer# Don't use {{ }} on non-arguments
my_func <- function(x) {
x <- force(x) # x is now a value, not an argument
quo(mean({{ x }})) # Wrong! Captures value, not expression
}
# Don't mix injection styles unnecessarily
# Pick one approach and stick with it:
# Either: embrace pattern
my_func <- function(data, var) data |> summarise(mean = mean({{ var }}))
# Or: defuse-and-inject pattern
my_func <- function(data, var) {
var <- enquo(var)
data |> summarise(mean = mean(!!var))
}# In DESCRIPTION:
Imports: rlang
# In NAMESPACE, import specific functions:
importFrom(rlang, enquo, enquos, expr, !!!, :=)
# Or import key functions:
#' @importFrom rlang := enquo enquos#' @param var <[`data-masked`][dplyr::dplyr_data_masking]> Column to summarize
#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Additional grouping variables
#' @param cols <[`tidy-select`][dplyr::dplyr_tidy_select]> Columns to select# Test data-masking behavior
test_that("function supports data masking", {
result <- my_function(mtcars, cyl)
expect_equal(names(result), "mean_cyl")
# Test with expressions
result2 <- my_function(mtcars, cyl * 2)
expect_true("mean_cyl * 2" %in% names(result2))
})
# Test injection behavior
test_that("function supports injection", {
var <- "cyl"
result <- my_function(mtcars, !!sym(var))
expect_true(nrow(result) > 0)
})This modern rlang approach enables clean, safe metaprogramming while maintaining the intuitive data-masking experience users expect from tidyverse functions.
| Tool | Use When | Don't Use When | What It Shows |
|---|---|---|---|
profvis |
Complex code, unknown bottlenecks | Simple functions, known issues | Time per line, call stack |
bench::mark() |
Comparing alternatives | Single approach | Relative performance, memory |
system.time() |
Quick checks | Detailed analysis | Total runtime only |
Rprof() |
Base R only environments | When profvis available | Raw profiling data |
# 1. Profile first - find the actual bottlenecks
library(profvis)
profvis({
# Your slow code here
})
# 2. Focus on the slowest parts (80/20 rule)
# Don't optimize until you know where time is spent
# 3. Benchmark alternatives for hot spots
library(bench)
bench::mark(
current = current_approach(data),
vectorized = vectorized_approach(data),
parallel = map(data, in_parallel(func))
)
# 4. Consider tool trade-offs based on bottleneck typeParallel Processing (in_parallel())
# Helps when:
✓ CPU-intensive computations
✓ Embarassingly parallel problems
✓ Large datasets with independent operations
✓ I/O bound operations (file reading, API calls)
# Hurts when:
✗ Simple, fast operations (overhead > benefit)
✗ Memory-intensive operations (may cause thrashing)
✗ Operations requiring shared state
✗ Small datasets
# Example decision point:
expensive_func <- function(x) Sys.sleep(0.1) # 100ms per call
fast_func <- function(x) x^2 # microseconds per call
# Good for parallel
map(1:100, in_parallel(expensive_func)) # ~10s -> ~2.5s on 4 cores
# Bad for parallel (overhead > benefit)
map(1:100, in_parallel(fast_func)) # 100μs -> 50ms (500x slower!)vctrs Backend Tools
# Use vctrs when:
✓ Type safety matters more than raw speed
✓ Building reusable package functions
✓ Complex coercion/combination logic
✓ Consistent behavior across edge cases
# Avoid vctrs when:
✗ One-off scripts where speed matters most
✗ Simple operations where base R is sufficient
✗ Memory is extremely constrained
# Decision point:
simple_combine <- function(x, y) c(x, y) # Fast, simple
robust_combine <- function(x, y) vec_c(x, y) # Safer, slight overhead
# Use simple for hot loops, robust for package APIsData Backend Selection
# Use data.table when:
✓ Very large datasets (>1GB)
✓ Complex grouping operations
✓ Reference semantics desired
✓ Maximum performance critical
# Use dplyr when:
✓ Readability and maintainability priority
✓ Complex joins and window functions
✓ Team familiarity with tidyverse
✓ Moderate sized data (<100MB)
# Use base R when:
✓ No dependencies allowed
✓ Simple operations
✓ Teaching/learning contexts# 1. Profile realistic data sizes
profvis({
# Use actual data size, not toy examples
real_data |> your_analysis()
})
# 2. Profile multiple runs for stability
bench::mark(
your_function(data),
min_iterations = 10, # Multiple runs
max_iterations = 100
)
# 3. Check memory usage too
bench::mark(
approach1 = method1(data),
approach2 = method2(data),
check = FALSE, # If outputs differ slightly
filter_gc = FALSE # Include GC time
)
# 4. Profile with realistic usage patterns
# Not just isolated function calls# Don't optimize without measuring
# ✗ "This looks slow" -> immediately rewrite
# ✓ Profile first, optimize bottlenecks
# Don't over-engineer for performance
# ✗ Complex optimizations for 1% gains
# ✓ Focus on algorithmic improvements
# Don't assume - measure
# ✗ "for loops are always slow in R"
# ✓ Benchmark your specific use case
# Don't ignore readability costs
# ✗ Unreadable code for minor speedups
# ✓ Readable code with targeted optimizations- Consider lower-level tools when speed is critical
- Use vctrs, rlang backends when appropriate
- Profile to identify true bottlenecks
# For packages - consider backend tools
# vctrs for type-stable vector operations
# rlang for metaprogramming
# data.table for large data operations- Type stability - Predictable output types regardless of input values
- Size stability - Predictable output sizes from input sizes
- Consistent coercion rules - Single set of rules applied everywhere
- Robust class design - Proper S3 vector infrastructure
# Good - vctrs-based vector class
new_percent <- function(x = double()) {
vec_assert(x, double())
new_vctr(x, class = "pkg_percent")
}
# Automatic data frame compatibility, subsetting, etc.# Good - Guaranteed output type
my_function <- function(x, y) {
# Always returns double, regardless of input values
vec_cast(result, double())
}
# Avoid - Type depends on data
sapply(x, function(i) if(condition) 1L else 1.0)# Good - Explicit casting with clear rules
vec_cast(x, double()) # Clear intent, predictable behavior
# Good - Common type finding
vec_ptype_common(x, y, z) # Finds richest compatible type
# Avoid - Base R inconsistencies
c(factor("a"), "b") # Unpredictable behavior# Good - Predictable sizing
vec_c(x, y) # size = vec_size(x) + vec_size(y)
vec_rbind(df1, df2) # size = sum of input sizes
# Avoid - Unpredictable sizing
c(env_object, function_object) # Unpredictable length| Use Case | Base R | vctrs | When to Choose vctrs |
|---|---|---|---|
| Simple combining | c() |
vec_c() |
Need type stability, consistent rules |
| Custom classes | S3 manually | new_vctr() |
Want data frame compatibility, subsetting |
| Type conversion | as.*() |
vec_cast() |
Need explicit, safe casting |
| Finding common type | Not available | vec_ptype_common() |
Combining heterogeneous inputs |
| Size operations | length() |
vec_size() |
Working with non-vector objects |
# Constructor (low-level)
new_percent <- function(x = double()) {
vec_assert(x, double())
new_vctr(x, class = "pkg_percent")
}
# Helper (user-facing)
percent <- function(x = double()) {
x <- vec_cast(x, double())
new_percent(x)
}
# Format method
format.pkg_percent <- function(x, ...) {
paste0(vec_data(x) * 100, "%")
}# Self-coercion
vec_ptype2.pkg_percent.pkg_percent <- function(x, y, ...) {
new_percent()
}
# With double
vec_ptype2.pkg_percent.double <- function(x, y, ...) double()
vec_ptype2.double.pkg_percent <- function(x, y, ...) double()
# Casting
vec_cast.pkg_percent.double <- function(x, to, ...) {
new_percent(x)
}
vec_cast.double.pkg_percent <- function(x, to, ...) {
vec_data(x)
}- Simple operations -
vec_c(1, 2)vsc(1, 2)for basic atomic vectors - One-off scripts - Type safety less critical than speed
- Small vectors - Overhead may outweigh benefits
- Package functions - Type stability prevents expensive re-computation
- Complex classes - Consistent behavior reduces debugging
- Data frame operations - Robust column type handling
- Repeated operations - Predictable types enable optimization
# DESCRIPTION - Import specific functions
Imports: vctrs
# NAMESPACE - Import what you need
importFrom(vctrs, vec_assert, new_vctr, vec_cast, vec_ptype_common)
# Or if using extensively
import(vctrs)# Test type stability
test_that("my_function is type stable", {
expect_equal(vec_ptype(my_function(1:3)), vec_ptype(double()))
expect_equal(vec_ptype(my_function(integer())), vec_ptype(double()))
})
# Test coercion
test_that("coercion works", {
expect_equal(vec_ptype_common(new_percent(), 1.0), double())
expect_error(vec_ptype_common(new_percent(), "a"))
})- Simple one-off analyses - Base R is sufficient
- No custom classes needed - Standard types work fine
- Performance critical + simple operations - Base R may be faster
- External API constraints - Must return base R types
The key insight: vctrs is most valuable in package development where type safety, consistency, and extensibility matter more than raw speed for simple operations.
- Use
map() |> list_rbind()instead of supersededmap_dfr() - Use
walk()for side effects (file writing, plotting) - Use
in_parallel()for scaling across cores
# Modern data frame row binding (purrr 1.0+)
models <- data_splits |>
map(\(split) train_model(split)) |>
list_rbind() # Replaces map_dfr()
# Column binding
summaries <- data_list |>
map(\(df) get_summary_stats(df)) |>
list_cbind() # Replaces map_dfc()
# Side effects with walk()
plots <- walk2(data_list, plot_names, \(df, name) {
p <- ggplot(df, aes(x, y)) + geom_point()
ggsave(name, p)
})
# Parallel processing (purrr 1.1.0+)
library(mirai)
daemons(4)
results <- large_datasets |>
map(in_parallel(expensive_computation))
daemons(0)
- Use stringr over base R string functions
- Consistent
str_prefix and string-first argument order - Pipe-friendly and vectorized by design
# Good - stringr (consistent, pipe-friendly)
text |>
str_to_lower() |>
str_trim() |>
str_replace_all("pattern", "replacement") |>
str_extract("\\d+")
# Common patterns
str_detect(text, "pattern") # vs grepl("pattern", text)
str_extract(text, "pattern") # vs complex regmatches()
str_replace_all(text, "a", "b") # vs gsub("a", "b", text)
str_split(text, ",") # vs strsplit(text, ",")
str_length(text) # vs nchar(text)
str_sub(text, 1, 5) # vs substr(text, 1, 5)
# String combination and formatting
str_c("a", "b", "c") # vs paste0()
str_glue("Hello {name}!") # templating
str_pad(text, 10, "left") # padding
str_wrap(text, width = 80) # text wrapping
# Case conversion
str_to_lower(text) # vs tolower()
str_to_upper(text) # vs toupper()
str_to_title(text) # vs tools::toTitleCase()
# Pattern helpers for clarity
str_detect(text, fixed("$")) # literal match
str_detect(text, regex("\\d+")) # explicit regex
str_detect(text, coll("é", locale = "fr")) # collation
# Avoid - inconsistent base R functions
grepl("pattern", text) # argument order varies
regmatches(text, regexpr(...)) # complex extraction
gsub("a", "b", text) # different arg order# Good - vectorized operations
result <- x + y
# Good - Type-stable purrr functions
map_dbl(data, mean) # always returns double
map_chr(data, class) # always returns character
# Avoid - Type-unstable base functions
sapply(data, mean) # might return list or vector
# Avoid - explicit loops for simple operations
result <- numeric(length(x))
for(i in seq_along(x)) {
result[i] <- x[i] + y[i]
}# Good function structure
rescale01 <- function(x) {
rng <- range(x, na.rm = TRUE, finite = TRUE)
(x - rng[1]) / (rng[2] - rng[1])
}
# Use type-stable outputs
map_dbl() # returns numeric vector
map_chr() # returns character vector
map_lgl() # returns logical vector# Good naming: snake_case for variables/functions
calculate_mean_score <- function(data, score_col) {
# Function body
}
# Prefix non-standard arguments with .
my_function <- function(.data, ...) {
# Reduces argument conflicts
}- Use snake_case for all names
- Variable names = nouns, function names = verbs
- Avoid dots except for S3 methods
# Good
day_one
calculate_mean
user_data
# Avoid
DayOne
calculate.mean
userData# Good spacing
x[, 1]
mean(x, na.rm = TRUE)
if (condition) {
action()
}
# Pipe formatting
data |>
filter(year >= 2020) |>
group_by(category) |>
summarise(
mean_value = mean(value),
count = n()
)# Avoid - Old pipe
data %>% function()
# Avoid - Old join syntax
inner_join(x, y, by = c("a" = "b"))
# Avoid - Implicit type conversion
sapply() # Use map_*() instead
# Avoid - String manipulation in data masking
mutate(data, !!paste0("new_", var) := value)
# Use across() or other approaches instead# Avoid - Growing objects in loops
result <- c()
for(i in 1:n) {
result <- c(result, compute(i)) # Slow!
}
# Good - Pre-allocate
result <- vector("list", n)
for(i in 1:n) {
result[[i]] <- compute(i)
}
# Better - Use purrr
result <- map(1:n, compute)- S7 combines S3 simplicity with S4 structure
- Formal class definitions with automatic validation
- Compatible with existing S3 code
# S7 class definition
Range <- new_class("Range",
properties = list(
start = class_double,
end = class_double
),
validator = function(self) {
if (self@end < self@start) {
"@end must be >= @start"
}
}
)
# Usage - constructor and property access
x <- Range(start = 1, end = 10)
x@start # 1
x@end <- 20 # automatic validation
# Methods
inside <- new_generic("inside", "x")
method(inside, Range) <- function(x, y) {
y >= x@start & y <= x@end
}Start here: What are you building?
Use vctrs when:
✓ Need data frame integration (columns/rows)
✓ Want type-stable vector operations
✓ Building factor-like, date-like, or numeric-like classes
✓ Need consistent coercion/casting behavior
✓ Working with existing tidyverse infrastructure
Examples: custom date classes, units, categorical data
Use S7 when:
✓ NEW projects that need formal classes
✓ Want property validation and safe property access (@)
✓ Need multiple dispatch (beyond S3's double dispatch)
✓ Converting from S3 and want better structure
✓ Building class hierarchies with inheritance
✓ Want better error messages and discoverability
Use S3 when:
✓ Simple classes with minimal structure needs
✓ Maximum compatibility and minimal dependencies
✓ Quick prototyping or internal classes
✓ Contributing to existing S3-based ecosystems
✓ Performance is absolutely critical (minimal overhead)
Use S4 when:
✓ Working in Bioconductor ecosystem
✓ Need complex multiple inheritance (S7 doesn't support this)
✓ Existing S4 codebase that works well
| Feature | S3 | S7 | When S7 wins |
|---|---|---|---|
| Class definition | Informal (convention) | Formal (new_class()) |
Need guaranteed structure |
| Property access | $ or attr() (unsafe) |
@ (safe, validated) |
Property validation matters |
| Validation | Manual, inconsistent | Built-in validators | Data integrity important |
| Method discovery | Hard to find methods | Clear method printing | Developer experience matters |
| Multiple dispatch | Limited (base generics) | Full multiple dispatch | Complex method dispatch needed |
| Inheritance | Informal, NextMethod() |
Explicit super() |
Predictable inheritance needed |
| Migration cost | - | Low (1-2 hours) | Want better structure |
| Performance | Fastest | ~Same as S3 | Performance difference negligible |
| Compatibility | Full S3 | Full S3 + S7 | Need both old and new patterns |
# Complex validation needs
Range <- new_class("Range",
properties = list(start = class_double, end = class_double),
validator = function(self) {
if (self@end < self@start) "@end must be >= @start"
}
)
# Multiple dispatch needs
method(generic, list(ClassA, ClassB)) <- function(x, y) ...
# Class hierarchies with clear inheritance
Child <- new_class("Child", parent = Parent)# Vector-like behavior in data frames
percent <- new_vctr(0.5, class = "percentage")
data.frame(x = 1:3, pct = percent(c(0.1, 0.2, 0.3))) # works seamlessly
# Type-stable operations
vec_c(percent(0.1), percent(0.2)) # predictable behavior
vec_cast(0.5, percent()) # explicit, safe casting# Simple classes without complex needs
new_simple <- function(x) structure(x, class = "simple")
print.simple <- function(x, ...) cat("Simple:", x)
# Maximum performance needs (rare)
# Existing S3 ecosystem contributions- S3 → S7: Usually 1-2 hours work, keeps full compatibility
- S4 → S7: More complex, evaluate if S4 features are actually needed
- Base R → vctrs: For vector-like classes, significant benefits
- Combining approaches: S7 classes can use vctrs principles internally
# Add dependency when:
✓ Significant functionality gain
✓ Maintenance burden reduction
✓ User experience improvement
✓ Complex implementation (regex, dates, web)
# Use base R when:
✓ Simple utility functions
✓ Package will be widely used (minimize deps)
✓ Dependency is large for small benefit
✓ Base R solution is straightforward
# Example decisions:
str_detect(x, "pattern") # Worth stringr dependency
length(x) > 0 # Don't need purrr for this
parse_dates(x) # Worth lubridate dependency
x + 1 # Don't need dplyr for this# Core tidyverse (usually worth it):
dplyr # Complex data manipulation
purrr # Functional programming, parallel
stringr # String manipulation
tidyr # Data reshaping
# Specialized tidyverse (evaluate carefully):
lubridate # If heavy date manipulation
forcats # If many categorical operations
readr # If specific file reading needs
ggplot2 # If package creates visualizations
# Heavy dependencies (use sparingly):
tidyverse # Meta-package, very heavy
shiny # Only for interactive apps# Modern tidyverse API patterns
# 1. Use .by for per-operation grouping
my_summarise <- function(.data, ..., .by = NULL) {
# Support modern grouped operations
}
# 2. Use {{ }} for user-provided columns
my_select <- function(.data, cols) {
.data |> select({{ cols }})
}
# 3. Use ... for flexible arguments
my_mutate <- function(.data, ..., .by = NULL) {
.data |> mutate(..., .by = {{ .by }})
}
# 4. Return consistent types (tibbles, not data.frames)
my_function <- function(.data) {
result |> tibble::as_tibble()
}# Validation level by function type:
# User-facing functions - comprehensive validation
user_function <- function(x, threshold = 0.5) {
# Check all inputs thoroughly
if (!is.numeric(x)) stop("x must be numeric")
if (!is.numeric(threshold) || length(threshold) != 1) {
stop("threshold must be a single number")
}
# ... function body
}
# Internal functions - minimal validation
.internal_function <- function(x, threshold) {
# Assume inputs are valid (document assumptions)
# Only check critical invariants
# ... function body
}
# Package functions with vctrs - type-stable validation
safe_function <- function(x, y) {
x <- vec_cast(x, double())
y <- vec_cast(y, double())
# Automatic type checking and coercion
}# Good error messages - specific and actionable
if (length(x) == 0) {
cli::cli_abort(
"Input {.arg x} cannot be empty.",
"i" = "Provide a non-empty vector."
)
}
# Include function name in errors
validate_input <- function(x, call = caller_env()) {
if (!is.numeric(x)) {
cli::cli_abort("Input must be numeric", call = call)
}
}
# Use consistent error styling
# cli package for user-friendly messages
# rlang for developer tools✓ Users will call it directly
✓ Other packages might want to extend it
✓ Part of the core package functionality
✓ Stable API that won't change often
# Example: main data processing functions
export_these <- function(.data, ...) {
# Comprehensive input validation
# Full documentation required
# Stable API contract
}✓ Implementation detail that may change
✓ Only used within package
✓ Complex implementation helpers
✓ Would clutter user-facing API
# Example: helper functions
.internal_helper <- function(x, y) {
# Minimal documentation
# Can change without breaking users
# Assume inputs are pre-validated
}# Unit tests - individual functions
test_that("function handles edge cases", {
expect_equal(my_func(c()), expected_empty_result)
expect_error(my_func(NULL), class = "my_error_class")
})
# Integration tests - workflow combinations
test_that("pipeline works end-to-end", {
result <- data |>
step1() |>
step2() |>
step3()
expect_s3_class(result, "expected_class")
})
# Property-based tests for package functions
test_that("function properties hold", {
# Test invariants across many inputs
})# Must document:
✓ All exported functions
✓ Complex algorithms or formulas
✓ Non-obvious parameter interactions
✓ Examples of typical usage
# Can skip documentation:
✗ Simple internal helpers
✗ Obvious parameter meanings
✗ Functions that just call other functions# Data manipulation
subset(data, condition) -> filter(data, condition)
data[order(data$x), ] -> arrange(data, x)
aggregate(x ~ y, data, mean) -> summarise(data, mean(x), .by = y)
# Functional programming
sapply(x, f) -> map(x, f) # type-stable
lapply(x, f) -> map(x, f)
# String manipulation
grepl("pattern", text) -> str_detect(text, "pattern")
gsub("old", "new", text) -> str_replace_all(text, "old", "new")
substr(text, 1, 5) -> str_sub(text, 1, 5)
nchar(text) -> str_length(text)
strsplit(text, ",") -> str_split(text, ",")
paste0(a, b) -> str_c(a, b)
tolower(text) -> str_to_lower(text)# Pipes
data %>% function() -> data |> function()
# Grouping (dplyr 1.1+)
group_by(data, x) |>
summarise(mean(y)) |>
ungroup() -> summarise(data, mean(y), .by = x)
# Column selection
across(starts_with("x")) -> pick(starts_with("x")) # for selection only
# Joins
by = c("a" = "b") -> by = join_by(a == b)
# Multi-row summaries
summarise(data, x, .groups = "drop") -> reframe(data, x)
# Data reshaping
gather()/spread() -> pivot_longer()/pivot_wider()
# String separation (tidyr 1.3+)
separate(col, into = c("a", "b")) -> separate_wider_delim(col, delim = "_", names = c("a", "b"))
extract(col, into = "x", regex) -> separate_wider_regex(col, patterns = c(x = regex))# Old -> New performance patterns
for loops for parallelizable work -> map(data, in_parallel(f))
Manual type checking -> vec_assert() / vec_cast()
Inconsistent coercion -> vec_ptype_common() / vec_c()
# Superseded purrr functions (purrr 1.0+)
map_dfr(x, f) -> map(x, f) |> list_rbind()
map_dfc(x, f) -> map(x, f) |> list_cbind()
map2_dfr(x, y, f) -> map2(x, y, f) |> list_rbind()
pmap_dfr(list, f) -> pmap(list, f) |> list_rbind()
imap_dfr(x, f) -> imap(x, f) |> list_rbind()
# For side effects
walk(x, write_file) # instead of for loops
walk2(data, paths, write_csv) # multiple argumentsThis document should be referenced for all R development to ensure modern, performant, and maintainable code.