Skip to content

Instantly share code, notes, and snippets.

@rjpower
Created March 11, 2026 00:23
Show Gist options
  • Select an option

  • Save rjpower/9f5672ae2c3ce32b69ae8f075e7164ec to your computer and use it in GitHub Desktop.

Select an option

Save rjpower/9f5672ae2c3ce32b69ae8f075e7164ec to your computer and use it in GitHub Desktop.
PyArrow concat_tables benchmark (uv script)
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "pyarrow",
# ]
# ///
"""Benchmark: create N PyArrow tables with varied column types and concat them."""
import time
import random
import string
import argparse
import pyarrow as pa
def random_text(n: int = 10_000) -> str:
return "".join(random.choices(string.ascii_letters + string.digits + " \n", k=n))
def make_table(rows: int = 1_000, large_text_every: int = 10) -> pa.Table:
ids = list(range(rows))
names = [f"name_{i}" for i in range(rows)]
scores = [random.uniform(0.0, 1.0) for _ in range(rows)]
counts = [random.randint(0, 1_000_000) for _ in range(rows)]
flags = [random.choice([True, False]) for _ in range(rows)]
tags = [random.choice(["alpha", "beta", "gamma", "delta", None]) for _ in range(rows)]
# large text in every Nth cell, short otherwise
notes = [
random_text(10_000) if i % large_text_every == 0 else f"short note {i}"
for i in range(rows)
]
prices = [random.uniform(0.01, 9_999.99) for _ in range(rows)]
codes = [f"CODE-{random.randint(1000, 9999)}" for _ in range(rows)]
ranks = [random.randint(1, 100) for _ in range(rows)]
return pa.table(
{
"id": pa.array(ids, type=pa.int64()),
"name": pa.array(names, type=pa.string()),
"score": pa.array(scores, type=pa.float64()),
"count": pa.array(counts, type=pa.int32()),
"flag": pa.array(flags, type=pa.bool_()),
"tag": pa.array(tags, type=pa.string()),
"notes": pa.array(notes, type=pa.large_string()),
"price": pa.array(prices, type=pa.float32()),
"code": pa.array(codes, type=pa.string()),
"rank": pa.array(ranks, type=pa.int16()),
}
)
def bench(n: int, rows: int) -> dict:
t0 = time.perf_counter()
tables = [make_table(rows) for _ in range(n)]
build_ms = (time.perf_counter() - t0) * 1000
total_mb = sum(t.nbytes for t in tables) / 1024 / 1024
t1 = time.perf_counter()
result = pa.concat_tables(tables)
concat_ms = (time.perf_counter() - t1) * 1000
return {
"n": n,
"rows": rows,
"total_rows": result.num_rows,
"total_mb": total_mb,
"build_ms": build_ms,
"concat_ms": concat_ms,
}
def main(n: int, rows: int) -> None:
print(f"Creating {n} tables with {rows} rows each...")
print(f" 10 columns: int64, string, float64, int32, bool, string(nullable), large_string, float32, string, int16")
print(f" Large text (~10KB) every 10th row in 'notes' column\n")
tables = []
t0 = time.perf_counter()
for i in range(n):
t = time.perf_counter()
table = make_table(rows)
elapsed = time.perf_counter() - t
size_mb = table.nbytes / 1024 / 1024
print(f" table[{i:3d}]: {elapsed*1000:7.1f}ms {size_mb:.1f} MB ({table.num_rows} rows)")
tables.append(table)
build_total = time.perf_counter() - t0
total_mb = sum(t.nbytes for t in tables) / 1024 / 1024
print(f"\nBuild total: {build_total*1000:.1f}ms ({total_mb:.1f} MB across all tables)")
print(f"\nConcatenating {n} tables with concat_tables()...")
t1 = time.perf_counter()
result = pa.concat_tables(tables)
concat_elapsed = time.perf_counter() - t1
result_mb = result.nbytes / 1024 / 1024
print(f" concat_tables: {concat_elapsed*1000:.1f}ms")
print(f" result: {result.num_rows} rows, {result.num_columns} cols, {result_mb:.1f} MB")
grand_total = time.perf_counter() - t0
print(f"\nTotal wall time: {grand_total*1000:.1f}ms")
def grid() -> None:
ns = [1, 5, 10, 25, 50, 100]
row_sizes = [100, 250, 500, 1_000]
col_w = 28
header = f"{'n \\ rows':>10}" + "".join(f"{r:>{col_w}}" for r in row_sizes)
sep = "-" * len(header)
subheader = " " * 10 + "".join(f"{'build / concat / MB':>{col_w}}" for _ in row_sizes)
print(header)
print(subheader)
print(sep)
for n in ns:
row_parts = [f"{n:>10}"]
for rows in row_sizes:
r = bench(n, rows)
cell = f"{r['build_ms']:.1f}ms / {r['concat_ms']:.2f}ms / {r['total_mb']:.0f}MB"
row_parts.append(f"{cell:>{col_w}}")
print("".join(row_parts))
print("\n(build = table construction time, concat = concat_tables() time)")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="PyArrow concat_tables benchmark")
parser.add_argument("-n", type=int, default=10, help="Number of tables (default: 10)")
parser.add_argument("--rows", type=int, default=1_000, help="Rows per table (default: 1000)")
parser.add_argument("--grid", action="store_true", help="Run a grid of n x rows sizes and print a summary table")
args = parser.parse_args()
if args.grid:
grid()
else:
main(args.n, args.rows)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment