Created
March 11, 2026 00:23
-
-
Save rjpower/9f5672ae2c3ce32b69ae8f075e7164ec to your computer and use it in GitHub Desktop.
PyArrow concat_tables benchmark (uv script)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env -S uv run --script | |
| # /// script | |
| # requires-python = ">=3.11" | |
| # dependencies = [ | |
| # "pyarrow", | |
| # ] | |
| # /// | |
| """Benchmark: create N PyArrow tables with varied column types and concat them.""" | |
| import time | |
| import random | |
| import string | |
| import argparse | |
| import pyarrow as pa | |
| def random_text(n: int = 10_000) -> str: | |
| return "".join(random.choices(string.ascii_letters + string.digits + " \n", k=n)) | |
| def make_table(rows: int = 1_000, large_text_every: int = 10) -> pa.Table: | |
| ids = list(range(rows)) | |
| names = [f"name_{i}" for i in range(rows)] | |
| scores = [random.uniform(0.0, 1.0) for _ in range(rows)] | |
| counts = [random.randint(0, 1_000_000) for _ in range(rows)] | |
| flags = [random.choice([True, False]) for _ in range(rows)] | |
| tags = [random.choice(["alpha", "beta", "gamma", "delta", None]) for _ in range(rows)] | |
| # large text in every Nth cell, short otherwise | |
| notes = [ | |
| random_text(10_000) if i % large_text_every == 0 else f"short note {i}" | |
| for i in range(rows) | |
| ] | |
| prices = [random.uniform(0.01, 9_999.99) for _ in range(rows)] | |
| codes = [f"CODE-{random.randint(1000, 9999)}" for _ in range(rows)] | |
| ranks = [random.randint(1, 100) for _ in range(rows)] | |
| return pa.table( | |
| { | |
| "id": pa.array(ids, type=pa.int64()), | |
| "name": pa.array(names, type=pa.string()), | |
| "score": pa.array(scores, type=pa.float64()), | |
| "count": pa.array(counts, type=pa.int32()), | |
| "flag": pa.array(flags, type=pa.bool_()), | |
| "tag": pa.array(tags, type=pa.string()), | |
| "notes": pa.array(notes, type=pa.large_string()), | |
| "price": pa.array(prices, type=pa.float32()), | |
| "code": pa.array(codes, type=pa.string()), | |
| "rank": pa.array(ranks, type=pa.int16()), | |
| } | |
| ) | |
| def bench(n: int, rows: int) -> dict: | |
| t0 = time.perf_counter() | |
| tables = [make_table(rows) for _ in range(n)] | |
| build_ms = (time.perf_counter() - t0) * 1000 | |
| total_mb = sum(t.nbytes for t in tables) / 1024 / 1024 | |
| t1 = time.perf_counter() | |
| result = pa.concat_tables(tables) | |
| concat_ms = (time.perf_counter() - t1) * 1000 | |
| return { | |
| "n": n, | |
| "rows": rows, | |
| "total_rows": result.num_rows, | |
| "total_mb": total_mb, | |
| "build_ms": build_ms, | |
| "concat_ms": concat_ms, | |
| } | |
| def main(n: int, rows: int) -> None: | |
| print(f"Creating {n} tables with {rows} rows each...") | |
| print(f" 10 columns: int64, string, float64, int32, bool, string(nullable), large_string, float32, string, int16") | |
| print(f" Large text (~10KB) every 10th row in 'notes' column\n") | |
| tables = [] | |
| t0 = time.perf_counter() | |
| for i in range(n): | |
| t = time.perf_counter() | |
| table = make_table(rows) | |
| elapsed = time.perf_counter() - t | |
| size_mb = table.nbytes / 1024 / 1024 | |
| print(f" table[{i:3d}]: {elapsed*1000:7.1f}ms {size_mb:.1f} MB ({table.num_rows} rows)") | |
| tables.append(table) | |
| build_total = time.perf_counter() - t0 | |
| total_mb = sum(t.nbytes for t in tables) / 1024 / 1024 | |
| print(f"\nBuild total: {build_total*1000:.1f}ms ({total_mb:.1f} MB across all tables)") | |
| print(f"\nConcatenating {n} tables with concat_tables()...") | |
| t1 = time.perf_counter() | |
| result = pa.concat_tables(tables) | |
| concat_elapsed = time.perf_counter() - t1 | |
| result_mb = result.nbytes / 1024 / 1024 | |
| print(f" concat_tables: {concat_elapsed*1000:.1f}ms") | |
| print(f" result: {result.num_rows} rows, {result.num_columns} cols, {result_mb:.1f} MB") | |
| grand_total = time.perf_counter() - t0 | |
| print(f"\nTotal wall time: {grand_total*1000:.1f}ms") | |
| def grid() -> None: | |
| ns = [1, 5, 10, 25, 50, 100] | |
| row_sizes = [100, 250, 500, 1_000] | |
| col_w = 28 | |
| header = f"{'n \\ rows':>10}" + "".join(f"{r:>{col_w}}" for r in row_sizes) | |
| sep = "-" * len(header) | |
| subheader = " " * 10 + "".join(f"{'build / concat / MB':>{col_w}}" for _ in row_sizes) | |
| print(header) | |
| print(subheader) | |
| print(sep) | |
| for n in ns: | |
| row_parts = [f"{n:>10}"] | |
| for rows in row_sizes: | |
| r = bench(n, rows) | |
| cell = f"{r['build_ms']:.1f}ms / {r['concat_ms']:.2f}ms / {r['total_mb']:.0f}MB" | |
| row_parts.append(f"{cell:>{col_w}}") | |
| print("".join(row_parts)) | |
| print("\n(build = table construction time, concat = concat_tables() time)") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="PyArrow concat_tables benchmark") | |
| parser.add_argument("-n", type=int, default=10, help="Number of tables (default: 10)") | |
| parser.add_argument("--rows", type=int, default=1_000, help="Rows per table (default: 1000)") | |
| parser.add_argument("--grid", action="store_true", help="Run a grid of n x rows sizes and print a summary table") | |
| args = parser.parse_args() | |
| if args.grid: | |
| grid() | |
| else: | |
| main(args.n, args.rows) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment