rjpower · March 11, 2026 00:23
diff --git a/pyarrow_concat_bench.py b/pyarrow_concat_bench.py
 #!/usr/bin/env -S uv run --script
 # /// script
 # requires-python = ">=3.11"
 # dependencies = [
 #   "pyarrow",
 # ]
 # ///

 """Benchmark: create N PyArrow tables with varied column types and concat them."""

 import time
 import random
 import string
 import argparse
 import pyarrow as pa


 def random_text(n: int = 10_000) -> str:
    return "".join(random.choices(string.ascii_letters + string.digits + " \n", k=n))


 def make_table(rows: int = 1_000, large_text_every: int = 10) -> pa.Table:
    ids = list(range(rows))
    names = [f"name_{i}" for i in range(rows)]
    scores = [random.uniform(0.0, 1.0) for _ in range(rows)]
    counts = [random.randint(0, 1_000_000) for _ in range(rows)]
    flags = [random.choice([True, False]) for _ in range(rows)]
    tags = [random.choice(["alpha", "beta", "gamma", "delta", None]) for _ in range(rows)]
    # large text in every Nth cell, short otherwise
    notes = [
        random_text(10_000) if i % large_text_every == 0 else f"short note {i}"
        for i in range(rows)
    ]
    prices = [random.uniform(0.01, 9_999.99) for _ in range(rows)]
    codes = [f"CODE-{random.randint(1000, 9999)}" for _ in range(rows)]
    ranks = [random.randint(1, 100) for _ in range(rows)]

    return pa.table(
        {
            "id": pa.array(ids, type=pa.int64()),
            "name": pa.array(names, type=pa.string()),
            "score": pa.array(scores, type=pa.float64()),
            "count": pa.array(counts, type=pa.int32()),
            "flag": pa.array(flags, type=pa.bool_()),
            "tag": pa.array(tags, type=pa.string()),
            "notes": pa.array(notes, type=pa.large_string()),
            "price": pa.array(prices, type=pa.float32()),
            "code": pa.array(codes, type=pa.string()),
            "rank": pa.array(ranks, type=pa.int16()),
        }
    )


 def bench(n: int, rows: int) -> dict:
    t0 = time.perf_counter()
    tables = [make_table(rows) for _ in range(n)]
    build_ms = (time.perf_counter() - t0) * 1000

    total_mb = sum(t.nbytes for t in tables) / 1024 / 1024

    t1 = time.perf_counter()
    result = pa.concat_tables(tables)
    concat_ms = (time.perf_counter() - t1) * 1000

    return {
        "n": n,
        "rows": rows,
        "total_rows": result.num_rows,
        "total_mb": total_mb,
        "build_ms": build_ms,
        "concat_ms": concat_ms,
    }


 def main(n: int, rows: int) -> None:
    print(f"Creating {n} tables with {rows} rows each...")
    print(f"  10 columns: int64, string, float64, int32, bool, string(nullable), large_string, float32, string, int16")
    print(f"  Large text (~10KB) every 10th row in 'notes' column\n")

    tables = []
    t0 = time.perf_counter()
    for i in range(n):
        t = time.perf_counter()
        table = make_table(rows)
        elapsed = time.perf_counter() - t
        size_mb = table.nbytes / 1024 / 1024
        print(f"  table[{i:3d}]: {elapsed*1000:7.1f}ms  {size_mb:.1f} MB  ({table.num_rows} rows)")
        tables.append(table)

    build_total = time.perf_counter() - t0
    total_mb = sum(t.nbytes for t in tables) / 1024 / 1024
    print(f"\nBuild total: {build_total*1000:.1f}ms  ({total_mb:.1f} MB across all tables)")

    print(f"\nConcatenating {n} tables with concat_tables()...")
    t1 = time.perf_counter()
    result = pa.concat_tables(tables)
    concat_elapsed = time.perf_counter() - t1

    result_mb = result.nbytes / 1024 / 1024
    print(f"  concat_tables: {concat_elapsed*1000:.1f}ms")
    print(f"  result: {result.num_rows} rows, {result.num_columns} cols, {result_mb:.1f} MB")

    grand_total = time.perf_counter() - t0
    print(f"\nTotal wall time: {grand_total*1000:.1f}ms")


 def grid() -> None:
    ns = [1, 5, 10, 25, 50, 100]
    row_sizes = [100, 250, 500, 1_000]

    col_w = 28
    header = f"{'n \\ rows':>10}" + "".join(f"{r:>{col_w}}" for r in row_sizes)
    sep = "-" * len(header)
    subheader = " " * 10 + "".join(f"{'build / concat / MB':>{col_w}}" for _ in row_sizes)
    print(header)
    print(subheader)
    print(sep)

    for n in ns:
        row_parts = [f"{n:>10}"]
        for rows in row_sizes:
            r = bench(n, rows)
            cell = f"{r['build_ms']:.1f}ms / {r['concat_ms']:.2f}ms / {r['total_mb']:.0f}MB"
            row_parts.append(f"{cell:>{col_w}}")
        print("".join(row_parts))

    print("\n(build = table construction time, concat = concat_tables() time)")


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="PyArrow concat_tables benchmark")
    parser.add_argument("-n", type=int, default=10, help="Number of tables (default: 10)")
    parser.add_argument("--rows", type=int, default=1_000, help="Rows per table (default: 1000)")
    parser.add_argument("--grid", action="store_true", help="Run a grid of n x rows sizes and print a summary table")
    args = parser.parse_args()

    if args.grid:
        grid()
    else:
        main(args.n, args.rows)
	#!/usr/bin/env -S uv run --script
	# /// script
	# requires-python = ">=3.11"
	# dependencies = [
	# "pyarrow",
	# ]
	# ///

	"""Benchmark: create N PyArrow tables with varied column types and concat them."""

	import time
	import random
	import string
	import argparse
	import pyarrow as pa


	def random_text(n: int = 10_000) -> str:
	return "".join(random.choices(string.ascii_letters + string.digits + " \n", k=n))


	def make_table(rows: int = 1_000, large_text_every: int = 10) -> pa.Table:
	ids = list(range(rows))
	names = [f"name_{i}" for i in range(rows)]
	scores = [random.uniform(0.0, 1.0) for _ in range(rows)]
	counts = [random.randint(0, 1_000_000) for _ in range(rows)]
	flags = [random.choice([True, False]) for _ in range(rows)]
	tags = [random.choice(["alpha", "beta", "gamma", "delta", None]) for _ in range(rows)]
	# large text in every Nth cell, short otherwise
	notes = [
	random_text(10_000) if i % large_text_every == 0 else f"short note {i}"
	for i in range(rows)
	]
	prices = [random.uniform(0.01, 9_999.99) for _ in range(rows)]
	codes = [f"CODE-{random.randint(1000, 9999)}" for _ in range(rows)]
	ranks = [random.randint(1, 100) for _ in range(rows)]

	return pa.table(
	{
	"id": pa.array(ids, type=pa.int64()),
	"name": pa.array(names, type=pa.string()),
	"score": pa.array(scores, type=pa.float64()),
	"count": pa.array(counts, type=pa.int32()),
	"flag": pa.array(flags, type=pa.bool_()),
	"tag": pa.array(tags, type=pa.string()),
	"notes": pa.array(notes, type=pa.large_string()),
	"price": pa.array(prices, type=pa.float32()),
	"code": pa.array(codes, type=pa.string()),
	"rank": pa.array(ranks, type=pa.int16()),
	}
	)


	def bench(n: int, rows: int) -> dict:
	t0 = time.perf_counter()
	tables = [make_table(rows) for _ in range(n)]
	build_ms = (time.perf_counter() - t0) * 1000

	total_mb = sum(t.nbytes for t in tables) / 1024 / 1024

	t1 = time.perf_counter()
	result = pa.concat_tables(tables)
	concat_ms = (time.perf_counter() - t1) * 1000

	return {
	"n": n,
	"rows": rows,
	"total_rows": result.num_rows,
	"total_mb": total_mb,
	"build_ms": build_ms,
	"concat_ms": concat_ms,
	}


	def main(n: int, rows: int) -> None:
	print(f"Creating {n} tables with {rows} rows each...")
	print(f" 10 columns: int64, string, float64, int32, bool, string(nullable), large_string, float32, string, int16")
	print(f" Large text (~10KB) every 10th row in 'notes' column\n")

	tables = []
	t0 = time.perf_counter()
	for i in range(n):
	t = time.perf_counter()
	table = make_table(rows)
	elapsed = time.perf_counter() - t
	size_mb = table.nbytes / 1024 / 1024
	print(f" table[{i:3d}]: {elapsed*1000:7.1f}ms {size_mb:.1f} MB ({table.num_rows} rows)")
	tables.append(table)

	build_total = time.perf_counter() - t0
	total_mb = sum(t.nbytes for t in tables) / 1024 / 1024
	print(f"\nBuild total: {build_total*1000:.1f}ms ({total_mb:.1f} MB across all tables)")

	print(f"\nConcatenating {n} tables with concat_tables()...")
	t1 = time.perf_counter()
	result = pa.concat_tables(tables)
	concat_elapsed = time.perf_counter() - t1

	result_mb = result.nbytes / 1024 / 1024
	print(f" concat_tables: {concat_elapsed*1000:.1f}ms")
	print(f" result: {result.num_rows} rows, {result.num_columns} cols, {result_mb:.1f} MB")

	grand_total = time.perf_counter() - t0
	print(f"\nTotal wall time: {grand_total*1000:.1f}ms")


	def grid() -> None:
	ns = [1, 5, 10, 25, 50, 100]
	row_sizes = [100, 250, 500, 1_000]

	col_w = 28
	header = f"{'n \\ rows':>10}" + "".join(f"{r:>{col_w}}" for r in row_sizes)
	sep = "-" * len(header)
	subheader = " " * 10 + "".join(f"{'build / concat / MB':>{col_w}}" for _ in row_sizes)
	print(header)
	print(subheader)
	print(sep)

	for n in ns:
	row_parts = [f"{n:>10}"]
	for rows in row_sizes:
	r = bench(n, rows)
	cell = f"{r['build_ms']:.1f}ms / {r['concat_ms']:.2f}ms / {r['total_mb']:.0f}MB"
	row_parts.append(f"{cell:>{col_w}}")
	print("".join(row_parts))

	print("\n(build = table construction time, concat = concat_tables() time)")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="PyArrow concat_tables benchmark")
	parser.add_argument("-n", type=int, default=10, help="Number of tables (default: 10)")
	parser.add_argument("--rows", type=int, default=1_000, help="Rows per table (default: 1000)")
	parser.add_argument("--grid", action="store_true", help="Run a grid of n x rows sizes and print a summary table")
	args = parser.parse_args()

	if args.grid:
	grid()
	else:
	main(args.n, args.rows)
No results found