Skip to content

Instantly share code, notes, and snippets.

@Bvngee
Last active March 2, 2025 10:20
Show Gist options
  • Save Bvngee/4402c52a406344b3462aab9375d21d85 to your computer and use it in GitHub Desktop.
Save Bvngee/4402c52a406344b3462aab9375d21d85 to your computer and use it in GitHub Desktop.
JS Quick Array Performance Tests

Quick JS Array Performance Tests

WARNING:

These tests are not actually indicitive of any real performance metrics. I mostly put these together to get a sense of how these different approaches work, and to get a very rough speed comparison (I am fully aware of all the pitfalls of microbenchmarks).

I am only archiving them so in the future I can reference the code again if need be.

// this is roughly copied from my original implementation
export default class DataFrame {
"1": number = 0;
"2": number = 0;
"3": number = 0;
"4": number = 0;
"5": number = 0;
"6": number = 0;
"7": number = 0;
"8": number = 0;
"9": number = 0;
"10": number = 0;
"11": number = 0;
"12": number = 0;
"13": number = 0;
"14": number = 0;
"15": number = 0;
"16": number = 0;
"17": number = 0;
"18": number = 0;
"19": number = 0;
"20": number = 0;
"21": number = 0;
"22": number = 0;
"23": number = 0;
"24": number = 0;
"25": number = 0;
"26": number = 0;
"27": number = 0;
"28": number = 0;
"29": number = 0;
"30": number = 0;
"31": number = 0;
"32": number = 0;
"33": number = 0;
"34": number = 0;
"35": number = 0;
"36": number = 0;
"37": number = 0;
"38": number = 0;
"39": number = 0;
"40": number = 0;
"41": number = 0;
"42": number = 0;
"43": number = 0;
"44": number = 0;
"45": number = 0;
"46": number = 0;
"47": number = 0;
"48": number = 0;
"49": number = 0;
"50": number = 0;
"51": number = 0;
"52": number = 0;
"53": number = 0;
"54": number = 0;
"55": number = 0;
"56": number = 0;
"57": number = 0;
"58": number = 0;
"59": number = 0;
"60": number = 0;
"61": number = 0;
"62": number = 0;
"63": number = 0;
"64": number = 0;
"65": number = 0;
"66": number = 0;
"67": number = 0;
"68": number = 0;
"69": number = 0;
"70": number = 0;
"71": number = 0;
"72": number = 0;
"73": number = 0;
"74": number = 0;
"75": number = 0;
"76": number = 0;
"77": number = 0;
"78": number = 0;
"79": number = 0;
"80": number = 0;
"81": number = 0;
"82": number = 0;
"83": number = 0;
"84": number = 0;
"85": number = 0;
"86": number = 0;
"87": number = 0;
"88": number = 0;
"89": number = 0;
"90": number = 0;
"91": number = 0;
"92": number = 0;
"93": number = 0;
"94": number = 0;
"95": number = 0;
"96": number = 0;
"97": number = 0;
"98": number = 0;
"99": number = 0;
"100": number = 0;
constructor(values: number[]) {
// Get all the keys of the class
const keys = Object.keys(this);
if (values.length != keys.length) {
console.warn(`WARN: ${values.length} columns recieved instead of ${keys.length}!`);
}
// Assign values from the array to the fields
keys.forEach((key, index) => {
(this as any)[key] = index < values.length ? values[index] : null;
});
}
}
// The original goal of this file was to test the performance of various
// different methods of reading record batches from the arry ipc stream and
// storing the values for later use. My main error, which I discovered later,
// was that I was creating *separate* ipc streams for each record batch
// (including the start/end length metadata), which would prevent the
// RecordBatchStreamReader from reading past the first batch (see
// https://github.com/apache/arrow/issues/32593#issuecomment-1378117262). I
// never fixed that in this test file as I learned plenty about the performance
// of arrow-js objects in the process, but if I wanted to I'd likely just have
// to use a single long-standing RecordBatchStreamWriter like I did in the
// python backend.
"use client"
import React, { useEffect } from "react";
import { Float, RecordBatch, RecordBatchStreamReader, RecordBatchStreamWriter, Table, tableFromArrays, tableFromIPC } from "apache-arrow";
import assert from "assert";
import DataFrame from "./dataframe";
type Temp = {
"1": Float, "2": Float, "3": Float, "4": Float, "5": Float, "6": Float, "7": Float, "8": Float, "9": Float, "10": Float,
"11": Float, "12": Float, "13": Float, "14": Float, "15": Float, "16": Float, "17": Float, "18": Float, "19": Float, "20": Float,
"21": Float, "22": Float, "23": Float, "24": Float, "25": Float, "26": Float, "27": Float, "28": Float, "29": Float, "30": Float,
"31": Float, "32": Float, "33": Float, "34": Float, "35": Float, "36": Float, "37": Float, "38": Float, "39": Float, "40": Float,
"41": Float, "42": Float, "43": Float, "44": Float, "45": Float, "46": Float, "47": Float, "48": Float, "49": Float, "50": Float,
"51": Float, "52": Float, "53": Float, "54": Float, "55": Float, "56": Float, "57": Float, "58": Float, "59": Float, "60": Float,
"61": Float, "62": Float, "63": Float, "64": Float, "65": Float, "66": Float, "67": Float, "68": Float, "69": Float, "70": Float,
"71": Float, "72": Float, "73": Float, "74": Float, "75": Float, "76": Float, "77": Float, "78": Float, "79": Float, "80": Float,
"81": Float, "82": Float, "83": Float, "84": Float, "85": Float, "86": Float, "87": Float, "88": Float, "89": Float, "90": Float,
"91": Float, "92": Float, "93": Float, "94": Float, "95": Float, "96": Float, "97": Float, "98": Float, "99": Float, "100": Float,
}
export default function Home() {
useEffect(() => {
const arrs: Uint8Array[] = [];
const tbls: Table[] = [];
// build base tables/batches; 1000 batches with 100 columns and 10 rows each
console.log("generating test data...");
for (const n of Array.from({ length: 1000 }, (_, i) => i + 1)) {
let obj: Record<string, number[]> = {};
for (const k of Array.from({ length: 100 }, (_, i) => i + 1)) {
obj[k.toString()] = [];
for (const v of Array.from({ length: 10 }, (_, i) => i + 1)) {
obj[k.toString()].push(Math.random() * v);
}
}
const tbl = tableFromArrays(obj);
assert(tbl.numCols == 100);
assert(tbl.numRows == 10);
tbls.push(tbl);
// // This does the same as:
// // const arr = RecordBatchStreamWriter.writeAll(tbl).toUint8Array(true)
// // arrs.push(arr);
// const arr = tableToIPC(tbl);
// arrs.push(arr);
// // Which is likely also the same as:
// // const smallRBWriter = new RecordBatchStreamWriter<Temp>();
// // smallRBWriter.writeAll(tbl);
// // const arr = smallRBWriter.toUint8Array(true);
}
console.log("finished generating test data!");
// const rbw = RecordBatchFileWriter.writeAll(tbls.map((t) => t.batches).flat());
// downloadBlob(rbw.toUint8Array(true), "t1.arrow", "application/vnd.apache.arrow.file");
// const bigRBWriter = new RecordBatchStreamWriter(); // {autoDestroy: true} is automatically passed
// tbls.forEach((t) => {
// bigRBWriter.write(t)
// }); // Write each table (recordbatch) to a big Uint8Array
// const arrbig = bigRBWriter.toUint8Array(true)
tbls.forEach((t) => {
const writer = RecordBatchStreamWriter.writeAll(t);
const arr = writer.toUint8Array(true);
arrs.push(arr);
});
const arrbig = new Uint8Array(1000 * arrs[0].byteLength);
arrs.forEach((arr, i) => arrbig.set(arr, i * arrs[0].byteLength));
assert(arrbig.length == arrs.length * arrs[0].byteLength);
assert(arrs.length == 1000);
assert(tbls.length == 1000);
{
const start1 = performance.now();
const arrs2: Uint8Array[] = [];
let tbl: Table<Temp> | null = null;
for (let i = 0; i < arrs.length; i++) {
arrs2.push(arrs[i]);
// const rbr: RecordBatchStreamReader = RecordBatchStreamReader.from(arrs2);
// rbr.open();
// const batches = rbr.readAll();
// console.log(arrs2.length, rbr.numRecordBatches, batches.length, rbr.isSync(), rbr.isStream());
const batches: RecordBatch[] = []
const readers = RecordBatchStreamReader.readAll(arrs.slice(0, i));
let q = 0;
for (const reader of readers) {
batches.push(reader.next().value);
q++;
}
tbl = new Table<Temp>(batches);
console.log(arrs2.length, q, batches.length);
}
const end1 = performance.now();
console.log(`1: tableFromIPC on increasing Uint8Array[] of record batches using RecordBatchStreamReader: ${end1 - start1}ms`);
console.log("tbl: ", tbl!.numRows, tbl!.numCols, arrbig.length, arrs.length, arrs[0].length);
assert(tbl!.numRows == 10000);
const start2 = performance.now();
const vec = tbl!.getChild("37");
assert(vec!.length == 10000);
const end2 = performance.now();
console.log(`1: tbl.getChild("37"): ${end2 - start2}ms`);
}
{
const start1 = performance.now();
let tbl: Table<Temp> | null = null;
for (let i = 0; i < arrs.length; i++) {
tbl = tableFromIPC<Temp>(arrbig.subarray(0, (i + 1) * arrs[0].length));
}
const end1 = performance.now();
console.log(`1.5: tableFromIPC on slice of arrbig: ${end1 - start1}ms`);
console.log("tbl: ", tbl!.numRows, tbl!.numCols, arrbig.length, arrs.length, arrs[0].length);
assert(tbl!.numRows == 10000);
const start2 = performance.now();
const vec = tbl!.getChild("37");
assert(vec!.length == 10000);
const end2 = performance.now();
console.log(`1.5: tbl.getChild("37"): ${end2 - start2}ms`);
}
{
const start1 = performance.now();
const dataFrames: DataFrame[] = []
for (let i = 0; i < arrs.length; i++) {
const tbl = tableFromIPC<Temp>(arrs[i]);
const newFrames = Array.from({ length: tbl.numRows }, (_, i) => {
const row = tbl.get(i)!.toArray() as number[];
return new DataFrame(row);
});
dataFrames.push(...newFrames);
}
assert(dataFrames.length == 10000);
const end1 = performance.now();
console.log(`2: Storing a DataFrame[]: ${end1 - start1}ms`);
const start2 = performance.now();
const vec2 = dataFrames.map((d) => d["37"]);
assert(vec2.length == 10000);
const end2 = performance.now();
console.log(`2: dataFrames.map((d) => d["37"]): ${end2 - start2}ms`);
}
{
const start1 = performance.now();
let soa: Record<string, Float32Array> = {};
for (let k = 0; k < 100; k++) {
soa[k.toString()] = new Float32Array(10000);
}
for (let i = 0; i < arrs.length; i++) {
const tbl = tableFromIPC<Temp>(arrs[i]);
for (let k = 0; k < 100; k++) {
const arr = tbl.getChildAt(k)!.toArray();
soa[k.toString()].set(arr, i * 10);
}
}
const end1 = performance.now();
console.log(`3: Creating a Record<string, Float32Array>: ${end1 - start1}ms`);
const start2 = performance.now();
const vec3 = soa["37"];
assert(vec3.length == 10000);
const end2 = performance.now();
console.log(`3: soa["37"]: ${end2 - start2}ms`);
}
})
return (
<>
<p>
Performance tests...
</p>
<br />
{/* <p>1: tbl.getChild("37"): {end1_2 - start1_2}ms</p> */}
{/* <p>2: Storing a DataFrame[]: {end2_1 - start2_1}ms</p> */}
{/* <p>2: dataFrames.map((d) = d["37"]): {end2_2 - start2_2}ms</p> */}
{/* <p>3: Storing a DataFrame[]: {end3_1 - start3_1}ms</p> */}
{/* <p>1: tableFromIPC on increasing Uint8Array[] of record batches: {end1_1 - start1_1}ms</p> */}
</>
);
}
// This script is intended to be run standalone with something like `ts-node`.
// From the testing below, it seems to be that TypedArrays (Float32Array), at
// least for our use case (keeping track of a few thousand rows of frequent
// small batches of number-like data), are only very slightly faster than
// regular arrays (number[]). See the top answer here for some more interesting
// information:
// https://stackoverflow.com/questions/24853686/javascript-typedarray-performance
function assert_eq(val1: any, val2: any) {
if (val1 !== val2) {
throw Error(`AssertionError: ${val1} !== ${val2}!`);
}
console.debug(`Assert passed! (${val1} === ${val2})`);
}
const NUM_ROWS = 2000;
const BATCH_SIZE = 10;
const NUM_BATCHES = 100_000;
// We use double the length so that we can always have a full NUM_ROWS' of rows
// to look at by shifting the top half to the bottom half when it fills up
const floatArr = new Float32Array(NUM_ROWS * 2);
let idxTyped = 0;
const arr = new Array<number>(NUM_ROWS);
// Using Float32Array
const start1 = performance.now();
for (const i of Array.from({ length: NUM_BATCHES }, (_, n) => n + 1)) {
const tenRows = Array.from({ length: BATCH_SIZE }, (_, n) => i + n / 10);
if (idxTyped + tenRows.length >= floatArr.length) {
// needs swap
const topHalf = floatArr.subarray(NUM_ROWS);
// assert_eq(topHalf.length, NUM_ROWS);
// assert_eq(topHalf[0] > floatArr.subarray(0, NUM_ROWS)[NUM_ROWS - 1], true);
floatArr.set(topHalf);
floatArr.fill(0, NUM_ROWS);
idxTyped = NUM_ROWS;
}
floatArr.set(tenRows, idxTyped);
idxTyped += tenRows.length;
// console.table(floatArr);
const lhs = Math.max(0, idxTyped - NUM_ROWS);
const sub = floatArr.subarray(lhs, lhs + NUM_ROWS)
// assert_eq(sub.length, NUM_ROWS);
}
const end1 = performance.now();
// Using number[]
const start2 = performance.now();
for (const i of Array.from({ length: NUM_BATCHES }, (_, n) => n + 1)) {
const tenRows = Array.from({ length: BATCH_SIZE }, (_, n) => i + n / 10);
for (let j = 0; j < tenRows.length; j++) {
arr.push(tenRows[j]);
}
// Remove as many from the start of arr as needed to maintain NUM_ROWS length
arr.splice(0, Math.max(0, arr.length - NUM_ROWS))
// console.table(arr);
// assert_eq(arr.length, NUM_ROWS);
}
const end2 = performance.now();
// assert_eq(floatArr.length, NUM_ROWS*2);
// assert_eq(arr.length, NUM_ROWS);
console.log(`floatArr: ${(end1 - start1).toFixed(2)}ms`);
console.log(`arr: ${(end2 - start2).toFixed(2)}ms`);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment