Created
August 14, 2024 20:58
-
-
Save ivirshup/018bb8ae1ea7746db768c3672b8a007b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n" | |
] | |
} | |
], | |
"source": [ | |
"import multiprocessing\n", | |
"multiprocessing.set_start_method(\"spawn\")\n", | |
"\n", | |
"import dask.array as da\n", | |
"import tiledb\n", | |
"import numpy as np\n", | |
"from scipy import sparse\n", | |
"\n", | |
"import dask.distributed as dd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n", | |
" <div style=\"margin-left: 48px;\">\n", | |
" <h3 style=\"margin-bottom: 0px;\">Client</h3>\n", | |
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-51d833f8-5a7f-11ef-82a2-023ca6c22285</p>\n", | |
" <table style=\"width: 100%; text-align: left;\">\n", | |
"\n", | |
" <tr>\n", | |
" \n", | |
" <td style=\"text-align: left;\"><strong>Connection method:</strong> Cluster object</td>\n", | |
" <td style=\"text-align: left;\"><strong>Cluster type:</strong> distributed.LocalCluster</td>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" \n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\"></td>\n", | |
" </tr>\n", | |
" \n", | |
"\n", | |
" </table>\n", | |
"\n", | |
" \n", | |
"\n", | |
" \n", | |
" <details>\n", | |
" <summary style=\"margin-bottom: 20px;\"><h3 style=\"display: inline;\">Cluster Info</h3></summary>\n", | |
" <div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output\">\n", | |
" <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\">\n", | |
" </div>\n", | |
" <div style=\"margin-left: 48px;\">\n", | |
" <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCluster</h3>\n", | |
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">fbd898ae</p>\n", | |
" <table style=\"width: 100%; text-align: left;\">\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Workers:</strong> 4\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total threads:</strong> 4\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total memory:</strong> 123.85 GiB\n", | |
" </td>\n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\"><strong>Status:</strong> running</td>\n", | |
" <td style=\"text-align: left;\"><strong>Using processes:</strong> True</td>\n", | |
"</tr>\n", | |
"\n", | |
" \n", | |
" </table>\n", | |
"\n", | |
" <details>\n", | |
" <summary style=\"margin-bottom: 20px;\">\n", | |
" <h3 style=\"display: inline;\">Scheduler Info</h3>\n", | |
" </summary>\n", | |
"\n", | |
" <div style=\"\">\n", | |
" <div>\n", | |
" <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n", | |
" <div style=\"margin-left: 48px;\">\n", | |
" <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n", | |
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-d772a300-8a73-47cb-8d51-b233ae112baa</p>\n", | |
" <table style=\"width: 100%; text-align: left;\">\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Comm:</strong> tcp://127.0.0.1:45619\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Workers:</strong> 4\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total threads:</strong> 4\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Started:</strong> Just now\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total memory:</strong> 123.85 GiB\n", | |
" </td>\n", | |
" </tr>\n", | |
" </table>\n", | |
" </div>\n", | |
" </div>\n", | |
"\n", | |
" <details style=\"margin-left: 48px;\">\n", | |
" <summary style=\"margin-bottom: 20px;\">\n", | |
" <h3 style=\"display: inline;\">Workers</h3>\n", | |
" </summary>\n", | |
"\n", | |
" \n", | |
" <div style=\"margin-bottom: 20px;\">\n", | |
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
" <div style=\"margin-left: 48px;\">\n", | |
" <details>\n", | |
" <summary>\n", | |
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 0</h4>\n", | |
" </summary>\n", | |
" <table style=\"width: 100%; text-align: left;\">\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Comm: </strong> tcp://127.0.0.1:39161\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total threads: </strong> 1\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:36119/status\" target=\"_blank\">http://127.0.0.1:36119/status</a>\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Memory: </strong> 30.96 GiB\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Nanny: </strong> tcp://127.0.0.1:43707\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\"></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td colspan=\"2\" style=\"text-align: left;\">\n", | |
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-y140q1k2\n", | |
" </td>\n", | |
" </tr>\n", | |
"\n", | |
" \n", | |
"\n", | |
" \n", | |
"\n", | |
" </table>\n", | |
" </details>\n", | |
" </div>\n", | |
" </div>\n", | |
" \n", | |
" <div style=\"margin-bottom: 20px;\">\n", | |
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
" <div style=\"margin-left: 48px;\">\n", | |
" <details>\n", | |
" <summary>\n", | |
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 1</h4>\n", | |
" </summary>\n", | |
" <table style=\"width: 100%; text-align: left;\">\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Comm: </strong> tcp://127.0.0.1:34127\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total threads: </strong> 1\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:40553/status\" target=\"_blank\">http://127.0.0.1:40553/status</a>\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Memory: </strong> 30.96 GiB\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Nanny: </strong> tcp://127.0.0.1:45867\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\"></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td colspan=\"2\" style=\"text-align: left;\">\n", | |
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-qoyj7d6g\n", | |
" </td>\n", | |
" </tr>\n", | |
"\n", | |
" \n", | |
"\n", | |
" \n", | |
"\n", | |
" </table>\n", | |
" </details>\n", | |
" </div>\n", | |
" </div>\n", | |
" \n", | |
" <div style=\"margin-bottom: 20px;\">\n", | |
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
" <div style=\"margin-left: 48px;\">\n", | |
" <details>\n", | |
" <summary>\n", | |
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 2</h4>\n", | |
" </summary>\n", | |
" <table style=\"width: 100%; text-align: left;\">\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Comm: </strong> tcp://127.0.0.1:37497\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total threads: </strong> 1\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:45205/status\" target=\"_blank\">http://127.0.0.1:45205/status</a>\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Memory: </strong> 30.96 GiB\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Nanny: </strong> tcp://127.0.0.1:45659\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\"></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td colspan=\"2\" style=\"text-align: left;\">\n", | |
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-bzkel24l\n", | |
" </td>\n", | |
" </tr>\n", | |
"\n", | |
" \n", | |
"\n", | |
" \n", | |
"\n", | |
" </table>\n", | |
" </details>\n", | |
" </div>\n", | |
" </div>\n", | |
" \n", | |
" <div style=\"margin-bottom: 20px;\">\n", | |
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
" <div style=\"margin-left: 48px;\">\n", | |
" <details>\n", | |
" <summary>\n", | |
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 3</h4>\n", | |
" </summary>\n", | |
" <table style=\"width: 100%; text-align: left;\">\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Comm: </strong> tcp://127.0.0.1:36239\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total threads: </strong> 1\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:41359/status\" target=\"_blank\">http://127.0.0.1:41359/status</a>\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Memory: </strong> 30.96 GiB\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Nanny: </strong> tcp://127.0.0.1:40621\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\"></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td colspan=\"2\" style=\"text-align: left;\">\n", | |
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-qxm06rhx\n", | |
" </td>\n", | |
" </tr>\n", | |
"\n", | |
" \n", | |
"\n", | |
" \n", | |
"\n", | |
" </table>\n", | |
" </details>\n", | |
" </div>\n", | |
" </div>\n", | |
" \n", | |
"\n", | |
" </details>\n", | |
"</div>\n", | |
"\n", | |
" </details>\n", | |
" </div>\n", | |
"</div>\n", | |
" </details>\n", | |
" \n", | |
"\n", | |
" </div>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
"<Client: 'tcp://127.0.0.1:45619' processes=4 threads=4, memory=123.85 GiB>" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"cluster = dd.LocalCluster(n_workers=4, threads_per_worker=1)\n", | |
"client = dd.Client(cluster)\n", | |
"client" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class TileDBSparseArrayWriteWrapper():\n", | |
" def __init__(self, uri):\n", | |
" self.uri = uri\n", | |
" # self.tiledb_array = tiledb_array\n", | |
" \n", | |
" # @property\n", | |
" # def tiledb_array(self):\n", | |
" # return tiledb.open(self.uri, mode=\"w\")\n", | |
"\n", | |
" def __setitem__(self, k: tuple[slice, ...], v: sparse.spmatrix):\n", | |
" assert all(isinstance(k_i, slice) for k_i in k)\n", | |
" row_slice, col_slice = k\n", | |
" row_offset = row_slice.start if row_slice.start is not None else 0\n", | |
" col_offset = col_slice.start if col_slice.start is not None else 0\n", | |
" v_coo = v.tocoo()\n", | |
" tiledb_array = tiledb.open(self.uri, mode=\"w\")\n", | |
" tiledb_array[v_coo.row + row_offset, v_coo.col + col_offset] = v.data\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def write_dask_array_as_tiledb(output_path, X, *, filters=tiledb.FilterList([])):\n", | |
" # Create output array\n", | |
" attrs = [tiledb.Attr(dtype=X.dtype, filters=filters)]\n", | |
" domain = tiledb.Domain(\n", | |
" tiledb.Dim(\n", | |
" name=\"obs\",\n", | |
" domain=(0, X.shape[0] - 1),\n", | |
" tile=min(X.shape[0], 256),\n", | |
" dtype=np.uint32,\n", | |
" filters=filters,\n", | |
" ),\n", | |
" tiledb.Dim(\n", | |
" name=\"var\",\n", | |
" domain=(0, X.shape[1] - 1),\n", | |
" tile=min(X.shape[1], 2048),\n", | |
" dtype=np.uint32,\n", | |
" filters=filters,\n", | |
" ),\n", | |
" )\n", | |
" schema = tiledb.ArraySchema(\n", | |
" domain=domain,\n", | |
" sparse=True,\n", | |
" allows_duplicates=True,\n", | |
" attrs=attrs,\n", | |
" cell_order=\"row-major\",\n", | |
" tile_order=\"row-major\",\n", | |
" capacity=1024000,\n", | |
" )\n", | |
" tiledb.Array.create(output_path, schema)\n", | |
" # X_write = tiledb.open(\"X\", mode=\"w\")\n", | |
"\n", | |
" # Write\n", | |
" X_write = TileDBSparseArrayWriteWrapper(output_path)\n", | |
" X.store(X_write, lock=False, compute=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def to_sparse_matrix(tdb_array: tiledb.SparseArray) -> sparse.spmatrix:\n", | |
" data, row, col = tdb_array[:].values()\n", | |
" return sparse.coo_matrix((data, (row, col)), shape=tdb_array.shape)\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Proof of concept with generated data:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table>\n", | |
" <tr>\n", | |
" <td>\n", | |
" <table style=\"border-collapse: collapse;\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" <td> </td>\n", | |
" <th> Array </th>\n", | |
" <th> Chunk </th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" <th> Shape </th>\n", | |
" <td> (10000, 10000) </td>\n", | |
" <td> (1000, 1000) </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> Dask graph </th>\n", | |
" <td colspan=\"2\"> 100 chunks in 2 graph layers </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> Data type </th>\n", | |
" <td colspan=\"2\"> int64 scipy.sparse._csr.csr_matrix </td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
" </table>\n", | |
" </td>\n", | |
" <td>\n", | |
" <svg width=\"170\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n", | |
"\n", | |
" <!-- Horizontal lines -->\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"120\" y2=\"0\" style=\"stroke-width:2\" />\n", | |
" <line x1=\"0\" y1=\"12\" x2=\"120\" y2=\"12\" />\n", | |
" <line x1=\"0\" y1=\"24\" x2=\"120\" y2=\"24\" />\n", | |
" <line x1=\"0\" y1=\"36\" x2=\"120\" y2=\"36\" />\n", | |
" <line x1=\"0\" y1=\"48\" x2=\"120\" y2=\"48\" />\n", | |
" <line x1=\"0\" y1=\"60\" x2=\"120\" y2=\"60\" />\n", | |
" <line x1=\"0\" y1=\"72\" x2=\"120\" y2=\"72\" />\n", | |
" <line x1=\"0\" y1=\"84\" x2=\"120\" y2=\"84\" />\n", | |
" <line x1=\"0\" y1=\"96\" x2=\"120\" y2=\"96\" />\n", | |
" <line x1=\"0\" y1=\"108\" x2=\"120\" y2=\"108\" />\n", | |
" <line x1=\"0\" y1=\"120\" x2=\"120\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
"\n", | |
" <!-- Vertical lines -->\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
" <line x1=\"12\" y1=\"0\" x2=\"12\" y2=\"120\" />\n", | |
" <line x1=\"24\" y1=\"0\" x2=\"24\" y2=\"120\" />\n", | |
" <line x1=\"36\" y1=\"0\" x2=\"36\" y2=\"120\" />\n", | |
" <line x1=\"48\" y1=\"0\" x2=\"48\" y2=\"120\" />\n", | |
" <line x1=\"60\" y1=\"0\" x2=\"60\" y2=\"120\" />\n", | |
" <line x1=\"72\" y1=\"0\" x2=\"72\" y2=\"120\" />\n", | |
" <line x1=\"84\" y1=\"0\" x2=\"84\" y2=\"120\" />\n", | |
" <line x1=\"96\" y1=\"0\" x2=\"96\" y2=\"120\" />\n", | |
" <line x1=\"108\" y1=\"0\" x2=\"108\" y2=\"120\" />\n", | |
" <line x1=\"120\" y1=\"0\" x2=\"120\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
"\n", | |
" <!-- Colored Rectangle -->\n", | |
" <polygon points=\"0.0,0.0 120.0,0.0 120.0,120.0 0.0,120.0\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n", | |
"\n", | |
" <!-- Text -->\n", | |
" <text x=\"60.000000\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >10000</text>\n", | |
" <text x=\"140.000000\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,140.000000,60.000000)\">10000</text>\n", | |
"</svg>\n", | |
" </td>\n", | |
" </tr>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"dask.array<csr_matrix, shape=(10000, 10000), dtype=int64, chunksize=(1000, 1000), chunktype=scipy.csr_matrix>" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"X = da.random.poisson(.01, size=(10_000, 10_000), chunks=(1000,1000))\n", | |
"X = X.map_blocks(sparse.csr_matrix, meta=sparse.csr_matrix((0, 0), dtype=X.dtype))\n", | |
"X" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.009800000000000001" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Checking level of sparsity\n", | |
"tmp = X[:1000, :1000].compute()\n", | |
"(tmp != 0).mean()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!rm -r X_poisson" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n" | |
] | |
} | |
], | |
"source": [ | |
"write_dask_array_as_tiledb(\"X_poisson\", X)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Round trip test\n", | |
"\n", | |
"from_disk = to_sparse_matrix(tiledb.open(\"X_poisson\"))\n", | |
"from_dask = X.compute()\n", | |
"\n", | |
"assert (from_disk != from_dask).nnz == 0" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Test with real data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import h5py\n", | |
"from anndata.experimental import read_elem_as_dask # This will be in anndata 0.11, which is currently unreleased" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f = h5py.File(\"/home/ubuntu/github/cellxgene-census/b58c5c69-cb34-443d-a273-e440e130649a.h5ad\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table>\n", | |
" <tr>\n", | |
" <td>\n", | |
" <table style=\"border-collapse: collapse;\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" <td> </td>\n", | |
" <th> Array </th>\n", | |
" <th> Chunk </th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" <th> Shape </th>\n", | |
" <td> (1033409, 45854) </td>\n", | |
" <td> (1000, 45854) </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> Dask graph </th>\n", | |
" <td colspan=\"2\"> 1034 chunks in 2 graph layers </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> Data type </th>\n", | |
" <td colspan=\"2\"> float32 scipy.sparse._csr.csr_matrix </td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
" </table>\n", | |
" </td>\n", | |
" <td>\n", | |
" <svg width=\"83\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n", | |
"\n", | |
" <!-- Horizontal lines -->\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"33\" y2=\"0\" style=\"stroke-width:2\" />\n", | |
" <line x1=\"0\" y1=\"6\" x2=\"33\" y2=\"6\" />\n", | |
" <line x1=\"0\" y1=\"12\" x2=\"33\" y2=\"12\" />\n", | |
" <line x1=\"0\" y1=\"18\" x2=\"33\" y2=\"18\" />\n", | |
" <line x1=\"0\" y1=\"25\" x2=\"33\" y2=\"25\" />\n", | |
" <line x1=\"0\" y1=\"31\" x2=\"33\" y2=\"31\" />\n", | |
" <line x1=\"0\" y1=\"37\" x2=\"33\" y2=\"37\" />\n", | |
" <line x1=\"0\" y1=\"44\" x2=\"33\" y2=\"44\" />\n", | |
" <line x1=\"0\" y1=\"50\" x2=\"33\" y2=\"50\" />\n", | |
" <line x1=\"0\" y1=\"56\" x2=\"33\" y2=\"56\" />\n", | |
" <line x1=\"0\" y1=\"63\" x2=\"33\" y2=\"63\" />\n", | |
" <line x1=\"0\" y1=\"69\" x2=\"33\" y2=\"69\" />\n", | |
" <line x1=\"0\" y1=\"75\" x2=\"33\" y2=\"75\" />\n", | |
" <line x1=\"0\" y1=\"82\" x2=\"33\" y2=\"82\" />\n", | |
" <line x1=\"0\" y1=\"88\" x2=\"33\" y2=\"88\" />\n", | |
" <line x1=\"0\" y1=\"94\" x2=\"33\" y2=\"94\" />\n", | |
" <line x1=\"0\" y1=\"101\" x2=\"33\" y2=\"101\" />\n", | |
" <line x1=\"0\" y1=\"107\" x2=\"33\" y2=\"107\" />\n", | |
" <line x1=\"0\" y1=\"113\" x2=\"33\" y2=\"113\" />\n", | |
" <line x1=\"0\" y1=\"120\" x2=\"33\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
"\n", | |
" <!-- Vertical lines -->\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
" <line x1=\"33\" y1=\"0\" x2=\"33\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
"\n", | |
" <!-- Colored Rectangle -->\n", | |
" <polygon points=\"0.0,0.0 33.76909709939219,0.0 33.76909709939219,120.0 0.0,120.0\" style=\"fill:#8B4903A0;stroke-width:0\"/>\n", | |
"\n", | |
" <!-- Text -->\n", | |
" <text x=\"16.884549\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >45854</text>\n", | |
" <text x=\"53.769097\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,53.769097,60.000000)\">1033409</text>\n", | |
"</svg>\n", | |
" </td>\n", | |
" </tr>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"dask.array<make_dask_chunk, shape=(1033409, 45854), dtype=float32, chunksize=(1000, 45854), chunktype=scipy.csr_matrix>" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"X = read_elem_as_dask(f[\"X\"])\n", | |
"X" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!rm -r X" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 18.1 s, sys: 4.04 s, total: 22.1 s\n", | |
"Wall time: 3min 54s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"write_dask_array_as_tiledb(\"X\", X, filters=tiledb.FilterList([tiledb.ZstdFilter(level=5)]))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "cellxgene-census-dev", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment