Last active
July 22, 2024 23:06
-
-
Save ivirshup/dc39029ad439cef4755e45582fc35541 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Dask + census demo" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Setup" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%load_ext memory_profiler" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n" | |
] | |
} | |
], | |
"source": [ | |
"import multiprocessing\n", | |
"multiprocessing.set_start_method(\"spawn\")\n", | |
"\n", | |
"from pathlib import Path\n", | |
"\n", | |
"import tiledb\n", | |
"import tiledbsoma\n", | |
"import cellxgene_census\n", | |
"from tiledbsoma import SOMATileDBContext\n", | |
"\n", | |
"from dask.array.core import slices_from_chunks\n", | |
"import dask.array as da\n", | |
"from dask import delayed\n", | |
"import dask.distributed as dd\n", | |
"\n", | |
"from scipy import sparse\n", | |
"import numpy as np\n", | |
"\n", | |
"import anndata as ad, scanpy as sc\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"SOMA_URI= \"s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma\"\n", | |
"SPECIES = \"mus_musculus\"\n", | |
"\n", | |
"SPARSE_CHUNK_SIZE = 10_000\n", | |
"DENSE_CHUNK_SIZE = 1_000" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\n", | |
"Perhaps you already have a cluster running?\n", | |
"Hosting the HTTP server on port 41035 instead\n", | |
" warnings.warn(\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n", | |
" <div style=\"margin-left: 48px;\">\n", | |
" <h3 style=\"margin-bottom: 0px;\">Client</h3>\n", | |
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-a1a2c73b-45ef-11ef-9341-023ca6c22285</p>\n", | |
" <table style=\"width: 100%; text-align: left;\">\n", | |
"\n", | |
" <tr>\n", | |
" \n", | |
" <td style=\"text-align: left;\"><strong>Connection method:</strong> Cluster object</td>\n", | |
" <td style=\"text-align: left;\"><strong>Cluster type:</strong> distributed.LocalCluster</td>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" \n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:41035/status\" target=\"_blank\">http://127.0.0.1:41035/status</a>\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\"></td>\n", | |
" </tr>\n", | |
" \n", | |
"\n", | |
" </table>\n", | |
"\n", | |
" \n", | |
"\n", | |
" \n", | |
" <details>\n", | |
" <summary style=\"margin-bottom: 20px;\"><h3 style=\"display: inline;\">Cluster Info</h3></summary>\n", | |
" <div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output\">\n", | |
" <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\">\n", | |
" </div>\n", | |
" <div style=\"margin-left: 48px;\">\n", | |
" <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCluster</h3>\n", | |
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">8caf40bb</p>\n", | |
" <table style=\"width: 100%; text-align: left;\">\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:41035/status\" target=\"_blank\">http://127.0.0.1:41035/status</a>\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Workers:</strong> 4\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total threads:</strong> 32\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total memory:</strong> 123.85 GiB\n", | |
" </td>\n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\"><strong>Status:</strong> running</td>\n", | |
" <td style=\"text-align: left;\"><strong>Using processes:</strong> True</td>\n", | |
"</tr>\n", | |
"\n", | |
" \n", | |
" </table>\n", | |
"\n", | |
" <details>\n", | |
" <summary style=\"margin-bottom: 20px;\">\n", | |
" <h3 style=\"display: inline;\">Scheduler Info</h3>\n", | |
" </summary>\n", | |
"\n", | |
" <div style=\"\">\n", | |
" <div>\n", | |
" <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n", | |
" <div style=\"margin-left: 48px;\">\n", | |
" <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n", | |
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-90193f8a-a619-4b49-a4b9-a0f0e888916a</p>\n", | |
" <table style=\"width: 100%; text-align: left;\">\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Comm:</strong> tcp://127.0.0.1:40857\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Workers:</strong> 4\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:41035/status\" target=\"_blank\">http://127.0.0.1:41035/status</a>\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total threads:</strong> 32\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Started:</strong> Just now\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total memory:</strong> 123.85 GiB\n", | |
" </td>\n", | |
" </tr>\n", | |
" </table>\n", | |
" </div>\n", | |
" </div>\n", | |
"\n", | |
" <details style=\"margin-left: 48px;\">\n", | |
" <summary style=\"margin-bottom: 20px;\">\n", | |
" <h3 style=\"display: inline;\">Workers</h3>\n", | |
" </summary>\n", | |
"\n", | |
" \n", | |
" <div style=\"margin-bottom: 20px;\">\n", | |
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
" <div style=\"margin-left: 48px;\">\n", | |
" <details>\n", | |
" <summary>\n", | |
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 0</h4>\n", | |
" </summary>\n", | |
" <table style=\"width: 100%; text-align: left;\">\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Comm: </strong> tcp://127.0.0.1:39833\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total threads: </strong> 8\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:44407/status\" target=\"_blank\">http://127.0.0.1:44407/status</a>\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Memory: </strong> 30.96 GiB\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Nanny: </strong> tcp://127.0.0.1:32809\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\"></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td colspan=\"2\" style=\"text-align: left;\">\n", | |
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-66s976ft\n", | |
" </td>\n", | |
" </tr>\n", | |
"\n", | |
" \n", | |
"\n", | |
" \n", | |
"\n", | |
" </table>\n", | |
" </details>\n", | |
" </div>\n", | |
" </div>\n", | |
" \n", | |
" <div style=\"margin-bottom: 20px;\">\n", | |
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
" <div style=\"margin-left: 48px;\">\n", | |
" <details>\n", | |
" <summary>\n", | |
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 1</h4>\n", | |
" </summary>\n", | |
" <table style=\"width: 100%; text-align: left;\">\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Comm: </strong> tcp://127.0.0.1:46849\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total threads: </strong> 8\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:44099/status\" target=\"_blank\">http://127.0.0.1:44099/status</a>\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Memory: </strong> 30.96 GiB\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Nanny: </strong> tcp://127.0.0.1:42921\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\"></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td colspan=\"2\" style=\"text-align: left;\">\n", | |
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-ucxhk9da\n", | |
" </td>\n", | |
" </tr>\n", | |
"\n", | |
" \n", | |
"\n", | |
" \n", | |
"\n", | |
" </table>\n", | |
" </details>\n", | |
" </div>\n", | |
" </div>\n", | |
" \n", | |
" <div style=\"margin-bottom: 20px;\">\n", | |
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
" <div style=\"margin-left: 48px;\">\n", | |
" <details>\n", | |
" <summary>\n", | |
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 2</h4>\n", | |
" </summary>\n", | |
" <table style=\"width: 100%; text-align: left;\">\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Comm: </strong> tcp://127.0.0.1:34231\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total threads: </strong> 8\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:33539/status\" target=\"_blank\">http://127.0.0.1:33539/status</a>\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Memory: </strong> 30.96 GiB\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Nanny: </strong> tcp://127.0.0.1:39305\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\"></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td colspan=\"2\" style=\"text-align: left;\">\n", | |
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-fy_h0eq8\n", | |
" </td>\n", | |
" </tr>\n", | |
"\n", | |
" \n", | |
"\n", | |
" \n", | |
"\n", | |
" </table>\n", | |
" </details>\n", | |
" </div>\n", | |
" </div>\n", | |
" \n", | |
" <div style=\"margin-bottom: 20px;\">\n", | |
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
" <div style=\"margin-left: 48px;\">\n", | |
" <details>\n", | |
" <summary>\n", | |
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 3</h4>\n", | |
" </summary>\n", | |
" <table style=\"width: 100%; text-align: left;\">\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Comm: </strong> tcp://127.0.0.1:41333\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Total threads: </strong> 8\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:33359/status\" target=\"_blank\">http://127.0.0.1:33359/status</a>\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Memory: </strong> 30.96 GiB\n", | |
" </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td style=\"text-align: left;\">\n", | |
" <strong>Nanny: </strong> tcp://127.0.0.1:42953\n", | |
" </td>\n", | |
" <td style=\"text-align: left;\"></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td colspan=\"2\" style=\"text-align: left;\">\n", | |
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-xhino_04\n", | |
" </td>\n", | |
" </tr>\n", | |
"\n", | |
" \n", | |
"\n", | |
" \n", | |
"\n", | |
" </table>\n", | |
" </details>\n", | |
" </div>\n", | |
" </div>\n", | |
" \n", | |
"\n", | |
" </details>\n", | |
"</div>\n", | |
"\n", | |
" </details>\n", | |
" </div>\n", | |
"</div>\n", | |
" </details>\n", | |
" \n", | |
"\n", | |
" </div>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
"<Client: 'tcp://127.0.0.1:40857' processes=4 threads=32, memory=123.85 GiB>" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"cluster = dd.LocalCluster(n_workers=4)\n", | |
"client = dd.Client(cluster)\n", | |
"client" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Creating a dask array from census" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def to_listed_chunks(chunk_size: int, dim_size: int) -> list[int]:\n", | |
" n_full, rem = divmod(dim_size, chunk_size)\n", | |
" chunk_list = [chunk_size] * n_full\n", | |
" if rem:\n", | |
" chunk_list += [rem]\n", | |
" return chunk_list\n", | |
"\n", | |
"\n", | |
"def make_sparse_chunk(array: tiledb.Array, tile_slices: list[slice]) -> da.Array:\n", | |
" shape = [(s.stop - s.start) for s in tile_slices]\n", | |
" def _inner(array, tile_slices):\n", | |
" res = array[tile_slices]\n", | |
" offsets = [s.start for s in tile_slices]\n", | |
" res[\"soma_dim_0\"] -= offsets[0]\n", | |
" res[\"soma_dim_1\"] -= offsets[1]\n", | |
" return sparse.csr_matrix((res[\"soma_data\"], (res[\"soma_dim_0\"], res[\"soma_dim_1\"])), shape=shape)\n", | |
" # return sparse.csr_matrix((res[\"soma_data\"], (res[\"soma_dim_0\"] - offsets[0], res[\"soma_dim_1\"] - offsets[1])), shape=shape)\n", | |
" return da.from_delayed(delayed(_inner)(array, tile_slices), shape=shape, meta=sparse.csr_matrix((0, 0), dtype=array.dtype))\n", | |
"\n", | |
"\n", | |
"def tiledb_sparse_as_dask(tdb_array: tiledb.Array, row_chunks: int = SPARSE_CHUNK_SIZE) -> da.Array:\n", | |
" schema = tdb_array.schema\n", | |
" # chunks = list(schema.domain.dim(i).tile for i in range(schema.ndim))\n", | |
" chunks = [row_chunks, tdb_array.shape[1]]\n", | |
" # Simplifying to have complete slices across rows\n", | |
" slices: list[list[slice]] = slices_from_chunks((to_listed_chunks(chunks[0], tdb_array.shape[0]), [tdb_array.shape[1]]))\n", | |
" return da.concatenate(\n", | |
" [make_sparse_chunk(tdb_array, s) for s in slices], axis=0\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Opening the tiledb array and creating a dask array " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 5.31 s, sys: 995 ms, total: 6.31 s\n", | |
"Wall time: 6.07 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<table>\n", | |
" <tr>\n", | |
" <td>\n", | |
" <table style=\"border-collapse: collapse;\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" <td> </td>\n", | |
" <th> Array </th>\n", | |
" <th> Chunk </th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" <th> Shape </th>\n", | |
" <td> (5684805, 52417) </td>\n", | |
" <td> (1000, 52417) </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> Dask graph </th>\n", | |
" <td colspan=\"2\"> 5685 chunks in 11371 graph layers </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> Data type </th>\n", | |
" <td colspan=\"2\"> float32 scipy.sparse._csr.csr_matrix </td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
" </table>\n", | |
" </td>\n", | |
" <td>\n", | |
" <svg width=\"75\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n", | |
"\n", | |
" <!-- Horizontal lines -->\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"25\" y2=\"0\" style=\"stroke-width:2\" />\n", | |
" <line x1=\"0\" y1=\"6\" x2=\"25\" y2=\"6\" />\n", | |
" <line x1=\"0\" y1=\"12\" x2=\"25\" y2=\"12\" />\n", | |
" <line x1=\"0\" y1=\"18\" x2=\"25\" y2=\"18\" />\n", | |
" <line x1=\"0\" y1=\"25\" x2=\"25\" y2=\"25\" />\n", | |
" <line x1=\"0\" y1=\"31\" x2=\"25\" y2=\"31\" />\n", | |
" <line x1=\"0\" y1=\"37\" x2=\"25\" y2=\"37\" />\n", | |
" <line x1=\"0\" y1=\"44\" x2=\"25\" y2=\"44\" />\n", | |
" <line x1=\"0\" y1=\"50\" x2=\"25\" y2=\"50\" />\n", | |
" <line x1=\"0\" y1=\"56\" x2=\"25\" y2=\"56\" />\n", | |
" <line x1=\"0\" y1=\"63\" x2=\"25\" y2=\"63\" />\n", | |
" <line x1=\"0\" y1=\"69\" x2=\"25\" y2=\"69\" />\n", | |
" <line x1=\"0\" y1=\"75\" x2=\"25\" y2=\"75\" />\n", | |
" <line x1=\"0\" y1=\"82\" x2=\"25\" y2=\"82\" />\n", | |
" <line x1=\"0\" y1=\"88\" x2=\"25\" y2=\"88\" />\n", | |
" <line x1=\"0\" y1=\"94\" x2=\"25\" y2=\"94\" />\n", | |
" <line x1=\"0\" y1=\"101\" x2=\"25\" y2=\"101\" />\n", | |
" <line x1=\"0\" y1=\"107\" x2=\"25\" y2=\"107\" />\n", | |
" <line x1=\"0\" y1=\"113\" x2=\"25\" y2=\"113\" />\n", | |
" <line x1=\"0\" y1=\"120\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
"\n", | |
" <!-- Vertical lines -->\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
" <line x1=\"25\" y1=\"0\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
"\n", | |
" <!-- Colored Rectangle -->\n", | |
" <polygon points=\"0.0,0.0 25.412616514582485,0.0 25.412616514582485,120.0 0.0,120.0\" style=\"fill:#8B4903A0;stroke-width:0\"/>\n", | |
"\n", | |
" <!-- Text -->\n", | |
" <text x=\"12.706308\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >52417</text>\n", | |
" <text x=\"45.412617\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,45.412617,60.000000)\">5684805</text>\n", | |
"</svg>\n", | |
" </td>\n", | |
" </tr>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"dask.array<concatenate, shape=(5684805, 52417), dtype=float32, chunksize=(1000, 52417), chunktype=scipy.csr_matrix>" | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# need to specify anonymous access\n", | |
"ctx = {\n", | |
" \"vfs.s3.no_sign_request\": \"true\",\n", | |
" \"vfs.s3.region\": \"us-west-2\"\n", | |
"}\n", | |
"\n", | |
"tiledb_array = tiledb.open(\n", | |
" f\"{SOMA_URI}/census_data/{SPECIES}/ms/RNA/X/raw/\",\n", | |
" ctx=tiledb.Ctx(ctx),\n", | |
")\n", | |
"\n", | |
"X = tiledb_sparse_as_dask(tiledb_array, row_chunks=1_000)\n", | |
"X" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"For the sake of time, I am going to slim this down to the first million cells" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X = X[:1_000_000]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 2.66 s, sys: 2.78 s, total: 5.44 s\n", | |
"Wall time: 3.58 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# Retrieving obs and var as pandas dataframes\n", | |
"census = cellxgene_census.open_soma(uri=SOMA_URI, context=SOMATileDBContext(tiledb_config=ctx))\n", | |
"obs = cellxgene_census.get_obs(census, SPECIES, coords=slice(X.shape[0] - 1))\n", | |
"var = cellxgene_census.get_var(census, SPECIES)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/ubuntu/github/anndata/src/anndata/_core/aligned_df.py:67: ImplicitModificationWarning: Transforming to str index.\n", | |
" warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n", | |
"/home/ubuntu/github/anndata/src/anndata/_core/aligned_df.py:67: ImplicitModificationWarning: Transforming to str index.\n", | |
" warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 590 ms, sys: 130 ms, total: 720 ms\n", | |
"Wall time: 708 ms\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"AnnData object with n_obs × n_vars = 1000000 × 52417\n", | |
" obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'\n", | |
" var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'" | |
] | |
}, | |
"execution_count": 34, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"adata = ad.AnnData(X=X, obs=obs, var=var)\n", | |
"adata" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Groupby" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"First example is to do a groupby reduction over the entire census. This will be using the `flox` library. This library currently doesn't have support for sparse chunks, so we'll need to densify." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import flox" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X = tiledb_sparse_as_dask(tiledb_array, row_chunks=1_000)[:1_000_000]\n", | |
"X_dense = X.map_blocks(\n", | |
" lambda x: x.toarray(), dtype=X.dtype, meta=np.array([])\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Creating vector to groupby\n", | |
"cell_type = obs[\"cell_type\"].astype(\"category\")\n", | |
"dev_stage = obs[\"development_stage\"].astype(\"category\")\n", | |
"\n", | |
"# encoding cell_type and developmental_stage together:\n", | |
"dev_x_celltype = dev_stage.cat.codes * len(cell_type.cat.categories) + cell_type.cat.codes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 850 ms, sys: 192 ms, total: 1.04 s\n", | |
"Wall time: 926 ms\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<table>\n", | |
" <tr>\n", | |
" <td>\n", | |
" <table style=\"border-collapse: collapse;\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" <td> </td>\n", | |
" <th> Array </th>\n", | |
" <th> Chunk </th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" <th> Bytes </th>\n", | |
" <td> 121.77 MiB </td>\n", | |
" <td> 19.20 MiB </td>\n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" <th> Shape </th>\n", | |
" <td> (52417, 609) </td>\n", | |
" <td> (52417, 96) </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> Dask graph </th>\n", | |
" <td colspan=\"2\"> 134 chunks in 11474 graph layers </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> Data type </th>\n", | |
" <td colspan=\"2\"> float32 numpy.ndarray </td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
" </table>\n", | |
" </td>\n", | |
" <td>\n", | |
" <svg width=\"76\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n", | |
"\n", | |
" <!-- Horizontal lines -->\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"26\" y2=\"0\" style=\"stroke-width:2\" />\n", | |
" <line x1=\"0\" y1=\"120\" x2=\"26\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
"\n", | |
" <!-- Vertical lines -->\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" />\n", | |
" <line x1=\"1\" y1=\"0\" x2=\"1\" y2=\"120\" />\n", | |
" <line x1=\"2\" y1=\"0\" x2=\"2\" y2=\"120\" />\n", | |
" <line x1=\"3\" y1=\"0\" x2=\"3\" y2=\"120\" />\n", | |
" <line x1=\"5\" y1=\"0\" x2=\"5\" y2=\"120\" />\n", | |
" <line x1=\"6\" y1=\"0\" x2=\"6\" y2=\"120\" />\n", | |
" <line x1=\"6\" y1=\"0\" x2=\"6\" y2=\"120\" />\n", | |
" <line x1=\"7\" y1=\"0\" x2=\"7\" y2=\"120\" />\n", | |
" <line x1=\"9\" y1=\"0\" x2=\"9\" y2=\"120\" />\n", | |
" <line x1=\"9\" y1=\"0\" x2=\"9\" y2=\"120\" />\n", | |
" <line x1=\"11\" y1=\"0\" x2=\"11\" y2=\"120\" />\n", | |
" <line x1=\"12\" y1=\"0\" x2=\"12\" y2=\"120\" />\n", | |
" <line x1=\"13\" y1=\"0\" x2=\"13\" y2=\"120\" />\n", | |
" <line x1=\"17\" y1=\"0\" x2=\"17\" y2=\"120\" />\n", | |
" <line x1=\"18\" y1=\"0\" x2=\"18\" y2=\"120\" />\n", | |
" <line x1=\"19\" y1=\"0\" x2=\"19\" y2=\"120\" />\n", | |
" <line x1=\"20\" y1=\"0\" x2=\"20\" y2=\"120\" />\n", | |
" <line x1=\"21\" y1=\"0\" x2=\"21\" y2=\"120\" />\n", | |
" <line x1=\"26\" y1=\"0\" x2=\"26\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
"\n", | |
" <!-- Colored Rectangle -->\n", | |
" <polygon points=\"0.0,0.0 26.14516872185634,0.0 26.14516872185634,120.0 0.0,120.0\" style=\"fill:#8B4903A0;stroke-width:0\"/>\n", | |
"\n", | |
" <!-- Text -->\n", | |
" <text x=\"13.072584\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >609</text>\n", | |
" <text x=\"46.145169\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,46.145169,60.000000)\">52417</text>\n", | |
"</svg>\n", | |
" </td>\n", | |
" </tr>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"dask.array<getitem, shape=(52417, 609), dtype=float32, chunksize=(52417, 96), chunktype=numpy.ndarray>" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"agg_delayed, groups = flox.groupby_reduce(\n", | |
" X_dense.T,\n", | |
" dev_x_celltype,\n", | |
" func=\"sum\",\n", | |
" # method=\"map-reduce\",\n", | |
")\n", | |
"agg_delayed" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 40.7 s, sys: 13.3 s, total: 54 s\n", | |
"Wall time: 4min\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[0.000e+00, 0.000e+00, 0.000e+00, ..., 3.085e+03, 3.400e+01,\n", | |
" 5.900e+02],\n", | |
" [3.000e+00, 0.000e+00, 1.000e+00, ..., 1.000e+00, 0.000e+00,\n", | |
" 1.000e+00],\n", | |
" [1.090e+02, 0.000e+00, 6.700e+01, ..., 1.000e+00, 0.000e+00,\n", | |
" 0.000e+00],\n", | |
" ...,\n", | |
" [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,\n", | |
" 0.000e+00],\n", | |
" [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,\n", | |
" 0.000e+00],\n", | |
" [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,\n", | |
" 0.000e+00]], dtype=float32)" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"agg = agg_delayed.compute()\n", | |
"agg" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Data Loader can easily be implemented with dask, and is a bit faster" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import cellxgene_census.experimental.ml as census_ml\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def list_split(arr_list:list, sublist_len: int) -> list[list]:\n", | |
" \"\"\"Splits a python list into a list of sublists where each sublist is of size `sublist_len`.\n", | |
" TODO: Replace with `itertools.batched` when Python 3.12 becomes the minimum supported version.\n", | |
" \"\"\"\n", | |
" i = 0\n", | |
" result = []\n", | |
" while i < len(arr_list):\n", | |
" if (i + sublist_len) >= len(arr_list):\n", | |
" result.append(arr_list[i:])\n", | |
" else:\n", | |
" result.append(arr_list[i : i + sublist_len])\n", | |
"\n", | |
" i += sublist_len\n", | |
"\n", | |
" return result" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"def shuffle_buffer_batches(X: da.Array, batch_size: int, shuffled_buffer_n_chunks: int, rng=None):\n", | |
" \"\"\"Simplified implementation of shuffling approach in census\"\"\"\n", | |
" if rng is None:\n", | |
" rng = np.random.default_rng()\n", | |
"\n", | |
" chunk_idx = np.arange(len(X.chunks[0]))\n", | |
" shuffled_chunk_idx = rng.permutation(chunk_idx)\n", | |
"\n", | |
" chunks_list = list_split(shuffled_chunk_idx, shuffled_buffer_n_chunks)\n", | |
" for chunk_idxs in chunks_list:\n", | |
" full = X.blocks[chunk_idxs].compute()\n", | |
" iter_order = rng.permutation(np.arange(full.shape[0]))\n", | |
" full = full[iter_order]\n", | |
" # full = rng.permutation(full) # Ideally do this without copying\n", | |
" cur_pos = 0\n", | |
" while True:\n", | |
" next_pos = min(cur_pos + batch_size, full.shape[0])\n", | |
" yield full[cur_pos:next_pos]\n", | |
" if next_pos == full.shape[0]:\n", | |
" break\n", | |
" else:\n", | |
" cur_pos = next_pos" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 10.5 s, sys: 2.11 s, total: 12.6 s\n", | |
"Wall time: 11.8 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time dask_loader_iter = shuffle_buffer_batches(tiledb_sparse_as_dask(tiledb_array, row_chunks=512)[:1_000_000], batch_size=1_000, shuffled_buffer_n_chunks=100)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"cluster.scale(16)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1min 27s, sys: 1min 9s, total: 2min 36s\n", | |
"Wall time: 3min 27s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"for batch in dask_loader_iter:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 3.11 ms, sys: 2.79 ms, total: 5.9 ms\n", | |
"Wall time: 5.2 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"dp = census_ml.ExperimentDataPipe(\n", | |
" census[\"census_data\"][SPECIES],\n", | |
" shuffle=True,\n", | |
" shuffle_chunk_count=100,\n", | |
" batch_size=1000,\n", | |
" soma_chunk_size=512,\n", | |
" obs_query=tiledbsoma.AxisQuery(coords=(slice(0, 1_000_000 - 1),))\n", | |
" # encoders=[DefaultEncoder(\"soma_joinid\"), DefaultEncoder(\"assay\")]\n", | |
")\n", | |
"census_loader_iter = iter(dp)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2024-07-19 17:08:33,640 - distributed.utils_perf - WARNING - full garbage collections took 10% CPU time recently (threshold: 10%)\n", | |
"2024-07-19 17:08:39,453 - distributed.utils_perf - WARNING - full garbage collections took 10% CPU time recently (threshold: 10%)\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 10min 34s, sys: 9min 23s, total: 19min 57s\n", | |
"Wall time: 5min 48s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"for batch in census_loader_iter:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Free up memory\n", | |
"del dp, census_loader_iter, dask_loader_iter" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Scanpy (+ memory issue)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"cluster.scale(3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"adata.layers[\"counts\"] = adata.X.copy()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 5.77 s, sys: 161 ms, total: 5.93 s\n", | |
"Wall time: 5.83 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"sc.pp.normalize_total(adata)\n", | |
"sc.pp.log1p(adata)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n", | |
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
" warnings.warn(\n" | |
] | |
}, | |
{ | |
"ename": "KeyboardInterrupt", | |
"evalue": "", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", | |
"File \u001b[0;32m<timed eval>:1\u001b[0m\n", | |
"File \u001b[0;32m~/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/legacy_api_wrap/__init__.py:80\u001b[0m, in \u001b[0;36mlegacy_api.<locals>.wrapper.<locals>.fn_compatible\u001b[0;34m(*args_all, **kw)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(fn)\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfn_compatible\u001b[39m(\u001b[38;5;241m*\u001b[39margs_all: P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkw: P\u001b[38;5;241m.\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m R:\n\u001b[1;32m 79\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args_all) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m n_positional:\n\u001b[0;32m---> 80\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs_all\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 82\u001b[0m args_pos: P\u001b[38;5;241m.\u001b[39margs\n\u001b[1;32m 83\u001b[0m args_pos, args_rest \u001b[38;5;241m=\u001b[39m args_all[:n_positional], args_all[n_positional:]\n", | |
"File \u001b[0;32m~/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/scanpy/preprocessing/_highly_variable_genes.py:677\u001b[0m, in \u001b[0;36mhighly_variable_genes\u001b[0;34m(***failed resolving arguments***)\u001b[0m\n\u001b[1;32m 674\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m min_disp, max_disp, min_mean, max_mean, n_top_genes\n\u001b[1;32m 676\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m batch_key \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 677\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43m_highly_variable_genes_single_batch\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 678\u001b[0m \u001b[43m \u001b[49m\u001b[43madata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlayer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlayer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcutoff\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcutoff\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_bins\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_bins\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mflavor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mflavor\u001b[49m\n\u001b[1;32m 679\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 680\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 681\u001b[0m df \u001b[38;5;241m=\u001b[39m _highly_variable_genes_batched(\n\u001b[1;32m 682\u001b[0m adata, batch_key, layer\u001b[38;5;241m=\u001b[39mlayer, cutoff\u001b[38;5;241m=\u001b[39mcutoff, n_bins\u001b[38;5;241m=\u001b[39mn_bins, flavor\u001b[38;5;241m=\u001b[39mflavor\n\u001b[1;32m 683\u001b[0m )\n", | |
"File \u001b[0;32m~/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/scanpy/preprocessing/_highly_variable_genes.py:299\u001b[0m, in \u001b[0;36m_highly_variable_genes_single_batch\u001b[0;34m(adata, layer, cutoff, n_bins, flavor)\u001b[0m\n\u001b[1;32m 296\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 297\u001b[0m X \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mexpm1(X)\n\u001b[0;32m--> 299\u001b[0m mean, var \u001b[38;5;241m=\u001b[39m \u001b[43mmaterialize_as_ndarray\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_get_mean_var\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 300\u001b[0m \u001b[38;5;66;03m# now actually compute the dispersion\u001b[39;00m\n\u001b[1;32m 301\u001b[0m mean[mean \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1e-12\u001b[39m \u001b[38;5;66;03m# set entries equal to zero to small value\u001b[39;00m\n", | |
"File \u001b[0;32m~/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/scanpy/preprocessing/_distributed.py:49\u001b[0m, in \u001b[0;36mmaterialize_as_ndarray\u001b[0;34m(a)\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(np\u001b[38;5;241m.\u001b[39masarray(arr) \u001b[38;5;28;01mfor\u001b[39;00m arr \u001b[38;5;129;01min\u001b[39;00m a)\n\u001b[1;32m 47\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdask\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01marray\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mda\u001b[39;00m\n\u001b[0;32m---> 49\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mda\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43ma\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msync\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/dask/base.py:662\u001b[0m, in \u001b[0;36mcompute\u001b[0;34m(traverse, optimize_graph, scheduler, get, *args, **kwargs)\u001b[0m\n\u001b[1;32m 659\u001b[0m postcomputes\u001b[38;5;241m.\u001b[39mappend(x\u001b[38;5;241m.\u001b[39m__dask_postcompute__())\n\u001b[1;32m 661\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m shorten_traceback():\n\u001b[0;32m--> 662\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mschedule\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdsk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 664\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m repack([f(r, \u001b[38;5;241m*\u001b[39ma) \u001b[38;5;28;01mfor\u001b[39;00m r, (f, a) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(results, postcomputes)])\n", | |
"File \u001b[0;32m~/miniforge3/envs/cellxgene-census-dev/lib/python3.11/threading.py:629\u001b[0m, in \u001b[0;36mEvent.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 627\u001b[0m signaled \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_flag\n\u001b[1;32m 628\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m signaled:\n\u001b[0;32m--> 629\u001b[0m signaled \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cond\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 630\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m signaled\n", | |
"File \u001b[0;32m~/miniforge3/envs/cellxgene-census-dev/lib/python3.11/threading.py:331\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 329\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 330\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m--> 331\u001b[0m gotit \u001b[38;5;241m=\u001b[39m \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 332\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 333\u001b[0m gotit \u001b[38;5;241m=\u001b[39m waiter\u001b[38;5;241m.\u001b[39macquire(\u001b[38;5;28;01mFalse\u001b[39;00m)\n", | |
"\u001b[0;31mKeyboardInterrupt\u001b[0m: " | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"sc.pp.highly_variable_genes(adata)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "cellxgene-census-dev", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment