Skip to content

Instantly share code, notes, and snippets.

@ivirshup
Last active July 22, 2024 23:06
Show Gist options
  • Save ivirshup/dc39029ad439cef4755e45582fc35541 to your computer and use it in GitHub Desktop.
Save ivirshup/dc39029ad439cef4755e45582fc35541 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup\n",
"\n",
"Imports, variables, and starts a dask client"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import multiprocessing\n",
"multiprocessing.set_start_method(\"spawn\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n",
" warnings.warn(\n"
]
}
],
"source": [
"import tiledb\n",
"import tiledbsoma\n",
"import cellxgene_census\n",
"from tiledbsoma import SOMATileDBContext\n",
"\n",
"import dask.array as da\n",
"# from dask import delayed\n",
"import dask.distributed as dd\n",
"\n",
"from scipy import sparse\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"# import anndata as ad, scanpy as sc\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"CENSUS_VERSION = \"2024-07-01\"\n",
"SOMA_URI= f\"s3://cellxgene-census-public-us-west-2/cell-census/{CENSUS_VERSION}/soma\"\n",
"SPECIES = \"mus_musculus\"\n",
"\n",
"SPARSE_CHUNK_SIZE = 10_000\n",
"DENSE_CHUNK_SIZE = 1_000\n",
"\n",
"CTX = {\n",
" \"vfs.s3.no_sign_request\": \"true\",\n",
" \"vfs.s3.region\": \"us-west-2\"\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
" <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <h3 style=\"margin-bottom: 0px;\">Client</h3>\n",
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-9ccac347-487d-11ef-a2f8-023ca6c22285</p>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
"\n",
" <tr>\n",
" \n",
" <td style=\"text-align: left;\"><strong>Connection method:</strong> Cluster object</td>\n",
" <td style=\"text-align: left;\"><strong>Cluster type:</strong> distributed.LocalCluster</td>\n",
" \n",
" </tr>\n",
"\n",
" \n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" \n",
"\n",
" </table>\n",
"\n",
" \n",
"\n",
" \n",
" <details>\n",
" <summary style=\"margin-bottom: 20px;\"><h3 style=\"display: inline;\">Cluster Info</h3></summary>\n",
" <div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\">\n",
" </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCluster</h3>\n",
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">ef0ad759</p>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Workers:</strong> 8\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads:</strong> 32\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total memory:</strong> 123.85 GiB\n",
" </td>\n",
" </tr>\n",
" \n",
" <tr>\n",
" <td style=\"text-align: left;\"><strong>Status:</strong> running</td>\n",
" <td style=\"text-align: left;\"><strong>Using processes:</strong> True</td>\n",
"</tr>\n",
"\n",
" \n",
" </table>\n",
"\n",
" <details>\n",
" <summary style=\"margin-bottom: 20px;\">\n",
" <h3 style=\"display: inline;\">Scheduler Info</h3>\n",
" </summary>\n",
"\n",
" <div style=\"\">\n",
" <div>\n",
" <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n",
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-6b09d772-2a5a-4906-894d-40300a8824ec</p>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm:</strong> tcp://127.0.0.1:41895\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Workers:</strong> 8\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads:</strong> 32\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Started:</strong> Just now\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total memory:</strong> 123.85 GiB\n",
" </td>\n",
" </tr>\n",
" </table>\n",
" </div>\n",
" </div>\n",
"\n",
" <details style=\"margin-left: 48px;\">\n",
" <summary style=\"margin-bottom: 20px;\">\n",
" <h3 style=\"display: inline;\">Workers</h3>\n",
" </summary>\n",
"\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 0</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:42777\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:34579/status\" target=\"_blank\">http://127.0.0.1:34579/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:39589\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-ksch0uc7\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 1</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:33189\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:41809/status\" target=\"_blank\">http://127.0.0.1:41809/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:39211\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-4i_whljq\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 2</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:46349\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:35117/status\" target=\"_blank\">http://127.0.0.1:35117/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:37289\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-vrph97p7\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 3</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:33619\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:46295/status\" target=\"_blank\">http://127.0.0.1:46295/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:38493\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-pnc8s3td\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 4</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:38525\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:33985/status\" target=\"_blank\">http://127.0.0.1:33985/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:35857\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-9zdfgqqc\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 5</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:39083\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:44377/status\" target=\"_blank\">http://127.0.0.1:44377/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:46067\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-nwbo4mgv\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 6</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:34953\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:37703/status\" target=\"_blank\">http://127.0.0.1:37703/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:35035\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-d69pd78e\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 7</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:34655\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:37417/status\" target=\"_blank\">http://127.0.0.1:37417/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:42439\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-fyu56ap9\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
"\n",
" </details>\n",
"</div>\n",
"\n",
" </details>\n",
" </div>\n",
"</div>\n",
" </details>\n",
" \n",
"\n",
" </div>\n",
"</div>"
],
"text/plain": [
"<Client: 'tcp://127.0.0.1:41895' processes=8 threads=32, memory=123.85 GiB>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cluster = dd.LocalCluster()\n",
"client = dd.Client(cluster)\n",
"client"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Functions"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def to_listed_chunks(chunk_size: int, dim_size: int) -> list[int]:\n",
" \"\"\"Go from single integer to list representation of a chunking scheme.\n",
"\n",
" Some rules about how this behaves:\n",
"\n",
" d, r := divmod(n, mod)\n",
" (*((mod,) * d), r) := to_listed_chunks(mod, n)\n",
" map(len, itertools.batched(range(n), mod))) := to_listed_chunks(mod, n)\n",
"\n",
"\n",
" Examples\n",
" --------\n",
" >>> to_listed_chunks(10, 25)\n",
" [10, 10, 5]\n",
" >>> to_listed_chunks(3, 9)\n",
" [3, 3, 3]\n",
" \"\"\"\n",
" n_full, rem = divmod(dim_size, chunk_size)\n",
" chunk_list = [chunk_size] * n_full\n",
" if rem:\n",
" chunk_list += [rem]\n",
" return chunk_list"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"to_listed_chunks(1000, 10_000)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from numba import njit\n",
"\n",
"@njit\n",
"def cmap(x, y, out=None):\n",
" if out is None:\n",
" out = np.empty_like(x)\n",
" d = dict()\n",
" for i, y_i in enumerate(y):\n",
" d[y_i] = i\n",
" # out = np.empty_like(y, shape=x.shape)\n",
" for i, x_i in enumerate(x):\n",
" out[i] = d[x_i]\n",
" return out"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def list_split(arr_list:list, sublist_len: int) -> list[list]:\n",
" \"\"\"Splits a python list into a list of sublists where each sublist is of size `sublist_len`.\n",
" TODO: Replace with `itertools.batched` when Python 3.12 becomes the minimum supported version.\n",
" \"\"\"\n",
" i = 0\n",
" result = []\n",
"\n",
" while i < len(arr_list):\n",
" if (i + sublist_len) >= len(arr_list):\n",
" result.append(arr_list[i:])\n",
" else:\n",
" result.append(arr_list[i : i + sublist_len])\n",
"\n",
" i += sublist_len\n",
"\n",
" return result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initial experiment"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 249 ms, sys: 40.7 ms, total: 290 ms\n",
"Wall time: 324 ms\n"
]
}
],
"source": [
"%%time\n",
"# need to specify anonymous access\n",
"tiledb_array = tiledb.open(\n",
" f\"{SOMA_URI}/census_data/{SPECIES}/ms/RNA/X/raw/\",\n",
" ctx=tiledb.Ctx(CTX),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def report_block_id_broadcast(obs_idx, var_idx, block_info):\n",
" print(type(obs_idx))\n",
" print(type(var_idx))\n",
" a = np.empty(block_info[None][\"chunk-shape\"], dtype=object)\n",
" a[:] = block_info\n",
" return a\n",
" # return \n",
" # np.complex(obs_idx)\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'numpy.ndarray'>\n",
"<class 'numpy.ndarray'>\n",
"<class 'numpy.ndarray'>\n",
"<class 'numpy.ndarray'>\n",
"<class 'numpy.ndarray'>\n",
"<class 'numpy.ndarray'>\n"
]
},
{
"data": {
"text/plain": [
"array([{0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(0, 3)], 'chunk-location': (0,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n",
" {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(0, 3)], 'chunk-location': (0,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n",
" {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(0, 3)], 'chunk-location': (0,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n",
" {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(0, 3)], 'chunk-location': (0,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n",
" {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(0, 3)], 'chunk-location': (0,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n",
" {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(3, 5)], 'chunk-location': (1,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n",
" {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(3, 5)], 'chunk-location': (1,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n",
" {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(3, 5)], 'chunk-location': (1,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n",
" {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(3, 5)], 'chunk-location': (1,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n",
" {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(3, 5)], 'chunk-location': (1,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,), 'chunk-shape': (5,), 'dtype': <class 'object'>}}],\n",
" dtype=object)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"da.map_blocks(\n",
" report_block_id_broadcast, \n",
" da.arange(10, chunks=5),\n",
" da.arange(5, chunks=3),\n",
" # chunks=((5, 5), (3, 2)),\n",
" dtype=object,\n",
").compute()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Demo of how map blocks works"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <td>\n",
" <table style=\"border-collapse: collapse;\">\n",
" <thead>\n",
" <tr>\n",
" <td> </td>\n",
" <th> Array </th>\n",
" <th> Chunk </th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" <th> Bytes </th>\n",
" <td> 400 B </td>\n",
" <td> 80 B </td>\n",
" </tr>\n",
" \n",
" <tr>\n",
" <th> Shape </th>\n",
" <td> (5, 10) </td>\n",
" <td> (2, 5) </td>\n",
" </tr>\n",
" <tr>\n",
" <th> Dask graph </th>\n",
" <td colspan=\"2\"> 6 chunks in 5 graph layers </td>\n",
" </tr>\n",
" <tr>\n",
" <th> Data type </th>\n",
" <td colspan=\"2\"> object numpy.ndarray </td>\n",
" </tr>\n",
" </tbody>\n",
" </table>\n",
" </td>\n",
" <td>\n",
" <svg width=\"170\" height=\"110\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
"\n",
" <!-- Horizontal lines -->\n",
" <line x1=\"0\" y1=\"0\" x2=\"120\" y2=\"0\" style=\"stroke-width:2\" />\n",
" <line x1=\"0\" y1=\"24\" x2=\"120\" y2=\"24\" />\n",
" <line x1=\"0\" y1=\"48\" x2=\"120\" y2=\"48\" />\n",
" <line x1=\"0\" y1=\"60\" x2=\"120\" y2=\"60\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Vertical lines -->\n",
" <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"60\" style=\"stroke-width:2\" />\n",
" <line x1=\"60\" y1=\"0\" x2=\"60\" y2=\"60\" />\n",
" <line x1=\"120\" y1=\"0\" x2=\"120\" y2=\"60\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Colored Rectangle -->\n",
" <polygon points=\"0.0,0.0 120.0,0.0 120.0,60.0 0.0,60.0\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
"\n",
" <!-- Text -->\n",
" <text x=\"60.000000\" y=\"80.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >10</text>\n",
" <text x=\"140.000000\" y=\"30.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(0,140.000000,30.000000)\">5</text>\n",
"</svg>\n",
" </td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"dask.array<report_block_id_broadcast, shape=(5, 10), dtype=object, chunksize=(2, 5), chunktype=numpy.ndarray>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"{0: {'shape': (10,),\n",
" 'num-chunks': (2,),\n",
" 'array-location': [(0, 5)],\n",
" 'chunk-location': (0,)},\n",
" 1: {'shape': (5, 1),\n",
" 'num-chunks': (3, 1),\n",
" 'array-location': [(0, 2), (0, 1)],\n",
" 'chunk-location': (0, 0)},\n",
" None: {'shape': (5, 10),\n",
" 'num-chunks': (3, 2),\n",
" 'array-location': [(0, 2), (0, 5)],\n",
" 'chunk-location': (0, 0),\n",
" 'chunk-shape': (2, 5),\n",
" 'dtype': object}}"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"{0: {'shape': (10,),\n",
" 'num-chunks': (2,),\n",
" 'array-location': [(5, 10)],\n",
" 'chunk-location': (1,)},\n",
" 1: {'shape': (5, 1),\n",
" 'num-chunks': (3, 1),\n",
" 'array-location': [(2, 4), (0, 1)],\n",
" 'chunk-location': (1, 0)},\n",
" None: {'shape': (5, 10),\n",
" 'num-chunks': (3, 2),\n",
" 'array-location': [(2, 4), (5, 10)],\n",
" 'chunk-location': (1, 1),\n",
" 'chunk-shape': (2, 5),\n",
" 'dtype': object}}"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n",
" [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
" [ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],\n",
" [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],\n",
" [ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def report_block_id_broadcast(obs_idx, var_idx, block_info):\n",
" a = np.empty(block_info[None][\"chunk-shape\"], dtype=object)\n",
" a[:] = block_info\n",
" return a\n",
"\n",
"delayed_res = da.map_blocks(\n",
" report_block_id_broadcast,\n",
" da.arange(10, chunks=5),\n",
" da.arange(5, chunks=2).reshape((5, 1)),\n",
" # chunks=((5, 5), (3, 2)),\n",
" # new_axis=[1],\n",
" dtype=object,\n",
")\n",
"res = delayed_res.compute()\n",
"display(delayed_res)\n",
"\n",
"display(res[1, 4])\n",
"display(res[2, 5])\n",
"\n",
"from operator import add\n",
"\n",
"da.map_blocks(\n",
" add,\n",
" da.arange(10, chunks=5),\n",
" da.arange(5, chunks=2).reshape((5, 1)),\n",
" # chunks=((5, 5), (3, 2)),\n",
" # new_axis=[1],\n",
").compute()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Getting tiledb arrays to work with scanpy (proof of concept)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Proof of concept using scanpy on top of a dask array w/ sparse chunks. Gets through highly variable genes using tiledb's python library directly. This form doesn't support queries on the data prior to creating the dask array."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def sparse_chunk(block_id, block_info):\n",
" shape = block_info[None][\"chunk-shape\"]\n",
" array_location = block_info[None][\"array-location\"]\n",
" offsets = array_location[0][0], array_location[1][0]\n",
"\n",
" tiledb_array = tiledb.open(\n",
" f\"{SOMA_URI}/census_data/{SPECIES}/ms/RNA/X/raw/\",\n",
" ctx=tiledb.Ctx(CTX),\n",
" )\n",
"\n",
" res = tiledb_array[slice(*array_location[0]), slice(*array_location[1])]\n",
"\n",
" res[\"soma_dim_0\"] -= offsets[0]\n",
" res[\"soma_dim_1\"] -= offsets[1]\n",
"\n",
" a = sparse.csr_matrix((res[\"soma_data\"], (res[\"soma_dim_0\"], res[\"soma_dim_1\"])), shape=shape)\n",
" return a"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n",
" warnings.warn(\n",
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n",
" warnings.warn(\n",
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 533 ms, sys: 216 ms, total: 749 ms\n",
"Wall time: 3.16 s\n"
]
},
{
"data": {
"text/plain": [
"<3000x52417 sparse matrix of type '<class 'numpy.float32'>'\n",
"\twith 6164051 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"da.map_blocks(sparse_chunk, chunks=((1000, 1000, 1000), (52417,)), meta=sparse.csr_matrix((0, 0), dtype=tiledb_array.dtype)).compute()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import scanpy as sc, anndata as ad"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 575 ms, sys: 111 ms, total: 686 ms\n",
"Wall time: 679 ms\n"
]
},
{
"data": {
"text/plain": [
"AnnData object with n_obs × n_vars = 1000000 × 52437"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"adata = ad.AnnData(\n",
" X=da.map_blocks(\n",
" sparse_chunk,\n",
" chunks=(\n",
" tuple(to_listed_chunks(10_000, 1_000_000)),\n",
" (tiledb_array.shape[1],)\n",
" ),\n",
" meta=sparse.csr_matrix((0, 0), dtype=tiledb_array.dtype)\n",
" )\n",
")\n",
"adata"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 24.2 ms, sys: 5.75 ms, total: 29.9 ms\n",
"Wall time: 28 ms\n"
]
}
],
"source": [
"%%time\n",
"sc.pp.normalize_total(adata)\n",
"sc.pp.log1p(adata)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n",
" warnings.warn(\n",
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n",
" warnings.warn(\n",
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n",
" warnings.warn(\n",
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n",
" warnings.warn(\n",
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 19.8 s, sys: 2.98 s, total: 22.8 s\n",
"Wall time: 41.3 s\n"
]
},
{
"data": {
"text/plain": [
"AnnData object with n_obs × n_vars = 1000000 × 52437\n",
" var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'\n",
" uns: 'log1p', 'hvg'"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"sc.pp.highly_variable_genes(adata)\n",
"adata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## tiledbsoma"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This section demonstrates creating and using dask with a tiledbsoma query. This means replicating axis filters that have been applied, though I may choose to specialize on `obs` filters here for performance reasons."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The \"stable\" release is currently 2024-07-01. Specify 'census_version=\"2024-07-01\"' in future calls to open_soma() to ensure data consistency.\n"
]
},
{
"data": {
"text/plain": [
"<somacore.query.query.ExperimentAxisQuery at 0x7e09481ecdd0>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"SparseArray(uri='s3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/mus_musculus/ms/RNA/X/raw', mode=r, ndim=2)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(41233630, 52437)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import cellxgene_census\n",
"import tiledbsoma\n",
"import anndata as ad, scanpy as sc\n",
"\n",
"\n",
"\n",
"census = cellxgene_census.open_soma(census_version=\"stable\")\n",
"mouse = census[\"census_data\"][\"mus_musculus\"]\n",
"\n",
"query = mouse.axis_query(\"RNA\", obs_query=tiledbsoma.AxisQuery(value_filter=\"is_primary_data == True\"))\n",
"display(query)\n",
"\n",
"\n",
"# Retrieving the array from a query\n",
"tiledb_array = tiledb.open(\n",
" query.X(\"raw\").array.uri,\n",
" ctx=tiledb.Ctx(CTX),\n",
")\n",
"\n",
"display(tiledb_array)\n",
"display(tiledb_array.shape)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1020"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = mouse.axis_query(\"RNA\", obs_query=tiledbsoma.AxisQuery(value_filter=\"is_primary_data == True\", coords=(slice(0, 83_000),)))\n",
"query.n_obs\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"def query_to_dask(\n",
" query: tiledbsoma.ExperimentAxisQuery,\n",
" layer: str = \"raw\",\n",
" obs_chunks: np.ndarray | int = 10_000,\n",
") -> da.Array:\n",
" if not isinstance(obs_chunks, (int, np.integer)):\n",
" raise NotImplementedError(\"Doesn't support arrays just yet\")\n",
"\n",
" obs_chunks_listed = to_listed_chunks(obs_chunks, query.n_obs)\n",
"\n",
" # Do I have any guarantees about the order of these?\n",
" # Reshapeing so data is distributed propery for the map blocks operation\n",
" obs_joinids = da.from_array(query.obs_joinids().to_numpy(), chunks=obs_chunks_listed).reshape((-1, 1))\n",
" var_joinids = da.from_array(query.var_joinids().to_numpy(), chunks=-1)\n",
"\n",
" uri = query.X(layer).array.uri\n",
"\n",
" def sparse_chunk(obs_joinids, var_joinids, block_id, block_info):\n",
" shape = block_info[None][\"chunk-shape\"]\n",
" array_location = block_info[None][\"array-location\"]\n",
" offsets = array_location[0][0], array_location[1][0]\n",
" obs_joinids = obs_joinids.flatten()\n",
"\n",
" tiledb_array = tiledb.open(\n",
" uri,\n",
" ctx=tiledb.Ctx(CTX),\n",
" )\n",
" res = tiledb_array.multi_index[obs_joinids, var_joinids]\n",
"\n",
" # Inplace operations sometimes throwing errors, something about read only buffers.\n",
" row = cmap(res[\"soma_dim_0\"], obs_joinids)\n",
" col = cmap(res[\"soma_dim_1\"], var_joinids)\n",
"\n",
" a = sparse.csr_matrix((res[\"soma_data\"], (row, col)), shape=shape)\n",
"\n",
" return a\n",
" expr = da.map_blocks(\n",
" sparse_chunk,\n",
" obs_joinids,\n",
" var_joinids,\n",
" meta=sparse.csr_matrix((0, 0), dtype=np.float32),\n",
" chunks=(obs_chunks_listed, (len(var_joinids),)),\n",
" )\n",
" return expr"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.46 s, sys: 2.54 s, total: 4 s\n",
"Wall time: 1.97 s\n",
"CPU times: user 565 ms, sys: 289 ms, total: 854 ms\n",
"Wall time: 425 ms\n",
"CPU times: user 2.03 s, sys: 2.83 s, total: 4.85 s\n",
"Wall time: 2.4 s\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <td>\n",
" <table style=\"border-collapse: collapse;\">\n",
" <thead>\n",
" <tr>\n",
" <td> </td>\n",
" <th> Array </th>\n",
" <th> Chunk </th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" <th> Shape </th>\n",
" <td> (32722, 52437) </td>\n",
" <td> (10000, 52437) </td>\n",
" </tr>\n",
" <tr>\n",
" <th> Dask graph </th>\n",
" <td colspan=\"2\"> 4 chunks in 6 graph layers </td>\n",
" </tr>\n",
" <tr>\n",
" <th> Data type </th>\n",
" <td colspan=\"2\"> float32 scipy.sparse._csr.csr_matrix </td>\n",
" </tr>\n",
" </tbody>\n",
" </table>\n",
" </td>\n",
" <td>\n",
" <svg width=\"170\" height=\"124\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
"\n",
" <!-- Horizontal lines -->\n",
" <line x1=\"0\" y1=\"0\" x2=\"120\" y2=\"0\" style=\"stroke-width:2\" />\n",
" <line x1=\"0\" y1=\"22\" x2=\"120\" y2=\"22\" />\n",
" <line x1=\"0\" y1=\"45\" x2=\"120\" y2=\"45\" />\n",
" <line x1=\"0\" y1=\"68\" x2=\"120\" y2=\"68\" />\n",
" <line x1=\"0\" y1=\"74\" x2=\"120\" y2=\"74\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Vertical lines -->\n",
" <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"74\" style=\"stroke-width:2\" />\n",
" <line x1=\"120\" y1=\"0\" x2=\"120\" y2=\"74\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Colored Rectangle -->\n",
" <polygon points=\"0.0,0.0 120.0,0.0 120.0,74.88300246009497 0.0,74.88300246009497\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
"\n",
" <!-- Text -->\n",
" <text x=\"60.000000\" y=\"94.883002\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >52437</text>\n",
" <text x=\"140.000000\" y=\"37.441501\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,140.000000,37.441501)\">32722</text>\n",
"</svg>\n",
" </td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"dask.array<sparse_chunk, shape=(32722, 52437), dtype=float32, chunksize=(10000, 52437), chunktype=scipy.csr_matrix>"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"query = mouse.axis_query(\"RNA\", obs_query=tiledbsoma.AxisQuery(value_filter=\"is_primary_data == True and tissue == 'limb muscle'\"))\n",
"%time obs = query.obs().concat().to_pandas()\n",
"%time X = query_to_dask(query)\n",
"X"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"def query_to_anndata(query, obs_chunks: int = 10_000):\n",
" return ad.AnnData(\n",
" X=query_to_dask(query, obs_chunks=obs_chunks),\n",
" obs=query.obs().concat().to_pandas(),\n",
" var=query.var().concat().to_pandas(),\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/github/anndata/src/anndata/_core/aligned_df.py:67: ImplicitModificationWarning: Transforming to str index.\n",
" warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n",
"/home/ubuntu/github/anndata/src/anndata/_core/aligned_df.py:67: ImplicitModificationWarning: Transforming to str index.\n",
" warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n"
]
},
{
"data": {
"text/plain": [
"AnnData object with n_obs × n_vars = 1000000 × 52437\n",
" obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'observation_joinid', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'\n",
" var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"adata_first_million = query_to_anndata(\n",
" mouse.axis_query(\"RNA\", obs_query=tiledbsoma.AxisQuery(coords=(slice(0, 1_000_000 - 1),)))\n",
")\n",
"adata_first_million"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"sc.pp.normalize_total(adata_first_million)\n",
"sc.pp.log1p(adata_first_million)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"There is overhead to going through the query. I strongly suspect that is largely around us now accessing the array with integer coordinates, which is more expensive to transfer and work with that slices would be."
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 30.5 s, sys: 6.47 s, total: 37 s\n",
"Wall time: 1min 4s\n"
]
},
{
"data": {
"text/plain": [
"AnnData object with n_obs × n_vars = 1000000 × 52437\n",
" obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'observation_joinid', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'\n",
" var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'\n",
" uns: 'log1p', 'hvg'"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"sc.pp.highly_variable_genes(adata_first_million)\n",
"adata_first_million\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## scratch"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query_big = mouse.axis_query(\n",
" \"RNA\",\n",
" obs_query=tiledbsoma.AxisQuery(\n",
" value_filter=\"\"\"is_primary_data == True and assay in [\"10x 3' v3\", \"10x 3' v2\"]\"\"\"\n",
" )\n",
")\n",
"obs_big = query_big.obs().concat().to_pandas()\n",
"var = query_big.var().concat().to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<tiledbsoma._sparse_nd_array.SparseNDArrayRead at 0x75a30d30f050>"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query.X(\"raw\")"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 0 ns, sys: 514 µs, total: 514 µs\n",
"Wall time: 352 µs\n"
]
},
{
"data": {
"text/plain": [
"array([0, 1, 2, 3, 4])"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"idxr.by_obs(obs_idx[:5])"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 0 ns, sys: 498 µs, total: 498 µs\n",
"Wall time: 352 µs\n"
]
},
{
"data": {
"text/plain": [
"array([0, 1, 2, 3, 4])"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"idxr.by_obs(obs_idx[:5])"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"OrderedDict([('soma_dim_0', array([10, 10, 10, ..., 18, 18, 18])),\n",
" ('soma_dim_1',\n",
" array([ 7, 21, 24, ..., 18005, 18008, 18022])),\n",
" ('soma_data',\n",
" array([ 1., 1., 1., ..., 46., 1., 1.], dtype=float32))])"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tiledb_array.multi_index[np.arange(10_000), np.arange(tiledb_array.shape[1])]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "cellxgene-census-dev",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment