Created
March 24, 2022 17:28
-
-
Save alimanfoo/7bd8d1ccef74f4aa3c8bdb6215fd1189 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "e77a6e77-2d47-44e9-a7de-f627132c5ccd", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Copying gs://vo_agam_production/v3.x/curation/1272-VO-GM-OPONDO-VMF00160/snp_genotypes/VBS47720-6366STDY9854329.zarr.zip...\n", | |
"\\ [1 files][468.4 MiB/468.4 MiB] \n", | |
"Operation completed over 1 objects/468.4 MiB. \n" | |
] | |
} | |
], | |
"source": [ | |
"!gsutil cp gs://vo_agam_production/v3.x/curation/1272-VO-GM-OPONDO-VMF00160/snp_genotypes/VBS47720-6366STDY9854329.zarr.zip ." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "f52388ed-f59a-4667-a56b-c850842250fb", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import zarr\n", | |
"from fsspec.implementations.zip import ZipFileSystem\n", | |
"import fsspec" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "de6cfe37-be08-4972-81a2-7c578c562823", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<zarr.hierarchy.Group '/'>" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"expected_sites = zarr.open(fsspec.get_mapper(\"gs://vo_agam_release/v3/snp_genotypes/all/sites\"))\n", | |
"expected_sites" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"id": "407faf0b-273f-489d-89d9-049986cfd1d1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def check_snp_genotype_zarr(path, sample_id, expected_sites):\n", | |
"\n", | |
" fs = ZipFileSystem(path)\n", | |
" store = fs.get_mapper(\"/\")\n", | |
" root = zarr.open(store=store)\n", | |
"\n", | |
" expected_contigs = sorted(expected_sites)\n", | |
" # exception - Mt not included in SNP calling\n", | |
" expected_contigs = [c for c in expected_contigs if c != \"Mt\"]\n", | |
" \n", | |
" for contig in expected_contigs:\n", | |
" \n", | |
" # check arrays exists\n", | |
" mq = root[f\"/{sample_id}/{contig}/variants/MQ\"]\n", | |
" gt = root[f\"/{sample_id}/{contig}/calldata/GT\"]\n", | |
" gq = root[f\"/{sample_id}/{contig}/calldata/GQ\"]\n", | |
" ad = root[f\"/{sample_id}/{contig}/calldata/AD\"]\n", | |
" assert isinstance(mq, zarr.core.Array)\n", | |
" assert isinstance(gt, zarr.core.Array)\n", | |
" assert isinstance(gq, zarr.core.Array)\n", | |
" assert isinstance(ad, zarr.core.Array)\n", | |
" \n", | |
" # check array shapes\n", | |
" expected_n_sites = expected_sites[f\"{contig}/variants/POS\"].shape[0]\n", | |
" assert mq.shape == (expected_n_sites,)\n", | |
" assert gq.shape == (expected_n_sites, 1)\n", | |
" assert gt.shape == (expected_n_sites, 1, 2)\n", | |
" assert ad.shape == (expected_n_sites, 1, 4)\n", | |
" \n", | |
" for x in mq, gq, gt, ad:\n", | |
" \n", | |
" # check all chunks are initialised and present\n", | |
" assert x.nchunks == x.nchunks_initialized\n", | |
" \n", | |
" # check we can read the data\n", | |
" a = x[:]\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"id": "44edc657-ac58-4303-82db-05080f26c30a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 26.9 s, sys: 2.66 s, total: 29.6 s\n", | |
"Wall time: 25.6 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"check_snp_genotype_zarr(\n", | |
" path=\"VBS47720-6366STDY9854329.zarr.zip\", \n", | |
" sample_id=\"VBS47720-6366STDY9854329\", \n", | |
" expected_sites=expected_sites\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "e85a1348-6e3f-4cb7-8464-23fc6c555f4c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python [conda env:binder-v3.3.0]", | |
"language": "python", | |
"name": "conda-env-binder-v3.3.0-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment