Created
November 9, 2020 10:58
-
-
Save alimanfoo/5b3e86966ac02787723df28c53c19334 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# AgamP4 resource in GCS\n", | |
"\n", | |
"A quick guide to accessing AgamP4 reference genome and genome annotations on GCS." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Reference genome - read from GCS\n", | |
"\n", | |
"The reference genome is available in zarr format and can be read directly from GCS..." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import zarr\n", | |
"import fsspec" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<zarr.hierarchy.Group '/'>" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"genome_path_gcs = 'gs://vo_agam_release/reference/genome/agamp4/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.zarr'\n", | |
"genome_store = fsspec.get_mapper(genome_path_gcs)\n", | |
"genome = zarr.open_consolidated(genome_store)\n", | |
"genome" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"/\n", | |
" ├── 2L (49364325,) |S1\n", | |
" ├── 2R (61545105,) |S1\n", | |
" ├── 3L (41963435,) |S1\n", | |
" ├── 3R (53200684,) |S1\n", | |
" ├── Mt (15363,) |S1\n", | |
" ├── UNKN (42389979,) |S1\n", | |
" ├── X (24393108,) |S1\n", | |
" └── Y_unplaced (237045,) |S1\n" | |
] | |
} | |
], | |
"source": [ | |
"print(genome.tree())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"49364325" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(genome['2L'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<zarr.core.Array '/2L' (49364325,) |S1>" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"genome['2L']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([b'a', b'a', b'c', ..., b'a', b'a', b'a'], dtype='|S1')" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"seq = genome['2L'][:]\n", | |
"seq" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Reference genome - local download\n", | |
"\n", | |
"You can also download the reference genome locally if you prefer..." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"--2020-11-09 10:57:01-- https://storage.googleapis.com/vo_agam_release/reference/genome/agamp4/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa.gz\n", | |
"Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.191.128, 173.194.192.128, 209.85.146.128, ...\n", | |
"Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.191.128|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 80872688 (77M) [application/gzip]\n", | |
"Saving to: ‘Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa.gz’\n", | |
"\n", | |
"Anopheles-gambiae-P 100%[===================>] 77.13M 111MB/s in 0.7s \n", | |
"\n", | |
"2020-11-09 10:57:02 (111 MB/s) - ‘Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa.gz’ saved [80872688/80872688]\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"!wget --no-clobber https://storage.googleapis.com/vo_agam_release/reference/genome/agamp4/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa.gz" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!gunzip Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa.gz" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pyfasta" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<pyfasta.fasta.Fasta at 0x7f269a72d750>" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# don't forget the key_fn argument\n", | |
"genome = pyfasta.Fasta('Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa', key_fn=lambda x: x.split()[0])\n", | |
"genome" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Genome annotations - read from GCS\n", | |
"\n", | |
"Genome annotations (last version produced before vectorbase migrated to eupathdb) can be read directly from GCS via petl..." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import petl as etl\n", | |
"import petlx" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"geneset_path_gcs = 'gs://vo_agam_release/reference/genome/agamp4/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3.gz'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table class='petl'>\n", | |
"<thead>\n", | |
"<tr>\n", | |
"<th>seqid</th>\n", | |
"<th>source</th>\n", | |
"<th>type</th>\n", | |
"<th>start</th>\n", | |
"<th>end</th>\n", | |
"<th>score</th>\n", | |
"<th>strand</th>\n", | |
"<th>phase</th>\n", | |
"<th>attributes</th>\n", | |
"</tr>\n", | |
"</thead>\n", | |
"<tbody>\n", | |
"<tr>\n", | |
"<td>2L</td>\n", | |
"<td>VectorBase</td>\n", | |
"<td>chromosome</td>\n", | |
"<td style='text-align: right'>1</td>\n", | |
"<td style='text-align: right'>49364325</td>\n", | |
"<td>.</td>\n", | |
"<td>.</td>\n", | |
"<td>.</td>\n", | |
"<td>{'ID': '2L', 'Alias': 'CM000356.1'}</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>2L</td>\n", | |
"<td>VectorBase</td>\n", | |
"<td>gene</td>\n", | |
"<td style='text-align: right'>157348</td>\n", | |
"<td style='text-align: right'>186936</td>\n", | |
"<td>.</td>\n", | |
"<td>-</td>\n", | |
"<td>.</td>\n", | |
"<td>{'ID': 'AGAP004677', 'biotype': 'protein_coding', 'description': 'methylenetetrahydrofolate dehydrogenase(NAD ) / 5,10-methenyltetrahydrofolate [Source:VB Community Annotation]', 'version': '1'}</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>2L</td>\n", | |
"<td>VectorBase</td>\n", | |
"<td>mRNA</td>\n", | |
"<td style='text-align: right'>157348</td>\n", | |
"<td style='text-align: right'>181305</td>\n", | |
"<td>.</td>\n", | |
"<td>-</td>\n", | |
"<td>.</td>\n", | |
"<td>{'ID': 'AGAP004677-RA', 'Parent': 'AGAP004677', 'Dbxref': 'Celera_Pep:agCP1943,KEGG_Enzyme:00670 1.5.1.5 3.5.4.9,KEGG_Enzyme:00720 1.5.1.5 3.5.4.9,RefSeq:XM_001687731.1,RefSeq:XP_001687783.1,STRING:7165.AGAP004677-PA,UniParc:UPI0000020060,UniProtKB:A7UTF7,NCBI_GP:EDO64016.1', 'Ontology_term': 'GO:0003824,GO:0004477,GO:0004487,GO:0004488,GO:0035999,GO:0046653,GO:0055114', 'biotype': 'protein_coding', 'version': '1'}</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>2L</td>\n", | |
"<td>VectorBase</td>\n", | |
"<td>three_prime_UTR</td>\n", | |
"<td style='text-align: right'>157348</td>\n", | |
"<td style='text-align: right'>157495</td>\n", | |
"<td>.</td>\n", | |
"<td>-</td>\n", | |
"<td>.</td>\n", | |
"<td>{'Parent': 'AGAP004677-RA'}</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>2L</td>\n", | |
"<td>VectorBase</td>\n", | |
"<td>exon</td>\n", | |
"<td style='text-align: right'>157348</td>\n", | |
"<td style='text-align: right'>157623</td>\n", | |
"<td>.</td>\n", | |
"<td>-</td>\n", | |
"<td>.</td>\n", | |
"<td>{'Parent': 'AGAP004677-RA', 'Name': 'AGAP004677-RB-E4', 'constitutive': '1', 'rank': '4'}</td>\n", | |
"</tr>\n", | |
"</tbody>\n", | |
"</table>\n", | |
"<p><strong>...</strong></p>" | |
], | |
"text/plain": [ | |
"+-------+--------------+-------------------+--------+----------+-------+--------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", | |
"| seqid | source | type | start | end | score | strand | phase | attributes |\n", | |
"+=======+==============+===================+========+==========+=======+========+=======+====================================================================================================================================================================================================================================================================================================================================================================================================================================+\n", | |
"| '2L' | 'VectorBase' | 'chromosome' | 1 | 49364325 | '.' | '.' | '.' | {'ID': '2L', 'Alias': 'CM000356.1'} |\n", | |
"+-------+--------------+-------------------+--------+----------+-------+--------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", | |
"| '2L' | 'VectorBase' | 'gene' | 157348 | 186936 | '.' | '-' | '.' | {'ID': 'AGAP004677', 'biotype': 'protein_coding', 'description': 'methylenetetrahydrofolate dehydrogenase(NAD ) / 5,10-methenyltetrahydrofolate [Source:VB Community Annotation]', 'version': '1'} |\n", | |
"+-------+--------------+-------------------+--------+----------+-------+--------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", | |
"| '2L' | 'VectorBase' | 'mRNA' | 157348 | 181305 | '.' | '-' | '.' | {'ID': 'AGAP004677-RA', 'Parent': 'AGAP004677', 'Dbxref': 'Celera_Pep:agCP1943,KEGG_Enzyme:00670 1.5.1.5 3.5.4.9,KEGG_Enzyme:00720 1.5.1.5 3.5.4.9,RefSeq:XM_001687731.1,RefSeq:XP_001687783.1,STRING:7165.AGAP004677-PA,UniParc:UPI0000020060,UniProtKB:A7UTF7,NCBI_GP:EDO64016.1', 'Ontology_term': 'GO:0003824,GO:0004477,GO:0004487,GO:0004488,GO:0035999,GO:0046653,GO:0055114', 'biotype': 'protein_coding', 'version': '1'} |\n", | |
"+-------+--------------+-------------------+--------+----------+-------+--------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", | |
"| '2L' | 'VectorBase' | 'three_prime_UTR' | 157348 | 157495 | '.' | '-' | '.' | {'Parent': 'AGAP004677-RA'} |\n", | |
"+-------+--------------+-------------------+--------+----------+-------+--------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", | |
"| '2L' | 'VectorBase' | 'exon' | 157348 | 157623 | '.' | '-' | '.' | {'Parent': 'AGAP004677-RA', 'Name': 'AGAP004677-RB-E4', 'constitutive': '1', 'rank': '4'} |\n", | |
"+-------+--------------+-------------------+--------+----------+-------+--------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", | |
"..." | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tbl_geneset = etl.fromgff3(geneset_path_gcs)\n", | |
"tbl_geneset" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"If you prefer to use pandas, here's how to make a dataframe..." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>seqid</th>\n", | |
" <th>source</th>\n", | |
" <th>type</th>\n", | |
" <th>start</th>\n", | |
" <th>end</th>\n", | |
" <th>score</th>\n", | |
" <th>strand</th>\n", | |
" <th>phase</th>\n", | |
" <th>ID</th>\n", | |
" <th>Name</th>\n", | |
" <th>Parent</th>\n", | |
" <th>biotype</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2L</td>\n", | |
" <td>VectorBase</td>\n", | |
" <td>chromosome</td>\n", | |
" <td>1</td>\n", | |
" <td>49364325</td>\n", | |
" <td>.</td>\n", | |
" <td>.</td>\n", | |
" <td>.</td>\n", | |
" <td>2L</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2L</td>\n", | |
" <td>VectorBase</td>\n", | |
" <td>gene</td>\n", | |
" <td>157348</td>\n", | |
" <td>186936</td>\n", | |
" <td>.</td>\n", | |
" <td>-</td>\n", | |
" <td>.</td>\n", | |
" <td>AGAP004677</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" <td>protein_coding</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2L</td>\n", | |
" <td>VectorBase</td>\n", | |
" <td>mRNA</td>\n", | |
" <td>157348</td>\n", | |
" <td>181305</td>\n", | |
" <td>.</td>\n", | |
" <td>-</td>\n", | |
" <td>.</td>\n", | |
" <td>AGAP004677-RA</td>\n", | |
" <td>None</td>\n", | |
" <td>AGAP004677</td>\n", | |
" <td>protein_coding</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2L</td>\n", | |
" <td>VectorBase</td>\n", | |
" <td>three_prime_UTR</td>\n", | |
" <td>157348</td>\n", | |
" <td>157495</td>\n", | |
" <td>.</td>\n", | |
" <td>-</td>\n", | |
" <td>.</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" <td>AGAP004677-RA</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2L</td>\n", | |
" <td>VectorBase</td>\n", | |
" <td>exon</td>\n", | |
" <td>157348</td>\n", | |
" <td>157623</td>\n", | |
" <td>.</td>\n", | |
" <td>-</td>\n", | |
" <td>.</td>\n", | |
" <td>None</td>\n", | |
" <td>AGAP004677-RB-E4</td>\n", | |
" <td>AGAP004677-RA</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" seqid source type start end score strand phase \\\n", | |
"0 2L VectorBase chromosome 1 49364325 . . . \n", | |
"1 2L VectorBase gene 157348 186936 . - . \n", | |
"2 2L VectorBase mRNA 157348 181305 . - . \n", | |
"3 2L VectorBase three_prime_UTR 157348 157495 . - . \n", | |
"4 2L VectorBase exon 157348 157623 . - . \n", | |
"\n", | |
" ID Name Parent biotype \n", | |
"0 2L None None None \n", | |
"1 AGAP004677 None None protein_coding \n", | |
"2 AGAP004677-RA None AGAP004677 protein_coding \n", | |
"3 None None AGAP004677-RA None \n", | |
"4 None AGAP004677-RB-E4 AGAP004677-RA None " | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_geneset = (\n", | |
" tbl_geneset\n", | |
" # choose the attributes you want to include as columns\n", | |
" .unpackdict('attributes', ['ID', 'Name', 'Parent', 'biotype'])\n", | |
" .todataframe()\n", | |
")\n", | |
"df_geneset.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Note that scikit-allel has a `gff3_to_dataframe` function, but this does not support reading directly from cloud at the moment (PR welcome)." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Genome annotations - local download" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"--2020-11-09 10:57:42-- https://storage.googleapis.com/vo_agam_release/reference/genome/agamp4/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3.gz\n", | |
"Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.194.128, 64.233.191.128, 142.250.125.128, ...\n", | |
"Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.194.128|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 2724130 (2.6M) [application/gzip]\n", | |
"Saving to: ‘Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3.gz’\n", | |
"\n", | |
"Anopheles-gambiae-P 100%[===================>] 2.60M --.-KB/s in 0.02s \n", | |
"\n", | |
"2020-11-09 10:57:42 (158 MB/s) - ‘Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3.gz’ saved [2724130/2724130]\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"!wget --no-clobber https://storage.googleapis.com/vo_agam_release/reference/genome/agamp4/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3.gz" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import allel" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>seqid</th>\n", | |
" <th>source</th>\n", | |
" <th>type</th>\n", | |
" <th>start</th>\n", | |
" <th>end</th>\n", | |
" <th>score</th>\n", | |
" <th>strand</th>\n", | |
" <th>phase</th>\n", | |
" <th>ID</th>\n", | |
" <th>Name</th>\n", | |
" <th>Parent</th>\n", | |
" <th>biotype</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2L</td>\n", | |
" <td>VectorBase</td>\n", | |
" <td>chromosome</td>\n", | |
" <td>1</td>\n", | |
" <td>49364325</td>\n", | |
" <td>-1</td>\n", | |
" <td>.</td>\n", | |
" <td>-1</td>\n", | |
" <td>2L</td>\n", | |
" <td>.</td>\n", | |
" <td>.</td>\n", | |
" <td>.</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2L</td>\n", | |
" <td>VectorBase</td>\n", | |
" <td>gene</td>\n", | |
" <td>157348</td>\n", | |
" <td>186936</td>\n", | |
" <td>-1</td>\n", | |
" <td>-</td>\n", | |
" <td>-1</td>\n", | |
" <td>AGAP004677</td>\n", | |
" <td>.</td>\n", | |
" <td>.</td>\n", | |
" <td>protein_coding</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2L</td>\n", | |
" <td>VectorBase</td>\n", | |
" <td>mRNA</td>\n", | |
" <td>157348</td>\n", | |
" <td>181305</td>\n", | |
" <td>-1</td>\n", | |
" <td>-</td>\n", | |
" <td>-1</td>\n", | |
" <td>AGAP004677-RA</td>\n", | |
" <td>.</td>\n", | |
" <td>AGAP004677</td>\n", | |
" <td>protein_coding</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2L</td>\n", | |
" <td>VectorBase</td>\n", | |
" <td>three_prime_UTR</td>\n", | |
" <td>157348</td>\n", | |
" <td>157495</td>\n", | |
" <td>-1</td>\n", | |
" <td>-</td>\n", | |
" <td>-1</td>\n", | |
" <td>.</td>\n", | |
" <td>.</td>\n", | |
" <td>AGAP004677-RA</td>\n", | |
" <td>.</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2L</td>\n", | |
" <td>VectorBase</td>\n", | |
" <td>exon</td>\n", | |
" <td>157348</td>\n", | |
" <td>157623</td>\n", | |
" <td>-1</td>\n", | |
" <td>-</td>\n", | |
" <td>-1</td>\n", | |
" <td>.</td>\n", | |
" <td>AGAP004677-RB-E4</td>\n", | |
" <td>AGAP004677-RA</td>\n", | |
" <td>.</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" seqid source type start end score strand phase \\\n", | |
"0 2L VectorBase chromosome 1 49364325 -1 . -1 \n", | |
"1 2L VectorBase gene 157348 186936 -1 - -1 \n", | |
"2 2L VectorBase mRNA 157348 181305 -1 - -1 \n", | |
"3 2L VectorBase three_prime_UTR 157348 157495 -1 - -1 \n", | |
"4 2L VectorBase exon 157348 157623 -1 - -1 \n", | |
"\n", | |
" ID Name Parent biotype \n", | |
"0 2L . . . \n", | |
"1 AGAP004677 . . protein_coding \n", | |
"2 AGAP004677-RA . AGAP004677 protein_coding \n", | |
"3 . . AGAP004677-RA . \n", | |
"4 . AGAP004677-RB-E4 AGAP004677-RA . " | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_geneset = allel.gff3_to_dataframe('Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3.gz',\n", | |
" attributes=['ID', 'Name', 'Parent', 'biotype'])\n", | |
"df_geneset.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment