Last active
December 24, 2015 14:49
-
-
Save akloster/6815711 to your computer and use it in GitHub Desktop.
Script to download all "complete" bacterial genomes from NCBI and prepare GC skew plots from them.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# <nbformat>3.0</nbformat> | |
# <codecell> | |
from numpy import * | |
from pandas import * | |
from scipy.signal import argrelextrema | |
# <codecell> | |
# Download a list of all complete bacterial genomes from NCBI | |
import urllib | |
url = "http://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?action=download&orgn=&report=proks&status=Complete|&group=--%20All%20Bacteria%20--&subgroup=--%20All%20Bacteria%20--" | |
data = read_csv(urllib.urlopen(url), na_values='-', sep="\t") | |
no_refseq = isnull(data['Chromosomes/RefSeq']) | |
print "Without RefSeq: %i/%i" % (sum(no_refseq), len(no_refseq)) | |
data = data[~no_refseq] | |
# <codecell> | |
# Take a peek at the table columns | |
data | |
# <codecell> | |
# Distribution of gene counts | |
data['Genes'].hist() | |
# <codecell> | |
# Distribution of genome sizes | |
data['Size (Mb)'].hist() | |
# <codecell> | |
# How many data do we have to download? | |
sum(data['Size (Mb)']) | |
# <codecell> | |
import os | |
try: | |
os.mkdir('fasta') | |
except OSError: | |
pass | |
try: | |
os.mkdir('skew_plots') | |
except OSError: | |
pass | |
# <codecell> | |
from Bio import Entrez, SeqIO | |
Entrez.email = "" # Always tell NCBI who you are | |
# <codecell> | |
for index, row in data.iterrows(): | |
for refseq in row['Chromosomes/RefSeq'].split(','): | |
filename = "fasta/%s.fasta" % (refseq,) | |
if os.path.exists(filename): | |
continue | |
print "%i/%i" % (index, len(data)) , refseq, row['#Organism/Name'] | |
handle = Entrez.efetch(db="nucleotide", id=refseq, rettype="fasta", retmode="text") | |
seq = SeqIO.read(handle, 'fasta') | |
output_handle = open(filename, "w") | |
SeqIO.write(seq, output_handle, "fasta") | |
output_handle.close() | |
# <codecell> | |
# <codecell> | |
def skew_increments(s): | |
return [{'C': -1, 'A':0, 'T':0, 'G':1, 'N':0, 'S':0,'R':0, 'D':0, 'Y':0, 'K':0, 'M':0, 'W':0, 'B':0, 'V':0, 'H':0}[c] for c in s] | |
def gc_skew(s): | |
return cumsum([0]+skew_increments(s)) | |
# <codecell> | |
# Calculate and plot a simple example | |
seq = SeqIO.read(file("fasta/NC_017492.1.fasta"), "fasta").seq | |
skew = gc_skew(seq)[0::10000] | |
figure(figsize=(3,3)) | |
plot(skew) | |
minima = argrelextrema(skew, less, order=10)[0] | |
# <codecell> | |
# <codecell> | |
# Iterate over each organism | |
for index, row in data.iterrows(): | |
# Each organism can have multiple chromosomes | |
for refseq in row['Chromosomes/RefSeq'].split(','): | |
filename = "fasta/%s.fasta" % (refseq,) | |
if not os.path.exists(filename): | |
continue | |
image_name = "%s(%s)" % (row['#Organism/Name'], refseq) | |
image_name = "skew_plots/" + image_name.replace(" ", "_").replace(".", "").replace("/"," of ") + ".png" | |
if os.path.exists(image_name): | |
continue | |
try: | |
seq = SeqIO.read(file(filename), "fasta").seq | |
except ValueError: | |
print "error:", filename | |
continue | |
skew = gc_skew(seq)[0::10000] | |
minima = argrelextrema(skew, less, order=10)[0] | |
print "%i/%i" % (index, len(data)), refseq, row['#Organism/Name'], len(minima) | |
figure(figsize=(3,3)) | |
plot(skew) | |
title("%s\n%s" % (row['#Organism/Name'], refseq)) | |
savefig(image_name, bbox_inches='tight', pad_inches=0.5) | |
# <codecell> | |
1 | |
# <codecell> | |
# <codecell> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment