The following script takes a space separated list of GenBank numbers as input, and then uses BioPython to download the corresponding sequences from GenBank, strips off all non-coding nucleotides, gives the sequences sensible names, and assembles them into a FASTA file. It is pretty basic, and does not do a lot of fancy error checking, and is probably a little too specific to be useful for most people. I can imagine extending it in a number of ways that would make it much more useful in a number of different contexts. For example, parsing the GenBank data into multiple FASTA files on a gene-by-gene basis. Nonetheless, despite its dubious general utility, I’m still posting it here. This is partly so that other people can use it as a starting point to develop scripts more applicable for their own work, but also so that I myself do not have to wade through several (very useful!) BioPython tutorials if I ever want to do this again.

class Gene(object):

def __init__(self, seq_record, feature):
self.gb_num = seq_record.id
self.type = feature.type
self.start = feature.location.start.position
self.end = feature.location.end.position
self.seq = seq_record.seq[self.start:self.end]
try:
self.name = feature.qualifiers['gene'][0]
except KeyError:
self.name = None
try:
self.codon_start = feature.qualifiers['codon_start'][0]
except KeyError:
self.codon_start = None
try:
self.translation = feature.qualifiers['translation'][0]
except KeyError:
self.translation = None

def __len__(self):
return len(self.seq)

def chars(self):
return str(self.seq)

def main(): """ Main CLI handler. """

parser = OptionParser(usage=_prog_usage,
version=_prog_version,
description=_prog_description)

action='store_const',
dest='output',
metavar='PREFIX',
help='prefix for output file names')

action='store_true',
dest='wrap',
default=False,
help='wrap sequences (slows things down ...)')

(opts, args) = parser.parse_args()

if len(args) == 0:
args = [a for a in args.split("\n") if a]

if opts.output is None:
out = sys.stdout
else:
out = open(opts.output, "w")

gb_recs = Entrez.efetch(db="nucleotide", rettype="gb", id=",".join(args))

for seq_record in SeqIO.parse(gb_recs, "genbank"):
dp = seq_record.description.split(' ')
label = seq_record.id + "_" + dp[0] + "_" + dp[1]
sys.stderr.write("[%s: %s]\n" % (seq_record.id, seq_record.description))
sys.stderr.write("%10s    % 7s    % 7s    % 8s\n" \
% ("Gene",
"Start",
"End",
"Length"))
codons = StringIO.StringIO()
for f in seq_record.features:
if f.type == "CDS":
g = Gene(seq_record, f)
sys.stderr.write("%10s    % 7d    % 7d    % 8d\n" \
% (g.name,
g.start,
g.end,
len(g)))
codons.write(g.chars())

out.write(">%s\n" % label)
codons = codons.getvalue()
if opts.wrap:
out.write("%s\n\n" % textwrap.fill(codons, width=76))
else:
out.write("%s\n\n" % codons)

if __name__ == '__main__':
main()
Share