Mercurial > repos > vipints > fml_gff3togtf
view gbk_to_gff.py @ 11:5c6f33e20fcc default tip
requirement tag added
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Fri, 24 Apr 2015 18:04:27 -0400 |
parents | c42c69aa81f8 |
children |
line wrap: on
line source
#!/usr/bin/env python """ Convert data from Genbank format to GFF. Usage: python gbk_to_gff.py in.gbk > out.gff Requirements: BioPython:- http://biopython.org/ helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py Copyright (C) 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. """ import os import re import sys import helper import collections from Bio import SeqIO def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk): """ Write the feature information """ for gname, ginfo in genes.items(): line = [str(chr_id), 'gbk2gff', ginfo[3], str(ginfo[0]), str(ginfo[1]), '.', ginfo[2], '.', 'ID=%s;Name=%s' % (str(gname), str(gname))] sys.stdout.write('\t'.join(line)+"\n") ## construct the transcript line is not defined in the original file t_line = [str(chr_id), 'gbk2gff', source, 0, 1, '.', ginfo[2], '.'] if not transcripts: t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname))) if exons: ## get the entire transcript region from the defined feature t_line[3] = str(exons[gname][0][0]) t_line[4] = str(exons[gname][0][-1]) elif cds: t_line[3] = str(cds[gname][0][0]) t_line[4] = str(cds[gname][0][-1]) if not cds: t_line[2] = 'transcript' else: t_line[2] = 'mRNA' sys.stdout.write('\t'.join(t_line)+"\n") if exons: exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon') if cds: exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS') if not exons: exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon') else: ## transcript is defined for idx in transcripts[gname]: t_line[2] = idx[3] t_line[3] = str(idx[0]) t_line[4] = str(idx[1]) t_line.append('ID='+str(idx[2])+';Parent='+str(gname)) sys.stdout.write('\t'.join(t_line)+"\n") ## feature line print call if exons: exon_line_print(t_line, exons[gname], str(idx[2]), 'exon') if cds: exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS') if not exons: exon_line_print(t_line, cds[gname], str(idx[2]), 'exon') if len(genes) == 0: ## feature entry with fragment information line = [str(chr_id), 'gbk2gff', source, 0, 1, '.', orient, '.'] fStart = fStop = None for eid, ex in cds.items(): fStart = ex[0][0] fStop = ex[0][-1] for eid, ex in exons.items(): fStart = ex[0][0] fStop = ex[0][-1] if fStart or fStart: line[2] = 'gene' line[3] = str(fStart) line[4] = str(fStop) line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk)) sys.stdout.write('\t'.join(line)+"\n") if not cds: line[2] = 'transcript' else: line[2] = 'mRNA' line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk) sys.stdout.write('\t'.join(line)+"\n") if exons: exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon') if cds: exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS') if not exons: exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon') unk +=1 return unk def exon_line_print(temp_line, trx_exons, parent, ftype): """ Print the EXON feature line """ for ex in trx_exons: temp_line[2] = ftype temp_line[3] = str(ex[0]) temp_line[4] = str(ex[1]) temp_line[8] = 'Parent=%s' % parent sys.stdout.write('\t'.join(temp_line)+"\n") def gbk_parse(fname): """ Extract genome annotation recods from genbank format @args fname: gbk file name @type fname: str """ fhand = helper.open_file(gbkfname) unk = 1 for record in SeqIO.parse(fhand, "genbank"): gene_tags = dict() tx_tags = collections.defaultdict(list) exon = collections.defaultdict(list) cds = collections.defaultdict(list) mol_type, chr_id = None, None for rec in record.features: if rec.type == 'source': try: mol_type = rec.qualifiers['mol_type'][0] except: mol_type = '.' pass try: chr_id = rec.qualifiers['chromosome'][0] except: chr_id = record.name continue strand='-' strand='+' if rec.strand>0 else strand fid = None try: fid = rec.qualifiers['gene'][0] except: pass transcript_id = None try: transcript_id = rec.qualifiers['transcript_id'][0] except: pass if re.search(r'gene', rec.type): gene_tags[fid] = (rec.location._start.position+1, rec.location._end.position, strand, rec.type ) elif rec.type == 'exon': exon[fid].append((rec.location._start.position+1, rec.location._end.position)) elif rec.type=='CDS': cds[fid].append((rec.location._start.position+1, rec.location._end.position)) else: # get all transcripts if transcript_id: tx_tags[fid].append((rec.location._start.position+1, rec.location._end.position, transcript_id, rec.type)) # record extracted, generate feature table unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk) fhand.close() if __name__=='__main__': try: gbkfname = sys.argv[1] except: print __doc__ sys.exit(-1) ## extract gbk records gbk_parse(gbkfname)