comparison select_product.py @ 0:6497d42015aa draft

Uploaded
author brigidar
date Thu, 08 Oct 2015 15:36:12 -0400
parents
children 53f1f83e80a1
comparison
equal deleted inserted replaced
-1:000000000000 0:6497d42015aa
1 #Adapted from gbk_to_gff
2 # Brigida Rusconi, UTSA
3 # October 8 2015
4 # Finds all CDS that match the key words provided in the comma separated argument -k in the product description
5 # adds a given number of bases on each side of the transcript for the exclusion set
6
7
8
9 #!/usr/bin/env python
10 """
11 Convert data from Genbank format to GFF.
12
13 Usage:
14 python gbk_to_gff.py in.gbk > out.gff
15
16 Requirements:
17 BioPython:- http://biopython.org/
18 helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py
19
20 Copyright (C)
21 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
22 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA.
23 """
24
25 import os
26 import re
27 import sys
28 import helper
29 import collections
30 import pdb
31 import argparse
32 from Bio import SeqIO
33
34 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk, product, keys, n):
35 """
36 Write the feature information
37 """
38 for gname, ginfo in genes.items():
39 line = [str(chr_id),
40 ginfo[3],
41 str(ginfo[0]-int(n)),
42 str(ginfo[1]+int(n)),
43 ginfo[2],
44 str(product)]
45
46 # to key on specfic products only
47 for k in keys.split(","):
48 if k in str(product):
49
50 sys.stdout.write('\t'.join(line)+"\n")
51 break
52
53
54 unk +=1
55 return unk
56
57
58
59
60 def gbk_parse(fname,keys,n):
61 """
62 Extract genome annotation recods from genbank format
63
64 @args fname: gbk file name
65 @type fname: str
66 """
67 fhand = helper.open_file(gbkfname)
68 unk = 1
69
70 for record in SeqIO.parse(fhand, "genbank"):
71 gene_tags = dict()
72 tx_tags = collections.defaultdict(list)
73 exon = collections.defaultdict(list)
74 cds = collections.defaultdict(list)
75 mol_type, chr_id = None, None
76
77 for rec in record.features:
78
79
80 product = "NA"
81
82 if rec.qualifiers.has_key('product'):
83
84 product = rec.qualifiers['product'][0]
85
86
87 if rec.type == 'source':
88 try:
89 mol_type = rec.qualifiers['mol_type'][0]
90 except:
91 mol_type = '.'
92 pass
93 try:
94 chr_id = rec.qualifiers['chromosome'][0]
95 except:
96 chr_id = record.name
97 continue
98
99 strand='-'
100 strand='+' if rec.strand>0 else strand
101
102 fid = None
103 try:
104 fid = rec.qualifiers['gene'][0]
105 except:
106 pass
107
108 transcript_id = None
109 try:
110 transcript_id = rec.qualifiers['transcript_id'][0]
111 except:
112 pass
113
114 if re.search(r'gene', rec.type):
115 gene_tags[fid] = (rec.location._start.position+1,
116 rec.location._end.position,
117 strand,
118 rec.type
119 )
120 elif rec.type == 'exon':
121 exon[fid].append((rec.location._start.position+1,
122 rec.location._end.position))
123 elif rec.type=='CDS':
124 cds[fid].append((rec.location._start.position+1,
125 rec.location._end.position))
126 else:
127 # get all transcripts
128 if transcript_id:
129 tx_tags[fid].append((rec.location._start.position+1,
130 rec.location._end.position,
131 transcript_id,
132 rec.type))
133 # record extracted, generate feature table
134 unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk, product,keys,n)
135
136 fhand.close()
137
138
139 if __name__=='__main__':
140
141 try:
142 parser = argparse.ArgumentParser()
143 parser.add_argument('-k', '--keys', help="list of products to key on in a comma delimited list")
144 parser.add_argument('-g', '--gbk_file', help="genbank file of reference")
145 parser.add_argument('-n', '--number', help="flanking region to include")
146
147 args = parser.parse_args()
148 gbkfname = args.gbk_file
149 n=args.number
150 keys=args.keys
151
152 except:
153 print __doc__
154 sys.exit(-1)
155
156 ## extract gbk records
157 gbk_parse(gbkfname,keys,n)
158
159
160
161