annotate select_product.py @ 8:53f1f83e80a1 draft

Uploaded
author brigidar
date Fri, 09 Oct 2015 11:52:31 -0400
parents 6497d42015aa
children 82186605ba75
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
1 #Adapted from gbk_to_gff
6497d42015aa Uploaded
brigidar
parents:
diff changeset
2 # Brigida Rusconi, UTSA
6497d42015aa Uploaded
brigidar
parents:
diff changeset
3 # October 8 2015
6497d42015aa Uploaded
brigidar
parents:
diff changeset
4 # Finds all CDS that match the key words provided in the comma separated argument -k in the product description
6497d42015aa Uploaded
brigidar
parents:
diff changeset
5 # adds a given number of bases on each side of the transcript for the exclusion set
6497d42015aa Uploaded
brigidar
parents:
diff changeset
6
6497d42015aa Uploaded
brigidar
parents:
diff changeset
7
6497d42015aa Uploaded
brigidar
parents:
diff changeset
8
6497d42015aa Uploaded
brigidar
parents:
diff changeset
9 #!/usr/bin/env python
6497d42015aa Uploaded
brigidar
parents:
diff changeset
10
6497d42015aa Uploaded
brigidar
parents:
diff changeset
11
6497d42015aa Uploaded
brigidar
parents:
diff changeset
12 import os
6497d42015aa Uploaded
brigidar
parents:
diff changeset
13 import re
6497d42015aa Uploaded
brigidar
parents:
diff changeset
14 import sys
6497d42015aa Uploaded
brigidar
parents:
diff changeset
15 import collections
6497d42015aa Uploaded
brigidar
parents:
diff changeset
16 import pdb
6497d42015aa Uploaded
brigidar
parents:
diff changeset
17 import argparse
6497d42015aa Uploaded
brigidar
parents:
diff changeset
18 from Bio import SeqIO
6497d42015aa Uploaded
brigidar
parents:
diff changeset
19
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
20 def open_file(fname):
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
21 """
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
22 Open the file (supports .gz .bz2) and returns the handler
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
23
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
24 @args fname: input file name for reading
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
25 @type fname: str
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
26 """
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
27
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
28 try:
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
29 if os.path.splitext(fname)[1] == ".gz":
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
30 FH = gzip.open(fname, 'rb')
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
31 elif os.path.splitext(fname)[1] == ".bz2":
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
32 FH = bz2.BZ2File(fname, 'rb')
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
33 else:
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
34 FH = open(fname, 'rU')
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
35 except Exception as error:
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
36 sys.exit(error)
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
37
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
38 return FH
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
39
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
40
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
41 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, product, keys_1, n):
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
42 """
6497d42015aa Uploaded
brigidar
parents:
diff changeset
43 Write the feature information
6497d42015aa Uploaded
brigidar
parents:
diff changeset
44 """
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
45
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
46 lines = []
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
47
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
48 for gname, ginfo in genes.items():
6497d42015aa Uploaded
brigidar
parents:
diff changeset
49 line = [str(chr_id),
6497d42015aa Uploaded
brigidar
parents:
diff changeset
50 ginfo[3],
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
51 str(ginfo[0]-n),
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
52 str(ginfo[1]+n),
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
53 ginfo[2],
6497d42015aa Uploaded
brigidar
parents:
diff changeset
54 str(product)]
6497d42015aa Uploaded
brigidar
parents:
diff changeset
55
6497d42015aa Uploaded
brigidar
parents:
diff changeset
56 # to key on specfic products only
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
57 for k in keys_1.split(","):
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
58 if k in str(product):
6497d42015aa Uploaded
brigidar
parents:
diff changeset
59
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
60 lines.append('\t'.join(line)+"\n")
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
61 break
6497d42015aa Uploaded
brigidar
parents:
diff changeset
62
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
63 return lines
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
64
6497d42015aa Uploaded
brigidar
parents:
diff changeset
65
6497d42015aa Uploaded
brigidar
parents:
diff changeset
66
6497d42015aa Uploaded
brigidar
parents:
diff changeset
67
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
68 def gbk_parse(fname,keys_1,n,outfile=""):
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
69 """
6497d42015aa Uploaded
brigidar
parents:
diff changeset
70 Extract genome annotation recods from genbank format
6497d42015aa Uploaded
brigidar
parents:
diff changeset
71
6497d42015aa Uploaded
brigidar
parents:
diff changeset
72 @args fname: gbk file name
6497d42015aa Uploaded
brigidar
parents:
diff changeset
73 @type fname: str
6497d42015aa Uploaded
brigidar
parents:
diff changeset
74 """
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
75 fhand = open_file(gbkfname)
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
76 lines = []
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
77
6497d42015aa Uploaded
brigidar
parents:
diff changeset
78 for record in SeqIO.parse(fhand, "genbank"):
6497d42015aa Uploaded
brigidar
parents:
diff changeset
79 gene_tags = dict()
6497d42015aa Uploaded
brigidar
parents:
diff changeset
80 tx_tags = collections.defaultdict(list)
6497d42015aa Uploaded
brigidar
parents:
diff changeset
81 exon = collections.defaultdict(list)
6497d42015aa Uploaded
brigidar
parents:
diff changeset
82 cds = collections.defaultdict(list)
6497d42015aa Uploaded
brigidar
parents:
diff changeset
83 mol_type, chr_id = None, None
6497d42015aa Uploaded
brigidar
parents:
diff changeset
84
6497d42015aa Uploaded
brigidar
parents:
diff changeset
85 for rec in record.features:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
86
6497d42015aa Uploaded
brigidar
parents:
diff changeset
87
6497d42015aa Uploaded
brigidar
parents:
diff changeset
88 product = "NA"
6497d42015aa Uploaded
brigidar
parents:
diff changeset
89
6497d42015aa Uploaded
brigidar
parents:
diff changeset
90 if rec.qualifiers.has_key('product'):
6497d42015aa Uploaded
brigidar
parents:
diff changeset
91
6497d42015aa Uploaded
brigidar
parents:
diff changeset
92 product = rec.qualifiers['product'][0]
6497d42015aa Uploaded
brigidar
parents:
diff changeset
93
6497d42015aa Uploaded
brigidar
parents:
diff changeset
94
6497d42015aa Uploaded
brigidar
parents:
diff changeset
95 if rec.type == 'source':
6497d42015aa Uploaded
brigidar
parents:
diff changeset
96 try:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
97 mol_type = rec.qualifiers['mol_type'][0]
6497d42015aa Uploaded
brigidar
parents:
diff changeset
98 except:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
99 mol_type = '.'
6497d42015aa Uploaded
brigidar
parents:
diff changeset
100 pass
6497d42015aa Uploaded
brigidar
parents:
diff changeset
101 try:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
102 chr_id = rec.qualifiers['chromosome'][0]
6497d42015aa Uploaded
brigidar
parents:
diff changeset
103 except:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
104 chr_id = record.name
6497d42015aa Uploaded
brigidar
parents:
diff changeset
105 continue
6497d42015aa Uploaded
brigidar
parents:
diff changeset
106
6497d42015aa Uploaded
brigidar
parents:
diff changeset
107 strand='-'
6497d42015aa Uploaded
brigidar
parents:
diff changeset
108 strand='+' if rec.strand>0 else strand
6497d42015aa Uploaded
brigidar
parents:
diff changeset
109
6497d42015aa Uploaded
brigidar
parents:
diff changeset
110 fid = None
6497d42015aa Uploaded
brigidar
parents:
diff changeset
111 try:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
112 fid = rec.qualifiers['gene'][0]
6497d42015aa Uploaded
brigidar
parents:
diff changeset
113 except:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
114 pass
6497d42015aa Uploaded
brigidar
parents:
diff changeset
115
6497d42015aa Uploaded
brigidar
parents:
diff changeset
116 transcript_id = None
6497d42015aa Uploaded
brigidar
parents:
diff changeset
117 try:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
118 transcript_id = rec.qualifiers['transcript_id'][0]
6497d42015aa Uploaded
brigidar
parents:
diff changeset
119 except:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
120 pass
6497d42015aa Uploaded
brigidar
parents:
diff changeset
121
6497d42015aa Uploaded
brigidar
parents:
diff changeset
122 if re.search(r'gene', rec.type):
6497d42015aa Uploaded
brigidar
parents:
diff changeset
123 gene_tags[fid] = (rec.location._start.position+1,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
124 rec.location._end.position,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
125 strand,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
126 rec.type
6497d42015aa Uploaded
brigidar
parents:
diff changeset
127 )
6497d42015aa Uploaded
brigidar
parents:
diff changeset
128 elif rec.type == 'exon':
6497d42015aa Uploaded
brigidar
parents:
diff changeset
129 exon[fid].append((rec.location._start.position+1,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
130 rec.location._end.position))
6497d42015aa Uploaded
brigidar
parents:
diff changeset
131 elif rec.type=='CDS':
6497d42015aa Uploaded
brigidar
parents:
diff changeset
132 cds[fid].append((rec.location._start.position+1,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
133 rec.location._end.position))
6497d42015aa Uploaded
brigidar
parents:
diff changeset
134 else:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
135 # get all transcripts
6497d42015aa Uploaded
brigidar
parents:
diff changeset
136 if transcript_id:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
137 tx_tags[fid].append((rec.location._start.position+1,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
138 rec.location._end.position,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
139 transcript_id,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
140 rec.type))
6497d42015aa Uploaded
brigidar
parents:
diff changeset
141 # record extracted, generate feature table
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
142 lines += feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, product,keys_1,n)
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
143
6497d42015aa Uploaded
brigidar
parents:
diff changeset
144 fhand.close()
6497d42015aa Uploaded
brigidar
parents:
diff changeset
145
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
146 return lines
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
147
6497d42015aa Uploaded
brigidar
parents:
diff changeset
148 if __name__=='__main__':
6497d42015aa Uploaded
brigidar
parents:
diff changeset
149
6497d42015aa Uploaded
brigidar
parents:
diff changeset
150 try:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
151 parser = argparse.ArgumentParser()
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
152 parser.add_argument('-k', '--keys_1', type=str, help="list of products to key on in a comma delimited list")
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
153 parser.add_argument('-g', '--gbk_file', help="genbank file of reference")
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
154 parser.add_argument('-n', '--number', type=int, help="flanking region to include")
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
155 parser.add_argument('-o', '--output', help="output file")
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
156
6497d42015aa Uploaded
brigidar
parents:
diff changeset
157 args = parser.parse_args()
6497d42015aa Uploaded
brigidar
parents:
diff changeset
158 gbkfname = args.gbk_file
6497d42015aa Uploaded
brigidar
parents:
diff changeset
159 n=args.number
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
160 keys_1=args.keys_1
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
161 output=args.output
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
162
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
163 except Exception,e:
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
164 print "Error: %s" % e
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
165 sys.exit(-1)
6497d42015aa Uploaded
brigidar
parents:
diff changeset
166
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
167 ## extract gbk records
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
168
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
169 print "Filtering on keys: %s" % keys_1
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
170 lines = gbk_parse(gbkfname,keys_1,n)
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
171
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
172 with open(output, 'w') as of:
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
173 for l in lines:
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
174 of.write(l)
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
175
6497d42015aa Uploaded
brigidar
parents:
diff changeset
176
6497d42015aa Uploaded
brigidar
parents:
diff changeset
177
6497d42015aa Uploaded
brigidar
parents:
diff changeset
178