annotate select_product.py @ 10:82186605ba75 draft default tip

Uploaded
author brigidar
date Fri, 09 Oct 2015 17:30:50 -0400
parents 53f1f83e80a1
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
1 #Adapted from gbk_to_gff
6497d42015aa Uploaded
brigidar
parents:
diff changeset
2 # Brigida Rusconi, UTSA
6497d42015aa Uploaded
brigidar
parents:
diff changeset
3 # October 8 2015
6497d42015aa Uploaded
brigidar
parents:
diff changeset
4 # Finds all CDS that match the key words provided in the comma separated argument -k in the product description
6497d42015aa Uploaded
brigidar
parents:
diff changeset
5 # adds a given number of bases on each side of the transcript for the exclusion set
6497d42015aa Uploaded
brigidar
parents:
diff changeset
6
6497d42015aa Uploaded
brigidar
parents:
diff changeset
7
6497d42015aa Uploaded
brigidar
parents:
diff changeset
8
6497d42015aa Uploaded
brigidar
parents:
diff changeset
9 #!/usr/bin/env python
6497d42015aa Uploaded
brigidar
parents:
diff changeset
10
6497d42015aa Uploaded
brigidar
parents:
diff changeset
11
6497d42015aa Uploaded
brigidar
parents:
diff changeset
12 import os
6497d42015aa Uploaded
brigidar
parents:
diff changeset
13 import re
6497d42015aa Uploaded
brigidar
parents:
diff changeset
14 import sys
6497d42015aa Uploaded
brigidar
parents:
diff changeset
15 import collections
6497d42015aa Uploaded
brigidar
parents:
diff changeset
16 import pdb
6497d42015aa Uploaded
brigidar
parents:
diff changeset
17 import argparse
6497d42015aa Uploaded
brigidar
parents:
diff changeset
18 from Bio import SeqIO
6497d42015aa Uploaded
brigidar
parents:
diff changeset
19
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
20 def open_file(fname):
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
21 """
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
22 Open the file (supports .gz .bz2) and returns the handler
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
23
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
24 @args fname: input file name for reading
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
25 @type fname: str
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
26 """
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
27
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
28 try:
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
29 if os.path.splitext(fname)[1] == ".gz":
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
30 FH = gzip.open(fname, 'rb')
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
31 elif os.path.splitext(fname)[1] == ".bz2":
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
32 FH = bz2.BZ2File(fname, 'rb')
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
33 else:
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
34 FH = open(fname, 'rU')
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
35 except Exception as error:
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
36 sys.exit(error)
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
37
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
38 return FH
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
39
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
40
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
41 def gbk_parse(fname,keys_1,n,outfile=""):
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
42 """
6497d42015aa Uploaded
brigidar
parents:
diff changeset
43 Extract genome annotation recods from genbank format
6497d42015aa Uploaded
brigidar
parents:
diff changeset
44
6497d42015aa Uploaded
brigidar
parents:
diff changeset
45 @args fname: gbk file name
6497d42015aa Uploaded
brigidar
parents:
diff changeset
46 @type fname: str
6497d42015aa Uploaded
brigidar
parents:
diff changeset
47 """
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
48 fhand = open_file(gbkfname)
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
49 lines = []
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
50
6497d42015aa Uploaded
brigidar
parents:
diff changeset
51 for record in SeqIO.parse(fhand, "genbank"):
6497d42015aa Uploaded
brigidar
parents:
diff changeset
52 gene_tags = dict()
10
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
53
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
54 mol_type, chr_id = None, None
6497d42015aa Uploaded
brigidar
parents:
diff changeset
55
6497d42015aa Uploaded
brigidar
parents:
diff changeset
56 for rec in record.features:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
57
6497d42015aa Uploaded
brigidar
parents:
diff changeset
58
6497d42015aa Uploaded
brigidar
parents:
diff changeset
59 product = "NA"
6497d42015aa Uploaded
brigidar
parents:
diff changeset
60
6497d42015aa Uploaded
brigidar
parents:
diff changeset
61 if rec.qualifiers.has_key('product'):
6497d42015aa Uploaded
brigidar
parents:
diff changeset
62
6497d42015aa Uploaded
brigidar
parents:
diff changeset
63 product = rec.qualifiers['product'][0]
10
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
64
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
65
6497d42015aa Uploaded
brigidar
parents:
diff changeset
66 if rec.type == 'source':
6497d42015aa Uploaded
brigidar
parents:
diff changeset
67 try:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
68 mol_type = rec.qualifiers['mol_type'][0]
6497d42015aa Uploaded
brigidar
parents:
diff changeset
69 except:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
70 mol_type = '.'
6497d42015aa Uploaded
brigidar
parents:
diff changeset
71 pass
6497d42015aa Uploaded
brigidar
parents:
diff changeset
72 try:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
73 chr_id = rec.qualifiers['chromosome'][0]
6497d42015aa Uploaded
brigidar
parents:
diff changeset
74 except:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
75 chr_id = record.name
6497d42015aa Uploaded
brigidar
parents:
diff changeset
76 continue
10
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
77
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
78
10
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
79 start = int(rec.location.start)+1-n
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
80 stop = int(rec.location.end)+n
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
81
10
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
82 if start < 0:
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
83 start = 1
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
84
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
85 for k in keys_1.split(","):
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
86
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
87 if str(k) in str(product):
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
88 lines.append('\t'.join([str(chr_id),str(start), str(stop), product])+"\n")
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
89
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
90
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
91 fhand.close()
6497d42015aa Uploaded
brigidar
parents:
diff changeset
92
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
93 return lines
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
94
10
82186605ba75 Uploaded
brigidar
parents: 8
diff changeset
95 if __name__=='__main__':
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
96
6497d42015aa Uploaded
brigidar
parents:
diff changeset
97 try:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
98 parser = argparse.ArgumentParser()
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
99 parser.add_argument('-k', '--keys_1', type=str, help="list of products to key on in a comma delimited list")
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
100 parser.add_argument('-g', '--gbk_file', help="genbank file of reference")
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
101 parser.add_argument('-n', '--number', type=int, help="flanking region to include")
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
102 parser.add_argument('-o', '--output', help="output file")
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
103
6497d42015aa Uploaded
brigidar
parents:
diff changeset
104 args = parser.parse_args()
6497d42015aa Uploaded
brigidar
parents:
diff changeset
105 gbkfname = args.gbk_file
6497d42015aa Uploaded
brigidar
parents:
diff changeset
106 n=args.number
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
107 keys_1=args.keys_1
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
108 output=args.output
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
109
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
110 except Exception,e:
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
111 print "Error: %s" % e
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
112 sys.exit(-1)
6497d42015aa Uploaded
brigidar
parents:
diff changeset
113
8
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
114 ## extract gbk records
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
115
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
116 print "Filtering on keys: %s" % keys_1
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
117 lines = gbk_parse(gbkfname,keys_1,n)
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
118
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
119 with open(output, 'w') as of:
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
120 for l in lines:
53f1f83e80a1 Uploaded
brigidar
parents: 0
diff changeset
121 of.write(l)
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
122
6497d42015aa Uploaded
brigidar
parents:
diff changeset
123
6497d42015aa Uploaded
brigidar
parents:
diff changeset
124
6497d42015aa Uploaded
brigidar
parents:
diff changeset
125