0
|
1 #Adapted from gbk_to_gff
|
|
2 # Brigida Rusconi, UTSA
|
|
3 # October 8 2015
|
|
4 # Finds all CDS that match the key words provided in the comma separated argument -k in the product description
|
|
5 # adds a given number of bases on each side of the transcript for the exclusion set
|
|
6
|
|
7
|
|
8
|
|
9 #!/usr/bin/env python
|
|
10
|
|
11
|
|
12 import os
|
|
13 import re
|
|
14 import sys
|
|
15 import collections
|
|
16 import pdb
|
|
17 import argparse
|
|
18 from Bio import SeqIO
|
|
19
|
8
|
20 def open_file(fname):
|
|
21 """
|
|
22 Open the file (supports .gz .bz2) and returns the handler
|
|
23
|
|
24 @args fname: input file name for reading
|
|
25 @type fname: str
|
|
26 """
|
|
27
|
|
28 try:
|
|
29 if os.path.splitext(fname)[1] == ".gz":
|
|
30 FH = gzip.open(fname, 'rb')
|
|
31 elif os.path.splitext(fname)[1] == ".bz2":
|
|
32 FH = bz2.BZ2File(fname, 'rb')
|
|
33 else:
|
|
34 FH = open(fname, 'rU')
|
|
35 except Exception as error:
|
|
36 sys.exit(error)
|
|
37
|
|
38 return FH
|
|
39
|
|
40
|
|
41 def gbk_parse(fname,keys_1,n,outfile=""):
|
0
|
42 """
|
|
43 Extract genome annotation recods from genbank format
|
|
44
|
|
45 @args fname: gbk file name
|
|
46 @type fname: str
|
|
47 """
|
8
|
48 fhand = open_file(gbkfname)
|
|
49 lines = []
|
0
|
50
|
|
51 for record in SeqIO.parse(fhand, "genbank"):
|
|
52 gene_tags = dict()
|
10
|
53
|
0
|
54 mol_type, chr_id = None, None
|
|
55
|
|
56 for rec in record.features:
|
|
57
|
|
58
|
|
59 product = "NA"
|
|
60
|
|
61 if rec.qualifiers.has_key('product'):
|
|
62
|
|
63 product = rec.qualifiers['product'][0]
|
10
|
64
|
0
|
65
|
|
66 if rec.type == 'source':
|
|
67 try:
|
|
68 mol_type = rec.qualifiers['mol_type'][0]
|
|
69 except:
|
|
70 mol_type = '.'
|
|
71 pass
|
|
72 try:
|
|
73 chr_id = rec.qualifiers['chromosome'][0]
|
|
74 except:
|
|
75 chr_id = record.name
|
|
76 continue
|
10
|
77
|
0
|
78
|
10
|
79 start = int(rec.location.start)+1-n
|
|
80 stop = int(rec.location.end)+n
|
0
|
81
|
10
|
82 if start < 0:
|
|
83 start = 1
|
|
84
|
|
85 for k in keys_1.split(","):
|
|
86
|
|
87 if str(k) in str(product):
|
|
88 lines.append('\t'.join([str(chr_id),str(start), str(stop), product])+"\n")
|
|
89
|
|
90
|
0
|
91 fhand.close()
|
|
92
|
8
|
93 return lines
|
0
|
94
|
10
|
95 if __name__=='__main__':
|
0
|
96
|
|
97 try:
|
|
98 parser = argparse.ArgumentParser()
|
8
|
99 parser.add_argument('-k', '--keys_1', type=str, help="list of products to key on in a comma delimited list")
|
0
|
100 parser.add_argument('-g', '--gbk_file', help="genbank file of reference")
|
8
|
101 parser.add_argument('-n', '--number', type=int, help="flanking region to include")
|
|
102 parser.add_argument('-o', '--output', help="output file")
|
0
|
103
|
|
104 args = parser.parse_args()
|
|
105 gbkfname = args.gbk_file
|
|
106 n=args.number
|
8
|
107 keys_1=args.keys_1
|
|
108 output=args.output
|
0
|
109
|
8
|
110 except Exception,e:
|
|
111 print "Error: %s" % e
|
0
|
112 sys.exit(-1)
|
|
113
|
8
|
114 ## extract gbk records
|
|
115
|
|
116 print "Filtering on keys: %s" % keys_1
|
|
117 lines = gbk_parse(gbkfname,keys_1,n)
|
|
118
|
|
119 with open(output, 'w') as of:
|
|
120 for l in lines:
|
|
121 of.write(l)
|
0
|
122
|
|
123
|
|
124
|
|
125
|