comparison select_product.py @ 8:53f1f83e80a1 draft

Uploaded
author brigidar
date Fri, 09 Oct 2015 11:52:31 -0400
parents 6497d42015aa
children 82186605ba75
comparison
equal deleted inserted replaced
7:7d71f20d949e 8:53f1f83e80a1
5 # adds a given number of bases on each side of the transcript for the exclusion set 5 # adds a given number of bases on each side of the transcript for the exclusion set
6 6
7 7
8 8
9 #!/usr/bin/env python 9 #!/usr/bin/env python
10 """
11 Convert data from Genbank format to GFF.
12 10
13 Usage:
14 python gbk_to_gff.py in.gbk > out.gff
15
16 Requirements:
17 BioPython:- http://biopython.org/
18 helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py
19
20 Copyright (C)
21 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
22 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA.
23 """
24 11
25 import os 12 import os
26 import re 13 import re
27 import sys 14 import sys
28 import helper
29 import collections 15 import collections
30 import pdb 16 import pdb
31 import argparse 17 import argparse
32 from Bio import SeqIO 18 from Bio import SeqIO
33 19
34 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk, product, keys, n): 20 def open_file(fname):
21 """
22 Open the file (supports .gz .bz2) and returns the handler
23
24 @args fname: input file name for reading
25 @type fname: str
26 """
27
28 try:
29 if os.path.splitext(fname)[1] == ".gz":
30 FH = gzip.open(fname, 'rb')
31 elif os.path.splitext(fname)[1] == ".bz2":
32 FH = bz2.BZ2File(fname, 'rb')
33 else:
34 FH = open(fname, 'rU')
35 except Exception as error:
36 sys.exit(error)
37
38 return FH
39
40
41 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, product, keys_1, n):
35 """ 42 """
36 Write the feature information 43 Write the feature information
37 """ 44 """
45
46 lines = []
47
38 for gname, ginfo in genes.items(): 48 for gname, ginfo in genes.items():
39 line = [str(chr_id), 49 line = [str(chr_id),
40 ginfo[3], 50 ginfo[3],
41 str(ginfo[0]-int(n)), 51 str(ginfo[0]-n),
42 str(ginfo[1]+int(n)), 52 str(ginfo[1]+n),
43 ginfo[2], 53 ginfo[2],
44 str(product)] 54 str(product)]
45 55
46 # to key on specfic products only 56 # to key on specfic products only
47 for k in keys.split(","): 57 for k in keys_1.split(","):
48 if k in str(product): 58 if k in str(product):
49 59
50 sys.stdout.write('\t'.join(line)+"\n") 60 lines.append('\t'.join(line)+"\n")
51 break 61 break
52 62
53 63 return lines
54 unk +=1
55 return unk
56 64
57 65
58 66
59 67
60 def gbk_parse(fname,keys,n): 68 def gbk_parse(fname,keys_1,n,outfile=""):
61 """ 69 """
62 Extract genome annotation recods from genbank format 70 Extract genome annotation recods from genbank format
63 71
64 @args fname: gbk file name 72 @args fname: gbk file name
65 @type fname: str 73 @type fname: str
66 """ 74 """
67 fhand = helper.open_file(gbkfname) 75 fhand = open_file(gbkfname)
68 unk = 1 76 lines = []
69 77
70 for record in SeqIO.parse(fhand, "genbank"): 78 for record in SeqIO.parse(fhand, "genbank"):
71 gene_tags = dict() 79 gene_tags = dict()
72 tx_tags = collections.defaultdict(list) 80 tx_tags = collections.defaultdict(list)
73 exon = collections.defaultdict(list) 81 exon = collections.defaultdict(list)
129 tx_tags[fid].append((rec.location._start.position+1, 137 tx_tags[fid].append((rec.location._start.position+1,
130 rec.location._end.position, 138 rec.location._end.position,
131 transcript_id, 139 transcript_id,
132 rec.type)) 140 rec.type))
133 # record extracted, generate feature table 141 # record extracted, generate feature table
134 unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk, product,keys,n) 142 lines += feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, product,keys_1,n)
135 143
136 fhand.close() 144 fhand.close()
137 145
146 return lines
138 147
139 if __name__=='__main__': 148 if __name__=='__main__':
140 149
141 try: 150 try:
142 parser = argparse.ArgumentParser() 151 parser = argparse.ArgumentParser()
143 parser.add_argument('-k', '--keys', help="list of products to key on in a comma delimited list") 152 parser.add_argument('-k', '--keys_1', type=str, help="list of products to key on in a comma delimited list")
144 parser.add_argument('-g', '--gbk_file', help="genbank file of reference") 153 parser.add_argument('-g', '--gbk_file', help="genbank file of reference")
145 parser.add_argument('-n', '--number', help="flanking region to include") 154 parser.add_argument('-n', '--number', type=int, help="flanking region to include")
155 parser.add_argument('-o', '--output', help="output file")
146 156
147 args = parser.parse_args() 157 args = parser.parse_args()
148 gbkfname = args.gbk_file 158 gbkfname = args.gbk_file
149 n=args.number 159 n=args.number
150 keys=args.keys 160 keys_1=args.keys_1
161 output=args.output
151 162
152 except: 163 except Exception,e:
153 print __doc__ 164 print "Error: %s" % e
154 sys.exit(-1) 165 sys.exit(-1)
155 166
156 ## extract gbk records 167 ## extract gbk records
157 gbk_parse(gbkfname,keys,n) 168
169 print "Filtering on keys: %s" % keys_1
170 lines = gbk_parse(gbkfname,keys_1,n)
171
172 with open(output, 'w') as of:
173 for l in lines:
174 of.write(l)
158 175
159 176
160 177
161 178