Mercurial > repos > brigidar > select_product
comparison select_product.py @ 8:53f1f83e80a1 draft
Uploaded
author | brigidar |
---|---|
date | Fri, 09 Oct 2015 11:52:31 -0400 |
parents | 6497d42015aa |
children | 82186605ba75 |
comparison
equal
deleted
inserted
replaced
7:7d71f20d949e | 8:53f1f83e80a1 |
---|---|
5 # adds a given number of bases on each side of the transcript for the exclusion set | 5 # adds a given number of bases on each side of the transcript for the exclusion set |
6 | 6 |
7 | 7 |
8 | 8 |
9 #!/usr/bin/env python | 9 #!/usr/bin/env python |
10 """ | |
11 Convert data from Genbank format to GFF. | |
12 | 10 |
13 Usage: | |
14 python gbk_to_gff.py in.gbk > out.gff | |
15 | |
16 Requirements: | |
17 BioPython:- http://biopython.org/ | |
18 helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py | |
19 | |
20 Copyright (C) | |
21 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. | |
22 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. | |
23 """ | |
24 | 11 |
25 import os | 12 import os |
26 import re | 13 import re |
27 import sys | 14 import sys |
28 import helper | |
29 import collections | 15 import collections |
30 import pdb | 16 import pdb |
31 import argparse | 17 import argparse |
32 from Bio import SeqIO | 18 from Bio import SeqIO |
33 | 19 |
34 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk, product, keys, n): | 20 def open_file(fname): |
21 """ | |
22 Open the file (supports .gz .bz2) and returns the handler | |
23 | |
24 @args fname: input file name for reading | |
25 @type fname: str | |
26 """ | |
27 | |
28 try: | |
29 if os.path.splitext(fname)[1] == ".gz": | |
30 FH = gzip.open(fname, 'rb') | |
31 elif os.path.splitext(fname)[1] == ".bz2": | |
32 FH = bz2.BZ2File(fname, 'rb') | |
33 else: | |
34 FH = open(fname, 'rU') | |
35 except Exception as error: | |
36 sys.exit(error) | |
37 | |
38 return FH | |
39 | |
40 | |
41 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, product, keys_1, n): | |
35 """ | 42 """ |
36 Write the feature information | 43 Write the feature information |
37 """ | 44 """ |
45 | |
46 lines = [] | |
47 | |
38 for gname, ginfo in genes.items(): | 48 for gname, ginfo in genes.items(): |
39 line = [str(chr_id), | 49 line = [str(chr_id), |
40 ginfo[3], | 50 ginfo[3], |
41 str(ginfo[0]-int(n)), | 51 str(ginfo[0]-n), |
42 str(ginfo[1]+int(n)), | 52 str(ginfo[1]+n), |
43 ginfo[2], | 53 ginfo[2], |
44 str(product)] | 54 str(product)] |
45 | 55 |
46 # to key on specfic products only | 56 # to key on specfic products only |
47 for k in keys.split(","): | 57 for k in keys_1.split(","): |
48 if k in str(product): | 58 if k in str(product): |
49 | 59 |
50 sys.stdout.write('\t'.join(line)+"\n") | 60 lines.append('\t'.join(line)+"\n") |
51 break | 61 break |
52 | 62 |
53 | 63 return lines |
54 unk +=1 | |
55 return unk | |
56 | 64 |
57 | 65 |
58 | 66 |
59 | 67 |
60 def gbk_parse(fname,keys,n): | 68 def gbk_parse(fname,keys_1,n,outfile=""): |
61 """ | 69 """ |
62 Extract genome annotation recods from genbank format | 70 Extract genome annotation recods from genbank format |
63 | 71 |
64 @args fname: gbk file name | 72 @args fname: gbk file name |
65 @type fname: str | 73 @type fname: str |
66 """ | 74 """ |
67 fhand = helper.open_file(gbkfname) | 75 fhand = open_file(gbkfname) |
68 unk = 1 | 76 lines = [] |
69 | 77 |
70 for record in SeqIO.parse(fhand, "genbank"): | 78 for record in SeqIO.parse(fhand, "genbank"): |
71 gene_tags = dict() | 79 gene_tags = dict() |
72 tx_tags = collections.defaultdict(list) | 80 tx_tags = collections.defaultdict(list) |
73 exon = collections.defaultdict(list) | 81 exon = collections.defaultdict(list) |
129 tx_tags[fid].append((rec.location._start.position+1, | 137 tx_tags[fid].append((rec.location._start.position+1, |
130 rec.location._end.position, | 138 rec.location._end.position, |
131 transcript_id, | 139 transcript_id, |
132 rec.type)) | 140 rec.type)) |
133 # record extracted, generate feature table | 141 # record extracted, generate feature table |
134 unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk, product,keys,n) | 142 lines += feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, product,keys_1,n) |
135 | 143 |
136 fhand.close() | 144 fhand.close() |
137 | 145 |
146 return lines | |
138 | 147 |
139 if __name__=='__main__': | 148 if __name__=='__main__': |
140 | 149 |
141 try: | 150 try: |
142 parser = argparse.ArgumentParser() | 151 parser = argparse.ArgumentParser() |
143 parser.add_argument('-k', '--keys', help="list of products to key on in a comma delimited list") | 152 parser.add_argument('-k', '--keys_1', type=str, help="list of products to key on in a comma delimited list") |
144 parser.add_argument('-g', '--gbk_file', help="genbank file of reference") | 153 parser.add_argument('-g', '--gbk_file', help="genbank file of reference") |
145 parser.add_argument('-n', '--number', help="flanking region to include") | 154 parser.add_argument('-n', '--number', type=int, help="flanking region to include") |
155 parser.add_argument('-o', '--output', help="output file") | |
146 | 156 |
147 args = parser.parse_args() | 157 args = parser.parse_args() |
148 gbkfname = args.gbk_file | 158 gbkfname = args.gbk_file |
149 n=args.number | 159 n=args.number |
150 keys=args.keys | 160 keys_1=args.keys_1 |
161 output=args.output | |
151 | 162 |
152 except: | 163 except Exception,e: |
153 print __doc__ | 164 print "Error: %s" % e |
154 sys.exit(-1) | 165 sys.exit(-1) |
155 | 166 |
156 ## extract gbk records | 167 ## extract gbk records |
157 gbk_parse(gbkfname,keys,n) | 168 |
169 print "Filtering on keys: %s" % keys_1 | |
170 lines = gbk_parse(gbkfname,keys_1,n) | |
171 | |
172 with open(output, 'w') as of: | |
173 for l in lines: | |
174 of.write(l) | |
158 | 175 |
159 | 176 |
160 | 177 |
161 | 178 |