Mercurial > repos > brigidar > select_product
comparison select_product.py @ 0:6497d42015aa draft
Uploaded
author | brigidar |
---|---|
date | Thu, 08 Oct 2015 15:36:12 -0400 |
parents | |
children | 53f1f83e80a1 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6497d42015aa |
---|---|
1 #Adapted from gbk_to_gff | |
2 # Brigida Rusconi, UTSA | |
3 # October 8 2015 | |
4 # Finds all CDS that match the key words provided in the comma separated argument -k in the product description | |
5 # adds a given number of bases on each side of the transcript for the exclusion set | |
6 | |
7 | |
8 | |
9 #!/usr/bin/env python | |
10 """ | |
11 Convert data from Genbank format to GFF. | |
12 | |
13 Usage: | |
14 python gbk_to_gff.py in.gbk > out.gff | |
15 | |
16 Requirements: | |
17 BioPython:- http://biopython.org/ | |
18 helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py | |
19 | |
20 Copyright (C) | |
21 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. | |
22 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. | |
23 """ | |
24 | |
25 import os | |
26 import re | |
27 import sys | |
28 import helper | |
29 import collections | |
30 import pdb | |
31 import argparse | |
32 from Bio import SeqIO | |
33 | |
34 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk, product, keys, n): | |
35 """ | |
36 Write the feature information | |
37 """ | |
38 for gname, ginfo in genes.items(): | |
39 line = [str(chr_id), | |
40 ginfo[3], | |
41 str(ginfo[0]-int(n)), | |
42 str(ginfo[1]+int(n)), | |
43 ginfo[2], | |
44 str(product)] | |
45 | |
46 # to key on specfic products only | |
47 for k in keys.split(","): | |
48 if k in str(product): | |
49 | |
50 sys.stdout.write('\t'.join(line)+"\n") | |
51 break | |
52 | |
53 | |
54 unk +=1 | |
55 return unk | |
56 | |
57 | |
58 | |
59 | |
60 def gbk_parse(fname,keys,n): | |
61 """ | |
62 Extract genome annotation recods from genbank format | |
63 | |
64 @args fname: gbk file name | |
65 @type fname: str | |
66 """ | |
67 fhand = helper.open_file(gbkfname) | |
68 unk = 1 | |
69 | |
70 for record in SeqIO.parse(fhand, "genbank"): | |
71 gene_tags = dict() | |
72 tx_tags = collections.defaultdict(list) | |
73 exon = collections.defaultdict(list) | |
74 cds = collections.defaultdict(list) | |
75 mol_type, chr_id = None, None | |
76 | |
77 for rec in record.features: | |
78 | |
79 | |
80 product = "NA" | |
81 | |
82 if rec.qualifiers.has_key('product'): | |
83 | |
84 product = rec.qualifiers['product'][0] | |
85 | |
86 | |
87 if rec.type == 'source': | |
88 try: | |
89 mol_type = rec.qualifiers['mol_type'][0] | |
90 except: | |
91 mol_type = '.' | |
92 pass | |
93 try: | |
94 chr_id = rec.qualifiers['chromosome'][0] | |
95 except: | |
96 chr_id = record.name | |
97 continue | |
98 | |
99 strand='-' | |
100 strand='+' if rec.strand>0 else strand | |
101 | |
102 fid = None | |
103 try: | |
104 fid = rec.qualifiers['gene'][0] | |
105 except: | |
106 pass | |
107 | |
108 transcript_id = None | |
109 try: | |
110 transcript_id = rec.qualifiers['transcript_id'][0] | |
111 except: | |
112 pass | |
113 | |
114 if re.search(r'gene', rec.type): | |
115 gene_tags[fid] = (rec.location._start.position+1, | |
116 rec.location._end.position, | |
117 strand, | |
118 rec.type | |
119 ) | |
120 elif rec.type == 'exon': | |
121 exon[fid].append((rec.location._start.position+1, | |
122 rec.location._end.position)) | |
123 elif rec.type=='CDS': | |
124 cds[fid].append((rec.location._start.position+1, | |
125 rec.location._end.position)) | |
126 else: | |
127 # get all transcripts | |
128 if transcript_id: | |
129 tx_tags[fid].append((rec.location._start.position+1, | |
130 rec.location._end.position, | |
131 transcript_id, | |
132 rec.type)) | |
133 # record extracted, generate feature table | |
134 unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk, product,keys,n) | |
135 | |
136 fhand.close() | |
137 | |
138 | |
139 if __name__=='__main__': | |
140 | |
141 try: | |
142 parser = argparse.ArgumentParser() | |
143 parser.add_argument('-k', '--keys', help="list of products to key on in a comma delimited list") | |
144 parser.add_argument('-g', '--gbk_file', help="genbank file of reference") | |
145 parser.add_argument('-n', '--number', help="flanking region to include") | |
146 | |
147 args = parser.parse_args() | |
148 gbkfname = args.gbk_file | |
149 n=args.number | |
150 keys=args.keys | |
151 | |
152 except: | |
153 print __doc__ | |
154 sys.exit(-1) | |
155 | |
156 ## extract gbk records | |
157 gbk_parse(gbkfname,keys,n) | |
158 | |
159 | |
160 | |
161 |