Mercurial > repos > bjoern-gruening > antismash
comparison multi_antiSMASH_wrapper.py @ 0:6a37d0a4510a default tip
initial uploaded
author | bjoern-gruening |
---|---|
date | Thu, 15 Mar 2012 05:23:03 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6a37d0a4510a |
---|---|
1 #!/usr/bin/env python | |
2 # -*- coding: UTF-8 -*- | |
3 | |
4 import os, sys, subprocess, commands | |
5 import random, shutil | |
6 import zipfile | |
7 from Bio import SeqIO | |
8 import tempfile | |
9 | |
10 blastdbpath = '/home/galaxy/bin/antismash-1.1.0/db' | |
11 pfamdbpath = '/home/galaxy/bin/antismash-1.1.0/db' | |
12 antismash_path = '/home/galaxy/bin/antismash-1.1.0/antismash.py' | |
13 | |
14 | |
15 def anitSMASH(args, sequence_path, glimmer_path): | |
16 #./antismash.py Tue6071_genome.fasta --geneclustertypes 1 --fullhmm y | |
17 print sequence_path | |
18 rint = random.randint(1,10000000) | |
19 tmp_dir = '/tmp/galaxy_%s' % rint | |
20 os.mkdir(tmp_dir) | |
21 os.mkdir(os.path.join( tmp_dir, 'geneprediction' )) | |
22 os.chdir(tmp_dir) | |
23 print 'IFORMAT:',args.iformat | |
24 if args.iformat == 'fasta': | |
25 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.fasta') | |
26 elif args.iformat == 'genbank': | |
27 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.gbk') | |
28 else: | |
29 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.embl') | |
30 | |
31 | |
32 # try to generate the same name as in antismash.py | |
33 genomename = ".".join( (os.path.basename(sequence_path) + '.'+ args.iformat).split(".")[:-1] ) | |
34 for i in """!"#$%&()*+,./:;=>?@[]^`{|}'""": | |
35 genomename = genomename.replace(i,"") | |
36 result_path = os.path.join( tmp_dir, genomename ) | |
37 | |
38 shutil.copy(sequence_path, new_input_path ) | |
39 | |
40 if args.eukaryotic: | |
41 taxon = '--taxon e' | |
42 else: | |
43 taxon = '--taxon p' | |
44 | |
45 if args.clusterblast: | |
46 clusterblast = '--clusterblast y' | |
47 else: | |
48 clusterblast = '--clusterblast n' | |
49 | |
50 if args.smcogs: | |
51 smcogs = '--smcogs y' | |
52 else: | |
53 smcogs = '--smcogs n' | |
54 | |
55 if args.fullhmm: | |
56 fullhmm = '--fullhmm y' | |
57 else: | |
58 fullhmm = '--fullhmm n' | |
59 | |
60 if args.fullblast: | |
61 fullblast = '--fullblast y' | |
62 else: | |
63 fullblast = '--fullblast n' | |
64 | |
65 h = [antismash_path, new_input_path, | |
66 '--geneclustertypes %s' % args.geneclustertypes, | |
67 taxon, | |
68 clusterblast, | |
69 smcogs, | |
70 fullhmm, | |
71 fullblast, | |
72 '--blastdbpath %s' % blastdbpath, | |
73 '--pfamdbpath %s' % pfamdbpath, | |
74 '--cores 10', | |
75 ] | |
76 if args.iformat == 'fasta': | |
77 h.append('--glimmer_prediction %s' % glimmer_path) | |
78 | |
79 a = ' '.join(h) | |
80 | |
81 subprocess.call(a, shell=True) | |
82 print a | |
83 embl_path = os.path.join(result_path, '%s.final.embl' % genomename) | |
84 if os.path.exists(embl_path) and args.embl_path: | |
85 args.embl_path.write( open(embl_path).read() ) | |
86 | |
87 gff_line = "%s\tantismash\t%s\t%s\t%s\t.\t%s\t.\t%s" | |
88 | |
89 clustername_mapping = {} | |
90 genecluster_path = os.path.join(result_path, 'clusterblast/geneclusters.txt') | |
91 if os.path.exists(genecluster_path): | |
92 for line in open( genecluster_path): | |
93 token = line.split('\t') | |
94 clustername_mapping[token[2]] = token[3] | |
95 old_cluster_name = False | |
96 parent_cluster_name = "" | |
97 child_lines = "" | |
98 starts = [] | |
99 ends = [] | |
100 for line in open( os.path.join(result_path, 'clusterblast/geneclusterprots.fasta') ): | |
101 if line.startswith('>'): | |
102 columns = line[1:].strip().split('|') | |
103 args.geneclusterprots.write( line.replace('|%s|' % columns[1], '|%s|%s|' % (columns[1],clustername_mapping[columns[1]])) ) | |
104 | |
105 """ | |
106 "Handlampe" kann man zu einer Person sagen wenn es nicht mehr zum Armleuchter reicht. Es ist also ein abgeschwächte Form für "Depp". | |
107 """ | |
108 if old_cluster_name == False: | |
109 # inital clustername setting | |
110 # try to catch the line in which the name changes | |
111 old_cluster_name = columns[1] | |
112 parent_cluster_name = columns[1] | |
113 | |
114 seqname = columns[0] | |
115 strand = columns[3] | |
116 (start, end) = columns[2].split('-') | |
117 starts.append(int(start)) | |
118 ends.append(int(end)) | |
119 | |
120 if old_cluster_name != columns[1]: | |
121 # begin of a new cluster, caclulate the parent line for the previous clusterproteins | |
122 # TODO: maybe the assumption is not correct, to take the smallest start and the largest end | |
123 starts.sort() | |
124 ends.sort() | |
125 genecluster_start = starts[0] | |
126 genecluster_end = ends[-1] | |
127 group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5]) | |
128 oline = gff_line % ( | |
129 seqname, 'genecluster', genecluster_start, genecluster_end, strand, group | |
130 ) | |
131 args.geneclusterprots_gff.write(oline + child_lines) | |
132 starts = [] | |
133 ends = [] | |
134 old_cluster_name = columns[1] | |
135 parent_cluster_name = columns[1] | |
136 child_lines = '' | |
137 | |
138 group = 'ID=%s;Parent=%s;Name=%s;Clustertype=%s;Note=%s\n' % (columns[4], columns[1], columns[4], clustername_mapping[columns[1]], columns[5]) | |
139 child_lines += gff_line % ( | |
140 seqname, 'gene', start, end, strand, group | |
141 ) | |
142 | |
143 else: | |
144 args.geneclusterprots.write( line ) | |
145 | |
146 starts.sort() | |
147 ends.sort() | |
148 genecluster_start = starts[0] | |
149 genecluster_end = ends[-1] | |
150 group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5]) | |
151 oline = gff_line % ( | |
152 seqname, 'genecluster', genecluster_start, genecluster_end, strand, group | |
153 ) | |
154 args.geneclusterprots_gff.write(oline + child_lines) | |
155 | |
156 # remove tmp directory | |
157 shutil.rmtree(tmp_dir) | |
158 | |
159 | |
160 def arg_parse(): | |
161 import argparse | |
162 parser = argparse.ArgumentParser(prog = 'antiSMASH-Wrapper') | |
163 parser.add_argument('--version', action='version', version='%(prog)s 0.01') | |
164 parser.add_argument('--geneclustertypes') | |
165 parser.add_argument('--clusterblast', action='store_true') | |
166 parser.add_argument('--eukaryotic', action='store_true') | |
167 parser.add_argument('--fullhmm', action='store_true') | |
168 parser.add_argument('--smcogs', action='store_true') | |
169 parser.add_argument('--fullblast', action='store_true') | |
170 | |
171 parser.add_argument('--input', '-i', help='FASTA Sequence File') | |
172 parser.add_argument('--glimmer_prediction', help='Glimmer Prediction File') | |
173 | |
174 parser.add_argument('--input_format', '-f', dest="iformat", help='input format, if the input is a fasta file you need a glimmer file as additional input') | |
175 | |
176 parser.add_argument('--zip', help='output: all files as zip file') | |
177 parser.add_argument('--html_file', help='output: the path to the index html file') | |
178 parser.add_argument('--html_path', help='output: the path to the output html dir') | |
179 parser.add_argument('--embl_path', help='output: the path to the embl output file', type=argparse.FileType('w')) | |
180 parser.add_argument('--geneclusterprots', help='output: Genecluster Fasta File', type=argparse.FileType('w')) | |
181 parser.add_argument('--geneclusterprots_gff', help='output: Genecluster GFF', type=argparse.FileType('w')) | |
182 | |
183 | |
184 args = parser.parse_args() | |
185 return args | |
186 | |
187 | |
188 def extract_glimmerHMM_results(sequence_id, glimmerHMM): | |
189 """ | |
190 Returns the given annoations from a sequence_id as complete GFF3 formated string. | |
191 """ | |
192 result = "##gff-version 3\n" | |
193 found_annoations = False | |
194 for line in open(glimmerHMM): | |
195 if line.startswith(sequence_id + '\t'): | |
196 result += line | |
197 found_annoations = True | |
198 if found_annoations: | |
199 return result | |
200 else: | |
201 return False | |
202 | |
203 def extract_glimmer_results(sequence_id, glimmer): | |
204 """ | |
205 Returns the given annoations from a sequence_id as glimmer result file. | |
206 """ | |
207 result = '' | |
208 found_annoations = False | |
209 for line in open(glimmer): | |
210 # that construct matches the pseudo fasta header and the corresponding orfs | |
211 if line.split()[0].startswith(sequence_id): | |
212 result += line | |
213 found_annoations = True | |
214 if found_annoations: | |
215 return result | |
216 else: | |
217 return False | |
218 | |
219 | |
220 if __name__ == '__main__': | |
221 args = arg_parse() | |
222 geneclusterprots = "" | |
223 temp_dir = tempfile.mkdtemp() | |
224 args.geneclusterprots_gff.write("##gff-version 3\n") | |
225 | |
226 | |
227 | |
228 for record in SeqIO.parse(open(args.input, "rU"), args.iformat) : | |
229 # create two tem files and fill it with the information from one sequence | |
230 record.id = record.id or record.name | |
231 tmp_sequence = os.path.join(temp_dir, record.id) #fasta -file | |
232 SeqIO.write(record, open(tmp_sequence, 'w+'), args.iformat) | |
233 | |
234 tmp_glimmer = '' | |
235 if args.iformat == 'fasta': | |
236 # We need to annotate the sequences manually | |
237 if args.eukaryotic: | |
238 #print os.path.join(temp_dir, record.id + '.gff') | |
239 tmp_glimmer = os.path.join(temp_dir, record.id + '.gff') | |
240 annotations = extract_glimmerHMM_results(record.id, args.glimmer_prediction) | |
241 #print annotations, ':', record.id, args.glimmer_prediction | |
242 if not annotations: | |
243 continue | |
244 open(tmp_glimmer, 'w+').write( annotations ) | |
245 else: | |
246 tmp_glimmer = os.path.join(temp_dir, record.id + '.tsv') | |
247 annotations = extract_glimmer_results(record.id, args.glimmer_prediction) | |
248 if not annotations: | |
249 continue | |
250 open(tmp_glimmer, 'w+').write( annotations ) | |
251 | |
252 anitSMASH(args, tmp_sequence, tmp_glimmer) | |
253 shutil.rmtree(temp_dir) | |
254 | |
255 | |
256 | |
257 | |
258 | |
259 | |
260 | |
261 | |
262 | |
263 | |
264 | |
265 |