0
|
1 #!/usr/bin/env python
|
|
2 # -*- coding: UTF-8 -*-
|
|
3
|
|
4 import os, sys, subprocess, commands
|
|
5 import random, shutil
|
|
6 import zipfile
|
|
7 from Bio import SeqIO
|
|
8 import tempfile
|
|
9
|
|
10 blastdbpath = '/home/galaxy/bin/antismash-1.1.0/db'
|
|
11 pfamdbpath = '/home/galaxy/bin/antismash-1.1.0/db'
|
|
12 antismash_path = '/home/galaxy/bin/antismash-1.1.0/antismash.py'
|
|
13
|
|
14
|
|
15 def anitSMASH(args, sequence_path, glimmer_path):
|
|
16 #./antismash.py Tue6071_genome.fasta --geneclustertypes 1 --fullhmm y
|
|
17 print sequence_path
|
|
18 rint = random.randint(1,10000000)
|
|
19 tmp_dir = '/tmp/galaxy_%s' % rint
|
|
20 os.mkdir(tmp_dir)
|
|
21 os.mkdir(os.path.join( tmp_dir, 'geneprediction' ))
|
|
22 os.chdir(tmp_dir)
|
|
23 print 'IFORMAT:',args.iformat
|
|
24 if args.iformat == 'fasta':
|
|
25 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.fasta')
|
|
26 elif args.iformat == 'genbank':
|
|
27 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.gbk')
|
|
28 else:
|
|
29 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.embl')
|
|
30
|
|
31
|
|
32 # try to generate the same name as in antismash.py
|
|
33 genomename = ".".join( (os.path.basename(sequence_path) + '.'+ args.iformat).split(".")[:-1] )
|
|
34 for i in """!"#$%&()*+,./:;=>?@[]^`{|}'""":
|
|
35 genomename = genomename.replace(i,"")
|
|
36 result_path = os.path.join( tmp_dir, genomename )
|
|
37
|
|
38 shutil.copy(sequence_path, new_input_path )
|
|
39
|
|
40 if args.eukaryotic:
|
|
41 taxon = '--taxon e'
|
|
42 else:
|
|
43 taxon = '--taxon p'
|
|
44
|
|
45 if args.clusterblast:
|
|
46 clusterblast = '--clusterblast y'
|
|
47 else:
|
|
48 clusterblast = '--clusterblast n'
|
|
49
|
|
50 if args.smcogs:
|
|
51 smcogs = '--smcogs y'
|
|
52 else:
|
|
53 smcogs = '--smcogs n'
|
|
54
|
|
55 if args.fullhmm:
|
|
56 fullhmm = '--fullhmm y'
|
|
57 else:
|
|
58 fullhmm = '--fullhmm n'
|
|
59
|
|
60 if args.fullblast:
|
|
61 fullblast = '--fullblast y'
|
|
62 else:
|
|
63 fullblast = '--fullblast n'
|
|
64
|
|
65 h = [antismash_path, new_input_path,
|
|
66 '--geneclustertypes %s' % args.geneclustertypes,
|
|
67 taxon,
|
|
68 clusterblast,
|
|
69 smcogs,
|
|
70 fullhmm,
|
|
71 fullblast,
|
|
72 '--blastdbpath %s' % blastdbpath,
|
|
73 '--pfamdbpath %s' % pfamdbpath,
|
|
74 '--cores 10',
|
|
75 ]
|
|
76 if args.iformat == 'fasta':
|
|
77 h.append('--glimmer_prediction %s' % glimmer_path)
|
|
78
|
|
79 a = ' '.join(h)
|
|
80
|
|
81 subprocess.call(a, shell=True)
|
|
82 print a
|
|
83 embl_path = os.path.join(result_path, '%s.final.embl' % genomename)
|
|
84 if os.path.exists(embl_path) and args.embl_path:
|
|
85 args.embl_path.write( open(embl_path).read() )
|
|
86
|
|
87 gff_line = "%s\tantismash\t%s\t%s\t%s\t.\t%s\t.\t%s"
|
|
88
|
|
89 clustername_mapping = {}
|
|
90 genecluster_path = os.path.join(result_path, 'clusterblast/geneclusters.txt')
|
|
91 if os.path.exists(genecluster_path):
|
|
92 for line in open( genecluster_path):
|
|
93 token = line.split('\t')
|
|
94 clustername_mapping[token[2]] = token[3]
|
|
95 old_cluster_name = False
|
|
96 parent_cluster_name = ""
|
|
97 child_lines = ""
|
|
98 starts = []
|
|
99 ends = []
|
|
100 for line in open( os.path.join(result_path, 'clusterblast/geneclusterprots.fasta') ):
|
|
101 if line.startswith('>'):
|
|
102 columns = line[1:].strip().split('|')
|
|
103 args.geneclusterprots.write( line.replace('|%s|' % columns[1], '|%s|%s|' % (columns[1],clustername_mapping[columns[1]])) )
|
|
104
|
|
105 """
|
|
106 "Handlampe" kann man zu einer Person sagen wenn es nicht mehr zum Armleuchter reicht. Es ist also ein abgeschwächte Form für "Depp".
|
|
107 """
|
|
108 if old_cluster_name == False:
|
|
109 # inital clustername setting
|
|
110 # try to catch the line in which the name changes
|
|
111 old_cluster_name = columns[1]
|
|
112 parent_cluster_name = columns[1]
|
|
113
|
|
114 seqname = columns[0]
|
|
115 strand = columns[3]
|
|
116 (start, end) = columns[2].split('-')
|
|
117 starts.append(int(start))
|
|
118 ends.append(int(end))
|
|
119
|
|
120 if old_cluster_name != columns[1]:
|
|
121 # begin of a new cluster, caclulate the parent line for the previous clusterproteins
|
|
122 # TODO: maybe the assumption is not correct, to take the smallest start and the largest end
|
|
123 starts.sort()
|
|
124 ends.sort()
|
|
125 genecluster_start = starts[0]
|
|
126 genecluster_end = ends[-1]
|
|
127 group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5])
|
|
128 oline = gff_line % (
|
|
129 seqname, 'genecluster', genecluster_start, genecluster_end, strand, group
|
|
130 )
|
|
131 args.geneclusterprots_gff.write(oline + child_lines)
|
|
132 starts = []
|
|
133 ends = []
|
|
134 old_cluster_name = columns[1]
|
|
135 parent_cluster_name = columns[1]
|
|
136 child_lines = ''
|
|
137
|
|
138 group = 'ID=%s;Parent=%s;Name=%s;Clustertype=%s;Note=%s\n' % (columns[4], columns[1], columns[4], clustername_mapping[columns[1]], columns[5])
|
|
139 child_lines += gff_line % (
|
|
140 seqname, 'gene', start, end, strand, group
|
|
141 )
|
|
142
|
|
143 else:
|
|
144 args.geneclusterprots.write( line )
|
|
145
|
|
146 starts.sort()
|
|
147 ends.sort()
|
|
148 genecluster_start = starts[0]
|
|
149 genecluster_end = ends[-1]
|
|
150 group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5])
|
|
151 oline = gff_line % (
|
|
152 seqname, 'genecluster', genecluster_start, genecluster_end, strand, group
|
|
153 )
|
|
154 args.geneclusterprots_gff.write(oline + child_lines)
|
|
155
|
|
156 # remove tmp directory
|
|
157 shutil.rmtree(tmp_dir)
|
|
158
|
|
159
|
|
160 def arg_parse():
|
|
161 import argparse
|
|
162 parser = argparse.ArgumentParser(prog = 'antiSMASH-Wrapper')
|
|
163 parser.add_argument('--version', action='version', version='%(prog)s 0.01')
|
|
164 parser.add_argument('--geneclustertypes')
|
|
165 parser.add_argument('--clusterblast', action='store_true')
|
|
166 parser.add_argument('--eukaryotic', action='store_true')
|
|
167 parser.add_argument('--fullhmm', action='store_true')
|
|
168 parser.add_argument('--smcogs', action='store_true')
|
|
169 parser.add_argument('--fullblast', action='store_true')
|
|
170
|
|
171 parser.add_argument('--input', '-i', help='FASTA Sequence File')
|
|
172 parser.add_argument('--glimmer_prediction', help='Glimmer Prediction File')
|
|
173
|
|
174 parser.add_argument('--input_format', '-f', dest="iformat", help='input format, if the input is a fasta file you need a glimmer file as additional input')
|
|
175
|
|
176 parser.add_argument('--zip', help='output: all files as zip file')
|
|
177 parser.add_argument('--html_file', help='output: the path to the index html file')
|
|
178 parser.add_argument('--html_path', help='output: the path to the output html dir')
|
|
179 parser.add_argument('--embl_path', help='output: the path to the embl output file', type=argparse.FileType('w'))
|
|
180 parser.add_argument('--geneclusterprots', help='output: Genecluster Fasta File', type=argparse.FileType('w'))
|
|
181 parser.add_argument('--geneclusterprots_gff', help='output: Genecluster GFF', type=argparse.FileType('w'))
|
|
182
|
|
183
|
|
184 args = parser.parse_args()
|
|
185 return args
|
|
186
|
|
187
|
|
188 def extract_glimmerHMM_results(sequence_id, glimmerHMM):
|
|
189 """
|
|
190 Returns the given annoations from a sequence_id as complete GFF3 formated string.
|
|
191 """
|
|
192 result = "##gff-version 3\n"
|
|
193 found_annoations = False
|
|
194 for line in open(glimmerHMM):
|
|
195 if line.startswith(sequence_id + '\t'):
|
|
196 result += line
|
|
197 found_annoations = True
|
|
198 if found_annoations:
|
|
199 return result
|
|
200 else:
|
|
201 return False
|
|
202
|
|
203 def extract_glimmer_results(sequence_id, glimmer):
|
|
204 """
|
|
205 Returns the given annoations from a sequence_id as glimmer result file.
|
|
206 """
|
|
207 result = ''
|
|
208 found_annoations = False
|
|
209 for line in open(glimmer):
|
|
210 # that construct matches the pseudo fasta header and the corresponding orfs
|
|
211 if line.split()[0].startswith(sequence_id):
|
|
212 result += line
|
|
213 found_annoations = True
|
|
214 if found_annoations:
|
|
215 return result
|
|
216 else:
|
|
217 return False
|
|
218
|
|
219
|
|
220 if __name__ == '__main__':
|
|
221 args = arg_parse()
|
|
222 geneclusterprots = ""
|
|
223 temp_dir = tempfile.mkdtemp()
|
|
224 args.geneclusterprots_gff.write("##gff-version 3\n")
|
|
225
|
|
226
|
|
227
|
|
228 for record in SeqIO.parse(open(args.input, "rU"), args.iformat) :
|
|
229 # create two tem files and fill it with the information from one sequence
|
|
230 record.id = record.id or record.name
|
|
231 tmp_sequence = os.path.join(temp_dir, record.id) #fasta -file
|
|
232 SeqIO.write(record, open(tmp_sequence, 'w+'), args.iformat)
|
|
233
|
|
234 tmp_glimmer = ''
|
|
235 if args.iformat == 'fasta':
|
|
236 # We need to annotate the sequences manually
|
|
237 if args.eukaryotic:
|
|
238 #print os.path.join(temp_dir, record.id + '.gff')
|
|
239 tmp_glimmer = os.path.join(temp_dir, record.id + '.gff')
|
|
240 annotations = extract_glimmerHMM_results(record.id, args.glimmer_prediction)
|
|
241 #print annotations, ':', record.id, args.glimmer_prediction
|
|
242 if not annotations:
|
|
243 continue
|
|
244 open(tmp_glimmer, 'w+').write( annotations )
|
|
245 else:
|
|
246 tmp_glimmer = os.path.join(temp_dir, record.id + '.tsv')
|
|
247 annotations = extract_glimmer_results(record.id, args.glimmer_prediction)
|
|
248 if not annotations:
|
|
249 continue
|
|
250 open(tmp_glimmer, 'w+').write( annotations )
|
|
251
|
|
252 anitSMASH(args, tmp_sequence, tmp_glimmer)
|
|
253 shutil.rmtree(temp_dir)
|
|
254
|
|
255
|
|
256
|
|
257
|
|
258
|
|
259
|
|
260
|
|
261
|
|
262
|
|
263
|
|
264
|
|
265
|