comparison multi_antiSMASH_wrapper.py @ 0:6a37d0a4510a default tip

initial uploaded
author bjoern-gruening
date Thu, 15 Mar 2012 05:23:03 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6a37d0a4510a
1 #!/usr/bin/env python
2 # -*- coding: UTF-8 -*-
3
4 import os, sys, subprocess, commands
5 import random, shutil
6 import zipfile
7 from Bio import SeqIO
8 import tempfile
9
10 blastdbpath = '/home/galaxy/bin/antismash-1.1.0/db'
11 pfamdbpath = '/home/galaxy/bin/antismash-1.1.0/db'
12 antismash_path = '/home/galaxy/bin/antismash-1.1.0/antismash.py'
13
14
15 def anitSMASH(args, sequence_path, glimmer_path):
16 #./antismash.py Tue6071_genome.fasta --geneclustertypes 1 --fullhmm y
17 print sequence_path
18 rint = random.randint(1,10000000)
19 tmp_dir = '/tmp/galaxy_%s' % rint
20 os.mkdir(tmp_dir)
21 os.mkdir(os.path.join( tmp_dir, 'geneprediction' ))
22 os.chdir(tmp_dir)
23 print 'IFORMAT:',args.iformat
24 if args.iformat == 'fasta':
25 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.fasta')
26 elif args.iformat == 'genbank':
27 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.gbk')
28 else:
29 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.embl')
30
31
32 # try to generate the same name as in antismash.py
33 genomename = ".".join( (os.path.basename(sequence_path) + '.'+ args.iformat).split(".")[:-1] )
34 for i in """!"#$%&()*+,./:;=>?@[]^`{|}'""":
35 genomename = genomename.replace(i,"")
36 result_path = os.path.join( tmp_dir, genomename )
37
38 shutil.copy(sequence_path, new_input_path )
39
40 if args.eukaryotic:
41 taxon = '--taxon e'
42 else:
43 taxon = '--taxon p'
44
45 if args.clusterblast:
46 clusterblast = '--clusterblast y'
47 else:
48 clusterblast = '--clusterblast n'
49
50 if args.smcogs:
51 smcogs = '--smcogs y'
52 else:
53 smcogs = '--smcogs n'
54
55 if args.fullhmm:
56 fullhmm = '--fullhmm y'
57 else:
58 fullhmm = '--fullhmm n'
59
60 if args.fullblast:
61 fullblast = '--fullblast y'
62 else:
63 fullblast = '--fullblast n'
64
65 h = [antismash_path, new_input_path,
66 '--geneclustertypes %s' % args.geneclustertypes,
67 taxon,
68 clusterblast,
69 smcogs,
70 fullhmm,
71 fullblast,
72 '--blastdbpath %s' % blastdbpath,
73 '--pfamdbpath %s' % pfamdbpath,
74 '--cores 10',
75 ]
76 if args.iformat == 'fasta':
77 h.append('--glimmer_prediction %s' % glimmer_path)
78
79 a = ' '.join(h)
80
81 subprocess.call(a, shell=True)
82 print a
83 embl_path = os.path.join(result_path, '%s.final.embl' % genomename)
84 if os.path.exists(embl_path) and args.embl_path:
85 args.embl_path.write( open(embl_path).read() )
86
87 gff_line = "%s\tantismash\t%s\t%s\t%s\t.\t%s\t.\t%s"
88
89 clustername_mapping = {}
90 genecluster_path = os.path.join(result_path, 'clusterblast/geneclusters.txt')
91 if os.path.exists(genecluster_path):
92 for line in open( genecluster_path):
93 token = line.split('\t')
94 clustername_mapping[token[2]] = token[3]
95 old_cluster_name = False
96 parent_cluster_name = ""
97 child_lines = ""
98 starts = []
99 ends = []
100 for line in open( os.path.join(result_path, 'clusterblast/geneclusterprots.fasta') ):
101 if line.startswith('>'):
102 columns = line[1:].strip().split('|')
103 args.geneclusterprots.write( line.replace('|%s|' % columns[1], '|%s|%s|' % (columns[1],clustername_mapping[columns[1]])) )
104
105 """
106 "Handlampe" kann man zu einer Person sagen wenn es nicht mehr zum Armleuchter reicht. Es ist also ein abgeschwächte Form für "Depp".
107 """
108 if old_cluster_name == False:
109 # inital clustername setting
110 # try to catch the line in which the name changes
111 old_cluster_name = columns[1]
112 parent_cluster_name = columns[1]
113
114 seqname = columns[0]
115 strand = columns[3]
116 (start, end) = columns[2].split('-')
117 starts.append(int(start))
118 ends.append(int(end))
119
120 if old_cluster_name != columns[1]:
121 # begin of a new cluster, caclulate the parent line for the previous clusterproteins
122 # TODO: maybe the assumption is not correct, to take the smallest start and the largest end
123 starts.sort()
124 ends.sort()
125 genecluster_start = starts[0]
126 genecluster_end = ends[-1]
127 group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5])
128 oline = gff_line % (
129 seqname, 'genecluster', genecluster_start, genecluster_end, strand, group
130 )
131 args.geneclusterprots_gff.write(oline + child_lines)
132 starts = []
133 ends = []
134 old_cluster_name = columns[1]
135 parent_cluster_name = columns[1]
136 child_lines = ''
137
138 group = 'ID=%s;Parent=%s;Name=%s;Clustertype=%s;Note=%s\n' % (columns[4], columns[1], columns[4], clustername_mapping[columns[1]], columns[5])
139 child_lines += gff_line % (
140 seqname, 'gene', start, end, strand, group
141 )
142
143 else:
144 args.geneclusterprots.write( line )
145
146 starts.sort()
147 ends.sort()
148 genecluster_start = starts[0]
149 genecluster_end = ends[-1]
150 group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5])
151 oline = gff_line % (
152 seqname, 'genecluster', genecluster_start, genecluster_end, strand, group
153 )
154 args.geneclusterprots_gff.write(oline + child_lines)
155
156 # remove tmp directory
157 shutil.rmtree(tmp_dir)
158
159
160 def arg_parse():
161 import argparse
162 parser = argparse.ArgumentParser(prog = 'antiSMASH-Wrapper')
163 parser.add_argument('--version', action='version', version='%(prog)s 0.01')
164 parser.add_argument('--geneclustertypes')
165 parser.add_argument('--clusterblast', action='store_true')
166 parser.add_argument('--eukaryotic', action='store_true')
167 parser.add_argument('--fullhmm', action='store_true')
168 parser.add_argument('--smcogs', action='store_true')
169 parser.add_argument('--fullblast', action='store_true')
170
171 parser.add_argument('--input', '-i', help='FASTA Sequence File')
172 parser.add_argument('--glimmer_prediction', help='Glimmer Prediction File')
173
174 parser.add_argument('--input_format', '-f', dest="iformat", help='input format, if the input is a fasta file you need a glimmer file as additional input')
175
176 parser.add_argument('--zip', help='output: all files as zip file')
177 parser.add_argument('--html_file', help='output: the path to the index html file')
178 parser.add_argument('--html_path', help='output: the path to the output html dir')
179 parser.add_argument('--embl_path', help='output: the path to the embl output file', type=argparse.FileType('w'))
180 parser.add_argument('--geneclusterprots', help='output: Genecluster Fasta File', type=argparse.FileType('w'))
181 parser.add_argument('--geneclusterprots_gff', help='output: Genecluster GFF', type=argparse.FileType('w'))
182
183
184 args = parser.parse_args()
185 return args
186
187
188 def extract_glimmerHMM_results(sequence_id, glimmerHMM):
189 """
190 Returns the given annoations from a sequence_id as complete GFF3 formated string.
191 """
192 result = "##gff-version 3\n"
193 found_annoations = False
194 for line in open(glimmerHMM):
195 if line.startswith(sequence_id + '\t'):
196 result += line
197 found_annoations = True
198 if found_annoations:
199 return result
200 else:
201 return False
202
203 def extract_glimmer_results(sequence_id, glimmer):
204 """
205 Returns the given annoations from a sequence_id as glimmer result file.
206 """
207 result = ''
208 found_annoations = False
209 for line in open(glimmer):
210 # that construct matches the pseudo fasta header and the corresponding orfs
211 if line.split()[0].startswith(sequence_id):
212 result += line
213 found_annoations = True
214 if found_annoations:
215 return result
216 else:
217 return False
218
219
220 if __name__ == '__main__':
221 args = arg_parse()
222 geneclusterprots = ""
223 temp_dir = tempfile.mkdtemp()
224 args.geneclusterprots_gff.write("##gff-version 3\n")
225
226
227
228 for record in SeqIO.parse(open(args.input, "rU"), args.iformat) :
229 # create two tem files and fill it with the information from one sequence
230 record.id = record.id or record.name
231 tmp_sequence = os.path.join(temp_dir, record.id) #fasta -file
232 SeqIO.write(record, open(tmp_sequence, 'w+'), args.iformat)
233
234 tmp_glimmer = ''
235 if args.iformat == 'fasta':
236 # We need to annotate the sequences manually
237 if args.eukaryotic:
238 #print os.path.join(temp_dir, record.id + '.gff')
239 tmp_glimmer = os.path.join(temp_dir, record.id + '.gff')
240 annotations = extract_glimmerHMM_results(record.id, args.glimmer_prediction)
241 #print annotations, ':', record.id, args.glimmer_prediction
242 if not annotations:
243 continue
244 open(tmp_glimmer, 'w+').write( annotations )
245 else:
246 tmp_glimmer = os.path.join(temp_dir, record.id + '.tsv')
247 annotations = extract_glimmer_results(record.id, args.glimmer_prediction)
248 if not annotations:
249 continue
250 open(tmp_glimmer, 'w+').write( annotations )
251
252 anitSMASH(args, tmp_sequence, tmp_glimmer)
253 shutil.rmtree(temp_dir)
254
255
256
257
258
259
260
261
262
263
264
265