0
|
1 #!/usr/bin/env python
|
|
2 ## Copyright (c) 2010 Marnix H. Medema
|
|
3 ## University of Groningen
|
|
4 ## Department of Microbial Physiology / Groningen Bioinformatics Centre
|
|
5 ## License: GNU General Public License v3 or later
|
|
6 ## A copy of GNU GPL v3 should have been included in this software package in LICENSE.txt.
|
|
7
|
|
8 ##Functions necessary for this script
|
|
9
|
|
10 import linecache, cPickle
|
|
11
|
|
12 DEBUG = True
|
|
13
|
|
14
|
|
15 def invalidoptions(argument):
|
|
16 if len(argument) > 0:
|
|
17 print >> sys.stderr, "Invalid options input:"
|
|
18 print >> sys.stderr, argument
|
|
19 print "From the command line, input antismash --help for more information."
|
|
20 logfile.write("Invalid options input: " + argument + "\n")
|
|
21 logfile.close()
|
|
22 sys.exit(1)
|
|
23
|
|
24 def sortdictkeysbyvalues(dict):
|
|
25 items = [(value, key) for key, value in dict.items()]
|
|
26 items.sort()
|
|
27 return [key for value, key in items]
|
|
28
|
|
29 def sortdictkeysbyvaluesrev(dict):
|
|
30 items = [(value, key) for key, value in dict.items()]
|
|
31 items.sort()
|
|
32 items.reverse()
|
|
33 return [key for value, key in items]
|
|
34
|
|
35 def sortdictkeysbyvaluesrevv(dict):
|
|
36 items = [(value, key) for key, value in dict.items()]
|
|
37 items.sort()
|
|
38 items.reverse()
|
|
39 return [value for value, key in items]
|
|
40
|
|
41 def get_sequence(fasta):
|
|
42 """get the description and trimmed dna sequence"""
|
|
43 #in_file = open(fasta, 'r')
|
|
44 #content = in_file.readlines()
|
|
45 #in_file.close()
|
|
46 #content2 = []
|
|
47 #for i in content:
|
|
48 #if i != "":
|
|
49 # content2.append(i)
|
|
50 content = []
|
|
51 [content.append(line) for line in open(fasta, 'r') if line]
|
|
52 #content = content2
|
|
53 while content[0] == "" or content[0] == "\n":
|
|
54 content = content[1:]
|
|
55 header = content[0]
|
|
56 content = content[1:]
|
|
57 content = [x.rstrip() for x in content]
|
|
58 seq = "".join(content)
|
|
59 if ">" not in header or ">" in seq:
|
|
60 print >> sys.stderr, "FASTA file not properly formatted; should be single sequence starting with '>' and sequence name."
|
|
61 logfile.write("FASTA file not properly formatted; should started with '>' and sequence name on first line.\n")
|
|
62 logfile.close()
|
|
63 sys.exit(1)
|
|
64 return seq
|
|
65
|
|
66 def complement(seq):
|
|
67 complement = {'a': 't', 'c': 'g', 'g': 'c', 't': 'a', 'n': 'n', 'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
|
|
68 complseq = []
|
|
69 for base in seq:
|
|
70 if base in complement.keys():
|
|
71 complbase = complement[str(base)]
|
|
72 complseq.append(complbase)
|
|
73 else:
|
|
74 complbase = 'n'
|
|
75 complseq.append(complbase)
|
|
76 return complseq
|
|
77
|
|
78 def reverse_complement(seq):
|
|
79 seq = list(seq)
|
|
80 seq.reverse()
|
|
81 revcompl = complement(seq)
|
|
82 revcomplstr = str()
|
|
83 for i in revcompl:
|
|
84 revcomplstr = revcomplstr + str(i)
|
|
85 return revcomplstr
|
|
86
|
|
87 def fastaseqlengths(proteins):
|
|
88 names = proteins[0]
|
|
89 seqs = proteins[1]
|
|
90 seqlengths = {}
|
|
91 a = 0
|
|
92 for i in names:
|
|
93 #seq = seqs[a]
|
|
94 #seqlength = len(seq)
|
|
95 #seqlengths[i] = seqlength
|
|
96 seqlengths[i] = len(seqs[a])
|
|
97 a += 1
|
|
98 return seqlengths
|
|
99
|
|
100 # Function that reads the fasta file into a dictionary
|
|
101 def fastadict(fasta):
|
|
102 file = open(fasta,"r")
|
|
103 filetext = file.read()
|
|
104 filetext = filetext.replace("\r","\n")
|
|
105 filetext = filetext.strip()
|
|
106 #Replaces all spaces with "_" to avoid problems
|
|
107 filetext = filetext.replace(' ','_')
|
|
108 filetext = filetext.split()
|
|
109 dictseq = {}
|
|
110 for a in filetext:
|
|
111 if ">" in a[0]:
|
|
112 f = str()
|
|
113 d = a[1:68]
|
|
114 else:
|
|
115 e = a
|
|
116 f += e
|
|
117 dictseq[d] = f
|
|
118 return dictseq
|
|
119
|
|
120 # Function that extracts all sequence names from the fasta dictionary
|
|
121 def lnames(fastadict):
|
|
122 items = fastadict.items()
|
|
123 items.sort()
|
|
124 return [names for names, seqs in items]
|
|
125
|
|
126 # Function that extracts all sequences from the fasta dictionary
|
|
127 def lseqs(fastadict):
|
|
128 items = fastadict.items()
|
|
129 items.sort()
|
|
130 return [seqs for names, seqs in items]
|
|
131
|
|
132 def extractpositions(refmusclefile,newmusclefile,positions,refsequencename,querysequencename):
|
|
133 dict = fastadict(refmusclefile)
|
|
134 seqs = lseqs(dict)
|
|
135 names = lnames(dict)
|
|
136 #startpos = 2
|
|
137 residues = []
|
|
138 #Count residues in ref sequence and put positions in list
|
|
139 muscle_dict = fastadict(newmusclefile)
|
|
140 muscle_seqs = lseqs(muscle_dict)
|
|
141 muscle_names = lnames(muscle_dict)
|
|
142 refseqnr = muscle_names.index(refsequencename)
|
|
143 #Extract activity signature
|
|
144 refseq = muscle_seqs[refseqnr]
|
|
145 poslist = []
|
|
146 b = 0
|
|
147 c = 0
|
|
148 while refseq != "":
|
|
149 i = refseq[0]
|
|
150 if c in positions and i != "-":
|
|
151 poslist.append(b)
|
|
152 if i != "-":
|
|
153 c += 1
|
|
154 b += 1
|
|
155 refseq = refseq[1:]
|
|
156 #Extract positions from query sequence
|
|
157 query_seqnr = muscle_names.index(querysequencename)
|
|
158 query_seq = muscle_seqs[query_seqnr]
|
|
159 for j in poslist:
|
|
160 residues.append(query_seq[j])
|
|
161 return residues
|
|
162
|
|
163 def parsegenes(genes):
|
|
164 genedict = {}
|
|
165 genelist = []
|
|
166 joinlist = []
|
|
167 joindict = {}
|
|
168 accessiondict = {}
|
|
169 error = "n"
|
|
170 errorlocations = []
|
|
171 genenr = 0
|
|
172 for i in genes:
|
|
173 if " gene " in i:
|
|
174 i = i.split(" gene ")[0]
|
|
175 elif "FT gene " in i:
|
|
176 i = i.split("FT gene ")[0]
|
|
177 join = "no"
|
|
178 genenr += 1
|
|
179 #Find gene location info for each gene
|
|
180 if "complement" in i.split("\n")[0].lower() and i.split("\n")[0][-1] == ")":
|
|
181 location = i.split("\n")[0]
|
|
182 elif "complement" in i.split("\n")[0].lower() and i.split("\n")[0][-1] != ")":
|
|
183 location = i.split(" /")[0]
|
|
184 while ")" not in location.replace(" ","")[-3:]:
|
|
185 location = location.rpartition("\n")[0]
|
|
186 location = location.replace("\n","")
|
|
187 location = location.replace(" ","")
|
|
188 elif "join" in i.split("\n")[0].lower() and i.split("\n")[0][-1] == ")":
|
|
189 location = i.split("\n")[0]
|
|
190 elif "join" in i.split("\n")[0].lower() and i.split("\n")[0][-1] != ")":
|
|
191 location = i.split("/")[0]
|
|
192 while ")" not in location.replace(" ","")[-3:]:
|
|
193 location = location.rpartition("\n")[0]
|
|
194 location = location.replace("\n","")
|
|
195 location = location.replace(" ","")
|
|
196 else:
|
|
197 location = i.split("\n")[0]
|
|
198 original_location = location
|
|
199 #location info found in gbk/embl file, now extract start and end positions
|
|
200 if location.count("(") != location.count(")"):
|
|
201 error = "y"
|
|
202 errorlocations.append(original_location)
|
|
203 continue
|
|
204 if "join(complement" in location.lower():
|
|
205 location = location.lower()
|
|
206 join = "yes"
|
|
207 location2 = location.partition("join(")[2][:-1].replace("<","").replace(">","")
|
|
208 if ("complement(" in location2[0:12] and location2[-1] != ")") or ")," in location2:
|
|
209 error = "y"
|
|
210 errorlocations.append(original_location)
|
|
211 continue
|
|
212 elif ("complement(" in location2[0:12] and location2[-1] == ")" and location2[12:-2].count(")") == 0 and location2[12:-2].count("(") == 0):
|
|
213 location2 = location2.partition("complement(")[2][:-1]
|
|
214 start = location2.split(",")[0]
|
|
215 start = start.split("..")[0]
|
|
216 start = start.replace("<","")
|
|
217 end = location2.split(",")[-1]
|
|
218 if ".." in end:
|
|
219 end = end.split("..")[1]
|
|
220 end = end.replace(">","")
|
|
221 joinedparts = location2.split(",")
|
|
222 joinedparts2 = []
|
|
223 for j in joinedparts:
|
|
224 newjoinedpart = j.replace("<","")
|
|
225 newjoinedpart = newjoinedpart.replace(">","")
|
|
226 joinedparts2.append(newjoinedpart)
|
|
227 strand = "-"
|
|
228 else:
|
|
229 error = "y"
|
|
230 errorlocations.append(original_location)
|
|
231 continue
|
|
232 elif "complement" in location.lower():
|
|
233 location = location.lower()
|
|
234 location = location.partition("complement(")[2][:-1]
|
|
235 if "join(" in location.lower():
|
|
236 join = "yes"
|
|
237 location = location.lower()
|
|
238 location2 = location.partition("join(")[2][:-1]
|
|
239 start = location2.split(",")[0]
|
|
240 start = start.split("..")[0]
|
|
241 start = start.replace("<","")
|
|
242 end = location2.split(",")[-1]
|
|
243 if ".." in end:
|
|
244 end = end.split("..")[1]
|
|
245 end = end.replace(">","")
|
|
246 joinedparts = location2.split(",")
|
|
247 joinedparts2 = []
|
|
248 for j in joinedparts:
|
|
249 newjoinedpart = j.replace("<","")
|
|
250 newjoinedpart = newjoinedpart.replace(">","")
|
|
251 joinedparts2.append(newjoinedpart)
|
|
252 else:
|
|
253 start = location.split("..")[0]
|
|
254 start = start.replace("<","")
|
|
255 if ".." in location:
|
|
256 end = location.split("..")[1]
|
|
257 else:
|
|
258 end = location
|
|
259 end = end.replace(">","")
|
|
260 strand = "-"
|
|
261 else:
|
|
262 if "join(" in location.lower():
|
|
263 join = "yes"
|
|
264 location = location.lower()
|
|
265 location2 = location.partition("join(")[2][:-1]
|
|
266 start = location2.split(",")[0]
|
|
267 start = start.split("..")[0]
|
|
268 start = start.replace("<","")
|
|
269 end = location2.split(",")[-1]
|
|
270 if ".." in end:
|
|
271 end = end.split("..")[1]
|
|
272 end = end.replace(">","")
|
|
273 joinedparts = location2.split(",")
|
|
274 joinedparts2 = []
|
|
275 for j in joinedparts:
|
|
276 newjoinedpart = j.replace("<","")
|
|
277 newjoinedpart = newjoinedpart.replace(">","")
|
|
278 joinedparts2.append(newjoinedpart)
|
|
279 else:
|
|
280 start = location.split("..")[0]
|
|
281 start = start.replace("<","")
|
|
282 if ".." in location:
|
|
283 end = location.split("..")[1]
|
|
284 else:
|
|
285 end = location
|
|
286 end = end.replace(">","")
|
|
287 strand = "+"
|
|
288 try:
|
|
289 if int(start) > int(end):
|
|
290 start2 = end
|
|
291 end2 = start
|
|
292 start = start2
|
|
293 end = end2
|
|
294 except ValueError:
|
|
295 error = "y"
|
|
296 errorlocations.append(original_location)
|
|
297 continue
|
|
298 #Correct for alternative codon start positions
|
|
299 if "codon_start=" in i.lower():
|
|
300 temp = i.lower().split("codon_start=")[1].split()[0]
|
|
301 if '"' in temp:
|
|
302 # temp ist "1" oder "2", dies kommt aus biopython
|
|
303 temp = temp[1]
|
|
304 else:
|
|
305 # ohne anfuhrungszeichen ... 1 oder 2
|
|
306 temp = temp[0]
|
|
307 codonstart = temp
|
|
308 if strand == "+":
|
|
309 start = str(int(start) + (int(codonstart) - 1))
|
|
310 elif strand == "-":
|
|
311 end = str(int(end) - (int(codonstart) - 1))
|
|
312 #Find gene name for each gene, preferably locus_tag, than gene, than protein_ID
|
|
313 a = 0
|
|
314 b = 0
|
|
315 genename = ""
|
|
316 nrlines = len(i.split("\n"))
|
|
317 while b == 0:
|
|
318 line = i.split("\n")[a]
|
|
319 if "protein_id=" in line:
|
|
320 genename = (line.split("protein_id=")[1][1:-1]).replace(" ","_")
|
|
321 genename = genename.replace("\\","_")
|
|
322 genename = genename.replace("/","_")
|
|
323 b += 1
|
|
324 elif "protein_id=" in line.lower():
|
|
325 genename = (line.lower().split("protein_id=")[1][1:-1]).replace(" ","_")
|
|
326 genename = genename.replace("\\","_")
|
|
327 genename = genename.replace("/","_")
|
|
328 b += 1
|
|
329 elif a == (nrlines - 1):
|
|
330 genename = ""
|
|
331 b += 1
|
|
332 else:
|
|
333 a += 1
|
|
334 if len(genename) > 1:
|
|
335 accnr = genename
|
|
336 else:
|
|
337 accnr = "no_accession_number_found"
|
|
338 a = 0
|
|
339 b = 0
|
|
340 nrlines = len(i.split("\n"))
|
|
341 while b == 0:
|
|
342 line = i.split("\n")[a]
|
|
343 if "gene=" in line:
|
|
344 genename = (line.split("gene=")[1][1:-1]).replace(" ","_")
|
|
345 genename = genename.replace("\\","_")
|
|
346 genename = genename.replace("/","_")
|
|
347 b += 1
|
|
348 elif "gene=" in line.lower():
|
|
349 genename = (line.lower().split("gene=")[1][1:-1]).replace(" ","_")
|
|
350 genename = genename.replace("\\","_")
|
|
351 genename = genename.replace("/","_")
|
|
352 b += 1
|
|
353 elif a == (nrlines - 1):
|
|
354 b += 1
|
|
355 else:
|
|
356 a += 1
|
|
357 a = 0
|
|
358 b = 0
|
|
359 nrlines = len(i.split("\n"))
|
|
360 while b == 0:
|
|
361 line = i.split("\n")[a]
|
|
362 if "locus_tag=" in line:
|
|
363 genename = (line.split("locus_tag=")[1][1:-1]).replace(" ","_")
|
|
364 genename = genename.replace("\\","_")
|
|
365 genename = genename.replace("/","_")
|
|
366 b += 1
|
|
367 elif "locus_tag=" in line.lower():
|
|
368 genename = (line.lower().split("locus_tag=")[1][1:-1]).replace(" ","_")
|
|
369 genename = genename.replace("\\","_")
|
|
370 genename = genename.replace("/","_")
|
|
371 b += 1
|
|
372 elif a == (nrlines - 1):
|
|
373 if genename == "":
|
|
374 genename = "prot_ID_" + str(genenr)
|
|
375 b += 1
|
|
376 else:
|
|
377 a += 1
|
|
378 #Find sequence for each gene
|
|
379 a = 0 ###Not all gbks contain protein sequences as translations, therefore sequences from gene clusters are now extracted from the database at a later stage if sequence is not in gbk
|
|
380 b = 0
|
|
381 sequence = ""
|
|
382 while b < 2:
|
|
383 line = i.split("\n")[a]
|
|
384 if "translation=" in line:
|
|
385 sequence = line.split("translation=")[1][1:]
|
|
386 b += 1
|
|
387 a += 1
|
|
388 if line.count('"') > 1:
|
|
389 sequence = line.split("translation=")[1][1:-1]
|
|
390 b = 2
|
|
391 elif "translation=" in line.lower():
|
|
392 sequence = line.lower().split("translation=")[1][1:]
|
|
393 b += 1
|
|
394 a += 1
|
|
395 if line.count('"') > 1:
|
|
396 sequence = line.lower().split("translation=")[1][1:-1]
|
|
397 b = 2
|
|
398 elif a == (nrlines - 2) or a == (nrlines - 1):
|
|
399 sequence = ""
|
|
400 b = 2
|
|
401 elif b == 1:
|
|
402 if '"' in line:
|
|
403 seqline = line.replace(" ","")
|
|
404 seqline = seqline.split('"')[0]
|
|
405 sequence = sequence + seqline
|
|
406 b += 1
|
|
407 else:
|
|
408 seqline = line.replace(" ","")
|
|
409 sequence = sequence + seqline
|
|
410 a += 1
|
|
411 else:
|
|
412 a += 1
|
|
413 sequence = sequence.upper()
|
|
414 #Quality-check sequence
|
|
415 forbiddencharacters = ["'",'"','=',';',':','[',']','>','<','|','\\',"/",'*','-','_','.',',','?',')','(','^','#','!','`','~','+','{','}','@','$','%','&']
|
|
416 for z in forbiddencharacters:
|
|
417 if z in sequence:
|
|
418 sequence = ""
|
|
419 #Find annotation for each gene
|
|
420 a = 0
|
|
421 b = 0
|
|
422 while b == 0:
|
|
423 line = i.split("\n")[a]
|
|
424 if "product=" in line:
|
|
425 annotation = line.split("product=")[1][1:]
|
|
426 annotation = annotation.replace(" ","_")
|
|
427 if annotation[-1] == '"':
|
|
428 annotation = annotation[:-1]
|
|
429 b += 1
|
|
430 elif "product=" in line.lower():
|
|
431 annotation = line.lower().split("product=")[1][1:]
|
|
432 annotation = annotation.replace(" ","_")
|
|
433 if annotation[-1] == '"':
|
|
434 annotation = annotation[:-1]
|
|
435 b += 1
|
|
436 elif a == (nrlines - 1):
|
|
437 annotation = "not_annotated"
|
|
438 b += 1
|
|
439 else:
|
|
440 a += 1
|
|
441 accessiondict[genename] = accnr
|
|
442 if join == "yes":
|
|
443 joinlist.append(genename)
|
|
444 joindict[genename] = joinedparts2
|
|
445 #Save data to dictionary
|
|
446 if len(genename) > 1:
|
|
447 genedict[genename] = [start,end,strand,annotation,sequence]
|
|
448 genelist.append(genename)
|
|
449 if error == "y":
|
|
450 errorinfo = "\n".join(errorlocations)
|
|
451 print >> sys.stderr, "Exit: locations in GBK/EMBL file not properly formatted:\n" + errorinfo
|
|
452 logfile.write("Exit: GBK file not properly formatted, no sequence found or no CDS annotation found.\n")
|
|
453 logfile.close()
|
|
454 sys.exit(1)
|
|
455 return [genelist, genedict, joinlist, joindict, accessiondict]
|
|
456
|
|
457 def cleandnaseq(dnaseq):
|
|
458 dnaseq = dnaseq.replace(" ","")
|
|
459 dnaseq = dnaseq.replace("\t","")
|
|
460 dnaseq = dnaseq.replace("\n","")
|
|
461 dnaseq = dnaseq.replace("0","")
|
|
462 dnaseq = dnaseq.replace("1","")
|
|
463 dnaseq = dnaseq.replace("2","")
|
|
464 dnaseq = dnaseq.replace("3","")
|
|
465 dnaseq = dnaseq.replace("4","")
|
|
466 dnaseq = dnaseq.replace("5","")
|
|
467 dnaseq = dnaseq.replace("6","")
|
|
468 dnaseq = dnaseq.replace("7","")
|
|
469 dnaseq = dnaseq.replace("8","")
|
|
470 dnaseq = dnaseq.replace("9","")
|
|
471 dnaseq = dnaseq.replace("/","")
|
|
472 dnaseq = dnaseq.replace("u","t")
|
|
473 dnaseq = dnaseq.replace("U","T")
|
|
474 dnaseq = dnaseq.replace("r","n")
|
|
475 dnaseq = dnaseq.replace("R","n")
|
|
476 dnaseq = dnaseq.replace("y","n")
|
|
477 dnaseq = dnaseq.replace("Y","n")
|
|
478 dnaseq = dnaseq.replace("w","n")
|
|
479 dnaseq = dnaseq.replace("W","n")
|
|
480 dnaseq = dnaseq.replace("s","n")
|
|
481 dnaseq = dnaseq.replace("S","n")
|
|
482 dnaseq = dnaseq.replace("m","n")
|
|
483 dnaseq = dnaseq.replace("M","n")
|
|
484 dnaseq = dnaseq.replace("k","n")
|
|
485 dnaseq = dnaseq.replace("K","n")
|
|
486 dnaseq = dnaseq.replace("h","n")
|
|
487 dnaseq = dnaseq.replace("H","n")
|
|
488 dnaseq = dnaseq.replace("b","n")
|
|
489 dnaseq = dnaseq.replace("B","n")
|
|
490 dnaseq = dnaseq.replace("v","n")
|
|
491 dnaseq = dnaseq.replace("V","n")
|
|
492 dnaseq = dnaseq.replace("d","n")
|
|
493 dnaseq = dnaseq.replace("D","n")
|
|
494 return dnaseq
|
|
495
|
|
496 def extractprotfasta(genelist,genedict,dnaseq,rc_dnaseq,joinlist,joindict,accessiondict):
|
|
497 names = []
|
|
498 seqs = []
|
|
499 for i in genelist:
|
|
500 genename = i
|
|
501 #If suitable translation found in gbk, use that
|
|
502 if len(genedict[i][4]) > 5:
|
|
503 protseq = genedict[i][4]
|
|
504 i = genedict[i]
|
|
505 #If no suitable translation found in gbk, extract from DNA sequence
|
|
506 else:
|
|
507 i = genedict[i]
|
|
508 y = int(i[0])
|
|
509 z = int(i[1])
|
|
510 if i[2] == "+":
|
|
511 if genename in joinlist:
|
|
512 geneseq = ""
|
|
513 for j in joindict[genename]:
|
|
514 partstart = int(j.split("..")[0])
|
|
515 if ".." in j:
|
|
516 partend = int(j.split("..")[1])
|
|
517 else:
|
|
518 partend = int(j)
|
|
519 geneseqpart = dnaseq[(partstart - 1):partend]
|
|
520 geneseq = geneseq + geneseqpart
|
|
521 else:
|
|
522 geneseq = dnaseq[(y - 1):z]
|
|
523 protseq = translate(geneseq)
|
|
524 elif i[2] == "-":
|
|
525 if genename in joinlist:
|
|
526 geneseq = ""
|
|
527 joinlistrev = joindict[genename]
|
|
528 joinlistrev.reverse()
|
|
529 for j in joinlistrev:
|
|
530 partstart = int(j.split("..")[0])
|
|
531 if ".." in j:
|
|
532 partend = int(j.split("..")[1])
|
|
533 else:
|
|
534 partend = int(j)
|
|
535 geneseqpart = rc_dnaseq[(len(rc_dnaseq) - partend):(len(rc_dnaseq) - partstart + 1)]
|
|
536 geneseq = geneseq + geneseqpart
|
|
537 else:
|
|
538 geneseq = rc_dnaseq[(len(rc_dnaseq) - z):(len(rc_dnaseq) - y + 1)]
|
|
539 protseq = translate(geneseq)
|
|
540 name = "input" + "|" + "c1" + "|" + i[0] + "-" + i[1] + "|" + i[2] + "|" + genename + "|" + i[3]
|
|
541 seqs.append(protseq)
|
|
542 names.append(name)
|
|
543 proteins = [names,seqs,genelist,genedict,accessiondict]
|
|
544 return proteins
|
|
545
|
|
546 def gbk2proteins(gbkfile):
|
|
547 file = open(gbkfile,"r")
|
|
548 filetext = file.read()
|
|
549 filetext = filetext.replace("\r","\n")
|
|
550 if " CDS " not in filetext or "\nORIGIN" not in filetext:
|
|
551 print >> sys.stderr, "Exit: GBK file not properly formatted, no sequence found or no CDS annotation found."
|
|
552 logfile.write("Exit: GBK file not properly formatted, no sequence found or no CDS annotation found.\n")
|
|
553 logfile.close()
|
|
554 sys.exit(1)
|
|
555 cdspart = filetext.split("\nORIGIN")[0]
|
|
556 #Extract DNA sequence and calculate reverse complement of it
|
|
557 dnaseq = filetext.split("\nORIGIN")[1]
|
|
558 dnaseq = cleandnaseq(dnaseq)
|
|
559 sequence = dnaseq
|
|
560 if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
|
|
561 print >> sys.stderr, "Protein GBK/EMBL file provided. Please provide nucleotide GBK/EMBL file."
|
|
562 sys.exit(1)
|
|
563 dnaseqlength = len(dnaseq)
|
|
564 rc_dnaseq = reverse_complement(dnaseq)
|
|
565 #Extract genes
|
|
566 genes = cdspart.split(" CDS ")
|
|
567 genes = genes[1:]
|
|
568 try:
|
|
569 genesdetails = parsegenes(genes)
|
|
570 except ValueError, e:
|
|
571 print >> sys.stderr, "Could not parse genes from GBK/EMBL file. Please check if your GBK/EMBL file is valid."
|
|
572 raise
|
|
573 print >> sys.stderr, "Error was: %s" % e
|
|
574 print len(genes)
|
|
575 sys.exit(1)
|
|
576 genelist = genesdetails[0]
|
|
577 genedict = genesdetails[1]
|
|
578 joinlist = genesdetails[2]
|
|
579 joindict = genesdetails[3]
|
|
580 accessiondict = genesdetails[4]
|
|
581 #Locate all genes on DNA sequence and translate to protein sequence
|
|
582 proteins = extractprotfasta(genelist,genedict,dnaseq,rc_dnaseq,joinlist,joindict,accessiondict)
|
|
583 textlines = filetext.split("\n//")[0]
|
|
584 textlines = textlines.split("\n")
|
|
585 accession = ""
|
|
586 for i in textlines:
|
|
587 if accession == "":
|
|
588 if "LOCUS " in i:
|
|
589 j = i.split("LOCUS ")[1]
|
|
590 accession = j.split(" ")[0]
|
|
591 if len(accession) < 4:
|
|
592 accession = ""
|
|
593 #Test if accession number is probably real GenBank/RefSeq acc nr
|
|
594 numbers = range(0,10)
|
|
595 letters = []
|
|
596 for i in ascii_letters:
|
|
597 letters.append(i)
|
|
598 nrnumbers = 0
|
|
599 nrletters = 0
|
|
600 for i in accession:
|
|
601 if i in letters:
|
|
602 nrletters += 1
|
|
603 try:
|
|
604 j = int(i)
|
|
605 if j in numbers:
|
|
606 nrnumbers += 1
|
|
607 except:
|
|
608 pass
|
|
609 if nrnumbers < 3 or nrletters < 1:
|
|
610 accession = ""
|
|
611 return [proteins,accession,dnaseqlength]
|
|
612
|
|
613 def embl2proteins(emblfile,sequence):
|
|
614 file = open(emblfile,"r")
|
|
615 filetext = file.read()
|
|
616 filetext = filetext.replace("\r","\n")
|
|
617 file.close()
|
|
618 if "FT CDS " not in filetext or ("\nSQ" not in filetext and len(sequence) < 1):
|
|
619 logfile.write("Exit: EMBL file not properly formatted, no sequence found or no CDS annotation found.\n")
|
|
620 print >> sys.stderr, "Exit: EMBL file not properly formatted, no sequence found or no CDS annotation found.\n"
|
|
621 logfile.close()
|
|
622 sys.exit(1)
|
|
623 cdspart = filetext.split("\nSQ ")[0]
|
|
624 #Extract DNA sequence and calculate reverse complement of it
|
|
625 seqpart = filetext.split("\nSQ ")[1]
|
|
626 seqlines = seqpart.split("\n")[1:]
|
|
627 dnaseq = ""
|
|
628 for i in seqlines:
|
|
629 dnaseq = dnaseq + i
|
|
630 dnaseq = cleandnaseq(dnaseq)
|
|
631 sequence = dnaseq
|
|
632 if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
|
|
633 print >> sys.stderr, "Protein GBK/EMBL file provided. Please provide nucleotide GBK/EMBL file."
|
|
634 sys.exit(1)
|
|
635 dnaseqlength = len(dnaseq)
|
|
636 rc_dnaseq = reverse_complement(dnaseq)
|
|
637 #Extract genes
|
|
638 genes = cdspart.split("FT CDS ")
|
|
639 genes = genes[1:]
|
|
640 try:
|
|
641 genesdetails = parsegenes(genes)
|
|
642 except ValueError, e:
|
|
643 print >> sys.stderr, "Could not parse genes from GBK/EMBL file. Please check if your GBK/EMBL file is valid."
|
|
644 print >> sys.stderr, "Error was: %s" % e
|
|
645 sys.exit(1)
|
|
646 genelist = genesdetails[0]
|
|
647 genedict = genesdetails[1]
|
|
648 joinlist = genesdetails[2]
|
|
649 joindict = genesdetails[3]
|
|
650 accessiondict = genesdetails[4]
|
|
651 #Locate all genes on DNA sequence and translate to protein sequence
|
|
652 proteins = extractprotfasta(genelist,genedict,dnaseq,rc_dnaseq,joinlist,joindict,accessiondict)
|
|
653 textlines = filetext.split("SQ ")[0]
|
|
654 textlines = textlines.split("\n")
|
|
655 accession = ""
|
|
656 for i in textlines:
|
|
657 if accession == "":
|
|
658 if "AC " in i:
|
|
659 j = i.split("AC ")[1]
|
|
660 j = j.replace(" ","")
|
|
661 accession = j.split(";")[0]
|
|
662 if len(accession) < 4:
|
|
663 accession = ""
|
|
664 #Test if accession number is probably real GenBank/RefSeq acc nr
|
|
665 numbers = range(0,10)
|
|
666 letters = []
|
|
667 for i in ascii_letters:
|
|
668 letters.append(i)
|
|
669 nrnumbers = 0
|
|
670 nrletters = 0
|
|
671 for i in accession:
|
|
672 if i in letters:
|
|
673 nrletters += 1
|
|
674 try:
|
|
675 j = int(i)
|
|
676 if j in numbers:
|
|
677 nrnumbers += 1
|
|
678 except:
|
|
679 pass
|
|
680 if nrnumbers < 3 or nrletters < 1:
|
|
681 accession = ""
|
|
682 return [proteins,accession,dnaseqlength]
|
|
683
|
|
684 def translate(sequence):
|
|
685 #Translation table standard genetic code; according to http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
|
|
686 transldict = { 'TTT': 'F', 'TCT': 'S', 'TAT': 'Y', 'TGT': 'C',
|
|
687 'TTC': 'F', 'TCC': 'S', 'TAC': 'Y', 'TGC': 'C',
|
|
688 'TTA': 'L', 'TCA': 'S', 'TAA': '*', 'TGA': '*',
|
|
689 'TTG': 'L', 'TCG': 'S', 'TAG': '*', 'TGG': 'W',
|
|
690 'CTT': 'L', 'CCT': 'P', 'CAT': 'H', 'CGT': 'R',
|
|
691 'CTC': 'L', 'CCC': 'P', 'CAC': 'H', 'CGC': 'R',
|
|
692 'CTA': 'L', 'CCA': 'P', 'CAA': 'Q', 'CGA': 'R',
|
|
693 'CTG': 'L', 'CCG': 'P', 'CAG': 'Q', 'CGG': 'R',
|
|
694 'ATT': 'I', 'ACT': 'T', 'AAT': 'N', 'AGT': 'S',
|
|
695 'ATC': 'I', 'ACC': 'T', 'AAC': 'N', 'AGC': 'S',
|
|
696 'ATA': 'I', 'ACA': 'T', 'AAA': 'K', 'AGA': 'R',
|
|
697 'ATG': 'M', 'ACG': 'T', 'AAG': 'K', 'AGG': 'R',
|
|
698 'GTT': 'V', 'GCT': 'A', 'GAT': 'D', 'GGT': 'G',
|
|
699 'GTC': 'V', 'GCC': 'A', 'GAC': 'D', 'GGC': 'G',
|
|
700 'GTA': 'V', 'GCA': 'A', 'GAA': 'E', 'GGA': 'G',
|
|
701 'GTG': 'V', 'GCG': 'A', 'GAG': 'E', 'GGG': 'G',
|
|
702 'ttt': 'F', 'tct': 'S', 'tat': 'Y', 'tgt': 'C',
|
|
703 'ttc': 'F', 'tcc': 'S', 'tac': 'Y', 'tgc': 'C',
|
|
704 'tta': 'L', 'tca': 'S', 'taa': '*', 'tga': '*',
|
|
705 'ttg': 'L', 'tcg': 'S', 'tag': '*', 'tgg': 'W',
|
|
706 'ctt': 'L', 'cct': 'P', 'cat': 'H', 'cgt': 'R',
|
|
707 'ctc': 'L', 'ccc': 'P', 'cac': 'H', 'cgc': 'R',
|
|
708 'cta': 'L', 'cca': 'P', 'caa': 'Q', 'cga': 'R',
|
|
709 'ctg': 'L', 'ccg': 'P', 'cag': 'Q', 'cgg': 'R',
|
|
710 'att': 'I', 'act': 'T', 'aat': 'N', 'agt': 'S',
|
|
711 'atc': 'I', 'acc': 'T', 'aac': 'N', 'agc': 'S',
|
|
712 'ata': 'I', 'aca': 'T', 'aaa': 'K', 'aga': 'R',
|
|
713 'atg': 'M', 'acg': 'T', 'aag': 'K', 'agg': 'R',
|
|
714 'gtt': 'V', 'gct': 'A', 'gat': 'D', 'ggt': 'G',
|
|
715 'gtc': 'V', 'gcc': 'A', 'gac': 'D', 'ggc': 'G',
|
|
716 'gta': 'V', 'gca': 'A', 'gaa': 'E', 'gga': 'G',
|
|
717 'gtg': 'V', 'gcg': 'A', 'gag': 'E', 'ggg': 'G'}
|
|
718 triplets = []
|
|
719 triplet = ""
|
|
720 a = 0
|
|
721 for i in sequence:
|
|
722 if a < 2:
|
|
723 a += 1
|
|
724 triplet = triplet + i
|
|
725 elif a == 2:
|
|
726 triplet = triplet + i
|
|
727 triplets.append(triplet)
|
|
728 triplet = ""
|
|
729 a = 0
|
|
730 protseq = ""
|
|
731 aanr = 0
|
|
732 for i in triplets:
|
|
733 aanr += 1
|
|
734 if aanr == 1:
|
|
735 protseq = protseq + "M"
|
|
736 else:
|
|
737 if "n" in i or "N" in i or i not in transldict.keys():
|
|
738 protseq = protseq + "X"
|
|
739 else:
|
|
740 protseq = protseq + transldict[i]
|
|
741 if len(protseq) > 0 and protseq[-1] == "*":
|
|
742 protseq = protseq[:-1]
|
|
743 return protseq
|
|
744
|
|
745 def writefasta(names,seqs,file):
|
|
746 e = 0
|
|
747 f = len(names) - 1
|
|
748 try:
|
|
749 out_file = open(file,"w")
|
|
750 while e <= f:
|
|
751 out_file.write(">%s\n%s\n" % (names[e], seqs[e]) )
|
|
752 #out_file.write(">")
|
|
753 #out_file.write(names[e])
|
|
754 #out_file.write("\n")
|
|
755 #out_file.write(seqs[e])
|
|
756 #out_file.write("\n")
|
|
757 e += 1
|
|
758 out_file.close()
|
|
759 except(IOError,OSError,NotImplementedError):
|
|
760 print >> sys.stderr, "FASTA file not created."
|
|
761 logfile.write("FASTA file not created.\n")
|
|
762
|
|
763 def parsehmmoutput(cutoff,file):
|
|
764 #file = open(file,"r")
|
|
765 #filetext = file.read()
|
|
766 #filetext = filetext.replace("\r","\n")
|
|
767 #lines = filetext.split("\n")
|
|
768 protlines = []
|
|
769 #for i in lines:
|
|
770 # if len(i) > 1 and i[0] != "#":
|
|
771 # protlines.append(i)
|
|
772 [protlines.append(line.strip()) for line in open(file,"r") if len(line) > 1 and not line.startswith('#')]
|
|
773 proteins = []
|
|
774 scores = []
|
|
775 #measuringline = lines[2]
|
|
776 measuringline = linecache.getline(file, 3)
|
|
777 x = 0
|
|
778 y = 0
|
|
779 for i in measuringline:
|
|
780 y += 1
|
|
781 if "-" in i:
|
|
782 x += 1
|
|
783 else:
|
|
784 if x > 1:
|
|
785 break
|
|
786 for i in protlines:
|
|
787 #accession = ""
|
|
788 #a = 0
|
|
789 protname = i[0:y]
|
|
790 protnameparts = protname.split("|")
|
|
791 accession = protnameparts[4]
|
|
792 score = i[(y+76):(y+82)]
|
|
793 score = float(score.replace(" ",""))
|
|
794 if score > cutoff and len(accession) > 1:
|
|
795 proteins.append(accession)
|
|
796 scores.append(score)
|
|
797 return [proteins,scores]
|
|
798
|
|
799 def sortonsecondvalueoflist(first,second):
|
|
800 f = int(first[1])
|
|
801 s = second[1]
|
|
802 if f > s:
|
|
803 value = 1
|
|
804 elif f < s:
|
|
805 value = -1
|
|
806 elif f == s:
|
|
807 value = 0
|
|
808 return value
|
|
809
|
|
810 def hmmlengths(hmmfile):
|
|
811 hmmlengthsdict = {}
|
|
812 file = open(hmmfile,"r")
|
|
813 filetext = file.read()
|
|
814 filetext = filetext.replace("\r","\n")
|
|
815 hmms = filetext.split("//")[:-1]
|
|
816 for i in hmms:
|
|
817 namepart = i.split("NAME ")[1]
|
|
818 name = namepart.split("\n", 1)[0]
|
|
819 lengthpart = i.split("LENG ")[1]
|
|
820 #print lengthline
|
|
821 #tabs = lengthline.split(" ")
|
|
822 #tabs2 = []
|
|
823 #for j in tabs:
|
|
824 # if j != "":
|
|
825 # tabs2.append(j)
|
|
826 #print tabs2
|
|
827 length = lengthpart.split("\n", 1)[0]
|
|
828 hmmlengthsdict[name] = int(length)
|
|
829 return hmmlengthsdict
|
|
830
|
|
831 def hmmscanparse(hmmscanoutputfile,hmmlengthsdict):
|
|
832 domaindict = {}
|
|
833 file = open(hmmscanoutputfile,"r")
|
|
834 filetext = file.read()
|
|
835 filetext = filetext.replace("\r","\n")
|
|
836 outputs = filetext.split("Query: ")[1:]
|
|
837 for i in outputs:
|
|
838 protname = i.split("\n", 1)[0]
|
|
839 protname = protname.split(" ", 1)[0]
|
|
840 domainresults = i.split("Domain annotation for each model:\n")[1]
|
|
841 domainresults = domainresults.split("\n\nInternal pipeline statistics summary:")[0]
|
|
842 domains = domainresults.split(">> ")
|
|
843 domainlist = []
|
|
844 #Find all domains
|
|
845 for i in domains:
|
|
846 tokens = i.split('\n')
|
|
847 domainname = tokens[0]
|
|
848 domainname = domainname.split(" ", 1)[0]
|
|
849 domainresults = tokens[3:-2]
|
|
850 for i in domainresults:
|
|
851 tabs = i.split(" ")
|
|
852 tabs2 = []
|
|
853 [tabs2.append(tab) for tab in tabs if tab != '']
|
|
854 #for i in tabs:
|
|
855 # if i != "":
|
|
856 # tabs2.append(i)
|
|
857 tabs = tabs2
|
|
858 start = int(tabs[12])
|
|
859 end = int(tabs[13])
|
|
860 evalue = tabs[5]
|
|
861 score = float(tabs[2])
|
|
862 domainlist.append([domainname,start,end,evalue,score])
|
|
863 domainlist.sort(sortonsecondvalueoflist)
|
|
864 #Purify domain list to remove overlapping domains, only keeping those with the highest scores
|
|
865 if len(domainlist) > 1:
|
|
866 domainlist2 = [domainlist[0]]
|
|
867 for i in domainlist[1:]:
|
|
868 maxoverlap = 20
|
|
869 if i[1] < (domainlist2[-1][2] - maxoverlap):
|
|
870 if i[4] < domainlist2[-1][4]:
|
|
871 pass
|
|
872 elif i[4] > domainlist2[-1][4]:
|
|
873 del domainlist2[-1]
|
|
874 domainlist2.append(i)
|
|
875 else:
|
|
876 domainlist2.append(i)
|
|
877 domainlist = domainlist2
|
|
878 #Merge domain fragments which are really one domain
|
|
879 if len(domainlist) > 1:
|
|
880 domainlist2 = [domainlist[0]]
|
|
881 for i in domainlist[1:]:
|
|
882 alilength1 = int(domainlist2[-1][2]) - int(domainlist2[-1][1])
|
|
883 alilength2 = int(i[2]) - int(i[1])
|
|
884 domainlength = hmmlengthsdict[i[0]]
|
|
885 if i[0] == domainlist2[-1][0] and (alilength1 < (0.75 * domainlength) or alilength2 < (0.75 * domainlength)) and (alilength1 + alilength2) < (1.5 * domainlength):
|
|
886 name = i[0]
|
|
887 start = domainlist2[-1][1]
|
|
888 end = i[2]
|
|
889 evalue = str(float(domainlist2[-1][3]) * float(i[3]))
|
|
890 score = str(float(domainlist2[-1][4]) + float(i[4]))
|
|
891 del domainlist2[-1]
|
|
892 domainlist2.append([name,start,end,evalue,score])
|
|
893 else:
|
|
894 domainlist2.append(i)
|
|
895 domainlist = domainlist2
|
|
896 #Remove incomplete domains (covering less than 60% of total domain hmm length)
|
|
897 if len(domainlist) > 1:
|
|
898 domainlist2 = []
|
|
899 for i in domainlist:
|
|
900 alilength = int(i[2]) - int(i[1])
|
|
901 domainlength = hmmlengthsdict[i[0]]
|
|
902 if alilength > (0.6 * domainlength):
|
|
903 domainlist2.append(i)
|
|
904 domainlist = domainlist2
|
|
905 #Save domainlist to domaindict
|
|
906 domaindict[protname] = domainlist
|
|
907 return domaindict
|
|
908
|
|
909 def blastparse(blasttext,minseqcoverage,minpercidentity,seqlengths,geneclustergenes):
|
|
910 blastdict = {}
|
|
911 querylist = []
|
|
912 hitclusters = []
|
|
913 blastlines = blasttext.split("\n")[:-1]
|
|
914 #Filter for best blast hits (of one query on each subject)
|
|
915 query_subject_combinations = []
|
|
916 blastlines2 = []
|
|
917 for i in blastlines:
|
|
918 tabs = i.split("\t")
|
|
919 query = tabs[0]
|
|
920 subject = tabs[1]
|
|
921 query_subject_combination = query + "_" + subject
|
|
922 if query_subject_combination in query_subject_combinations:
|
|
923 pass
|
|
924 else:
|
|
925 query_subject_combinations.append(query_subject_combination)
|
|
926 blastlines2.append(i)
|
|
927 blastlines = blastlines2
|
|
928 #Filters blastlines to get rid of hits that do not meet criteria
|
|
929 blastlines2 = []
|
|
930 for i in blastlines:
|
|
931 tabs = i.split("\t")
|
|
932 perc_ident = int(tabs[2].split(".",1)[0])
|
|
933 alignmentlength = float(tabs[3])
|
|
934 evalue = str(tabs[10])
|
|
935 blastscore = int(tabs[11].split(".",1)[0])
|
|
936 if seqlengths.has_key(query):
|
|
937 perc_coverage = (float(tabs[3]) / seqlengths[query]) * 100
|
|
938 if perc_ident > minpercidentity and (perc_coverage > minseqcoverage or alignmentlength > 40):
|
|
939 blastlines2.append(i)
|
|
940 blastlines = blastlines2
|
|
941 #Goes through the blastlines. For each query, creates a querydict and hitlist, and adds these to the blastdict when finding the next query
|
|
942 firstquery = "y"
|
|
943 for i in blastlines:
|
|
944 tabs = i.split("\t")
|
|
945 query = tabs[0]
|
|
946
|
|
947 second_column_split = tabs[1].split("|")
|
|
948
|
|
949 subject = second_column_split[4]
|
|
950 if subject == "no_locus_tag":
|
|
951 subject = second_column_split[6]
|
|
952 if subject in geneclustergenes:
|
|
953 subject = "h_" + subject
|
|
954 if len(second_column_split) > 6:
|
|
955 locustag = second_column_split[6]
|
|
956 else:
|
|
957 locustag = ""
|
|
958 subject_genecluster = second_column_split[0] + "_" + second_column_split[1]
|
|
959 subject_start = (second_column_split[2]).split("-")[0]
|
|
960 subject_end = (second_column_split[2]).split("-")[1]
|
|
961 subject_strand = second_column_split[3]
|
|
962 subject_annotation = second_column_split[5]
|
|
963 perc_ident = int(tabs[2].split(".")[0])
|
|
964 alignmentlength = float(tabs[3])
|
|
965 evalue = str(tabs[10])
|
|
966 blastscore = int(tabs[11].split(".", 1)[0])
|
|
967 if seqlengths.has_key(query):
|
|
968 perc_coverage = (float(tabs[3]) / seqlengths[query]) * 100
|
|
969 else:
|
|
970 seqlength = len(seqdict[query.split("|")[4]])
|
|
971 perc_coverage = (float(tabs[3]) / seqlength) * 100
|
|
972 if firstquery == "y": #Only until the first blastline with good hit
|
|
973 firstquery = "n"
|
|
974 querylist.append(query)
|
|
975 subjectlist = []
|
|
976 querydict = {}
|
|
977 subjectlist.append(subject)
|
|
978 querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
|
|
979 if subject_genecluster not in hitclusters:
|
|
980 hitclusters.append(subject_genecluster)
|
|
981 last_query = query
|
|
982 elif i == blastlines[-1]: #Only for the last blastline
|
|
983 if query not in querylist:
|
|
984 subjectlist.append(subject)
|
|
985 querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
|
|
986 blastdict[query] = [subjectlist,querydict]
|
|
987 querylist.append(query)
|
|
988 if subject_genecluster not in hitclusters:
|
|
989 hitclusters.append(subject_genecluster)
|
|
990 else:
|
|
991 subjectlist.append(subject)
|
|
992 querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
|
|
993 blastdict[query] = [subjectlist,querydict]
|
|
994 else: #For all but the first and last blastlines
|
|
995 if query not in querylist:
|
|
996 blastdict[last_query] = [subjectlist,querydict]
|
|
997 querylist.append(query)
|
|
998 subjectlist = []
|
|
999 querydict = {}
|
|
1000 subjectlist.append(subject)
|
|
1001 querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
|
|
1002 if subject_genecluster not in hitclusters:
|
|
1003 hitclusters.append(subject_genecluster)
|
|
1004 last_query = query
|
|
1005 else:
|
|
1006 subjectlist.append(subject)
|
|
1007 querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
|
|
1008 if subject_genecluster not in hitclusters:
|
|
1009 hitclusters.append(subject_genecluster)
|
|
1010 return [blastdict,querylist,hitclusters]
|
|
1011
|
|
1012 def getdircontents():
|
|
1013 return os.listdir('.')
|
|
1014 """
|
|
1015 if sys.platform == ('win32'):
|
|
1016 dircontents = os.popen("dir/w")
|
|
1017 dircontents = dircontents.read()
|
|
1018 dircontents = dircontents.replace("\n"," ")
|
|
1019 dircontents = dircontents.split(" ")
|
|
1020 if sys.platform == ('linux2'):
|
|
1021 dircontents = os.popen("ls")
|
|
1022 dircontents = dircontents.read()
|
|
1023 dircontents = dircontents.replace("\n"," ")
|
|
1024 dircontents = dircontents.replace("\r"," ")
|
|
1025 dircontents = dircontents.split(" ")
|
|
1026
|
|
1027 return dircontents
|
|
1028 """
|
|
1029
|
|
1030 def _gene_arrow(start,end,strand,color,base,height):
|
|
1031 halfheight = height/2
|
|
1032 if start > end:
|
|
1033 start2 = end
|
|
1034 end2 = start
|
|
1035 start = start2
|
|
1036 end = end2
|
|
1037 dist = 100
|
|
1038 oh = ShapeBuilder()
|
|
1039 if (end - start) < halfheight:
|
|
1040 if (strand == "+"):
|
|
1041 pointsAsTuples=[(start,base),
|
|
1042 (end,base - halfheight),
|
|
1043 (start,base - height),
|
|
1044 (start,base)
|
|
1045 ]
|
|
1046 elif (strand == "-"):
|
|
1047 pointsAsTuples=[(start,base - halfheight),
|
|
1048 (end,base - height),
|
|
1049 (end,base),
|
|
1050 (start,base - halfheight)
|
|
1051 ]
|
|
1052 else:
|
|
1053 if (strand == "+"):
|
|
1054 arrowstart = end-halfheight
|
|
1055 pointsAsTuples=[(start,base),
|
|
1056 (arrowstart,base),
|
|
1057 (end,base-halfheight),
|
|
1058 (arrowstart,base - height),
|
|
1059 (start,base - height),
|
|
1060 (start,base)
|
|
1061 ]
|
|
1062 elif (strand == "-"):
|
|
1063 arrowstart = start + halfheight
|
|
1064 pointsAsTuples=[(start,base - halfheight),
|
|
1065 (arrowstart,base - height),
|
|
1066 (end,base - height),
|
|
1067 (end,base),
|
|
1068 (arrowstart,base),
|
|
1069 (start,base - halfheight)
|
|
1070 ]
|
|
1071 pg=oh.createPolygon(points=oh.convertTupleArrayToPoints(pointsAsTuples),strokewidth=1, stroke='black', fill=color)
|
|
1072 return pg
|
|
1073
|
|
1074 def _gene_label(start,end,name,y,screenwidth):
|
|
1075 #Add gene label
|
|
1076 txt = name
|
|
1077 myStyle = StyleBuilder()
|
|
1078 myStyle.setFontFamily(fontfamily="Verdana")
|
|
1079 #myStyle.setFontWeight(fontweight='bold')
|
|
1080 myStyle.setFontStyle(fontstyle='italic')
|
|
1081 myStyle.setFontSize('10px')
|
|
1082 myStyle.setFilling('#600000')
|
|
1083 x = ((start + end)/2)
|
|
1084 base = 35
|
|
1085 height = 10
|
|
1086 halfheight = height/2
|
|
1087 y = base + halfheight
|
|
1088 t1 = text(txt,x,y)
|
|
1089 t1.set_style(myStyle.getStyle())
|
|
1090 return t1
|
|
1091
|
|
1092 def relativepositions(starts,ends,largestclustersize):
|
|
1093 rel_starts = []
|
|
1094 rel_ends = []
|
|
1095 #Assign relative start and end sites for visualization
|
|
1096 lowest_start = int(starts[0])
|
|
1097 leftboundary = lowest_start
|
|
1098 for i in starts:
|
|
1099 i = float(float(int(i) - int(leftboundary)) / largestclustersize) * screenwidth * 0.75
|
|
1100 i = int(i)
|
|
1101 rel_starts.append(i)
|
|
1102 for i in ends:
|
|
1103 i = float(float(int(i) - int(leftboundary)) / largestclustersize) * screenwidth * 0.75
|
|
1104 i = int(i)
|
|
1105 rel_ends.append(i)
|
|
1106 return [rel_starts,rel_ends]
|
|
1107
|
|
1108 def startendsitescheck(starts,ends):
|
|
1109 #Check whether start sites are always lower than end sites, reverse if necessary
|
|
1110 starts2 = []
|
|
1111 ends2 = []
|
|
1112 a = 0
|
|
1113 for i in starts:
|
|
1114 if int(i) > int(ends[a]):
|
|
1115 starts2.append(ends[a])
|
|
1116 ends2.append(i)
|
|
1117 else:
|
|
1118 starts2.append(i)
|
|
1119 ends2.append(ends[a])
|
|
1120 a += 1
|
|
1121 ends = ends2
|
|
1122 starts = starts2
|
|
1123 return [starts,ends]
|
|
1124
|
|
1125 def RadialGradient(startcolor,stopcolor,gradientname):
|
|
1126 d = defs()
|
|
1127 rg = radialGradient()
|
|
1128 rg.set_id(gradientname)
|
|
1129 s = stop(offset="0%")
|
|
1130 s.set_stop_color(startcolor)
|
|
1131 s.set_stop_opacity(1)
|
|
1132 rg.addElement(s)
|
|
1133 s = stop(offset="100%")
|
|
1134 s.set_stop_color(stopcolor)
|
|
1135 s.set_stop_opacity(1)
|
|
1136 rg.addElement(s)
|
|
1137 d.addElement(rg)
|
|
1138 return d
|
|
1139
|
|
1140 def LinearGradient(startcolor,stopcolor,gradientname):
|
|
1141 d = defs()
|
|
1142 lg = linearGradient()
|
|
1143 lg.set_id(gradientname)
|
|
1144 s = stop(offset="0%")
|
|
1145 s.set_stop_color(startcolor)
|
|
1146 s.set_stop_opacity(1)
|
|
1147 lg.addElement(s)
|
|
1148 s = stop(offset="100%")
|
|
1149 s.set_stop_color(stopcolor)
|
|
1150 s.set_stop_opacity(1)
|
|
1151 lg.addElement(s)
|
|
1152 d.addElement(lg)
|
|
1153 return d
|
|
1154
|
|
1155 def generate_rgbscheme(nr):
|
|
1156 usablenumbers = [1,2,4,8,12,18,24,32,48,64,10000]
|
|
1157 lengthsdict = {1:[1,1,1],2:[1,1,2],4:[1,2,2],8:[2,2,2],12:[2,2,3],18:[2,3,3],24:[3,3,3],32:[3,3,4],48:[3,4,4],64:[4,4,4]}
|
|
1158 shortestdistance = 10000
|
|
1159 for i in usablenumbers:
|
|
1160 distance = i - nr
|
|
1161 if distance >= 0:
|
|
1162 if distance < shortestdistance:
|
|
1163 shortestdistance = distance
|
|
1164 closestnr = i
|
|
1165 toohigh = "n"
|
|
1166 if closestnr == 10000:
|
|
1167 toohigh = "y"
|
|
1168 closestnr = 64
|
|
1169 xyznumbers = lengthsdict[closestnr]
|
|
1170 x = xyznumbers[0]
|
|
1171 y = xyznumbers[1]
|
|
1172 z = xyznumbers[2]
|
|
1173 xpoints = []
|
|
1174 xpoint = (255/z)/2
|
|
1175 for i in range(x):
|
|
1176 xpoints.append(xpoint)
|
|
1177 xpoint += (255/x)
|
|
1178 ypoints = []
|
|
1179 ypoint = (255/z)/2
|
|
1180 for i in range(y):
|
|
1181 ypoints.append(ypoint)
|
|
1182 ypoint += (255/y)
|
|
1183 zpoints = []
|
|
1184 zpoint = (255/z)/2
|
|
1185 for i in range(z):
|
|
1186 zpoints.append(zpoint)
|
|
1187 zpoint += (255/z)
|
|
1188 colorlist = []
|
|
1189 for i in xpoints:
|
|
1190 for j in ypoints:
|
|
1191 #for k in zpoints:
|
|
1192 # rgb = "rgb(%s,%s,%s)" % (i, j, k)
|
|
1193 # #rgb = "rgb(" + str(i) + "," + str(j) + "," + str(k) + ")"
|
|
1194 # colorlist.append(rgb)
|
|
1195 [colorlist.append("rgb(%s,%s,%s)" % (i, j, k)) for k in zpoints]
|
|
1196 if toohigh == "y":
|
|
1197 colorlist = colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist
|
|
1198 if closestnr == 24:
|
|
1199 colorlist = colorlist[:15] + colorlist[18:]
|
|
1200 if closestnr == 32:
|
|
1201 colorlist = colorlist[:21] + colorlist[24:]
|
|
1202 colorlist2 = []
|
|
1203 if closestnr == 1:
|
|
1204 colorlist2.append("red")
|
|
1205 if closestnr == 2:
|
|
1206 colorlist2.append("red")
|
|
1207 colorlist2.append("green")
|
|
1208 if closestnr == 4:
|
|
1209 colorlist2.append("red")
|
|
1210 colorlist2.append("green")
|
|
1211 colorlist2.append("blue")
|
|
1212 colorlist2.append("yellow")
|
|
1213 if closestnr == 8:
|
|
1214 neworder=[4,1,2,5,6,7,3,0]
|
|
1215 colorlist2 = [colorlist[i] for i in neworder]
|
|
1216 if closestnr == 12:
|
|
1217 neworder=[6,3,5,9,7,2,11,4,8,1,10,0]
|
|
1218 colorlist2 = [colorlist[i] for i in neworder]
|
|
1219 if closestnr == 18:
|
|
1220 neworder=[9,6,2,14,15,8,12,10,3,5,7,11,4,1,16,13,0]
|
|
1221 colorlist2 = [colorlist[i] for i in neworder]
|
|
1222 if closestnr == 24:
|
|
1223 neworder=[15,12,9,6,5,0,21,1,16,14,8,17,2,23,22,3,13,7,10,4,18,20,19,11]
|
|
1224 colorlist2 = [colorlist[i] for i in neworder]
|
|
1225 if closestnr == 32:
|
|
1226 neworder = [21,19,27,6,8,1,14,7,20,13,9,30,4,23,18,12,5,29,24,17,11,31,2,28,22,15,26,3,20,16,10,25]
|
|
1227 colorlist2 = [colorlist[i] for i in neworder]
|
|
1228 if closestnr > 32:
|
|
1229 random.shuffle(colorlist)
|
|
1230 colorlist2 = colorlist
|
|
1231 colorlist = colorlist2
|
|
1232 return colorlist
|
|
1233
|
|
1234 def geneclustersvg(genes,rel_starts,rel_ends,strands,geneposdict,pksnrpsprots,pksnrpsdomains,qclusternr):
|
|
1235 nrgenes = len(genes)
|
|
1236 #Define relative start and end positions for plotting
|
|
1237 s = svg(x = 0, y = 0, width = (screenwidth * 0.75), height = (259 + 99 * len(pksnrpsprots)))
|
|
1238 viewbox = "0 -30 " + str(screenwidth * 0.8) + " " + str(185 + 70 * len(pksnrpsprots))
|
|
1239 s.set_viewBox(viewbox)
|
|
1240 s.set_preserveAspectRatio("none")
|
|
1241
|
|
1242 #Add line behind gene arrows
|
|
1243 oh = ShapeBuilder()
|
|
1244 group = g()
|
|
1245 group.addElement(oh.createLine(10,60,10 + (screenwidth * 0.75),60, strokewidth = 2, stroke = "grey"))
|
|
1246 s.addElement(group)
|
|
1247 #Add gene arrows
|
|
1248 a = 0
|
|
1249 y = 0
|
|
1250 for x in range(nrgenes):
|
|
1251 group = g()
|
|
1252 #group.addElement(_gene_label(rel_starts[a],rel_ends[a],genes[a],y,screenwidth))
|
|
1253 group.addElement(_gene_arrow(10 + rel_starts[a],10 + rel_ends[a],strands[a],colors[a],65,10))
|
|
1254 #Can be used for domains
|
|
1255 # group.addElement(oh.createRect(rel_starts[a],45,(rel_ends[a]-rel_starts[a]),10, strokewidth = 2, stroke = "black", fill="#237845"))
|
|
1256 group.set_id("a" + str(qclusternr) + "_00%s"%x)
|
|
1257 s.addElement(group)
|
|
1258 if y == 0:
|
|
1259 y = 1
|
|
1260 elif y == 1:
|
|
1261 y = 0
|
|
1262 a += 1
|
|
1263 #Add domain depictions
|
|
1264 oh = ShapeBuilder()
|
|
1265 group = g()
|
|
1266 #Determine longest protein to decide on scaling
|
|
1267 longestprot = 0
|
|
1268 protlengthdict = {}
|
|
1269 for i in pksnrpsprots:
|
|
1270 protlength = (geneposdict[i][1] - geneposdict[i][0]) / 3
|
|
1271 protlengthdict[i] = protlength
|
|
1272 if protlength > longestprot:
|
|
1273 longestprot = protlength
|
|
1274 z = 1
|
|
1275 w = 0
|
|
1276 ksnr = 1
|
|
1277 atnr = 1
|
|
1278 dhnr = 1
|
|
1279 krnr = 1
|
|
1280 ernr = 1
|
|
1281 acpnr = 1
|
|
1282 cnr = 1
|
|
1283 enr = 1
|
|
1284 anr = 1
|
|
1285 pcpnr = 1
|
|
1286 tenr = 1
|
|
1287 othernr = 1
|
|
1288 for i in pksnrpsprots:
|
|
1289 domains = pksnrpsdomains[i][0]
|
|
1290 domainsdict = pksnrpsdomains[i][1]
|
|
1291 protlength = protlengthdict[i]
|
|
1292 group.addElement(oh.createLine(10,(125 + z * 60 ),10 + ((float(protlength) / float(longestprot)) * (screenwidth * 0.75)),(125 + z * 60 ), strokewidth = 1, stroke = "grey"))
|
|
1293 s.addElement(group)
|
|
1294 try:
|
|
1295 aa2pixelratio = longestprot * 0.75 / screenwidth
|
|
1296 except:
|
|
1297 aa2pixelratio = 0.1
|
|
1298 #print 'logestprot', longestprot
|
|
1299 #print 'scrennwidth', screenwidth
|
|
1300 #print aa2pixelratio
|
|
1301 myStyle = StyleBuilder()
|
|
1302 myStyle.setFontFamily(fontfamily="MS Reference Sans Serif")
|
|
1303 myStyle.setFontWeight(fontweight='bold')
|
|
1304 myStyle.setFontSize('12px')
|
|
1305 for j in domains:
|
|
1306 startpos = domainsdict[j][0]
|
|
1307 endpos = domainsdict[j][1]
|
|
1308 if "PKS_KS" in j:
|
|
1309 c = LinearGradient("#08B208","#81F781","KS_domain"+str(qclusternr) + "_" + str(ksnr))
|
|
1310 d = LinearGradient("#81F781","#08B208","KS_line"+str(qclusternr) + "_" + str(ksnr))
|
|
1311 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#KS_line' + str(qclusternr) + "_" + str(ksnr) + ")",fill="url(#KS_domain" + str(qclusternr) + "_" + str(ksnr) + ")")
|
|
1312 f = text("KS",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A2A0A')
|
|
1313 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
|
|
1314 myStyle.setFontSize('8px')
|
|
1315 f = text("KS",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#3B0B0B')
|
|
1316 elif ((endpos-startpos) / aa2pixelratio) < 20:
|
|
1317 f = "notext"
|
|
1318 if f != "notext":
|
|
1319 f.set_style(myStyle.getStyle())
|
|
1320 myStyle.setFontSize('12px')
|
|
1321 group = g()
|
|
1322 group.addElement(c)
|
|
1323 group.addElement(d)
|
|
1324 group.addElement(e)
|
|
1325 if f != "notext":
|
|
1326 group.addElement(f)
|
|
1327 group.set_id("b" + str(qclusternr) + "_00%s"%w)
|
|
1328 s.addElement(group)
|
|
1329 ksnr += 1
|
|
1330 elif "PKS_AT" in j:
|
|
1331 c = LinearGradient("#DC0404","#F78181","AT_domain"+str(qclusternr) + "_" + str(atnr))
|
|
1332 d = LinearGradient("#F78181","#DC0404","AT_line"+str(qclusternr) + "_" + str(atnr))
|
|
1333 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#AT_line' + str(qclusternr) + "_" + str(atnr) + ")",fill="url(#AT_domain" + str(qclusternr) + "_" + str(atnr) + ")")
|
|
1334 f = text("AT",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#2A1B0A')
|
|
1335 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
|
|
1336 myStyle.setFontSize('8px')
|
|
1337 f = text("AT",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#2A1B0A')
|
|
1338 elif ((endpos-startpos) / aa2pixelratio) < 20:
|
|
1339 f = "notext"
|
|
1340 if f != "notext":
|
|
1341 f.set_style(myStyle.getStyle())
|
|
1342 myStyle.setFontSize('12px')
|
|
1343 group = g()
|
|
1344 group.addElement(c)
|
|
1345 group.addElement(d)
|
|
1346 group.addElement(e)
|
|
1347 if f != "notext":
|
|
1348 group.addElement(f)
|
|
1349 group.set_id("b" + str(qclusternr) + "_00%s"%w)
|
|
1350 s.addElement(group)
|
|
1351 atnr += 1
|
|
1352 elif "PKS_DH" in j:
|
|
1353 c = LinearGradient("#B45F04","#F7BE81","DH_domain"+str(qclusternr) + "_" + str(dhnr))
|
|
1354 d = LinearGradient("#F7BE81","#B45F04","DH_line"+str(qclusternr) + "_" + str(dhnr))
|
|
1355 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#DH_line' + str(qclusternr) + "_" + str(dhnr) + ")",fill="url(#DH_domain" + str(qclusternr) + "_" + str(dhnr) + ")")
|
|
1356 f = text("DH",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#3B0B0B')
|
|
1357 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
|
|
1358 myStyle.setFontSize('8px')
|
|
1359 f = text("DH",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#3B0B0B')
|
|
1360 elif ((endpos-startpos) / aa2pixelratio) < 20:
|
|
1361 f = "notext"
|
|
1362 if f != "notext":
|
|
1363 f.set_style(myStyle.getStyle())
|
|
1364 myStyle.setFontSize('12px')
|
|
1365 group = g()
|
|
1366 group.addElement(c)
|
|
1367 group.addElement(d)
|
|
1368 group.addElement(e)
|
|
1369 if f != "notext":
|
|
1370 group.addElement(f)
|
|
1371 group.set_id("b" + str(qclusternr) + "_00%s"%w)
|
|
1372 s.addElement(group)
|
|
1373 dhnr += 1
|
|
1374 elif "PKS_KR" in j:
|
|
1375 c = LinearGradient("#089E4B","#81F781","KR_domain"+str(qclusternr) + "_" + str(krnr))
|
|
1376 d = LinearGradient("#81F781","#089E4B","KR_line"+str(qclusternr) + "_" + str(krnr))
|
|
1377 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#KR_line' + str(qclusternr) + "_" + str(krnr) + ")",fill="url(#KR_domain" + str(qclusternr) + "_" + str(krnr) + ")")
|
|
1378 f = text("KR",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A2A1B')
|
|
1379 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
|
|
1380 myStyle.setFontSize('8px')
|
|
1381 f = text("KR",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A2A1B')
|
|
1382 elif ((endpos-startpos) / aa2pixelratio) < 20:
|
|
1383 f = "notext"
|
|
1384 if f != "notext":
|
|
1385 f.set_style(myStyle.getStyle())
|
|
1386 myStyle.setFontSize('12px')
|
|
1387 group = g()
|
|
1388 group.addElement(c)
|
|
1389 group.addElement(d)
|
|
1390 group.addElement(e)
|
|
1391 if f != "notext":
|
|
1392 group.addElement(f)
|
|
1393 group.set_id("b" + str(qclusternr) + "_00%s"%w)
|
|
1394 s.addElement(group)
|
|
1395 krnr += 1
|
|
1396 elif "PKS_ER" in j:
|
|
1397 c = LinearGradient("#089E85","#81F7F3","ER_domain"+str(qclusternr) + "_" + str(ernr))
|
|
1398 d = LinearGradient("#81F7F3","#089E85","ER_line"+str(qclusternr) + "_" + str(ernr))
|
|
1399 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#ER_line' + str(qclusternr) + "_" + str(ernr) + ")",fill="url(#ER_domain" + str(qclusternr) + "_" + str(ernr) + ")")
|
|
1400 f = text("ER",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A2A29')
|
|
1401 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
|
|
1402 myStyle.setFontSize('8px')
|
|
1403 f = text("ER",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A2A29')
|
|
1404 elif ((endpos-startpos) / aa2pixelratio) < 20:
|
|
1405 f = "notext"
|
|
1406 if f != "notext":
|
|
1407 f.set_style(myStyle.getStyle())
|
|
1408 myStyle.setFontSize('12px')
|
|
1409 group = g()
|
|
1410 group.addElement(c)
|
|
1411 group.addElement(d)
|
|
1412 group.addElement(e)
|
|
1413 if f != "notext":
|
|
1414 group.addElement(f)
|
|
1415 group.set_id("b" + str(qclusternr) + "_00%s"%w)
|
|
1416 s.addElement(group)
|
|
1417 ernr += 1
|
|
1418 elif "ACP" in j:
|
|
1419 c = LinearGradient("#084BC6","#81BEF7","ACP_domain"+str(qclusternr) + "_" + str(acpnr))
|
|
1420 d = LinearGradient("#81BEF7","#084BC6","ACP_line"+str(qclusternr) + "_" + str(acpnr))
|
|
1421 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#ACP_line' + str(qclusternr) + "_" + str(acpnr) + ")",fill="url(#ACP_domain" + str(qclusternr) + "_" + str(acpnr) + ")")
|
|
1422 f = text("ACP",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A1B2A')
|
|
1423 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
|
|
1424 myStyle.setFontSize('8px')
|
|
1425 f = text("ACP",((-2 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A1B2A')
|
|
1426 elif ((endpos-startpos) / aa2pixelratio) < 20:
|
|
1427 f = "notext"
|
|
1428 if f != "notext":
|
|
1429 f.set_style(myStyle.getStyle())
|
|
1430 myStyle.setFontSize('12px')
|
|
1431 group = g()
|
|
1432 group.addElement(c)
|
|
1433 group.addElement(d)
|
|
1434 group.addElement(e)
|
|
1435 if f != "notext":
|
|
1436 group.addElement(f)
|
|
1437 group.set_id("b" + str(qclusternr) + "_00%s"%w)
|
|
1438 s.addElement(group)
|
|
1439 acpnr += 1
|
|
1440 elif ("C" in j or "Heterocyclization" in j ) and "ACP" not in j and "PCP" not in j and "NRPS-COM" not in j and "CAL" not in j:
|
|
1441 c = LinearGradient("#393989","#8181F7","C_domain"+str(qclusternr) + "_" + str(cnr))
|
|
1442 d = LinearGradient("#8181F7","#393989","C_line"+str(qclusternr) + "_" + str(cnr))
|
|
1443 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#C_line' + str(qclusternr) + "_" + str(cnr) + ")",fill="url(#C_domain" + str(qclusternr) + "_" + str(cnr) + ")")
|
|
1444 f = text("C",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A0A2A')
|
|
1445 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
|
|
1446 myStyle.setFontSize('8px')
|
|
1447 f = text("C",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A0A2A')
|
|
1448 elif ((endpos-startpos) / aa2pixelratio) < 20:
|
|
1449 f = "notext"
|
|
1450 if f != "notext":
|
|
1451 f.set_style(myStyle.getStyle())
|
|
1452 myStyle.setFontSize('12px')
|
|
1453 group = g()
|
|
1454 group.addElement(c)
|
|
1455 group.addElement(d)
|
|
1456 group.addElement(e)
|
|
1457 if f != "notext":
|
|
1458 group.addElement(f)
|
|
1459 group.set_id("b" + str(qclusternr) + "_00%s"%w)
|
|
1460 s.addElement(group)
|
|
1461 cnr += 1
|
|
1462 elif "Epimerization" in j and "ER" not in j and "TE" not in j:
|
|
1463 c = LinearGradient("#393989","#8181F7","E_domain"+str(qclusternr) + "_" + str(enr))
|
|
1464 d = LinearGradient("#8181F7","#393989","E_line"+str(qclusternr) + "_" + str(enr))
|
|
1465 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#E_line' + str(qclusternr) + "_" + str(enr) + ")",fill="url(#E_domain" + str(qclusternr) + "_" + str(enr) + ")")
|
|
1466 f = text("E",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A0A2A')
|
|
1467 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
|
|
1468 myStyle.setFontSize('8px')
|
|
1469 f = text("E",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A0A2A')
|
|
1470 elif ((endpos-startpos) / aa2pixelratio) < 20:
|
|
1471 f = "notext"
|
|
1472 if f != "notext":
|
|
1473 f.set_style(myStyle.getStyle())
|
|
1474 myStyle.setFontSize('12px')
|
|
1475 group = g()
|
|
1476 group.addElement(c)
|
|
1477 group.addElement(d)
|
|
1478 group.addElement(e)
|
|
1479 if f != "notext":
|
|
1480 group.addElement(f)
|
|
1481 group.set_id("b" + str(qclusternr) + "_00%s"%w)
|
|
1482 s.addElement(group)
|
|
1483 enr += 1
|
|
1484 elif ("AMP" in j or "A-OX" in j):
|
|
1485 c = LinearGradient("#56157F","#BE81F7","A_domain"+str(qclusternr) + "_" + str(anr))
|
|
1486 d = LinearGradient("#BE81F7","#56157F","A_line"+str(qclusternr) + "_" + str(anr))
|
|
1487 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#A_line' + str(qclusternr) + "_" + str(anr) + ")",fill="url(#A_domain" + str(qclusternr) + "_" + str(anr) + ")")
|
|
1488 f = text("A",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#1B0A2A')
|
|
1489 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
|
|
1490 myStyle.setFontSize('8px')
|
|
1491 f = text("A",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#1B0A2A')
|
|
1492 elif ((endpos-startpos) / aa2pixelratio) < 20:
|
|
1493 f = "notext"
|
|
1494 if f != "notext":
|
|
1495 f.set_style(myStyle.getStyle())
|
|
1496 myStyle.setFontSize('12px')
|
|
1497 group = g()
|
|
1498 group.addElement(c)
|
|
1499 group.addElement(d)
|
|
1500 group.addElement(e)
|
|
1501 if f != "notext":
|
|
1502 group.addElement(f)
|
|
1503 group.set_id("b" + str(qclusternr) + "_00%s"%w)
|
|
1504 s.addElement(group)
|
|
1505 anr += 1
|
|
1506 elif "PCP" in j:
|
|
1507 c = LinearGradient("#084BC6","#81BEF7","PCP_domain"+str(qclusternr) + "_" + str(pcpnr))
|
|
1508 d = LinearGradient("#81BEF7","#084BC6","PCP_line"+str(qclusternr) + "_" + str(pcpnr))
|
|
1509 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#PCP_line' + str(qclusternr) + "_" + str(pcpnr) + ")",fill="url(#PCP_domain" + str(qclusternr) + "_" + str(pcpnr) + ")")
|
|
1510 f = text("PCP",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A1B2A')
|
|
1511 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
|
|
1512 myStyle.setFontSize('8px')
|
|
1513 f = text("PCP",((-2 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A1B2A')
|
|
1514 elif ((endpos-startpos) / aa2pixelratio) < 20:
|
|
1515 f = "notext"
|
|
1516 if f != "notext":
|
|
1517 f.set_style(myStyle.getStyle())
|
|
1518 myStyle.setFontSize('12px')
|
|
1519 group = g()
|
|
1520 group.addElement(c)
|
|
1521 group.addElement(d)
|
|
1522 group.addElement(e)
|
|
1523 if f != "notext":
|
|
1524 group.addElement(f)
|
|
1525 group.set_id("b" + str(qclusternr) + "_00%s"%w)
|
|
1526 s.addElement(group)
|
|
1527 pcpnr += 1
|
|
1528 elif "Thioesterase" in j or "TD" in j:
|
|
1529 c = LinearGradient("#750072","#F5A9F2","TE_domain"+str(qclusternr) + "_" + str(tenr))
|
|
1530 d = LinearGradient("#F5A9F2","#750072","TE_line"+str(qclusternr) + "_" + str(tenr))
|
|
1531 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#TE_line' + str(qclusternr) + "_" + str(tenr) + ")",fill="url(#TE_domain" + str(qclusternr) + "_" + str(tenr) + ")")
|
|
1532 if "Thioesterase" in j:
|
|
1533 f = text("TE",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#2A0A29')
|
|
1534 else:
|
|
1535 f = text("TD",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#2A0A29')
|
|
1536 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
|
|
1537 myStyle.setFontSize('8px')
|
|
1538 if "Thioesterase" in j:
|
|
1539 f = text("TE",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#2A0A29')
|
|
1540 else:
|
|
1541 f = text("TD",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#2A0A29')
|
|
1542 elif ((endpos-startpos) / aa2pixelratio) < 20:
|
|
1543 f = "notext"
|
|
1544 if f != "notext":
|
|
1545 f.set_style(myStyle.getStyle())
|
|
1546 myStyle.setFontSize('12px')
|
|
1547 group = g()
|
|
1548 group.addElement(c)
|
|
1549 group.addElement(d)
|
|
1550 group.addElement(e)
|
|
1551 if f != "notext":
|
|
1552 group.addElement(f)
|
|
1553 group.set_id("b" + str(qclusternr) + "_00%s"%w)
|
|
1554 s.addElement(group)
|
|
1555 tenr += 1
|
|
1556 else:
|
|
1557 c = LinearGradient("#929292","#DBDBDB","other_domain"+str(qclusternr) + "_" + str(othernr))
|
|
1558 d = LinearGradient("#DBDBDB","#929292","other_line"+str(qclusternr) + "_" + str(othernr))
|
|
1559 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#other_line' + str(qclusternr) + "_" + str(othernr) + ")",fill="url(#other_domain" + str(qclusternr) + "_" + str(othernr) + ")")
|
|
1560 domname = (((((((((j.replace("0","")).replace("1","")).replace("2","")).replace("3","")).replace("4","")).replace("5","")).replace("6","")).replace("7","")).replace("8","")).replace("9","")
|
|
1561 if len(domname) == 1:
|
|
1562 f = text(domname,((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0B0B0B')
|
|
1563 elif len(domname) == 2:
|
|
1564 f = text(domname,((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0B0B0B')
|
|
1565 elif len(domname) == 3:
|
|
1566 f = text(domname,((-12 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0B0B0B')
|
|
1567 if len(domname) > 3 or ((endpos-startpos) / aa2pixelratio) < 100:
|
|
1568 myStyle.setFontSize('8px')
|
|
1569 f = text(domname,((-16 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0B0B0B')
|
|
1570 if len(domname) > 4 and ((endpos-startpos) / aa2pixelratio) < 100:
|
|
1571 myStyle.setFontSize('6px')
|
|
1572 f = text(domname,((-16 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0B0B0B')
|
|
1573 if ((endpos-startpos) / aa2pixelratio) < 60:
|
|
1574 f = "notext"
|
|
1575 if f != "notext":
|
|
1576 f.set_style(myStyle.getStyle())
|
|
1577 myStyle.setFontSize('12px')
|
|
1578 group = g()
|
|
1579 group.addElement(c)
|
|
1580 group.addElement(d)
|
|
1581 group.addElement(e)
|
|
1582 if f != "notext":
|
|
1583 group.addElement(f)
|
|
1584 group.set_id("b" + str(qclusternr) + "_00%s"%w)
|
|
1585 s.addElement(group)
|
|
1586 othernr += 1
|
|
1587 w += 1
|
|
1588 z += 1
|
|
1589 s.addElement(group)
|
|
1590 return s
|
|
1591
|
|
1592 def calculate_colorgroups(queryclusternumber,hitclusternumbers,queryclusterdata,internalhomologygroupsdict):
|
|
1593 #Extract data and generate color scheme
|
|
1594 nrhitclusters = queryclusterdata[queryclusternumber][0]
|
|
1595 hitclusterdata = queryclusterdata[queryclusternumber][1]
|
|
1596 queryclustergenes = hitclusterdata[1][3]
|
|
1597 queryclustergenesdetails = hitclusterdata[1][4]
|
|
1598 colorgroupsdict = {}
|
|
1599 colorgroupslengthlist = []
|
|
1600 colorgroupslist = []
|
|
1601 for hitclusternumber in hitclusternumbers:
|
|
1602 colorgroups = hitclusterdata[hitclusternumber][0][hitclusternumber]
|
|
1603 colorgroupsdict[hitclusternumber] = colorgroups
|
|
1604 colorgroupslengthlist.append(len(colorgroups))
|
|
1605 colorgroupslist.append(colorgroups)
|
|
1606 metacolorgroups = []
|
|
1607 internalgroups = internalhomologygroupsdict[queryclusternumber]
|
|
1608 for i in internalgroups:
|
|
1609 metagroup = []
|
|
1610 for j in i:
|
|
1611 for m in colorgroupslist:
|
|
1612 for l in m:
|
|
1613 if j in l:
|
|
1614 #for k in l:
|
|
1615 # if k not in metagroup:
|
|
1616 # metagroup.append(k)
|
|
1617 [metagroup.append(k) for k in l if k not in metagroup]
|
|
1618 if len(metagroup) > 1 and metagroup not in metacolorgroups:
|
|
1619 metacolorgroups.append(metagroup)
|
|
1620 #Generate RGB scheme
|
|
1621 rgbcolorscheme = generate_rgbscheme(len(metacolorgroups))
|
|
1622 rgbcolorscheme.append("#FFFFFF")
|
|
1623 #Create colorschemedict in which all genes that are hits of the same query gene get the same color
|
|
1624 colorschemedict = {}
|
|
1625 z = 0
|
|
1626 for i in queryclustergenes:
|
|
1627 for j in metacolorgroups:
|
|
1628 if i in j:
|
|
1629 for l in j:
|
|
1630 if colorschemedict.has_key(l):
|
|
1631 pass
|
|
1632 else:
|
|
1633 colorschemedict[l] = z
|
|
1634 #[colorschemedict[l] = z for l in j if not coloschemedict.has_key(l)]
|
|
1635 if z in colorschemedict.values():
|
|
1636 z += 1
|
|
1637 return colorschemedict,rgbcolorscheme
|
|
1638
|
|
1639 def clusterblastresults(queryclusternumber,hitclusternumbers,queryclusterdata,colorschemedict,rgbcolorscheme):
|
|
1640 #print "Generating svg for cluster",queryclusternumber
|
|
1641 #Extract data and generate color scheme
|
|
1642 nrhitclusters = queryclusterdata[queryclusternumber][0]
|
|
1643 hitclusterdata = queryclusterdata[queryclusternumber][1]
|
|
1644 queryclustergenes = hitclusterdata[1][3]
|
|
1645 queryclustergenesdetails = hitclusterdata[1][4]
|
|
1646 colorgroupsdict = {}
|
|
1647 colorgroupslengthlist = []
|
|
1648 colorgroupslist = []
|
|
1649 for hitclusternumber in hitclusternumbers:
|
|
1650 colorgroups = hitclusterdata[hitclusternumber][0][hitclusternumber]
|
|
1651 colorgroupsdict[hitclusternumber] = colorgroups
|
|
1652 colorgroupslengthlist.append(len(colorgroups))
|
|
1653 colorgroupslist.append(colorgroups)
|
|
1654 #Find out whether hit gene cluster needs to be inverted compared to query gene cluster
|
|
1655 strandsbalancedict = {}
|
|
1656 for m in hitclusternumbers:
|
|
1657 hitclustergenesdetails = hitclusterdata[m][2]
|
|
1658 strandsbalance = 0
|
|
1659 for i in queryclustergenes:
|
|
1660 refstrand = queryclustergenesdetails[i][2]
|
|
1661 for j in colorgroupsdict[m]:
|
|
1662 if i in j:
|
|
1663 for k in j:
|
|
1664 if k in hitclusterdata[m][1] and hitclustergenesdetails[k][2] == refstrand:
|
|
1665 strandsbalance += 1
|
|
1666 elif k in hitclusterdata[m][1] and hitclusterdata[m][2][k][2] != refstrand:
|
|
1667 strandsbalance = strandsbalance - 1
|
|
1668 strandsbalancedict[m] = strandsbalance
|
|
1669 #Generate coordinates for SVG figure
|
|
1670 qnrgenes = len(queryclustergenes)
|
|
1671 qstarts =[]
|
|
1672 qends = []
|
|
1673 qstrands =[]
|
|
1674 qcolors = []
|
|
1675 for i in queryclustergenes:
|
|
1676 qgenedata = queryclustergenesdetails[i]
|
|
1677 if qgenedata[0] > qgenedata[1]:
|
|
1678 qstarts.append(qgenedata[0])
|
|
1679 qends.append(qgenedata[1])
|
|
1680 else:
|
|
1681 qstarts.append(qgenedata[1])
|
|
1682 qends.append(qgenedata[0])
|
|
1683 qstrands.append(qgenedata[2])
|
|
1684 if colorschemedict.has_key(i):
|
|
1685 qcolors.append(colorschemedict[i])
|
|
1686 else:
|
|
1687 qcolors.append("white")
|
|
1688 qstarts_ends = startendsitescheck(qstarts,qends)
|
|
1689 qstarts = qstarts_ends[0]
|
|
1690 qends = qstarts_ends[1]
|
|
1691 hdata = {}
|
|
1692 for m in hitclusternumbers:
|
|
1693 hitclustergenes = hitclusterdata[m][1]
|
|
1694 hitclustergenesdetails = hitclusterdata[m][2]
|
|
1695 hnrgenes = len(hitclustergenes)
|
|
1696 hstarts =[]
|
|
1697 hends = []
|
|
1698 hstrands =[]
|
|
1699 hcolors = []
|
|
1700 for i in hitclustergenes:
|
|
1701 hgenedata = hitclustergenesdetails[i]
|
|
1702 if hgenedata[0] > hgenedata[1]:
|
|
1703 hstarts.append(hgenedata[0])
|
|
1704 hends.append(hgenedata[1])
|
|
1705 else:
|
|
1706 hstarts.append(hgenedata[1])
|
|
1707 hends.append(hgenedata[0])
|
|
1708 hstrands.append(hgenedata[2])
|
|
1709 if colorschemedict.has_key(i):
|
|
1710 hcolors.append(colorschemedict[i])
|
|
1711 else:
|
|
1712 hcolors.append("white")
|
|
1713 #Invert gene cluster if needed
|
|
1714 if strandsbalancedict[m] < 0:
|
|
1715 hstarts2 = []
|
|
1716 hends2 = []
|
|
1717 hstrands2 = []
|
|
1718 for i in hstarts:
|
|
1719 hstarts2.append(str(100000000 - int(i)))
|
|
1720 hstarts = hstarts2
|
|
1721 hstarts.reverse()
|
|
1722 for i in hends:
|
|
1723 hends2.append(str(100000000 - int(i)))
|
|
1724 hends = hends2
|
|
1725 hends.reverse()
|
|
1726 for i in hstrands:
|
|
1727 if i == "+":
|
|
1728 hstrands2.append("-")
|
|
1729 elif i == "-":
|
|
1730 hstrands2.append("+")
|
|
1731 hstrands = hstrands2
|
|
1732 hstrands.reverse()
|
|
1733 hcolors.reverse()
|
|
1734 hstarts_ends = startendsitescheck(hstarts,hends)
|
|
1735 hstarts = hstarts_ends[0]
|
|
1736 hends = hstarts_ends[1]
|
|
1737 hdata[m] = [hstarts,hends,hstrands,hcolors]
|
|
1738 #Find cluster size of largest cluster of query & all hit clusters assessed
|
|
1739 clustersizes = []
|
|
1740 for m in hitclusternumbers:
|
|
1741 hclustersize = int(hdata[m][1][-1]) - int(hdata[m][0][0])
|
|
1742 clustersizes.append(hclustersize)
|
|
1743 qclustersize = int(qends[-1]) - int(qstarts[0])
|
|
1744 clustersizes.append(qclustersize)
|
|
1745 largestclustersize = max(clustersizes)
|
|
1746 smallestclustersize = min(clustersizes)
|
|
1747 #Find relative positions
|
|
1748 qrelpositions = relativepositions(qstarts,qends,largestclustersize)
|
|
1749 qrel_starts = qrelpositions[0]
|
|
1750 qrel_ends = qrelpositions[1]
|
|
1751 qdata = [qrel_starts,qrel_ends,qstrands,qcolors]
|
|
1752 hdata2 = {}
|
|
1753 qdata2 = []
|
|
1754 for m in hitclusternumbers:
|
|
1755 hclustersize = int(hdata[m][1][-1]) - int(hdata[m][0][0])
|
|
1756 hrelpositions = relativepositions(hdata[m][0],hdata[m][1],largestclustersize)
|
|
1757 hrel_starts = hrelpositions[0]
|
|
1758 hrel_ends = hrelpositions[1]
|
|
1759 #Center-align smallest gene cluster
|
|
1760 if largestclustersize == hclustersize:
|
|
1761 qrel_ends2 = []
|
|
1762 qrel_starts2 = []
|
|
1763 for i in qrel_starts:
|
|
1764 qrel_starts2.append(int(i) + int(float(float((largestclustersize - qclustersize) / 2) / largestclustersize) * screenwidth * 0.75))
|
|
1765 for i in qrel_ends:
|
|
1766 qrel_ends2.append(int(i) + int(float(float((largestclustersize - qclustersize) / 2) / largestclustersize) * screenwidth * 0.75))
|
|
1767 qrel_ends = qrel_ends2
|
|
1768 qrel_starts = qrel_starts2
|
|
1769 else:
|
|
1770 hrel_ends2 = []
|
|
1771 hrel_starts2 = []
|
|
1772 for i in hrel_starts:
|
|
1773 hrel_starts2.append(int(i) + int(float(float((largestclustersize - hclustersize) / 2) / largestclustersize) * screenwidth * 0.75))
|
|
1774 for i in hrel_ends:
|
|
1775 hrel_ends2.append(int(i) + int(float(float((largestclustersize - hclustersize) / 2) / largestclustersize) * screenwidth * 0.75))
|
|
1776 hrel_ends = hrel_ends2
|
|
1777 hrel_starts = hrel_starts2
|
|
1778 hdata2[m] = [hrel_starts,hrel_ends,hdata[m][2],hdata[m][3]]
|
|
1779 qdata2 = [qrel_starts,qrel_ends,qdata[2],qdata[3]]
|
|
1780 hdata = hdata2
|
|
1781 qdata = qdata2
|
|
1782 s = svg(x = 0, y = 0, width = (screenwidth * 0.75), height = (270 + len(hitclusternumbers) * 50))
|
|
1783 viewbox = "0 0 " + str(screenwidth * 0.8) + " " + str(180 + len(hitclusternumbers) * 50)
|
|
1784 s.set_viewBox(viewbox)
|
|
1785 s.set_preserveAspectRatio("none")
|
|
1786 #Add line behind query gene cluster gene arrows
|
|
1787 oh = ShapeBuilder()
|
|
1788 group = g()
|
|
1789 group.addElement(oh.createLine(10,35,10 + (screenwidth * 0.75),35, strokewidth = 1, stroke = "grey"))
|
|
1790 s.addElement(group)
|
|
1791 #Add query gene cluster gene arrows
|
|
1792 a = 0
|
|
1793 y = 0
|
|
1794 for x in range(qnrgenes):
|
|
1795 group = g()
|
|
1796 #group.addElement(_gene_label(rel_starts[a],rel_ends[a],genes[a],y,screenwidth))
|
|
1797 if qcolors[a] == "white":
|
|
1798 group.addElement(_gene_arrow(10 + qrel_starts[a],10 + qrel_ends[a],qstrands[a],rgbcolorscheme[-1],40,10))
|
|
1799 else:
|
|
1800 group.addElement(_gene_arrow(10 + qrel_starts[a],10 + qrel_ends[a],qstrands[a],rgbcolorscheme[qcolors[a]],40,10))
|
|
1801 #Can be used for domains
|
|
1802 #group.addElement(oh.createRect(rel_starts[a],45,(rel_ends[a]-rel_starts[a]),10, strokewidth = 2, stroke = "black", fill="#237845"))
|
|
1803 if len(hitclusternumbers) == 1:
|
|
1804 group.set_id("q" + str(queryclusternumber) + "_" + str(hitclusternumbers[0]) + "_" + "%s"%x)
|
|
1805 else:
|
|
1806 group.set_id("all_" + str(queryclusternumber) + "_0_" + "%s"%x)
|
|
1807 s.addElement(group)
|
|
1808 if y == 0:
|
|
1809 y = 1
|
|
1810 elif y == 1:
|
|
1811 y = 0
|
|
1812 a += 1
|
|
1813 for m in hitclusternumbers:
|
|
1814 #Add line behind hit gene cluster gene arrows
|
|
1815 group.addElement(oh.createLine(10,35 + 50 * (hitclusternumbers.index(m) + 1),10 + (screenwidth * 0.75),35 + 50 * (hitclusternumbers.index(m) + 1), strokewidth = 1, stroke = "grey"))
|
|
1816 s.addElement(group)
|
|
1817 #Add hit gene cluster gene arrows
|
|
1818 hitclustergenes = hitclusterdata[m][1]
|
|
1819 hnrgenes = len(hitclustergenes)
|
|
1820 hrel_starts = hdata[m][0]
|
|
1821 hrel_ends = hdata[m][1]
|
|
1822 hstrands = hdata[m][2]
|
|
1823 hcolors = hdata[m][3]
|
|
1824 a = 0
|
|
1825 y = 0
|
|
1826 for x in range(hnrgenes):
|
|
1827 group = g()
|
|
1828 #group.addElement(_gene_label(rel_starts[a],rel_ends[a],genes[a],y,screenwidth))
|
|
1829 if hcolors[a] == "white":
|
|
1830 group.addElement(_gene_arrow(10 + hrel_starts[a],10 + hrel_ends[a],hstrands[a],rgbcolorscheme[-1],40 + 50 * (hitclusternumbers.index(m) + 1),10))
|
|
1831 else:
|
|
1832 group.addElement(_gene_arrow(10 + hrel_starts[a],10 + hrel_ends[a],hstrands[a],rgbcolorscheme[hcolors[a]],40 + 50 * (hitclusternumbers.index(m) + 1),10))
|
|
1833 #Can be used for domains
|
|
1834 # group.addElement(oh.createRect(rel_starts[a],45,(rel_ends[a]-rel_starts[a]),10, strokewidth = 2, stroke = "black", fill="#237845"))
|
|
1835 if len(hitclusternumbers) == 1:
|
|
1836 group.set_id("h" + str(queryclusternumber) + "_" + str(m) + "_" + "%s"%x)
|
|
1837 else:
|
|
1838 group.set_id("all_" + str(queryclusternumber) + "_" + str(m) + "_" + "%s"%x)
|
|
1839 s.addElement(group)
|
|
1840 if y == 0:
|
|
1841 y = 1
|
|
1842 elif y == 1:
|
|
1843 y = 0
|
|
1844 a += 1
|
|
1845 return [s,[qdata,hdata,strandsbalancedict]]
|
|
1846
|
|
1847 def runblast(query):
|
|
1848 blastsearch = "blastp -db "+antismash_path+"clusterblast/geneclusterprots.fasta -query " + query + " -outfmt 6 -max_target_seqs 1000 -evalue 1e-05 -out " + query.split(".")[0] + ".out"
|
|
1849 os.system(blastsearch)
|
|
1850
|
|
1851 def smcog_analysis(inputgenes,inputnr,accessiondict,seqdict,smcogdict,smcogsoutputfolder):
|
|
1852 #create input.fasta file with single query sequence to be used as input for MSA
|
|
1853 for k in inputgenes:
|
|
1854 gene = accessiondict[k]
|
|
1855 tag = k
|
|
1856 seq = seqdict[k]
|
|
1857 writefasta([tag],[seq],"input" + str(inputnr) + ".fasta")
|
|
1858 if len(smcogdict[k]) > 0:
|
|
1859 smcog = (smcogdict[k][0][0]).split(":")[0]
|
|
1860 #Align to multiple sequence alignment, output as fasta file
|
|
1861 fastafile = "input" + str(inputnr) + ".fasta"
|
|
1862 musclecommand = "muscle -quiet -profile -in1 " + str(smcog).lower() + "_muscle.fasta -in2 input" + str(inputnr) + ".fasta -out muscle" + str(inputnr) + ".fasta"
|
|
1863 os.system(musclecommand)
|
|
1864 #Trim alignment
|
|
1865 #edit muscle fasta file: remove all positions before the first and after the last position shared by >33% of all sequences
|
|
1866 file = open("muscle" + str(inputnr) + ".fasta","r")
|
|
1867 filetext = file.read()
|
|
1868 filetext = filetext.replace("\r","\n")
|
|
1869 lines = filetext.split("\n")
|
|
1870 ##Combine all sequence lines into single lines
|
|
1871 lines2 = []
|
|
1872 seq = ""
|
|
1873 nrlines = len(lines)
|
|
1874 a = 0
|
|
1875 lines = lines[:-1]
|
|
1876 for i in lines:
|
|
1877 if a == (nrlines - 2):
|
|
1878 seq = seq + i
|
|
1879 lines2.append(seq)
|
|
1880 if i[0] == ">":
|
|
1881 lines2.append(seq)
|
|
1882 seq = ""
|
|
1883 lines2.append(i)
|
|
1884 else:
|
|
1885 seq = seq + i
|
|
1886 a += 1
|
|
1887 lines = lines2[1:]
|
|
1888 #Retrieve names and seqs from muscle fasta lines
|
|
1889 seqs = []
|
|
1890 names = []
|
|
1891 for i in lines:
|
|
1892 if len(i) > 0 and i[0] == ">":
|
|
1893 name = i[1:]
|
|
1894 names.append(name)
|
|
1895 else:
|
|
1896 seq = i
|
|
1897 seqs.append(seq)
|
|
1898 #Find first and last amino acids shared conserved >33%
|
|
1899 #Create list system to store conservation of residues
|
|
1900 conservationlist = []
|
|
1901 lenseqs = len(seqs[0])
|
|
1902 nrseqs = len(seqs)
|
|
1903 for i in range(lenseqs):
|
|
1904 conservationlist.append({"A":0,"B":0,"C":0,"D":0,"E":0,"F":0,"G":0,"H":0,"I":0,"J":0,"K":0,"L":0,"M":0,"N":0,"P":0,"Q":0,"R":0,"S":0,"T":0,"U":0,"V":0,"W":0,"X":0,"Y":0,"Z":0,"-":0})
|
|
1905 a = 0
|
|
1906 for i in seqs:
|
|
1907 aa = list(i)
|
|
1908 for i in aa:
|
|
1909 conservationlist[a][i] += 1
|
|
1910 a += 1
|
|
1911 a = 0
|
|
1912 firstsharedaa = 0
|
|
1913 lastsharedaa = lenseqs
|
|
1914 #Find first amino acid shared
|
|
1915 first = "yes"
|
|
1916 nr = 0
|
|
1917 for i in conservationlist:
|
|
1918 aa = sortdictkeysbyvaluesrev(i)
|
|
1919 if aa[0] != "-" and i[aa[1]] > (nrseqs / 3) and first == "yes":
|
|
1920 firstsharedaa = nr
|
|
1921 first = "no"
|
|
1922 nr += 1
|
|
1923 #Find last amino acid shared
|
|
1924 conservationlist.reverse()
|
|
1925 first = "yes"
|
|
1926 nr = 0
|
|
1927 for i in conservationlist:
|
|
1928 aa = sortdictkeysbyvaluesrev(i)
|
|
1929 if aa[0] != "-" and i[aa[1]] > (nrseqs / 3) and first == "yes":
|
|
1930 lastsharedaa = lenseqs - nr
|
|
1931 first = "no"
|
|
1932 nr += 1
|
|
1933 #Shorten sequences to detected conserved regions
|
|
1934 seqs2 = []
|
|
1935 for i in seqs:
|
|
1936 seq = i[firstsharedaa:lastsharedaa]
|
|
1937 seqs2.append(seq)
|
|
1938 seqs = seqs2
|
|
1939 seedfastaname = "trimmed_alignment" + str(inputnr) + ".fasta"
|
|
1940 writefasta(names,seqs,seedfastaname)
|
|
1941 #Draw phylogenetic tree with fasttree 2.1.1
|
|
1942 nwkfile = "tree" + str(inputnr) + ".nwk"
|
|
1943 if sys.platform == ('win32'):
|
|
1944 fasttreecommand = "fasttree -quiet -fastest -noml trimmed_alignment" + str(inputnr) + ".fasta > " + nwkfile
|
|
1945 elif sys.platform == ('linux2'):
|
|
1946 fasttreecommand = "./FastTree -quiet -fastest -noml trimmed_alignment" + str(inputnr) + ".fasta > " + nwkfile
|
|
1947 os.system(fasttreecommand)
|
|
1948 #Convert tree to XTG and draw PNG image using TreeGraph
|
|
1949 p = subprocess.Popen("java -Djava.awt.headless=true -jar TreeGraph.jar -convert tree" + str(inputnr) + ".nwk -xtg tree" + str(inputnr) + ".xtg", shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
|
|
1950 processes_starttime = time.time()
|
|
1951 while True:
|
|
1952 if (time.time() - processes_starttime) > 300:
|
|
1953 if sys.platform == ('linux2'):
|
|
1954 os.kill(p.pid,signal.SIGKILL)
|
|
1955 break
|
|
1956 if sys.platform == ('win32'):
|
|
1957 subprocess.Popen("taskkill /F /T /PID %i"%p.pid , shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
|
|
1958 break
|
|
1959 if p.poll() == 0:
|
|
1960 break
|
|
1961 time.sleep(2)
|
|
1962 out, err = p.communicate()
|
|
1963 output = out
|
|
1964 if "exception" not in output and "Exception" not in output:
|
|
1965 p = subprocess.Popen("java -Djava.awt.headless=true -jar TreeGraph.jar -image tree" + str(inputnr) + ".xtg " + tag.split(".")[0] + ".png", shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
|
|
1966 processes_starttime = time.time()
|
|
1967 while True:
|
|
1968 if (time.time() - processes_starttime) > 300:
|
|
1969 if sys.platform == ('linux2'):
|
|
1970 os.kill(p.pid,signal.SIGKILL)
|
|
1971 break
|
|
1972 if sys.platform == ('win32'):
|
|
1973 subprocess.Popen("taskkill /F /T /PID %i"%p.pid , shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
|
|
1974 break
|
|
1975 if p.poll() == 0:
|
|
1976 break
|
|
1977 time.sleep(2)
|
|
1978 out, err = p.communicate()
|
|
1979 output = out
|
|
1980 if "exception" not in output and "Exception" not in output:
|
|
1981 if sys.platform == ('win32'):
|
|
1982 copycommand = 'copy/y ' + tag.split(".")[0] + '.png "..\\' + smcogsoutputfolder + '" > nul'
|
|
1983 elif sys.platform == ('linux2'):
|
|
1984 copycommand = 'cp ' + tag.split(".")[0] + '.png "../' + smcogsoutputfolder + '" > /dev/null'
|
|
1985 os.system(copycommand)
|
|
1986 if sys.platform == ('win32'):
|
|
1987 os.system("del " + tag.split(".")[0] + ".png")
|
|
1988 os.system("del tree" + str(inputnr) + ".xtg")
|
|
1989 os.system("del trimmed_alignment" + str(inputnr) + ".fasta")
|
|
1990 elif sys.platform == ('linux2'):
|
|
1991 os.system("rm " + tag.split(".")[0] + ".png")
|
|
1992 os.system("rm tree" + str(inputnr) + ".xtg")
|
|
1993 os.system("rm trimmed_alignment" + str(inputnr) + ".fasta")
|
|
1994
|
|
1995 def depict_smile(genecluster,structuresfolder):
|
|
1996 if sys.platform == ('win32'):
|
|
1997 indigo_depict_command1 = "indigo-depict genecluster" + str(genecluster) + ".smi " + "genecluster" + str(genecluster) + "_icon.png -query -w 200 -h 150"
|
|
1998 indigo_depict_command2 = "indigo-depict genecluster" + str(genecluster) + ".smi " + "genecluster" + str(genecluster) + ".png -query"
|
|
1999 elif sys.platform == ('linux2'):
|
|
2000 indigo_depict_command1 = "./indigo-depict genecluster" + str(genecluster) + ".smi " + "genecluster" + str(genecluster) + "_icon.png -query -w 200 -h 150"
|
|
2001 indigo_depict_command2 = "./indigo-depict genecluster" + str(genecluster) + ".smi " + "genecluster" + str(genecluster) + ".png -query"
|
|
2002 os.system(indigo_depict_command1)
|
|
2003 os.system(indigo_depict_command2)
|
|
2004 dircontents = getdircontents()
|
|
2005 geneclusterstring = "genecluster" + str(genecluster) + ".png"
|
|
2006 if geneclusterstring in dircontents:
|
|
2007 if sys.platform == ('win32'):
|
|
2008 structuresfolder = structuresfolder.replace("/","\\")
|
|
2009 copycommand1 = "copy/y genecluster" + str(genecluster) + ".png ..\\" + structuresfolder + ' > nul'
|
|
2010 copycommand2 = "copy/y genecluster" + str(genecluster) + "_icon.png ..\\" + structuresfolder + ' > nul'
|
|
2011 delcommand1 = "del genecluster" + str(genecluster) + ".png"
|
|
2012 delcommand2 = "del genecluster" + str(genecluster) + "_icon.png"
|
|
2013 delcommand3 = "del genecluster" + str(genecluster) + ".smi"
|
|
2014 os.system(copycommand1)
|
|
2015 os.system(copycommand2)
|
|
2016 os.system(delcommand1)
|
|
2017 os.system(delcommand2)
|
|
2018 os.system(delcommand3)
|
|
2019 if sys.platform == ('linux2'):
|
|
2020 copycommand1 = "cp genecluster" + str(genecluster) + ".png ../" + structuresfolder
|
|
2021 copycommand2 = "cp genecluster" + str(genecluster) + "_icon.png ../" + structuresfolder
|
|
2022 delcommand1 = "rm genecluster" + str(genecluster) + ".png"
|
|
2023 delcommand2 = "rm genecluster" + str(genecluster) + "_icon.png"
|
|
2024 delcommand3 = "rm genecluster" + str(genecluster) + ".smi"
|
|
2025 os.system(copycommand1)
|
|
2026 os.system(copycommand2)
|
|
2027 os.system(delcommand1)
|
|
2028 os.system(delcommand2)
|
|
2029 return "success"
|
|
2030 else:
|
|
2031 return "failed"
|
|
2032
|
|
2033 ##Core script
|
|
2034 import os
|
|
2035 from os import system
|
|
2036 import sys
|
|
2037 import multiprocessing
|
|
2038 import time
|
|
2039 from multiprocessing import Process, freeze_support
|
|
2040 import random
|
|
2041 import string
|
|
2042 import itertools
|
|
2043 from pysvg.filter import *
|
|
2044 from pysvg.gradient import *
|
|
2045 from pysvg.linking import *
|
|
2046 from pysvg.script import *
|
|
2047 from pysvg.shape import *
|
|
2048 from pysvg.structure import *
|
|
2049 from pysvg.style import *
|
|
2050 from pysvg.text import *
|
|
2051 from pysvg.builders import *
|
|
2052 from string import ascii_letters
|
|
2053 from pyExcelerator import *
|
|
2054 from pyExcelerator.Workbook import *
|
|
2055 import signal
|
|
2056 import subprocess
|
|
2057 starttime = time.time()
|
|
2058
|
|
2059 os.environ['NRPS2BASEDIR'] = os.path.join(os.getcwd(), 'NRPSPredictor2')
|
|
2060
|
|
2061 #Fix sys.argv input
|
|
2062 options = []
|
|
2063 for i in sys.argv:
|
|
2064 if i.count('"') > 1:
|
|
2065 j = i.split(' ')
|
|
2066 for k in j:
|
|
2067 if k[0] == '"':
|
|
2068 k = k + '"'
|
|
2069 elif k[-1] == '"':
|
|
2070 k = '"' + k
|
|
2071 options.append(k)
|
|
2072 else:
|
|
2073 options.append(i)
|
|
2074 sys.argv = options
|
|
2075 #Redirect stdout and stderr if GUI-executed
|
|
2076 if "--gui" in sys.argv and len(sys.argv) < (sys.argv.index("--gui") + 2):
|
|
2077 print >> sys.stderr, "Invalid options input: --gui without n or y"
|
|
2078 print "From the command line, input antismash --help for more information."
|
|
2079 logfile = open("antismash.log","w")
|
|
2080 logfile.write("Invalid options input: --gui without n or y\n")
|
|
2081 logfile.close()
|
|
2082 sys.exit(1)
|
|
2083 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
|
|
2084 stdoutfile = open("stdout.txt","w")
|
|
2085 sys.stdout = stdoutfile
|
|
2086 sys.stderr = stdoutfile
|
|
2087
|
|
2088 if __name__ == '__main__':
|
|
2089 import shutil
|
|
2090 hmmsearch_path = 'hmmsearch'
|
|
2091 hmmscan_path = 'hmmscan'
|
|
2092 antismash_path = '/home/galaxy/bin/antismash-1.1.0/'
|
|
2093 hmms_path = antismash_path + '/hmms/'
|
|
2094 shutil.copytree(antismash_path + '/NRPSPredictor2/', './NRPSPredictor2/')
|
|
2095 shutil.copytree(antismash_path + '/Minowa/', './Minowa/')
|
|
2096 shutil.copytree(antismash_path + '/pkssignatures/', './pkssignatures/')
|
|
2097 shutil.copytree(antismash_path + '/kr_analysis/', './kr_analysis/')
|
|
2098 shutil.copytree(antismash_path + '/docking_analysis/', './docking_analysis/')
|
|
2099 shutil.copytree(antismash_path + '/NRPeditor/', './NRPeditor/')
|
|
2100 shutil.copy(antismash_path + '/search_form.html', './')
|
|
2101 shutil.copy(antismash_path + '/empty.xhtml', './')
|
|
2102 shutil.copytree(antismash_path + '/vis/', './vis/')
|
|
2103 shutil.copytree(antismash_path + '/smcogtree/', './smcogtree/')
|
|
2104
|
|
2105 # add freeze support
|
|
2106 freeze_support()
|
|
2107
|
|
2108 #Open logfile
|
|
2109 logfile = open("antismash.log","w")
|
|
2110
|
|
2111 #Identify screen width
|
|
2112 if sys.platform == ('win32'):
|
|
2113 import ctypes
|
|
2114 user32 = ctypes.windll.user32
|
|
2115 screenwidth = user32.GetSystemMetrics(0)
|
|
2116 if sys.platform == ('linux2'):
|
|
2117 screenwidth = 1024
|
|
2118 # res = os.popen("xrandr | grep \* | cut -d' ' -f4") ###FOR SERVER USE###
|
|
2119 # res = res.read() ###FOR SERVER USE###
|
|
2120 # screenwidth = int(res.split("x")[0]) ###FOR SERVER USE###
|
|
2121 if screenwidth < 1024:
|
|
2122 screenwidth = 1024
|
|
2123 #temporary for testing
|
|
2124 screenwidth = 1024
|
|
2125
|
|
2126
|
|
2127 #Reads input
|
|
2128 inputinstructions = "antiSMASH 1.1.0 arguments:\n\nUsage: antismash <query fasta/embl/gbk file> [options]\n\nOptions (x is an integer number, list x,y,z is a list of integer numbers separated by commas):\n\n--gtransl <x> : GenBank translation table used for Glimmer (only for FASTA inputs, default: 1)\n1. The Standard Code\n2. The Vertebrate Mitochondrial Code\n3. The Yeast Mitochondrial Code\n4. The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code\n5. The Invertebrate Mitochondrial Code\n6. The Ciliate, Dasycladacean and Hexamita Nuclear Code\n9. The Echinoderm and Flatworm Mitochondrial Code\n10. The Euplotid Nuclear Code\n11. The Bacterial, Archaeal and Plant Plastid Code\n12. The Alternative Yeast Nuclear Code\n13. The Ascidian Mitochondrial Code\n14. The Alternative Flatworm Mitochondrial Code\n15. Blepharisma Nuclear Code\n16. Chlorophycean Mitochondrial Code\n21. Trematode Mitochondrial Code\n22. Scenedesmus Obliquus Mitochondrial Code\n23. Thraustochytrium Mitochondrial Code\n--genomeconf <l/c> : Genome configuration used for Glimmer: linear / circular (only for FASTA inputs, default: l)\n--minglength <x> : Glimmer minimal gene length (range 30-120, only for FASTA inputs, default: 90)\n--taxon <p/e> : Taxonomy: prokaryotic / eukaryotic (default: p)\n--cores <x> : Number of parallel CPUs to use for threading (default: all)\n--clusterblast <y/n> : Include ClusterBlast gene cluster comparison analysis (default:y)\n--smcogs <y/n> : Include smCOG analysis for functional prediction of genes (default:y)\n--fullblast <y/n> : Include genome-wide BLAST analysis (default:n)\n--fullhmm <y/n> : Include genome-wide PFAM HMM analysis (default:n)\n--blastdbpath <path> : Specify folder containing CLUSEAN blast database (default:clusean/db)\n--pfamdbpath <path> : Specify folder containing PFAM database (default:clusean/db)\n--geneclustertypes <x,y,z> : Gene cluster types to scan for (default:1):\n1 = all\n2 = type I polyketide synthases\n3 = type II polyketide synthases\n4 = type III polyketide synthases\n5 = nonribosomal peptide synthetases\n6 = terpene synthases\n7 = lantibiotics\n8 = bacteriocins\n9 = beta-lactams\n10 = aminoglycosides / aminocyclitols\n11 = aminocoumarins\n12 = siderophores\n13 = ectoines\n14 = butyrolactones\n15 = indoles\n16 = nucleosides\n17 = phosphoglycolipids\n18 = melanins\n19 = others\n--help : this help screen\n"
|
|
2129 #Check input file format
|
|
2130 if len(sys.argv) < 2 or len(sys.argv[1]) < 1:
|
|
2131 print >> sys.stderr, "Please supply valid name for input file."
|
|
2132 print "Usage: antismash <query fasta/embl/gbk file> [options]"
|
|
2133 print "From the command line, input antismash --help for more information."
|
|
2134 logfile.write("Input format error. Please supply valid name for infile.\n")
|
|
2135 logfile.write("Usage: antismash <query fasta/embl/gbk file> [options]\n")
|
|
2136 logfile.write("From the command line, input antismash --help for more information.\n")
|
|
2137 logfile.close()
|
|
2138 sys.exit(1)
|
|
2139 if sys.argv[1] != "--help":
|
|
2140 if len(sys.argv[1].split(".")) < 2 or (sys.argv[1].split(".")[-1] != "embl" and sys.argv[1].split(".")[-1] != "EMBL" and sys.argv[1].split(".")[-1] != "emb" and sys.argv[1].split(".")[-1] != "EMB" and sys.argv[1].split(".")[-1] != "genbank" and sys.argv[1].split(".")[-1] != "GENBANK" and sys.argv[1].split(".")[-1] != "gbk" and sys.argv[1].split(".")[-1] != "GBK" and sys.argv[1].split(".")[-1] != "gb" and sys.argv[1].split(".")[-1] != "GB" and sys.argv[1].split(".")[-1] != "fasta" and sys.argv[1].split(".")[-1] != "FASTA" and sys.argv[1].split(".")[-1] != "fas" and sys.argv[1].split(".")[-1] != "FAS" and sys.argv[1].split(".")[-1] != "fa" and sys.argv[1].split(".")[-1] != "FA"):
|
|
2141 print >> sys.stderr, "No EMBL/GBK/FASTA file submitted as input. Please supply a valid file with .embl / .gbk / .fasta extension. "
|
|
2142 print "Usage: antismash <query fasta/embl/gbk file> [options]"
|
|
2143 print "From the command line, input antismash --help for more information."
|
|
2144 logfile.write("Input format error. Please supply a valid file with .embl / .gbk / .fasta extension.\n")
|
|
2145 logfile.write("Usage: antismash <query fasta/embl/gbk file> [options]\n")
|
|
2146 logfile.write("From the command line, input antismash --help for more information.\n")
|
|
2147 logfile.close()
|
|
2148 sys.exit(1)
|
|
2149 #Define input filename and make fixes if necessary
|
|
2150 infile = sys.argv[1]
|
|
2151 try:
|
|
2152 testfile = open(infile,"r").read()
|
|
2153 except(IOError):
|
|
2154 print >> sys.stderr, "Please supply valid name for input file."
|
|
2155 print "Usage: antismash <query fasta/embl/gbk file> [options]"
|
|
2156 print "From the command line, input antismash --help for more information."
|
|
2157 logfile = open("antismash.log","w")
|
|
2158 logfile.write("Input format error. Please supply valid name for infile.\n")
|
|
2159 logfile.write("Usage: antismash <query fasta/embl/gbk file> [options]\n")
|
|
2160 logfile.write("From the command line, input antismash --help for more information.\n")
|
|
2161 logfile.close()
|
|
2162 sys.exit(1)
|
|
2163 #Parse absolute paths if found
|
|
2164 absolutepath = "n"
|
|
2165 if "/" in infile or "\\" in infile:
|
|
2166 absolutepath = "y"
|
|
2167 lastpos1 = infile.rfind("\\")
|
|
2168 lastpos2 = infile.rfind("/")
|
|
2169 lastpos = max([lastpos1,lastpos2])
|
|
2170 originpath = infile[:(lastpos + 1)]
|
|
2171 infile = infile[(lastpos + 1):]
|
|
2172 if sys.platform == ('win32'):
|
|
2173 copycommand = 'copy/y "' + originpath + infile + '" ' + infile + ' > nul'
|
|
2174 os.system(copycommand)
|
|
2175 if sys.platform == ('linux2'):
|
|
2176 copycommand = 'cp ' + originpath + infile + " . > /dev/null"
|
|
2177 os.system(copycommand)
|
|
2178 #genomename = ".".join(infile.split(".")[:-1])
|
|
2179 #for i in genomename:
|
|
2180 # if i in '!"#$%&()*+,./:;=>?@[]^`{|}' or i in "'":
|
|
2181 # genomename = genomename.replace(i,"")
|
|
2182 # if "/" in genomename:
|
|
2183 # genomename = genomename.rpartition("/")[2]
|
|
2184 # if "\\" in genomename:
|
|
2185 # genomename = genomename.rpartition("\\")[2]
|
|
2186 genomename = os.path.splitext(os.path.basename(infile))[0]
|
|
2187 if sys.platform == ('linux2'):
|
|
2188 if genomename != infile.split(".")[-2]:
|
|
2189 oldinfile = infile.replace("(","\\(").replace(")","\\)").replace("*","\\*").replace("&","\\&").replace("!","\\!").replace("$","\\$").replace("{","\\{").replace("}","\\}").replace("|","\\|").replace("`","\\`").replace("'","\\'").replace('"','\\"').replace('?','\\?')
|
|
2190 infile = genomename + "." + infile.split(".")[-1]
|
|
2191 if "/" in genomename:
|
|
2192 genomename = genomename.rpartition("/")[2]
|
|
2193 if "\\" in genomename:
|
|
2194 genomename = genomename.rpartition("\\")[2]
|
|
2195 os.system("cp " + oldinfile + " " + infile)
|
|
2196 #Define outputfolder
|
|
2197 if absolutepath == "y":
|
|
2198 if sys.platform == ('win32'):
|
|
2199 dir1 = os.popen("dir/w/ad " + originpath)
|
|
2200 dir2 = os.popen("dir/w/ad")
|
|
2201 dir1 = dir1.read()
|
|
2202 dir2 = dir2.read()
|
|
2203 if sys.platform == ('linux2'):
|
|
2204 dir1 = os.popen("ls")
|
|
2205 dir2 = os.popen("ls " + originpath)
|
|
2206 dir1 = dir1.read()
|
|
2207 dir2 = dir2.read()
|
|
2208 parts = dir1.split(" ") + dir2.split(" ")
|
|
2209 else:
|
|
2210 if sys.platform == ('win32'):
|
|
2211 dir = os.popen("dir/w/ad")
|
|
2212 dir = dir.read()
|
|
2213 if sys.platform == ('linux2'):
|
|
2214 dir = os.popen("ls")
|
|
2215 dir = dir.read()
|
|
2216 parts = dir.split(" ")
|
|
2217 parts2 = []
|
|
2218 for i in parts:
|
|
2219 partparts = i.split("\n")
|
|
2220 for i in partparts:
|
|
2221 i = i.replace("[","")
|
|
2222 i = i.replace("]","")
|
|
2223 parts2.append(i)
|
|
2224 parts = parts2
|
|
2225 oldgenomename = genomename
|
|
2226 if genomename in parts:
|
|
2227 genomename = genomename + "_" + str(0)
|
|
2228 while genomename in parts:
|
|
2229 finalpart = genomename.split("_")[-1]
|
|
2230 allnumbers = "y"
|
|
2231 for i in finalpart:
|
|
2232 if i not in ["0","1","2","3","4","5","6","7","8","9"]:
|
|
2233 allnumbers = "n"
|
|
2234 if allnumbers == "y" and int(finalpart) in range(0,1000):
|
|
2235 newgenomename = ""
|
|
2236 for i in genomename.split("_")[:-1]:
|
|
2237 newgenomename = newgenomename + "_" + i
|
|
2238 newgenomename = newgenomename + "_" + str(int(finalpart) + 1)
|
|
2239 genomename = newgenomename[1:]
|
|
2240 genomename = genomename.replace("__","_")
|
|
2241 #Output results folder name for output checking by GUI
|
|
2242 resultslocfile = open("resultsfolder.txt","w")
|
|
2243 resultslocfile.write(os.getcwd() + os.sep + genomename)
|
|
2244 resultslocfile.close()
|
|
2245 #Implement defaults
|
|
2246 glimmertransl_table = str(1)
|
|
2247 genomeconf = "l"
|
|
2248 minglength = str(90)
|
|
2249 cores = "all"
|
|
2250 taxon = "p"
|
|
2251 clusterblast = "y"
|
|
2252 smcogs = "y"
|
|
2253 fullblast = "n"
|
|
2254 fullhmm = "n"
|
|
2255 if sys.platform == ('win32'):
|
|
2256 blastdbpath = '"' + os.getcwd() + "/clusean/db" + '"'
|
|
2257 if sys.platform == ('linux2'):
|
|
2258 blastdbpath = os.getcwd() + "/clusean/db"
|
|
2259 if sys.platform == ('win32'):
|
|
2260 pfamdbpath = '"' + os.getcwd() + "/clusean/db/" + '"'
|
|
2261 if sys.platform == ('linux2'):
|
|
2262 pfamdbpath = os.getcwd() + "/clusean/db/"
|
|
2263 geneclustertypes = [1]
|
|
2264 #Read user-specified options which may override defaults
|
|
2265 if len(sys.argv) > 2 or sys.argv[1] == "--help":
|
|
2266 options = sys.argv
|
|
2267 if "--" in options[-1] and sys.argv[1] != "--help":
|
|
2268 invalidoptions(options[-1])
|
|
2269 #identify option identifiers
|
|
2270 identifiers = []
|
|
2271 for i in options:
|
|
2272 if "--" in i:
|
|
2273 if i not in identifiers:
|
|
2274 identifiers.append(i)
|
|
2275 else:
|
|
2276 invalidoptions("No '--' in given options or option given twice.")
|
|
2277 for i in identifiers:
|
|
2278 if i != "--help":
|
|
2279 value = options[options.index(i) + 1].strip()
|
|
2280 if i == "--gtransl":
|
|
2281 for k in value:
|
|
2282 if k not in ["0","1","2","3","4","5","6","7","8","9"]:
|
|
2283 invalidoptions(i + "input is no number")
|
|
2284 if int(value) in range(1,24) and int(value) != 7 and int(value) != 8 and int(value) != 17 and int(value) != 18 and int(value) != 19 and int(value) != 20:
|
|
2285 glimmertransl_table = value
|
|
2286 else:
|
|
2287 invalidoptions(i)
|
|
2288 elif i == "--genomeconf":
|
|
2289 if value == "l" or value == "c":
|
|
2290 genomeconf = value
|
|
2291 else:
|
|
2292 invalidoptions(i)
|
|
2293 elif i == "--minglength":
|
|
2294 for k in value:
|
|
2295 if k not in ["0","1","2","3","4","5","6","7","8","9"]:
|
|
2296 invalidoptions(i)
|
|
2297 if int(value) in range(30,91):
|
|
2298 minglength = value
|
|
2299 else:
|
|
2300 print >> sys.stderr, "Invalid options input: minimal gene length should be a number between 30-90."
|
|
2301 logfile = open("antismash.log","w")
|
|
2302 logfile.write("Invalid options input: minimal gene length should be a number between 30-90.\n")
|
|
2303 logfile.close()
|
|
2304 sys.exit(1)
|
|
2305 elif i == "--cores":
|
|
2306 for k in value:
|
|
2307 if k not in ["0","1","2","3","4","5","6","7","8","9"]:
|
|
2308 invalidoptions(i)
|
|
2309 if int(value) in range(1,1000):
|
|
2310 cores = int(value)
|
|
2311 else:
|
|
2312 invalidoptions(i)
|
|
2313 elif i == "--taxon":
|
|
2314 if value == "p" or value == "e":
|
|
2315 taxon = value
|
|
2316 else:
|
|
2317 invalidoptions(i)
|
|
2318 elif i == "--clusterblast":
|
|
2319 if value == "y" or value == "n":
|
|
2320 clusterblast = value
|
|
2321 else:
|
|
2322 invalidoptions(i)
|
|
2323 elif i == "--smcogs":
|
|
2324 if value == "y" or value == "n":
|
|
2325 smcogs = value
|
|
2326 else:
|
|
2327 invalidoptions(i)
|
|
2328 elif i == "--fullblast":
|
|
2329 if value == "y" or value == "n":
|
|
2330 fullblast = value
|
|
2331 else:
|
|
2332 invalidoptions(i)
|
|
2333 elif i == "--fullhmm":
|
|
2334 if value == "y" or value == "n":
|
|
2335 fullhmm = value
|
|
2336 else:
|
|
2337 invalidoptions(i)
|
|
2338 elif i == "--glimmer_prediction":
|
|
2339 glimmer_prediction_path = value
|
|
2340 elif i == "--blastdbpath":
|
|
2341 if sys.platform == ('win32'):
|
|
2342 if options[options.index(i) + 1][0] != '"':
|
|
2343 value = '"' + options[options.index(i) + 1] + '"'
|
|
2344 else:
|
|
2345 value = options[options.index(i) + 1]
|
|
2346 if ":\\" in value:
|
|
2347 blastdbpath = value
|
|
2348 elif "\\" in value or "/" in value:
|
|
2349 if value[0] == "\\" or value[0] == "/":
|
|
2350 blastdbpath = os.getcwd() + value
|
|
2351 else:
|
|
2352 blastdbpath = os.getcwd() + "\\" + value
|
|
2353 else:
|
|
2354 blastdbpath = os.getcwd() + "\\" + value
|
|
2355 if sys.platform == ('linux2'):
|
|
2356 value = options[options.index(i) + 1]
|
|
2357 if "\\" in value or "/" in value:
|
|
2358 value = value.replace("\\","/")
|
|
2359 if value[0] == "/":
|
|
2360 blastdbpath = value
|
|
2361 else:
|
|
2362 blastdbpath = os.getcwd() + "/" + value
|
|
2363 else:
|
|
2364 blastdbpath = os.getcwd() + "/" + value
|
|
2365 elif i == "--pfamdbpath":
|
|
2366 if sys.platform == ('win32'):
|
|
2367 if options[options.index(i) + 1][0] != '"':
|
|
2368 value = '"' + options[options.index(i) + 1] + '"'
|
|
2369 else:
|
|
2370 value = options[options.index(i) + 1]
|
|
2371 if ":\\" in value:
|
|
2372 pfamdbpath = value
|
|
2373 elif "\\" in value or "/" in value:
|
|
2374 if value[0] == "\\" or value[0] == "/":
|
|
2375 pfamdbpath = os.getcwd() + value
|
|
2376 else:
|
|
2377 pfamdbpath = os.getcwd() + "\\" + value
|
|
2378 else:
|
|
2379 pfamdbpath = os.getcwd() + "\\" + value
|
|
2380 if sys.platform == ('linux2'):
|
|
2381 value = options[options.index(i) + 1]
|
|
2382 if "\\" in value or "/" in value:
|
|
2383 value = value.replace("\\","/")
|
|
2384 if value[0] == "/":
|
|
2385 pfamdbpath = value
|
|
2386 else:
|
|
2387 pfamdbpath = os.getcwd() + "/" + value
|
|
2388 else:
|
|
2389 pfamdbpath = os.getcwd() + "/" + value
|
|
2390 elif i == "--geneclustertypes":
|
|
2391 if "," not in value and value not in ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19"]:
|
|
2392 invalidoptions(i)
|
|
2393 else:
|
|
2394 types = value.split(",")
|
|
2395 types2 = []
|
|
2396 if "1" in types:
|
|
2397 types2 = [1]
|
|
2398 for j in types:
|
|
2399 if int(j) not in range(1,20):
|
|
2400 invalidoptions(i)
|
|
2401 else:
|
|
2402 types2.append(int(j))
|
|
2403 geneclustertypes = types2
|
|
2404 elif i == "--help":
|
|
2405 print inputinstructions
|
|
2406 sys.exit()
|
|
2407 elif i == "--gui":
|
|
2408 pass
|
|
2409 else:
|
|
2410 invalidoptions(i)
|
|
2411
|
|
2412 #Determine number of CPUs used
|
|
2413 if cores == "all":
|
|
2414 try:
|
|
2415 nrcpus = multiprocessing.cpu_count()
|
|
2416 except(IOError,OSError,NotImplementedError):
|
|
2417 nrcpus = 1
|
|
2418 else:
|
|
2419 try:
|
|
2420 nrcpus = multiprocessing.cpu_count()
|
|
2421 except(IOError,OSError,NotImplementedError):
|
|
2422 nrcpus = 1
|
|
2423 if cores < nrcpus:
|
|
2424 nrcpus = cores
|
|
2425
|
|
2426 #Create directory structure needed for file storage
|
|
2427 try:
|
|
2428 os.mkdir(genomename)
|
|
2429 except(IOError,OSError):
|
|
2430 pass
|
|
2431 hmmoutputfolder = genomename + "/hmmoutput/"
|
|
2432 try:
|
|
2433 os.mkdir(hmmoutputfolder)
|
|
2434 except(IOError,OSError):
|
|
2435 pass
|
|
2436 nrpspksoutputfolder = genomename + "/nrpspks/"
|
|
2437 try:
|
|
2438 os.mkdir(nrpspksoutputfolder)
|
|
2439 except(IOError,OSError):
|
|
2440 pass
|
|
2441 nrpspredictoroutputfolder = nrpspksoutputfolder + "nrpspredictor/"
|
|
2442 try:
|
|
2443 os.mkdir(nrpspredictoroutputfolder)
|
|
2444 except(IOError,OSError):
|
|
2445 pass
|
|
2446 minowanrpsoutputfolder = nrpspksoutputfolder + "minowanrpspred/"
|
|
2447 try:
|
|
2448 os.mkdir(minowanrpsoutputfolder)
|
|
2449 except(IOError,OSError):
|
|
2450 pass
|
|
2451 minowapksoutputfolder = nrpspksoutputfolder + "minowapkspred/"
|
|
2452 try:
|
|
2453 os.mkdir(minowapksoutputfolder)
|
|
2454 except(IOError,OSError):
|
|
2455 pass
|
|
2456 minowacaloutputfolder = nrpspksoutputfolder + "minowacalpred/"
|
|
2457 try:
|
|
2458 os.mkdir(minowacaloutputfolder)
|
|
2459 except(IOError,OSError):
|
|
2460 pass
|
|
2461 pkssignatureoutputfolder = nrpspksoutputfolder + "pkssignatures/"
|
|
2462 try:
|
|
2463 os.mkdir(pkssignatureoutputfolder)
|
|
2464 except(IOError,OSError):
|
|
2465 pass
|
|
2466 kranalysisoutputfolder = nrpspksoutputfolder + "kr_analysis/"
|
|
2467 try:
|
|
2468 os.mkdir(kranalysisoutputfolder)
|
|
2469 except(IOError,OSError):
|
|
2470 pass
|
|
2471 clusterblastoutputfolder = genomename + "/clusterblast/"
|
|
2472 try:
|
|
2473 os.mkdir(clusterblastoutputfolder)
|
|
2474 except(IOError,OSError):
|
|
2475 pass
|
|
2476 smcogsoutputfolder = genomename + "/smcogs/"
|
|
2477 try:
|
|
2478 os.mkdir(smcogsoutputfolder)
|
|
2479 except(IOError,OSError):
|
|
2480 pass
|
|
2481 substrspecsfolder = genomename + "/substrspecs/"
|
|
2482 try:
|
|
2483 os.mkdir(substrspecsfolder)
|
|
2484 except(IOError,OSError):
|
|
2485 pass
|
|
2486 structuresfolder = genomename + "/structures/"
|
|
2487 try:
|
|
2488 os.mkdir(structuresfolder)
|
|
2489 except(IOError,OSError):
|
|
2490 pass
|
|
2491 svgfolder = genomename + "/svg/"
|
|
2492 try:
|
|
2493 os.mkdir(svgfolder)
|
|
2494 except(IOError,OSError):
|
|
2495 pass
|
|
2496 searchgtrfolder = genomename + "/searchgtr/"
|
|
2497 try:
|
|
2498 os.mkdir(searchgtrfolder)
|
|
2499 except(IOError,OSError):
|
|
2500 pass
|
|
2501 htmlfolder = genomename + "/html/"
|
|
2502 try:
|
|
2503 os.mkdir(htmlfolder)
|
|
2504 except(IOError,OSError):
|
|
2505 pass
|
|
2506 imagesfolder = genomename + "/images/"
|
|
2507 try:
|
|
2508 os.mkdir(imagesfolder)
|
|
2509 except(IOError,OSError):
|
|
2510 pass
|
|
2511
|
|
2512 #If input is unannotated GBK/EMBL file, convert to FASTA and use that as input
|
|
2513 if " CDS " not in open(infile,"r").read() and "FT CDS " not in open(infile,"r").read():
|
|
2514 if infile.split(".")[-1] == "embl" or infile.split(".")[-1] == "EMBL" or infile.split(".")[-1] == "emb" or infile.split(".")[-1] == "EMB":
|
|
2515 filetext = open(infile,"r").read()
|
|
2516 if "\nSQ" not in filetext:
|
|
2517 print >> sys.stderr, "Exit: EMBL file not properly formatted, no sequence found."
|
|
2518 logfile = open("antismash.log","w")
|
|
2519 logfile.write("Exit: EMBL file not properly formatted, no sequence found.\n")
|
|
2520 logfile.close()
|
|
2521 sys.exit(1)
|
|
2522 dnaseq = filetext.split("\nSQ")[1]
|
|
2523 dnaseq = cleandnaseq(dnaseq)
|
|
2524 sequence = dnaseq
|
|
2525 if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
|
|
2526 print >> sys.stderr, "Protein EMBL file provided. Please provide nucleotide EMBL file."
|
|
2527 sys.exit(1)
|
|
2528 fastafile = open(infile.rpartition(".")[0] + ".fasta","w")
|
|
2529 fastafile.write(">" + infile.rpartition(".")[0] + "|\n")
|
|
2530 fastafile.write(sequence)
|
|
2531 fastafile.close()
|
|
2532 infile = fastafile
|
|
2533 elif infile.split(".")[-1] == "gbk" or infile.split(".")[-1] == "GBK" or infile.split(".")[-1] == "gb" or infile.split(".")[-1] == "GB" or infile.split(".")[-1] == "genbank" or infile.split(".")[-1] == "GENBANK":
|
|
2534 filetext = open(infile,"r").read()
|
|
2535 if "\nORIGIN" not in filetext:
|
|
2536 print >> sys.stderr, "Exit: GBK file not properly formatted, no sequence found."
|
|
2537 logfile = open("antismash.log","w")
|
|
2538 logfile.write("Exit: GBK file not properly formatted, no sequence found.\n")
|
|
2539 logfile.close()
|
|
2540 sys.exit(1)
|
|
2541 dnaseq = filetext.split("\nORIGIN")[1]
|
|
2542 dnaseq = cleandnaseq(dnaseq)
|
|
2543 sequence = dnaseq
|
|
2544 if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
|
|
2545 print >> sys.stderr, "Protein GBK file provided. Please provide nucleotide GBK file."
|
|
2546 sys.exit(1)
|
|
2547 fastafile = open(infile.rpartition(".")[0] + ".fasta","w")
|
|
2548 fastafile.write(">" + infile.rpartition(".")[0] + "\n")
|
|
2549 fastafile.write(sequence)
|
|
2550 fastafile.close()
|
|
2551 infile = infile.rpartition(".")[0] + ".fasta"
|
|
2552 #If input is unannotated fasta file, predict genes with Glimmer and create EMBL file. If input is EMBL or GBK file, read input embl/gbk and create input fasta file, read input protein info into memory
|
|
2553 annotated = "y"
|
|
2554 if infile.split(".")[-1] == "fasta" or infile.split(".")[-1] == "FASTA" or infile.split(".")[-1] == "FAS" or infile.split(".")[-1] == "fas" or infile.split(".")[-1] == "FA" or infile.split(".")[-1] == "fa":
|
|
2555 annotated = "n"
|
|
2556 #Check input file formatting
|
|
2557 sequence = get_sequence(infile)
|
|
2558 if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
|
|
2559 print >> sys.stderr, "Protein FASTA file provided. Please provide nucleotide FASTA file."
|
|
2560 sys.exit(1)
|
|
2561 nucleotides = ["A","a","C","c","G","g","T","t","N","n"]
|
|
2562 badsequence = "n"
|
|
2563 sequence_name = open(infile,"r").read().partition(">")[2].partition("\n")[0]
|
|
2564 for i in sequence:
|
|
2565 if i not in nucleotides:
|
|
2566 badsequence = "y"
|
|
2567 if badsequence == "y":
|
|
2568 cleaned_sequence = cleandnaseq(sequence)
|
|
2569 badsequence = "n"
|
|
2570 for i in cleaned_sequence:
|
|
2571 if i not in nucleotides:
|
|
2572 badsequence = "y"
|
|
2573 if badsequence == "n":
|
|
2574 writefasta([sequence_name],[cleaned_sequence],infile.rpartition(".")[0] + "_f.fasta")
|
|
2575 infile = infile.rpartition(".")[0] + "_f.fasta"
|
|
2576 else:
|
|
2577 print >>sys.stderr, "Incorrect file formatting. Please submit a properly formatted single-sequence FASTA file."
|
|
2578 logfile = open("antismash.log","w")
|
|
2579 logfile.write("Incorrect file formatting. Please submit a properly formatted single-sequence FASTA file.\n")
|
|
2580 logfile.close()
|
|
2581 sys.exit(1)
|
|
2582 revseq = reverse_complement(sequence)
|
|
2583 seqlength = len(sequence)
|
|
2584
|
|
2585 #Print Glimmer notification
|
|
2586 #if taxon == "p":
|
|
2587 # print "Running Glimmer 3.02 to predict genes in unannotated prokaryotic genome..."
|
|
2588 #elif taxon == "e":
|
|
2589 # print "Running GlimmerHMM 3.0.1 to predict genes in unannotated eukaryotic genome..."
|
|
2590 logfile = open("antismash.log","w")
|
|
2591 if taxon == "p":
|
|
2592 logfile.write("Running Glimmer 3.02 to predict genes in unannotated prokaryotic genome...\n")
|
|
2593 elif taxon == "e":
|
|
2594 logfile.write("Running GlimmerHMM 3.0.1 to predict genes in unannotated eukaryotic genome...\n")
|
|
2595 #logfile.close()
|
|
2596 loginfo = open("antismash.log","r").read()
|
|
2597 #logfile.close()
|
|
2598 #Copying file and changing to folder to prepare for Glimmer3 prediction
|
|
2599 os.mkdir( os.path.join(os.getcwd(), genomename, "geneprediction"))
|
|
2600 if sys.platform == ('win32'):
|
|
2601 os.system("copy/y " + infile + " geneprediction > nul")
|
|
2602 if sys.platform == ('linux2'):
|
|
2603 os.system("cp " + infile + " geneprediction > /dev/null")
|
|
2604
|
|
2605 os.chdir( os.path.join(os.getcwd(), genomename, "geneprediction"))
|
|
2606 fastafile = '../../'+infile
|
|
2607
|
|
2608 #Find DNA sequence length
|
|
2609 seq = get_sequence(fastafile)
|
|
2610 dnaseqlength = len(seq)
|
|
2611 #Run Glimmer for prokaryotic sequences, GlimmerHMM for eukaryotic sequences
|
|
2612 if taxon == "p":
|
|
2613 """
|
|
2614 GlimmerPrediction, not needed since we can predict it in galaxy on our own
|
|
2615 if genomeconf == "l":
|
|
2616 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
|
|
2617 os.popen("tigr-glimmer long-orfs -l -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
|
|
2618 else:
|
|
2619 os.system("tigr-glimmer long-orfs -l -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
|
|
2620 else:
|
|
2621 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
|
|
2622 os.popen("tigr-glimmer long-orfs -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
|
|
2623 else:
|
|
2624 os.system("tigr-glimmer long-orfs -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
|
|
2625 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
|
|
2626 os.popen("tigr-glimmer extract -t " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs > " + fastafile.rpartition(".")[0] + ".train")
|
|
2627 else:
|
|
2628 os.system("tigr-glimmer extract -t " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs > " + fastafile.rpartition(".")[0] + ".train")
|
|
2629 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
|
|
2630 os.popen("tigr-glimmer build-icm -r " + fastafile.rpartition(".")[0] + ".icm < " + fastafile.rpartition(".")[0] + ".train")
|
|
2631 else:
|
|
2632 os.system("tigr-glimmer build-icm -r " + fastafile.rpartition(".")[0] + ".icm < " + fastafile.rpartition(".")[0] + ".train")
|
|
2633 if genomeconf == "l":
|
|
2634 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
|
|
2635 os.popen("tigr-glimmer glimmer3 -l -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
|
|
2636 else:
|
|
2637 os.system("tigr-glimmer glimmer3 -l -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
|
|
2638 else:
|
|
2639 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
|
|
2640 os.popen("tigr-glimmer glimmer3 -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
|
|
2641 else:
|
|
2642 os.system("tigr-glimmer glimmer3 -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
|
|
2643 #Convert glimmer predictions into EMBL with sequence
|
|
2644 glfile = fastafile.rpartition(".")[0] + ".predict"
|
|
2645
|
|
2646 Ende der Glimmer-Prediction
|
|
2647 """
|
|
2648 glfile = glimmer_prediction_path
|
|
2649 emblfile = fastafile.rpartition(".")[0] + ".embl"
|
|
2650 try:
|
|
2651 file = open(glfile,"r")
|
|
2652 filetext = file.read()
|
|
2653 except:
|
|
2654 print >> sys.stderr, "Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 11."
|
|
2655 logfile = open("antismash.log","w")
|
|
2656 logfile.write("Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 11.\n")
|
|
2657 logfile.close()
|
|
2658 sys.exit(1)
|
|
2659 if "orf" not in filetext:
|
|
2660 print >> sys.stderr, "Glimmer gene prediction failed: no genes found."
|
|
2661 logfile = open("antismash.log","w")
|
|
2662 logfile.write("Glimmer gene prediction failed: no genes found.\n")
|
|
2663 logfile.close()
|
|
2664 sys.exit(1)
|
|
2665 filetext = filetext.replace("\r","\n")
|
|
2666 lines = filetext.split("\n")
|
|
2667 lines = lines[1:-1]
|
|
2668 orfnames = []
|
|
2669 starts = []
|
|
2670 ends = []
|
|
2671 strands = []
|
|
2672 starts2 = []
|
|
2673 ends2 = []
|
|
2674 firstline = "y"
|
|
2675 for i in lines:
|
|
2676 columns = i.split(" ")
|
|
2677 columns2 = []
|
|
2678 for i in columns:
|
|
2679 if i != "":
|
|
2680 columns2.append(i)
|
|
2681 columns = columns2
|
|
2682 if len(columns) > 3:
|
|
2683 frame = columns[3][0]
|
|
2684 strands.append(frame)
|
|
2685 else:
|
|
2686 frame = ""
|
|
2687 if firstline == "y" and frame == "+" and len(columns) > 3:
|
|
2688 orfname = str(columns[0])
|
|
2689 orfnames.append(orfname)
|
|
2690 if genomeconf == "c" and (int(columns[1]) > int(columns[2])) and (int(columns[1]) > (0.5 * dnaseqlength)):
|
|
2691 gstart = (int(columns[2]) % 3) + 1
|
|
2692 if gstart == 3:
|
|
2693 gstart = 0
|
|
2694 starts.append(str(gstart))
|
|
2695 ends.append(columns[2])
|
|
2696 starts.append(columns[1])
|
|
2697 ends.append(str(dnaseqlength))
|
|
2698 else:
|
|
2699 starts.append(columns[1])
|
|
2700 ends.append(columns[2])
|
|
2701 firstline = "n"
|
|
2702 elif firstline == "y" and frame == "-" and len(columns) > 3:
|
|
2703 orfname = str(columns[0])
|
|
2704 orfnames.append(orfname)
|
|
2705 if genomeconf == "c" and (int(columns[1]) > int(columns[2])) and (int(columns[1]) > (0.5 * dnaseqlength)):
|
|
2706 gstart = (int(columns[2]) % 3) + 1
|
|
2707 if gstart == 3:
|
|
2708 gstart = 0
|
|
2709 starts.append("complement(" + str(gstart))
|
|
2710 ends.append(columns[2] + ")")
|
|
2711 starts.append("complement(" + columns[1])
|
|
2712 ends.append(str(dnaseqlength) + ")")
|
|
2713 else:
|
|
2714 complstart = "complement(" + str(columns[1])
|
|
2715 starts.append(complstart)
|
|
2716 complend = str(columns[2]) + ")"
|
|
2717 ends.append(str(complend))
|
|
2718 firstline = "n"
|
|
2719 elif frame == "+" and len(columns) > 3:
|
|
2720 orfname = str(columns[0])
|
|
2721 orfnames.append(orfname)
|
|
2722 starts.append(columns[1])
|
|
2723 ends.append(columns[2])
|
|
2724 elif frame == "-" and len(columns) > 3:
|
|
2725 orfname = str(columns[0])
|
|
2726 orfnames.append(orfname)
|
|
2727 complstart = "complement(" + str(columns[1])
|
|
2728 starts.append(complstart)
|
|
2729 complend = str(columns[2]) + ")"
|
|
2730 ends.append(str(complend))
|
|
2731 if len(orfnames) == 0:
|
|
2732 print >> sys.stderr, "Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 10."
|
|
2733 logfile = open("antismash.log","w")
|
|
2734 logfile.write("Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 10.\n")
|
|
2735 logfile.close()
|
|
2736 sys.exit(1)
|
|
2737 out_file = open(emblfile,"w")
|
|
2738 a = 0
|
|
2739 #print "Writing EMBL file with Glimmer-predicted genes..."
|
|
2740 logfile = open("antismash.log","w")
|
|
2741 logfile.write(loginfo)
|
|
2742 logfile.write("Writing EMBL file with Glimmer-predicted genes...\n")
|
|
2743 #logfile.close()
|
|
2744 loginfo = open("antismash.log","r").read()
|
|
2745 #logfile.close()
|
|
2746 if taxon == "p":
|
|
2747 out_file.write("ID A01; SV 1; linear; DNA; STD; PRO; " + str(dnaseqlength) + " BP.\nXX\n")
|
|
2748 else:
|
|
2749 out_file.write("ID A01; SV 1; linear; DNA; STD; FUN; " + str(dnaseqlength) + " BP.\nXX\n")
|
|
2750 out_file.write("AC A01;\nXX\n")
|
|
2751 out_file.write("DE " + genomename + ";\nXX\n")
|
|
2752 out_file.write("KW none;\nXX\n")
|
|
2753 out_file.write("OS unknown;\n")
|
|
2754 if taxon == "p":
|
|
2755 out_file.write("OC Eubacteria;\nXX\n")
|
|
2756 else:
|
|
2757 out_file.write("OC Fungi;\nXX\n")
|
|
2758 out_file.write("RN [1]\n")
|
|
2759 out_file.write("RT ;\n")
|
|
2760 out_file.write("RL Unknown.\nXX\n")
|
|
2761 out_file.write("FH Key Location/Qualifiers\nFH\n")
|
|
2762 out_file.write("FT source 1.." + str(dnaseqlength) + "\n")
|
|
2763 for i in orfnames:
|
|
2764 out_file.write("FT gene ")
|
|
2765 out_file.write(starts[a])
|
|
2766 out_file.write("..")
|
|
2767 out_file.write(ends[a])
|
|
2768 out_file.write("\n")
|
|
2769 out_file.write('FT /gene="' + i + '"\n')
|
|
2770 out_file.write("FT CDS ")
|
|
2771 out_file.write(starts[a])
|
|
2772 out_file.write("..")
|
|
2773 out_file.write(ends[a])
|
|
2774 out_file.write("\n")
|
|
2775 out_file.write('FT /gene="' + i + '"\n')
|
|
2776 a += 1
|
|
2777 elif taxon == "e":
|
|
2778 """
|
|
2779 GlimmerHMM is executed extern ... in galaxy and will be provided through glimmer_prediction_path
|
|
2780
|
|
2781 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
|
|
2782 os.popen("glimmerhmm " + fastafile + " train_crypto -o " + fastafile.rpartition(".")[0] + ".predict -g")
|
|
2783 else:
|
|
2784 os.system("glimmerhmm " + fastafile + " train_crypto -o " + fastafile.rpartition(".")[0] + ".predict -g")
|
|
2785 """
|
|
2786 #Convert glimmerhmm predictions into EMBL with sequence
|
|
2787 #glfile = fastafile.rpartition(".")[0] + ".predict"
|
|
2788 glfile = glimmer_prediction_path
|
|
2789 emblfile = fastafile.rpartition(".")[0] + ".embl"
|
|
2790 try:
|
|
2791 file = open(glfile,"r")
|
|
2792 filetext = file.read().replace("\r","")
|
|
2793 except:
|
|
2794 print >> sys.stderr, "GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 9."
|
|
2795 logfile = open("antismash.log","w")
|
|
2796 logfile.write("GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 9.\n")
|
|
2797 logfile.close()
|
|
2798 sys.exit(1)
|
|
2799 if "CDS" not in filetext:
|
|
2800 print >> sys.stderr, "GlimmerHMM gene prediction failed: no genes found."
|
|
2801 logfile = open("antismash.log","w")
|
|
2802 logfile.write("GlimmerHMM gene prediction failed: no genes found.\n")
|
|
2803 logfile.close()
|
|
2804 sys.exit(1)
|
|
2805 filetext = filetext.replace("\r","\n")
|
|
2806 lines = filetext.split("\n")
|
|
2807 lines = lines[2:-1]
|
|
2808 orfnames = []
|
|
2809 positions = []
|
|
2810 firstline = "y"
|
|
2811 x = 0
|
|
2812 orfnr = 0
|
|
2813 starts = []
|
|
2814 ends = []
|
|
2815 for i in lines:
|
|
2816 columns = i.split("\t")
|
|
2817 if len(columns) > 1:
|
|
2818 if x == 0:
|
|
2819 strand = columns[6]
|
|
2820 if "mRNA" not in i:
|
|
2821 starts.append(columns[3])
|
|
2822 ends.append(columns[4])
|
|
2823 elif x == (len(lines) - 1) or "mRNA" in lines[x + 1]:
|
|
2824 strand = columns[6]
|
|
2825 starts.append(columns[3])
|
|
2826 ends.append(columns[4])
|
|
2827 orfnr += 1
|
|
2828 if len(str(orfnr)) == 1:
|
|
2829 orfname = "orf0000" + str(orfnr)
|
|
2830 elif len(str(orfnr)) == 2:
|
|
2831 orfname = "orf000" + str(orfnr)
|
|
2832 elif len(str(orfnr)) == 3:
|
|
2833 orfname = "orf00" + str(orfnr)
|
|
2834 elif len(str(orfnr)) == 4:
|
|
2835 orfname = "orf0" + str(orfnr)
|
|
2836 elif len(str(orfnr)) == 5:
|
|
2837 orfname = "orf" + str(orfnr)
|
|
2838 orfnames.append(orfname)
|
|
2839 if strand == "+":
|
|
2840 if len(starts) == 1:
|
|
2841 pos = starts[0] + ".." + ends[0]
|
|
2842 positions.append(pos)
|
|
2843 else:
|
|
2844 pos = "join("
|
|
2845 y = 0
|
|
2846 for i in starts:
|
|
2847 pos = pos + i + ".." + ends[y]
|
|
2848 if i != starts[-1]:
|
|
2849 pos = pos + ","
|
|
2850 y += 1
|
|
2851 pos = pos + ")"
|
|
2852 positions.append(pos)
|
|
2853 elif strand == "-":
|
|
2854 if len(starts) == 1:
|
|
2855 pos = "complement(" + starts[0] + ".." + ends[0] + ")"
|
|
2856 positions.append(pos)
|
|
2857 else:
|
|
2858 pos = "complement(join("
|
|
2859 y = 0
|
|
2860 for i in starts:
|
|
2861 pos = pos + i + ".." + ends[y]
|
|
2862 if i != starts[-1]:
|
|
2863 pos = pos + ","
|
|
2864 y += 1
|
|
2865 pos = pos + "))"
|
|
2866 positions.append(pos)
|
|
2867 starts = []
|
|
2868 ends = []
|
|
2869 elif "mRNA" not in i:
|
|
2870 starts.append(columns[3])
|
|
2871 ends.append(columns[4])
|
|
2872 x += 1
|
|
2873 if len(orfnames) == 0:
|
|
2874 print >> sys.stderr, "GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error: 12"
|
|
2875 logfile = open("antismash.log","w")
|
|
2876 logfile.write("GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 12\n")
|
|
2877 logfile.close()
|
|
2878 sys.exit(1)
|
|
2879 out_file = open(emblfile,"w")
|
|
2880 a = 0
|
|
2881 #print "Writing EMBL file with GlimmerHMM-predicted genes..."
|
|
2882 logfile = open("antismash.log","w")
|
|
2883 logfile.write(loginfo)
|
|
2884 logfile.write("Writing EMBL file with GlimmerHMM-predicted genes...\n")
|
|
2885 #logfile.close()
|
|
2886 loginfo = open("antismash.log","r").read()
|
|
2887 #logfile.close()
|
|
2888 out_file.write("ID A01; SV 1; linear; DNA; STD; FUN; " + str(dnaseqlength) + " BP.\nXX\n")
|
|
2889 out_file.write("AC A01;\nXX\n")
|
|
2890 out_file.write("DE " + genomename + ";\nXX\n")
|
|
2891 out_file.write("KW none;\nXX\n")
|
|
2892 out_file.write("OS unknown;\n")
|
|
2893 out_file.write("OC Fungi;\nXX\n")
|
|
2894 out_file.write("RN [1]\n")
|
|
2895 out_file.write("RT ;\n")
|
|
2896 out_file.write("RL Unknown.\nXX\n")
|
|
2897 out_file.write("FH Key Location/Qualifiers\nFH\n")
|
|
2898 out_file.write("FT source 1.." + str(dnaseqlength) + "\n")
|
|
2899 for i in orfnames:
|
|
2900 out_file.write("FT gene ")
|
|
2901 out_file.write(positions[a])
|
|
2902 out_file.write("\n")
|
|
2903 out_file.write('FT /gene="' + i + '"\n')
|
|
2904 out_file.write("FT CDS ")
|
|
2905 out_file.write(positions[a])
|
|
2906 out_file.write("\n")
|
|
2907 out_file.write('FT /gene="' + i + '"\n')
|
|
2908 a += 1
|
|
2909 out_file.write("XX\nSQ Sequence " + str(dnaseqlength) + " BP; " + str(seq.count("a") + seq.count("A")) + " A; " + str(seq.count("c") + seq.count("C")) + " C; " + str(seq.count("g") + seq.count("G")) + " G; " + str(seq.count("t") + seq.count("T")) + " T; " + str(dnaseqlength - (seq.count("a") + seq.count("A") + seq.count("c") + seq.count("C") + seq.count("g") + seq.count("G") + seq.count("t") + seq.count("T"))) + " other;\n")
|
|
2910 seq2 = seq
|
|
2911 out_file.write(" ")
|
|
2912 grouplen=10
|
|
2913 textlen = len(seq)
|
|
2914 end = textlen - (textlen % grouplen)
|
|
2915 repeated_iterator = [iter(itertools.islice(seq, 0, end))] * grouplen
|
|
2916 parts = list(itertools.imap(lambda *chars: ''.join(chars),*repeated_iterator))
|
|
2917 if dnaseqlength%grouplen != 0:
|
|
2918 parts.append(seq[-1 * (dnaseqlength%grouplen):])
|
|
2919 w = 1
|
|
2920 for l in parts:
|
|
2921 out_file.write(l + " ")
|
|
2922 if w == len(parts):
|
|
2923 if w%6 == 0 and dnaseqlength%60 != 0:
|
|
2924 out_file.write((" " * (10 - dnaseqlength%grouplen) + " " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
|
|
2925 elif dnaseqlength%60 == 0:
|
|
2926 out_file.write((" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
|
|
2927 elif w%6 == 5 and dnaseqlength%grouplen == 0:
|
|
2928 out_file.write((" " + " " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
|
|
2929 elif dnaseqlength%grouplen != 0:
|
|
2930 out_file.write(" " * (10 - dnaseqlength%grouplen) + " " * (6 - len(parts)%6) + " " * (6 - len(parts)%6) + (" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
|
|
2931 else:
|
|
2932 out_file.write(" " * (6 - len(parts)%6) + " " * (5 - len(parts)%6) + (" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
|
|
2933 elif w%6 == 0:
|
|
2934 out_file.write((" " * (10 - len(str(w * 10)))) + str(w * 10) + "\n ")
|
|
2935 w += 1
|
|
2936 out_file.close()
|
|
2937 os.chdir("../../")
|
|
2938 infile = emblfile[6:]
|
|
2939 emblfile = emblfile[6:]
|
|
2940 if taxon == "p":
|
|
2941 glimmeroutputfolder = genomename + "/glimmer/"
|
|
2942 elif taxon == "e":
|
|
2943 glimmeroutputfolder = genomename + "/glimmerhmm/"
|
|
2944 try:
|
|
2945 os.mkdir(glimmeroutputfolder)
|
|
2946 except(IOError,OSError):
|
|
2947 pass
|
|
2948 proteins = embl2proteins(infile,sequence)
|
|
2949 genomic_accnr = proteins[1]
|
|
2950 dnaseqlength = proteins[2]
|
|
2951 proteins = proteins[0]
|
|
2952 writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta")
|
|
2953 else:
|
|
2954 #print "Reading embl/gbk file and creating input FASTA file for gene cluster detection..."
|
|
2955 logfile.write("Reading embl/gbk file and creating input FASTA file for gene cluster detection...\n")
|
|
2956 if infile.split(".")[-1] == "embl" or infile.split(".")[-1] == "EMBL" or infile.split(".")[-1] == "emb" or infile.split(".")[-1] == "EMB":
|
|
2957 sequence = ""
|
|
2958 proteins = embl2proteins(infile,sequence)
|
|
2959 genomic_accnr = proteins[1]
|
|
2960 dnaseqlength = proteins[2]
|
|
2961 proteins = proteins[0]
|
|
2962 writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta")
|
|
2963 elif infile.split(".")[-1] == "gbk" or infile.split(".")[-1] == "GBK" or infile.split(".")[-1] == "gb" or infile.split(".")[-1] == "GB" or infile.split(".")[-1] == "genbank" or infile.split(".")[-1] == "GENBANK":
|
|
2964 proteins = gbk2proteins(infile)
|
|
2965 genomic_accnr = proteins[1]
|
|
2966 dnaseqlength = proteins[2]
|
|
2967 proteins = proteins[0]
|
|
2968 writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta")
|
|
2969 accessiondict = proteins[4]
|
|
2970 seqdict = {}
|
|
2971 fullnamedict = {}
|
|
2972 strandsdict = {}
|
|
2973 z = 0
|
|
2974 for i in proteins[0]:
|
|
2975 name = i.split("|")[4]
|
|
2976 seq = proteins[1][z]
|
|
2977 seqdict[name] = seq
|
|
2978 strand = i.split("|")[3]
|
|
2979 strandsdict[name] = strand
|
|
2980 fullnamedict[name] = i
|
|
2981 z += 1
|
|
2982
|
|
2983 elapsed = (time.time() - starttime)
|
|
2984 #print "2968Time since start: " + str(elapsed)
|
|
2985
|
|
2986 #Run hmmsearch on proteins from input file and parse output
|
|
2987 #print "Performing HMM search on proteins for detection of signature genes..."
|
|
2988 logfile.write("Performing HMM search on proteins for detection of signature genes...\n")
|
|
2989 hmmslist = ["AMP-binding.hmm","BLS.hmm","CAS.hmm","Chal_sti_synt_C.hmm","Chal_sti_synt_N.hmm","Condensation.hmm","ene_KS.hmm","hyb_KS.hmm","itr_KS.hmm","mod_KS.hmm","tra_KS.hmm","LANC_like.hmm","ATd.hmm","PKS_AT.hmm","PKS_KS.hmm","PP-binding.hmm","t2clf.hmm","t2ks.hmm","t2ks2.hmm","Terpene_synth.hmm","Terpene_synth_C.hmm","strH_like.hmm","neoL_like.hmm","DOIS.hmm","valA_like.hmm","spcFG_like.hmm","spcDK_like_cou.hmm","spcDK_like_glyc.hmm","strK_like1.hmm","strK_like2.hmm","bt1fas.hmm","ft1fas.hmm","t2fas.hmm","hglD.hmm","hglE.hmm","fabH.hmm","AfsA.hmm","IucA_IucC.hmm","ectoine_synt.hmm","phytoene_synt.hmm","Lant_dehyd_N.hmm","Lant_dehyd_C.hmm","Antimicrobial18.hmm","Gallidermin.hmm","L_biotic_typeA.hmm","LE-DUF.hmm","LE-LAC481.hmm","LE-LanBC.hmm","LE-MER+2PEP.hmm","MA-2PEPA.hmm","MA-DUF.hmm","MA-EPI.hmm","MA-LAC481.hmm","MA-NIS+EPI.hmm","MA-NIS.hmm","indsynth.hmm","A-OX.hmm","LmbU.hmm","MoeO5.hmm","LipM.hmm","LipU.hmm","LipV.hmm","ToyB.hmm","TunD.hmm","melC.hmm","strepbact.hmm","goadsporin_like.hmm","Antimicrobial14.hmm","Bacteriocin_IId.hmm","BacteriocIIc_cy.hmm","Bacteriocin_II.hmm","Lactococcin.hmm","Antimicrobial17.hmm","Lactococcin_972.hmm","Bacteriocin_IIc.hmm","LcnG-beta.hmm","Bacteriocin_IIi.hmm","Subtilosin_A.hmm","Cloacin.hmm","Neocarzinostat.hmm","Linocin_M18.hmm","TIGR03603.hmm","TIGR03604.hmm","TIGR03605.hmm","TIGR03731.hmm","TIGR03651.hmm","TIGR03678.hmm","TIGR03693.hmm","TIGR03798.hmm","TIGR03882.hmm","TIGR03601.hmm","TIGR03602.hmm","tabtoxin.hmm","cycdipepsynth.hmm","cyanobactin_synth.hmm","fom1.hmm","bcpB.hmm","frbD.hmm","mitE.hmm",'Lycopene_cycl.hmm','terpene_cyclase.hmm','NapT7.hmm','fung_ggpps.hmm','fung_ggpps2.hmm','dmat.hmm','trichodiene_synth.hmm','novK.hmm','novJ.hmm','novI.hmm','novH.hmm','pur6.hmm','pur10.hmm','nikJ.hmm','nikO.hmm','mvnA.hmm','thiostrepton.hmm','NAD_binding_4.hmm','vlmB.hmm','salQ.hmm','prnB.hmm']
|
|
2990 for i in hmmslist:
|
|
2991 hmmsearch = hmmsearch_path + " " + "--cpu " + str(nrcpus) + " -o " + genomename + "/hmmoutput/" + i.split(".")[0] + "_output.txt" + " --noali --tblout " + genomename + "/hmmoutput/" + i.split(".")[0] + ".txt " + hmms_path + i + " " + genomename + "/genome_proteins.fasta"
|
|
2992 os.system(hmmsearch)
|
|
2993 #print "Parsing HMM outputs..."
|
|
2994 logfile.write("Parsing HMM outputs...\n")
|
|
2995 detecteddomainsdict = {}
|
|
2996 #Extract type I PKS proteins, KS cut-off: 50; AT cut-off: 20; exclude those sequences that score higher on type I FAS HMMs, type IV hglE-like KS domains
|
|
2997 t1pksprots = []
|
|
2998 transatpksprots = []
|
|
2999 if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
|
|
3000 ks = parsehmmoutput(50,hmmoutputfolder + "PKS_KS.txt")
|
|
3001 at = parsehmmoutput(50,hmmoutputfolder + "PKS_AT.txt")
|
|
3002 ft1fasks = parsehmmoutput(50,hmmoutputfolder + "ft1fas.txt")
|
|
3003 bt1fasks = parsehmmoutput(50,hmmoutputfolder + "bt1fas.txt")
|
|
3004 hgleks = parsehmmoutput(50,hmmoutputfolder + "hglE.txt")
|
|
3005 hgldks = parsehmmoutput(50,hmmoutputfolder + "hglD.txt")
|
|
3006 fabhks = parsehmmoutput(50,hmmoutputfolder + "fabH.txt")
|
|
3007 pksksprots = ks[0]
|
|
3008 pksatprots = at[0]
|
|
3009 pksatscores = at[1]
|
|
3010 pksksscores = ks[1]
|
|
3011 bt1fasprots = bt1fasks[0]
|
|
3012 bt1fasscores = bt1fasks[1]
|
|
3013 ft1fasprots = ft1fasks[0]
|
|
3014 ft1fasscores = ft1fasks[1]
|
|
3015 hgleprots = hgleks[0]
|
|
3016 hglescores = hgleks[1]
|
|
3017 hgldprots = hgldks[0]
|
|
3018 hgldscores = hgldks[1]
|
|
3019 fabhprots = fabhks[0]
|
|
3020 fabhscores = fabhks[1]
|
|
3021 for i in pksksprots:
|
|
3022 exclude = "n"
|
|
3023 score = pksksscores[pksksprots.index(i)]
|
|
3024 if i in bt1fasprots:
|
|
3025 bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
|
|
3026 if float(score) < float(bt1fasscore):
|
|
3027 exclude = "y"
|
|
3028 if i in ft1fasprots:
|
|
3029 ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
|
|
3030 if float(score) < float(ft1fasscore):
|
|
3031 exclude = "y"
|
|
3032 if i in hgldprots:
|
|
3033 hgldscore = hgldscores[hgldprots.index(i)]
|
|
3034 if float(score) < float(hgldscore):
|
|
3035 exclude = "y"
|
|
3036 if i in hgleprots:
|
|
3037 hglescore = hglescores[hgleprots.index(i)]
|
|
3038 if float(score) < float(hglescore):
|
|
3039 exclude = "y"
|
|
3040 if i in fabhprots:
|
|
3041 fabhscore = fabhscores[fabhprots.index(i)]
|
|
3042 if float(score) < float(fabhscore):
|
|
3043 exclude = "y"
|
|
3044 if i in pksatprots and exclude == "n":
|
|
3045 t1pksprots.append(i)
|
|
3046 if detecteddomainsdict.has_key(i):
|
|
3047 detdomlist = detecteddomainsdict[i]
|
|
3048 detdomlist.append(["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]])
|
|
3049 detdomlist.append(["PKS acyltransferase domain",pksatscores[pksatprots.index(i)]])
|
|
3050 detecteddomainsdict[i] = detdomlist
|
|
3051 else:
|
|
3052 detecteddomainsdict[i] = [["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]],["PKS acyltransferase domain",pksatscores[pksatprots.index(i)]]]
|
|
3053 #Extract trans-AT PKSs: proteins with KS hits but without AT hits, and with trans-AT specific ATd-hits
|
|
3054 atd = parsehmmoutput(65,hmmoutputfolder + "ATd.txt")
|
|
3055 traks = parsehmmoutput(50,hmmoutputfolder + "tra_KS.txt")
|
|
3056 traksprots = traks[0]
|
|
3057 atdprots = atd[0]
|
|
3058 atdscores = atd[1]
|
|
3059 for i in pksksprots:
|
|
3060 if i in atdprots and i in traksprots and i not in t1pksprots:
|
|
3061 transatpksprots.append(i)
|
|
3062 if detecteddomainsdict.has_key(i):
|
|
3063 detdomlist = detecteddomainsdict[i]
|
|
3064 detdomlist.append(["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]])
|
|
3065 detdomlist.append(["Trans-AT PKS AT-docking domain",atdscores[atdprots.index(i)]])
|
|
3066 detecteddomainsdict[i] = detdomlist
|
|
3067 else:
|
|
3068 detecteddomainsdict[i] = [["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]],["Trans-AT PKS AT-docking domain",atdscores[atdprots.index(i)]]]
|
|
3069 #Extract type II PKS & CLF proteins, KS-cut-off: 50, t2KS/clf score > modKS,eneKS,itrKS,traKS,t1fas,t2fas,hgle scores
|
|
3070 t2pksprots = []
|
|
3071 if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
|
|
3072 t2ks = parsehmmoutput(50,hmmoutputfolder + "t2ks.txt")
|
|
3073 t2ks2 = parsehmmoutput(450,hmmoutputfolder + "t2ks2.txt")
|
|
3074 t2clf = parsehmmoutput(50,hmmoutputfolder + "t2clf.txt")
|
|
3075 eneks = parsehmmoutput(50,hmmoutputfolder + "ene_KS.txt")
|
|
3076 hybks = parsehmmoutput(50,hmmoutputfolder + "hyb_KS.txt")
|
|
3077 modks = parsehmmoutput(50,hmmoutputfolder + "mod_KS.txt")
|
|
3078 itrks = parsehmmoutput(50,hmmoutputfolder + "itr_KS.txt")
|
|
3079 traks = parsehmmoutput(50,hmmoutputfolder + "tra_KS.txt")
|
|
3080 t2fasks = parsehmmoutput(50,hmmoutputfolder + "t2fas.txt")
|
|
3081 ft1fasks = parsehmmoutput(50,hmmoutputfolder + "ft1fas.txt")
|
|
3082 bt1fasks = parsehmmoutput(50,hmmoutputfolder + "bt1fas.txt")
|
|
3083 hgleks = parsehmmoutput(50,hmmoutputfolder + "hglE.txt")
|
|
3084 hgldks = parsehmmoutput(50,hmmoutputfolder + "hglD.txt")
|
|
3085 fabhks = parsehmmoutput(50,hmmoutputfolder + "fabH.txt")
|
|
3086 t2ksprots = t2ks[0]
|
|
3087 t2ks2prots = t2ks2[0]
|
|
3088 t2clfprots = t2clf[0]
|
|
3089 eneksprots = eneks[0]
|
|
3090 hybksprots = hybks[0]
|
|
3091 modksprots = modks[0]
|
|
3092 itrksprots = itrks[0]
|
|
3093 traksprots = traks[0]
|
|
3094 t2fasprots = t2fasks[0]
|
|
3095 t2ksscores = t2ks[1]
|
|
3096 t2ks2scores = t2ks2[1]
|
|
3097 t2clfscores = t2clf[1]
|
|
3098 eneksscores = eneks[1]
|
|
3099 hybksscores = hybks[1]
|
|
3100 modksscores = modks[1]
|
|
3101 itrksscores = itrks[1]
|
|
3102 traksscores = traks[1]
|
|
3103 t2fasscores = t2fasks[1]
|
|
3104 bt1fasprots = bt1fasks[0]
|
|
3105 bt1fasscores = bt1fasks[1]
|
|
3106 ft1fasprots = ft1fasks[0]
|
|
3107 ft1fasscores = ft1fasks[1]
|
|
3108 hgleprots = hgleks[0]
|
|
3109 hglescores = hgleks[1]
|
|
3110 hgldprots = hgldks[0]
|
|
3111 hgldscores = hgldks[1]
|
|
3112 fabhprots = fabhks[0]
|
|
3113 fabhscores = fabhks[1]
|
|
3114 for i in t2ksprots:
|
|
3115 type2 = "y"
|
|
3116 score = t2ksscores[t2ksprots.index(i)]
|
|
3117 if i in eneksprots:
|
|
3118 enescore = eneksscores[eneksprots.index(i)]
|
|
3119 if float(enescore) > float(score):
|
|
3120 type2 = "n"
|
|
3121 if i in hybksprots:
|
|
3122 hybscore = hybksscores[hybksprots.index(i)]
|
|
3123 if float(hybscore) > float(score):
|
|
3124 type2 = "n"
|
|
3125 if i in modksprots:
|
|
3126 modscore = modksscores[modksprots.index(i)]
|
|
3127 if float(modscore) > float(score):
|
|
3128 type2 = "n"
|
|
3129 if i in itrksprots:
|
|
3130 itrscore = itrksscores[itrksprots.index(i)]
|
|
3131 if float(itrscore) > float(score):
|
|
3132 type2 = "n"
|
|
3133 if i in traksprots:
|
|
3134 trascore = traksscores[traksprots.index(i)]
|
|
3135 if float(trascore) > float(score):
|
|
3136 type2 = "n"
|
|
3137 if i in bt1fasprots:
|
|
3138 bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
|
|
3139 if float(bt1fasscore) > float(score):
|
|
3140 type2 = "n"
|
|
3141 if i in ft1fasprots:
|
|
3142 ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
|
|
3143 if float(ft1fasscore) > float(score):
|
|
3144 type2 = "n"
|
|
3145 if i in t2fasprots:
|
|
3146 t2fasscore = t2fasscores[t2fasprots.index(i)]
|
|
3147 if float(t2fasscore) > float(score):
|
|
3148 type2 = "n"
|
|
3149 if i in hgleprots:
|
|
3150 hglescore = hglescores[hgleprots.index(i)]
|
|
3151 if float(hglescore) > float(score):
|
|
3152 type2 = "n"
|
|
3153 if i in fabhprots:
|
|
3154 fabhscore = fabhscores[fabhprots.index(i)]
|
|
3155 if float(fabhscore) > float(score):
|
|
3156 type2 = "n"
|
|
3157 if type2 == "y" and i not in t2pksprots and i not in t1pksprots:
|
|
3158 t2pksprots.append(i)
|
|
3159 if detecteddomainsdict.has_key(i):
|
|
3160 detdomlist = detecteddomainsdict[i]
|
|
3161 detdomlist.append(["Type II ketosynthase",t2ksscores[t2ksprots.index(i)]])
|
|
3162 detecteddomainsdict[i] = detdomlist
|
|
3163 else:
|
|
3164 detecteddomainsdict[i] = [["Type II ketosynthase",t2ksscores[t2ksprots.index(i)]]]
|
|
3165 for i in t2clfprots:
|
|
3166 type2 = "y"
|
|
3167 score = t2clfscores[t2clfprots.index(i)]
|
|
3168 if i in eneksprots:
|
|
3169 enescore = eneksscores[eneksprots.index(i)]
|
|
3170 if float(enescore) > float(score):
|
|
3171 type2 = "n"
|
|
3172 if i in hybksprots:
|
|
3173 hybscore = hybksscores[hybksprots.index(i)]
|
|
3174 if float(hybscore) > float(score):
|
|
3175 type2 = "n"
|
|
3176 if i in modksprots:
|
|
3177 modscore = modksscores[modksprots.index(i)]
|
|
3178 if float(modscore) > float(score):
|
|
3179 type2 = "n"
|
|
3180 if i in itrksprots:
|
|
3181 itrscore = itrksscores[itrksprots.index(i)]
|
|
3182 if float(itrscore) > float(score):
|
|
3183 type2 = "n"
|
|
3184 if i in traksprots:
|
|
3185 trascore = traksscores[traksprots.index(i)]
|
|
3186 if float(trascore) > float(score):
|
|
3187 type2 = "n"
|
|
3188 if i in bt1fasprots:
|
|
3189 bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
|
|
3190 if float(bt1fasscore) > float(score):
|
|
3191 type2 = "n"
|
|
3192 if i in ft1fasprots:
|
|
3193 ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
|
|
3194 if float(ft1fasscore) > float(score):
|
|
3195 type2 = "n"
|
|
3196 if i in t2fasprots:
|
|
3197 t2fasscore = t2fasscores[t2fasprots.index(i)]
|
|
3198 if float(t2fasscore) > float(score):
|
|
3199 type2 = "n"
|
|
3200 if i in hgleprots:
|
|
3201 hglescore = hglescores[hgleprots.index(i)]
|
|
3202 if float(hglescore) > float(score):
|
|
3203 type2 = "n"
|
|
3204 if i in fabhprots:
|
|
3205 fabhscore = fabhscores[fabhprots.index(i)]
|
|
3206 if float(fabhscore) > float(score):
|
|
3207 type2 = "n"
|
|
3208 if type2 == "y" and i not in t2pksprots and i not in t1pksprots:
|
|
3209 t2pksprots.append(i)
|
|
3210 if detecteddomainsdict.has_key(i):
|
|
3211 detdomlist = detecteddomainsdict[i]
|
|
3212 detdomlist.append(["Chain length factor",t2clfscores[t2clfprots.index(i)]])
|
|
3213 detecteddomainsdict[i] = detdomlist
|
|
3214 else:
|
|
3215 detecteddomainsdict[i] = [["Chain length factor",t2clfscores[t2clfprots.index(i)]]]
|
|
3216 for i in t2ks2prots:
|
|
3217 type2 = "y"
|
|
3218 score = t2ks2scores[t2ks2prots.index(i)]
|
|
3219 if i in eneksprots:
|
|
3220 enescore = eneksscores[eneksprots.index(i)]
|
|
3221 if float(enescore) > float(score):
|
|
3222 type2 = "n"
|
|
3223 if i in hybksprots:
|
|
3224 hybscore = hybksscores[hybksprots.index(i)]
|
|
3225 if float(hybscore) > float(score):
|
|
3226 type2 = "n"
|
|
3227 if i in modksprots:
|
|
3228 modscore = modksscores[modksprots.index(i)]
|
|
3229 if float(modscore) > float(score):
|
|
3230 type2 = "n"
|
|
3231 if i in itrksprots:
|
|
3232 itrscore = itrksscores[itrksprots.index(i)]
|
|
3233 if float(itrscore) > float(score):
|
|
3234 type2 = "n"
|
|
3235 if i in traksprots:
|
|
3236 trascore = traksscores[traksprots.index(i)]
|
|
3237 if float(trascore) > float(score):
|
|
3238 type2 = "n"
|
|
3239 if i in bt1fasprots:
|
|
3240 bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
|
|
3241 if float(bt1fasscore) > float(score):
|
|
3242 type2 = "n"
|
|
3243 if i in ft1fasprots:
|
|
3244 ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
|
|
3245 if float(ft1fasscore) > float(score):
|
|
3246 type2 = "n"
|
|
3247 if i in t2fasprots:
|
|
3248 t2fasscore = t2fasscores[t2fasprots.index(i)]
|
|
3249 if float(t2fasscore) > float(score):
|
|
3250 type2 = "n"
|
|
3251 if i in hgleprots:
|
|
3252 hglescore = hglescores[hgleprots.index(i)]
|
|
3253 if float(hglescore) > float(score):
|
|
3254 type2 = "n"
|
|
3255 if i in fabhprots:
|
|
3256 fabhscore = fabhscores[fabhprots.index(i)]
|
|
3257 if float(fabhscore) > float(score):
|
|
3258 type2 = "n"
|
|
3259 if type2 == "y" and i not in t2pksprots and i not in t1pksprots:
|
|
3260 t2pksprots.append(i)
|
|
3261 if detecteddomainsdict.has_key(i):
|
|
3262 detdomlist = detecteddomainsdict[i]
|
|
3263 detdomlist.append(["Type II ketosynthase, model 2",t2ks2scores[t2ks2prots.index(i)]])
|
|
3264 detecteddomainsdict[i] = detdomlist
|
|
3265 else:
|
|
3266 detecteddomainsdict[i] = [["Type II ketosynthase, model 2",t2ks2scores[t2ks2prots.index(i)]]]
|
|
3267 #Extract type III PKS proteins
|
|
3268 t3pksprots = []
|
|
3269 if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
|
|
3270 t3n = parsehmmoutput(63,hmmoutputfolder + "Chal_sti_synt_N.txt")
|
|
3271 t3c = parsehmmoutput(35,hmmoutputfolder + "Chal_sti_synt_C.txt")
|
|
3272 t3nprots = t3n[0]
|
|
3273 t3nscores = t3n[1]
|
|
3274 t3cprots = t3c[0]
|
|
3275 t3cscores = t3c[1]
|
|
3276 for i in t3cprots:
|
|
3277 if i not in t3pksprots and i not in t1pksprots and i not in t2pksprots:
|
|
3278 t3pksprots.append(i)
|
|
3279 if detecteddomainsdict.has_key(i):
|
|
3280 detdomlist = detecteddomainsdict[i]
|
|
3281 detdomlist.append(["Chalcone/stilbene synthase,C-terminus",t3cscores[t3cprots.index(i)]])
|
|
3282 detecteddomainsdict[i] = detdomlist
|
|
3283 else:
|
|
3284 detecteddomainsdict[i] = [["Chalcone/stilbene synthase,C-terminus",t3cscores[t3cprots.index(i)]]]
|
|
3285 for i in t3nprots:
|
|
3286 if i not in t3pksprots and i not in t1pksprots and i not in t2pksprots:
|
|
3287 t3pksprots.append(i)
|
|
3288 if detecteddomainsdict.has_key(i):
|
|
3289 detdomlist = detecteddomainsdict[i]
|
|
3290 detdomlist.append(["Chalcone/stilbene synthase,N-terminus",t3nscores[t3nprots.index(i)]])
|
|
3291 detecteddomainsdict[i] = detdomlist
|
|
3292 else:
|
|
3293 detecteddomainsdict[i] = [["Chalcone/stilbene synthase,N-terminus",t3nscores[t3nprots.index(i)]]]
|
|
3294 #Extract 'type IV' hglE-like PKS proteins, cut-off:50; only if not already scored as type 1-3 PKS, and not if FAS HMM has higher score
|
|
3295 t4pksprots = []
|
|
3296 if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
|
|
3297 t2fasks = parsehmmoutput(50,hmmoutputfolder + "t2fas.txt")
|
|
3298 t2fasprots = t2fasks[0]
|
|
3299 t2fasscores = t2fasks[1]
|
|
3300 for i in hgleprots:
|
|
3301 type4 = "y"
|
|
3302 score = hglescores[hgleprots.index(i)]
|
|
3303 if i in bt1fasprots:
|
|
3304 bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
|
|
3305 if float(bt1fasscore) > float(score):
|
|
3306 type4 = "n"
|
|
3307 if i in ft1fasprots:
|
|
3308 ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
|
|
3309 if float(ft1fasscore) > float(score):
|
|
3310 type4 = "n"
|
|
3311 if i in t2fasprots:
|
|
3312 t2fasscore = t2fasscores[t2fasprots.index(i)]
|
|
3313 if float(t2fasscore) > float(score):
|
|
3314 type4 = "n"
|
|
3315 if i in fabhprots:
|
|
3316 fabhscore = fabhscores[fabhprots.index(i)]
|
|
3317 if float(fabhscore) > float(score):
|
|
3318 type4 = "n"
|
|
3319 if i not in t1pksprots and i not in t2pksprots and i not in t3pksprots and i not in transatpksprots and type4 == "y":
|
|
3320 t4pksprots.append(i)
|
|
3321 if detecteddomainsdict.has_key(i):
|
|
3322 detdomlist = detecteddomainsdict[i]
|
|
3323 detdomlist.append(["Atypical PKS domain, HglE-like",hglescores[hgleprots.index(i)]])
|
|
3324 detecteddomainsdict[i] = detdomlist
|
|
3325 else:
|
|
3326 detecteddomainsdict[i] = [["Atypical PKS domain, HglE-like",hglescores[hgleprots.index(i)]]]
|
|
3327 for i in hgldprots:
|
|
3328 type4 = "y"
|
|
3329 score = hgldscores[hgldprots.index(i)]
|
|
3330 if i in bt1fasprots:
|
|
3331 bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
|
|
3332 if float(bt1fasscore) > float(score):
|
|
3333 type4 = "n"
|
|
3334 if i in ft1fasprots:
|
|
3335 ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
|
|
3336 if float(ft1fasscore) > float(score):
|
|
3337 type4 = "n"
|
|
3338 if i in t2fasprots:
|
|
3339 t2fasscore = t2fasscores[t2fasprots.index(i)]
|
|
3340 if float(t2fasscore) > float(score):
|
|
3341 type4 = "n"
|
|
3342 if i in fabhprots:
|
|
3343 fabhscore = fabhscores[fabhprots.index(i)]
|
|
3344 if float(fabhscore) > float(score):
|
|
3345 type4 = "n"
|
|
3346 if i not in t1pksprots and i not in t2pksprots and i not in t3pksprots and i not in transatpksprots and type4 == "y" and i not in t4pksprots:
|
|
3347 t4pksprots.append(i)
|
|
3348 if detecteddomainsdict.has_key(i):
|
|
3349 detdomlist = detecteddomainsdict[i]
|
|
3350 detdomlist.append(["Atypical PKS domain, HglD-like",hgldscores[hgldprots.index(i)]])
|
|
3351 detecteddomainsdict[i] = detdomlist
|
|
3352 else:
|
|
3353 detecteddomainsdict[i] = [["Atypical PKS domain, HglD-like",hgldscores[hgldprots.index(i)]]]
|
|
3354 #Extract NRPS proteins, C cut-off: 20; A cut-off:20, both should be there, or single domain proteins C,A, or T should be within 20kb of each other or a full NRPS
|
|
3355 nrpsprots = []
|
|
3356 if 1 in geneclustertypes or 5 in geneclustertypes:
|
|
3357 cond = parsehmmoutput(20,hmmoutputfolder + "Condensation.txt")
|
|
3358 amp = parsehmmoutput(20,hmmoutputfolder + "AMP-binding.txt")
|
|
3359 ampox = parsehmmoutput(50,hmmoutputfolder + "A-OX.txt")
|
|
3360 ampoxprots = ampox[0]
|
|
3361 ampoxscores = ampox[1]
|
|
3362 for i in ampox[0]:
|
|
3363 if i not in amp:
|
|
3364 amp.append(i)
|
|
3365 cprots = cond[0]
|
|
3366 cscores = cond[1]
|
|
3367 aprots = amp[0]
|
|
3368 ascores = amp[1]
|
|
3369 nrpsprots = []
|
|
3370 for i in cprots:
|
|
3371 if i in aprots:
|
|
3372 nrpsprots.append(i)
|
|
3373 if detecteddomainsdict.has_key(i):
|
|
3374 detdomlist = detecteddomainsdict[i]
|
|
3375 detdomlist.append(["Condensation domain",cscores[cprots.index(i)]])
|
|
3376 if i in aprots:
|
|
3377 detdomlist.append(["Adenylation domain",ascores[aprots.index(i)]])
|
|
3378 elif i in ampoxprots:
|
|
3379 detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
|
|
3380 detecteddomainsdict[i] = detdomlist
|
|
3381 else:
|
|
3382 if i in aprots:
|
|
3383 detecteddomainsdict[i] = [["Condensation domain",cscores[cprots.index(i)]],["Adenylation domain",ascores[aprots.index(i)]]]
|
|
3384 elif i in ampoxprots:
|
|
3385 detecteddomainsdict[i] = [["Condensation domain",cscores[cprots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
|
|
3386 for i in t1pksprots:
|
|
3387 if i in aprots:
|
|
3388 nrpsprots.append(i)
|
|
3389 if detecteddomainsdict.has_key(i):
|
|
3390 detdomlist = detecteddomainsdict[i]
|
|
3391 if i in aprots:
|
|
3392 detdomlist.append(["Adenylation domain",ascores[aprots.index(i)]])
|
|
3393 elif i in ampoxprots:
|
|
3394 detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
|
|
3395 detecteddomainsdict[i] = detdomlist
|
|
3396 else:
|
|
3397 if i in aprots:
|
|
3398 detecteddomainsdict[i] = [["Adenylation domain",ascores[aprots.index(i)]]]
|
|
3399 elif i in ampoxprots:
|
|
3400 detecteddomainsdict[i] = [["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
|
|
3401 single_aprots = []
|
|
3402 single_cprots = []
|
|
3403 single_pptprots = []
|
|
3404 pptprots = parsehmmoutput(20,hmmoutputfolder + "PP-binding.txt")[0]
|
|
3405 for i in aprots:
|
|
3406 if i not in nrpsprots:
|
|
3407 single_aprots.append(i)
|
|
3408 for i in cprots:
|
|
3409 if i not in nrpsprots:
|
|
3410 single_cprots.append(i)
|
|
3411 for i in pptprots:
|
|
3412 if i not in nrpsprots:
|
|
3413 single_pptprots.append(i)
|
|
3414 genelist = proteins[2]
|
|
3415 genedict = proteins[3]
|
|
3416 single_aprots_positions = {}
|
|
3417 single_cprots_positions = {}
|
|
3418 single_pptprots_positions = {}
|
|
3419 nrpsprots_positions = {}
|
|
3420 for j in single_aprots:
|
|
3421 if j in genelist:
|
|
3422 protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
|
|
3423 protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
|
|
3424 single_aprots_positions[j] = int((protend_abs + protstart_abs) / 2)
|
|
3425 for j in single_cprots:
|
|
3426 if j in genelist:
|
|
3427 protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
|
|
3428 protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
|
|
3429 single_cprots_positions[j] = int((protend_abs + protstart_abs) / 2)
|
|
3430 for j in single_pptprots:
|
|
3431 if j in genelist:
|
|
3432 protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
|
|
3433 protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
|
|
3434 single_pptprots_positions[j] = int((protend_abs + protstart_abs) / 2)
|
|
3435 for j in nrpsprots:
|
|
3436 if j in genelist:
|
|
3437 protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
|
|
3438 protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
|
|
3439 nrpsprots_positions[j] = int((protend_abs + protstart_abs) / 2)
|
|
3440 nrpsprots2 = []
|
|
3441 for i in nrpsprots:
|
|
3442 nrpsprots2.append(i)
|
|
3443 for j in single_aprots:
|
|
3444 include = "n"
|
|
3445 pos = single_aprots_positions[j]
|
|
3446 for i in single_cprots:
|
|
3447 pos2 = single_cprots_positions[i]
|
|
3448 if abs(pos - pos2) < 20000:
|
|
3449 include = "y"
|
|
3450 for i in nrpsprots2:
|
|
3451 pos2 = nrpsprots_positions[i]
|
|
3452 if abs(pos - pos2) < 20000:
|
|
3453 include = "y"
|
|
3454 if include == "y":
|
|
3455 nrpsprots.append(j)
|
|
3456 if detecteddomainsdict.has_key(j):
|
|
3457 detdomlist = detecteddomainsdict[j]
|
|
3458 if j in aprots:
|
|
3459 detdomlist.append(["Adenylation domain",ascores[aprots.index(j)]])
|
|
3460 elif j in ampoxprots:
|
|
3461 detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(j)]])
|
|
3462 detecteddomainsdict[j] = detdomlist
|
|
3463 else:
|
|
3464 if j in aprots:
|
|
3465 detecteddomainsdict[j] = [["Adenylation domain",ascores[aprots.index(j)]]]
|
|
3466 elif j in ampoxprots:
|
|
3467 detecteddomainsdict[j] = [["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(j)]]]
|
|
3468 for j in single_cprots:
|
|
3469 include = "n"
|
|
3470 pos = single_cprots_positions[j]
|
|
3471 for i in single_aprots:
|
|
3472 pos2 = single_aprots_positions[i]
|
|
3473 if abs(pos - pos2) < 20000:
|
|
3474 include = "y"
|
|
3475 for i in nrpsprots2:
|
|
3476 pos2 = nrpsprots_positions[i]
|
|
3477 if abs(pos - pos2) < 20000:
|
|
3478 include = "y"
|
|
3479 if include == "y":
|
|
3480 nrpsprots.append(j)
|
|
3481 if detecteddomainsdict.has_key(j):
|
|
3482 detdomlist = detecteddomainsdict[j]
|
|
3483 detdomlist.append(["Condensation domain",cscores[cprots.index(j)]])
|
|
3484 detecteddomainsdict[j] = detdomlist
|
|
3485 else:
|
|
3486 detecteddomainsdict[j] = [["Condensation domain",cscores[cprots.index(j)]]]
|
|
3487 #Extract Terpene synthase proteins, various cut-offs
|
|
3488 terpeneprots = []
|
|
3489 if 1 in geneclustertypes or 6 in geneclustertypes:
|
|
3490 terpene = parsehmmoutput(23,hmmoutputfolder + "Terpene_synth_C.txt")
|
|
3491 terpeneprots = terpene[0]
|
|
3492 terpenescores = terpene[1]
|
|
3493 for i in terpeneprots:
|
|
3494 if detecteddomainsdict.has_key(i):
|
|
3495 detdomlist = detecteddomainsdict[i]
|
|
3496 detdomlist.append(["Terpene synthase, C-terminus",terpenescores[terpeneprots.index(i)]])
|
|
3497 detecteddomainsdict[i] = detdomlist
|
|
3498 else:
|
|
3499 detecteddomainsdict[i] = [["Terpene synthase, C-terminus",terpenescores[terpeneprots.index(i)]]]
|
|
3500 if 1 in geneclustertypes or 6 in geneclustertypes:
|
|
3501 physqualdata = parsehmmoutput(20,hmmoutputfolder + "phytoene_synt.txt")
|
|
3502 physqualprots = physqualdata[0]
|
|
3503 physqualscores = physqualdata[1]
|
|
3504 for i in physqualprots:
|
|
3505 if i not in terpeneprots:
|
|
3506 terpeneprots.append(i)
|
|
3507 if detecteddomainsdict.has_key(i):
|
|
3508 detdomlist = detecteddomainsdict[i]
|
|
3509 detdomlist.append(["Phytoene/squalene synthase",physqualscores[physqualprots.index(i)]])
|
|
3510 detecteddomainsdict[i] = detdomlist
|
|
3511 else:
|
|
3512 detecteddomainsdict[i] = [["Phytoene/squalene synthase",physqualscores[physqualprots.index(i)]]]
|
|
3513 if 1 in geneclustertypes or 6 in geneclustertypes:
|
|
3514 lycopenedata = parsehmmoutput(80,hmmoutputfolder + "Lycopene_cycl.txt")
|
|
3515 lycopeneprots = lycopenedata[0]
|
|
3516 lycopenescores = lycopenedata[1]
|
|
3517 for i in lycopeneprots:
|
|
3518 if i not in terpeneprots:
|
|
3519 terpeneprots.append(i)
|
|
3520 if detecteddomainsdict.has_key(i):
|
|
3521 detdomlist = detecteddomainsdict[i]
|
|
3522 detdomlist.append(["Lycopene cyclase",lycopenescores[lycopeneprots.index(i)]])
|
|
3523 detecteddomainsdict[i] = detdomlist
|
|
3524 else:
|
|
3525 detecteddomainsdict[i] = [["Lycopene cyclase",lycopenescores[lycopeneprots.index(i)]]]
|
|
3526 if 1 in geneclustertypes or 6 in geneclustertypes:
|
|
3527 terpene_cyclasesdata = parsehmmoutput(50,hmmoutputfolder + "terpene_cyclase.txt")
|
|
3528 terpene_cyclases = terpene_cyclasesdata[0]
|
|
3529 terpene_cyclases_scores = terpene_cyclasesdata[1]
|
|
3530 for i in terpene_cyclases:
|
|
3531 if i not in terpeneprots:
|
|
3532 terpeneprots.append(i)
|
|
3533 if detecteddomainsdict.has_key(i):
|
|
3534 detdomlist = detecteddomainsdict[i]
|
|
3535 detdomlist.append(["Terpene cyclase",terpene_cyclases_scores[terpene_cyclases.index(i)]])
|
|
3536 detecteddomainsdict[i] = detdomlist
|
|
3537 else:
|
|
3538 detecteddomainsdict[i] = [["Terpene cyclase",terpene_cyclases_scores[terpene_cyclases.index(i)]]]
|
|
3539 if 1 in geneclustertypes or 6 in geneclustertypes:
|
|
3540 NapT7 = parsehmmoutput(250,hmmoutputfolder + "NapT7.txt")
|
|
3541 NapT7prots = NapT7[0]
|
|
3542 NapT7scores = NapT7[1]
|
|
3543 for i in NapT7prots:
|
|
3544 if i not in terpeneprots:
|
|
3545 terpeneprots.append(i)
|
|
3546 if detecteddomainsdict.has_key(i):
|
|
3547 detdomlist = detecteddomainsdict[i]
|
|
3548 detdomlist.append(["NapT7",NapT7scores[NapT7prots.index(i)]])
|
|
3549 detecteddomainsdict[i] = detdomlist
|
|
3550 else:
|
|
3551 detecteddomainsdict[i] = [["NapT7",NapT7scores[NapT7prots.index(i)]]]
|
|
3552 if 1 in geneclustertypes or 6 in geneclustertypes:
|
|
3553 fung_ggpps = parsehmmoutput(420,hmmoutputfolder + "fung_ggpps.txt")
|
|
3554 fung_ggppsprots = fung_ggpps[0]
|
|
3555 fung_ggppsscores = fung_ggpps[1]
|
|
3556 for i in fung_ggppsprots:
|
|
3557 if i not in terpeneprots:
|
|
3558 terpeneprots.append(i)
|
|
3559 if detecteddomainsdict.has_key(i):
|
|
3560 detdomlist = detecteddomainsdict[i]
|
|
3561 detdomlist.append(["Fungal geranylgeranyl pyrophosphate synthase, model 1",fung_ggppsscores[fung_ggppsprots.index(i)]])
|
|
3562 detecteddomainsdict[i] = detdomlist
|
|
3563 else:
|
|
3564 detecteddomainsdict[i] = [["Fungal geranylgeranyl pyrophosphate synthase, model 1",fung_ggppsscores[fung_ggppsprots.index(i)]]]
|
|
3565 if 1 in geneclustertypes or 6 in geneclustertypes:
|
|
3566 fung_ggpps2 = parsehmmoutput(312,hmmoutputfolder + "fung_ggpps2.txt")
|
|
3567 fung_ggpps2prots = fung_ggpps2[0]
|
|
3568 fung_ggpps2scores = fung_ggpps2[1]
|
|
3569 for i in fung_ggpps2prots:
|
|
3570 if i not in terpeneprots:
|
|
3571 terpeneprots.append(i)
|
|
3572 if detecteddomainsdict.has_key(i):
|
|
3573 detdomlist = detecteddomainsdict[i]
|
|
3574 detdomlist.append(["Fungal geranylgeranyl pyrophosphate synthase, model 2",fung_ggpps2scores[fung_ggpps2prots.index(i)]])
|
|
3575 detecteddomainsdict[i] = detdomlist
|
|
3576 else:
|
|
3577 detecteddomainsdict[i] = [["Fungal geranylgeranyl pyrophosphate synthase, model 2",fung_ggpps2scores[fung_ggpps2prots.index(i)]]]
|
|
3578 if 1 in geneclustertypes or 6 in geneclustertypes:
|
|
3579 dmat = parsehmmoutput(200,hmmoutputfolder + "dmat.txt")
|
|
3580 dmatprots = dmat[0]
|
|
3581 dmatscores = dmat[1]
|
|
3582 for i in dmatprots:
|
|
3583 if i not in terpeneprots:
|
|
3584 terpeneprots.append(i)
|
|
3585 if detecteddomainsdict.has_key(i):
|
|
3586 detdomlist = detecteddomainsdict[i]
|
|
3587 detdomlist.append(["Dimethylallyl tryptophan synthase",dmatscores[dmatprots.index(i)]])
|
|
3588 detecteddomainsdict[i] = detdomlist
|
|
3589 else:
|
|
3590 detecteddomainsdict[i] = [["Dimethylallyl tryptophan synthase",dmatscores[dmatprots.index(i)]]]
|
|
3591 if 1 in geneclustertypes or 6 in geneclustertypes:
|
|
3592 trichodiene_synth = parsehmmoutput(150,hmmoutputfolder + "trichodiene_synth.txt")
|
|
3593 trichodiene_synthprots = trichodiene_synth[0]
|
|
3594 trichodiene_synthscores = trichodiene_synth[1]
|
|
3595 for i in trichodiene_synthprots:
|
|
3596 if i not in terpeneprots:
|
|
3597 terpeneprots.append(i)
|
|
3598 if detecteddomainsdict.has_key(i):
|
|
3599 detdomlist = detecteddomainsdict[i]
|
|
3600 detdomlist.append(["Trichodiene synthase",trichodiene_synthscores[trichodiene_synthprots.index(i)]])
|
|
3601 detecteddomainsdict[i] = detdomlist
|
|
3602 else:
|
|
3603 detecteddomainsdict[i] = [["Trichodiene synthase",trichodiene_synthscores[trichodiene_synthprots.index(i)]]]
|
|
3604 #Extract lantibiotic proteins, LanC cut-off: 80, Lant_dehN & Lant_dehC combination cut-off: 20 each
|
|
3605 lantprots = []
|
|
3606 if 1 in geneclustertypes or 7 in geneclustertypes:
|
|
3607 lantc = parsehmmoutput(80,hmmoutputfolder + "LANC_like.txt")
|
|
3608 lancprots = lantc[0]
|
|
3609 lancscores = lantc[1]
|
|
3610 landehn = parsehmmoutput(20,hmmoutputfolder + "Lant_dehyd_N.txt")
|
|
3611 landehnprots = landehn[0]
|
|
3612 landehnscores = landehn[1]
|
|
3613 landehc = parsehmmoutput(20,hmmoutputfolder + "Lant_dehyd_C.txt")
|
|
3614 landehcprots = landehc[0]
|
|
3615 landehcscores = landehc[1]
|
|
3616 lanti1 = parsehmmoutput(20,hmmoutputfolder + "Antimicrobial18.txt")
|
|
3617 lanti1prots = lanti1[0]
|
|
3618 lanti1scores = lanti1[1]
|
|
3619 lanti2 = parsehmmoutput(20,hmmoutputfolder + "Gallidermin.txt")
|
|
3620 lanti2prots = lanti2[0]
|
|
3621 lanti2scores = lanti2[1]
|
|
3622 lanti3 = parsehmmoutput(20,hmmoutputfolder + "L_biotic_typeA.txt")
|
|
3623 lanti3prots = lanti3[0]
|
|
3624 lanti3scores = lanti3[1]
|
|
3625 lanti4 = parsehmmoutput(20,hmmoutputfolder + "LE-DUF.txt")
|
|
3626 lanti4prots = lanti4[0]
|
|
3627 lanti4scores = lanti4[1]
|
|
3628 lanti5 = parsehmmoutput(20,hmmoutputfolder + "LE-LAC481.txt")
|
|
3629 lanti5prots = lanti5[0]
|
|
3630 lanti5scores = lanti5[1]
|
|
3631 lanti6 = parsehmmoutput(20,hmmoutputfolder + "LE-LanBC.txt")
|
|
3632 lanti6prots = lanti6[0]
|
|
3633 lanti6scores = lanti6[1]
|
|
3634 lanti7 = parsehmmoutput(20,hmmoutputfolder + "LE-MER+2PEP.txt")
|
|
3635 lanti7prots = lanti7[0]
|
|
3636 lanti7scores = lanti7[1]
|
|
3637 lanti8 = parsehmmoutput(20,hmmoutputfolder + "MA-2PEPA.txt")
|
|
3638 lanti8prots = lanti8[0]
|
|
3639 lanti8scores = lanti8[1]
|
|
3640 lanti9 = parsehmmoutput(20,hmmoutputfolder + "MA-DUF.txt")
|
|
3641 lanti9prots = lanti9[0]
|
|
3642 lanti9scores = lanti9[1]
|
|
3643 lanti10 = parsehmmoutput(20,hmmoutputfolder + "MA-EPI.txt")
|
|
3644 lanti10prots = lanti10[0]
|
|
3645 lanti10scores = lanti10[1]
|
|
3646 lanti11 = parsehmmoutput(20,hmmoutputfolder + "MA-LAC481.txt")
|
|
3647 lanti11prots = lanti11[0]
|
|
3648 lanti11scores = lanti11[1]
|
|
3649 lanti12 = parsehmmoutput(20,hmmoutputfolder + "MA-NIS+EPI.txt")
|
|
3650 lanti12prots = lanti12[0]
|
|
3651 lanti12scores = lanti12[1]
|
|
3652 lanti13 = parsehmmoutput(20,hmmoutputfolder + "MA-NIS.txt")
|
|
3653 lanti13prots = lanti13[0]
|
|
3654 lanti13scores = lanti13[1]
|
|
3655 lanti14 = parsehmmoutput(18,hmmoutputfolder + "TIGR03731.txt")
|
|
3656 lanti14prots = lanti14[0]
|
|
3657 lanti14scores = lanti14[1]
|
|
3658 lantiprots = lanti1prots + lanti2prots + lanti3prots + lanti4prots + lanti5prots + lanti6prots + lanti7prots + lanti8prots + lanti9prots + lanti10prots + lanti11prots + lanti12prots + lanti13prots + lanti14prots
|
|
3659 lantiprots2 = []
|
|
3660 for i in lantiprots:
|
|
3661 if i not in lantiprots2:
|
|
3662 lantiprots2.append(i)
|
|
3663 lantiprots = lantiprots2
|
|
3664 for i in lancprots:
|
|
3665 lantprots.append(i)
|
|
3666 if detecteddomainsdict.has_key(i):
|
|
3667 detdomlist = detecteddomainsdict[i]
|
|
3668 detdomlist.append(["LanC lanthionine synthase domain",lancscores[lancprots.index(i)]])
|
|
3669 detecteddomainsdict[i] = detdomlist
|
|
3670 else:
|
|
3671 detecteddomainsdict[i] = [["LanC lanthionine synthase domain",lancscores[lancprots.index(i)]]]
|
|
3672 for i in landehnprots:
|
|
3673 if i in landehcprots and i not in lantprots:
|
|
3674 lantprots.append(i)
|
|
3675 if detecteddomainsdict.has_key(i):
|
|
3676 detdomlist = detecteddomainsdict[i]
|
|
3677 detdomlist.append(["Lantibiotic dehydratase, N-terminus",landehnscores[landehnprots.index(i)]])
|
|
3678 detdomlist.append(["Lantibiotic dehydratase, C-terminus",landehcscores[landehcprots.index(i)]])
|
|
3679 detecteddomainsdict[i] = detdomlist
|
|
3680 else:
|
|
3681 detecteddomainsdict[i] = [["Lantibiotic dehydratase, N-terminus",landehnscores[landehnprots.index(i)]],["Lantibiotic dehydratase, C-terminus",landehcscores[landehcprots.index(i)]]]
|
|
3682 for i in lantiprots:
|
|
3683 if i not in lantprots:
|
|
3684 lantprots.append(i)
|
|
3685 if detecteddomainsdict.has_key(i):
|
|
3686 detdomlist = detecteddomainsdict[i]
|
|
3687 if i in lanti1prots:
|
|
3688 detdomlist.append(["Antimicrobial18 domain",lanti1scores[lanti1prots.index(i)]])
|
|
3689 detecteddomainsdict[i] = detdomlist
|
|
3690 else:
|
|
3691 if i in lanti1prots:
|
|
3692 detecteddomainsdict[i] = [["Antimicrobial18 domain",lanti1scores[lanti1prots.index(i)]]]
|
|
3693 if detecteddomainsdict.has_key(i):
|
|
3694 detdomlist = detecteddomainsdict[i]
|
|
3695 if i in lanti2prots:
|
|
3696 detdomlist.append(["Gallidermin domain",lanti2scores[lanti2prots.index(i)]])
|
|
3697 detecteddomainsdict[i] = detdomlist
|
|
3698 else:
|
|
3699 if i in lanti2prots:
|
|
3700 detecteddomainsdict[i] = [["Gallidermin domain",lanti2scores[lanti2prots.index(i)]]]
|
|
3701 if detecteddomainsdict.has_key(i):
|
|
3702 detdomlist = detecteddomainsdict[i]
|
|
3703 if i in lanti3prots:
|
|
3704 detdomlist.append(["L_biotic_typeA domain",lanti3scores[lanti3prots.index(i)]])
|
|
3705 detecteddomainsdict[i] = detdomlist
|
|
3706 else:
|
|
3707 if i in lanti3prots:
|
|
3708 detecteddomainsdict[i] = [["L_biotic_typeA domain",lanti3scores[lanti3prots.index(i)]]]
|
|
3709 if detecteddomainsdict.has_key(i):
|
|
3710 detdomlist = detecteddomainsdict[i]
|
|
3711 if i in lanti4prots:
|
|
3712 detdomlist.append(["LE-DUF domain",lanti4scores[lanti4prots.index(i)]])
|
|
3713 detecteddomainsdict[i] = detdomlist
|
|
3714 else:
|
|
3715 if i in lanti4prots:
|
|
3716 detecteddomainsdict[i] = [["LE-DUF domain",lanti4scores[lanti4prots.index(i)]]]
|
|
3717 if detecteddomainsdict.has_key(i):
|
|
3718 detdomlist = detecteddomainsdict[i]
|
|
3719 if i in lanti5prots:
|
|
3720 detdomlist.append(["LE-LAC481 domain",lanti5scores[lanti5prots.index(i)]])
|
|
3721 detecteddomainsdict[i] = detdomlist
|
|
3722 else:
|
|
3723 if i in lanti5prots:
|
|
3724 detecteddomainsdict[i] = [["LE-LAC481 domain",lanti5scores[lanti5prots.index(i)]]]
|
|
3725 if detecteddomainsdict.has_key(i):
|
|
3726 detdomlist = detecteddomainsdict[i]
|
|
3727 if i in lanti6prots:
|
|
3728 detdomlist.append(["LE-LanBC domain",lanti6scores[lanti6prots.index(i)]])
|
|
3729 detecteddomainsdict[i] = detdomlist
|
|
3730 else:
|
|
3731 if i in lanti6prots:
|
|
3732 detecteddomainsdict[i] = [["LE-LanBC domain",lanti6scores[lanti6prots.index(i)]]]
|
|
3733 if detecteddomainsdict.has_key(i):
|
|
3734 detdomlist = detecteddomainsdict[i]
|
|
3735 if i in lanti7prots:
|
|
3736 detdomlist.append(["LE-MER+2PEP domain",lanti7scores[lanti7prots.index(i)]])
|
|
3737 detecteddomainsdict[i] = detdomlist
|
|
3738 else:
|
|
3739 if i in lanti7prots:
|
|
3740 detecteddomainsdict[i] = [["LE-MER+2PEP domain",lanti7scores[lanti7prots.index(i)]]]
|
|
3741 if detecteddomainsdict.has_key(i):
|
|
3742 detdomlist = detecteddomainsdict[i]
|
|
3743 if i in lanti8prots:
|
|
3744 detdomlist.append(["MA-2PEPA domain",lanti8scores[lanti8prots.index(i)]])
|
|
3745 detecteddomainsdict[i] = detdomlist
|
|
3746 else:
|
|
3747 if i in lanti8prots:
|
|
3748 detecteddomainsdict[i] = [["MA-2PEPA domain",lanti8scores[lanti8prots.index(i)]]]
|
|
3749 if detecteddomainsdict.has_key(i):
|
|
3750 detdomlist = detecteddomainsdict[i]
|
|
3751 if i in lanti9prots:
|
|
3752 detdomlist.append(["MA-DUF domain",lanti9scores[lanti9prots.index(i)]])
|
|
3753 detecteddomainsdict[i] = detdomlist
|
|
3754 else:
|
|
3755 if i in lanti9prots:
|
|
3756 detecteddomainsdict[i] = [["MA-DUF domain",lanti9scores[lanti9prots.index(i)]]]
|
|
3757 if detecteddomainsdict.has_key(i):
|
|
3758 detdomlist = detecteddomainsdict[i]
|
|
3759 if i in lanti10prots:
|
|
3760 detdomlist.append(["MA-EPI domain",lanti10scores[lanti10prots.index(i)]])
|
|
3761 detecteddomainsdict[i] = detdomlist
|
|
3762 else:
|
|
3763 if i in lanti10prots:
|
|
3764 detecteddomainsdict[i] = [["MA-EPI domain",lanti10scores[lanti10prots.index(i)]]]
|
|
3765 if detecteddomainsdict.has_key(i):
|
|
3766 detdomlist = detecteddomainsdict[i]
|
|
3767 if i in lanti11prots:
|
|
3768 detdomlist.append(["MA-LAC481 domain",lanti11scores[lanti11prots.index(i)]])
|
|
3769 detecteddomainsdict[i] = detdomlist
|
|
3770 else:
|
|
3771 if i in lanti11prots:
|
|
3772 detecteddomainsdict[i] = [["MA-LAC481 domain",lanti11scores[lanti11prots.index(i)]]]
|
|
3773 if detecteddomainsdict.has_key(i):
|
|
3774 detdomlist = detecteddomainsdict[i]
|
|
3775 if i in lanti12prots:
|
|
3776 detdomlist.append(["MA-NIS+EPI domain",lanti12scores[lanti12prots.index(i)]])
|
|
3777 detecteddomainsdict[i] = detdomlist
|
|
3778 else:
|
|
3779 if i in lanti12prots:
|
|
3780 detecteddomainsdict[i] = [["MA-NIS+EPI domain",lanti12scores[lanti12prots.index(i)]]]
|
|
3781 if detecteddomainsdict.has_key(i):
|
|
3782 detdomlist = detecteddomainsdict[i]
|
|
3783 if i in lanti13prots:
|
|
3784 detdomlist.append(["MA-NIS domain",lanti13scores[lanti13prots.index(i)]])
|
|
3785 detecteddomainsdict[i] = detdomlist
|
|
3786 else:
|
|
3787 if i in lanti13prots:
|
|
3788 detecteddomainsdict[i] = [["MA-NIS domain",lanti13scores[lanti13prots.index(i)]]]
|
|
3789 if detecteddomainsdict.has_key(i):
|
|
3790 detdomlist = detecteddomainsdict[i]
|
|
3791 if i in lanti14prots:
|
|
3792 detdomlist.append(["TIGR03731: lantibiotic, gallidermin/nisin family",lanti14scores[lanti14prots.index(i)]])
|
|
3793 detecteddomainsdict[i] = detdomlist
|
|
3794 else:
|
|
3795 if i in lanti14prots:
|
|
3796 detecteddomainsdict[i] = [["TIGR03731: lantibiotic, gallidermin/nisin family",lanti14scores[lanti14prots.index(i)]]]
|
|
3797 #Bacteriocin proteins, various cut-offs
|
|
3798 bcinprots = []
|
|
3799 if 1 in geneclustertypes or 8 in geneclustertypes:
|
|
3800 bcin1prots = parsehmmoutput(50,hmmoutputfolder + "strepbact.txt")[0]
|
|
3801 bcin2prots = parsehmmoutput(90,hmmoutputfolder + "Antimicrobial14.txt")[0]
|
|
3802 bcin3prots = parsehmmoutput(23,hmmoutputfolder + "Bacteriocin_IId.txt")[0]
|
|
3803 bcin4prots = parsehmmoutput(92,hmmoutputfolder + "BacteriocIIc_cy.txt")[0]
|
|
3804 bcin5prots = parsehmmoutput(40,hmmoutputfolder + "Bacteriocin_II.txt")[0]
|
|
3805 bcin6prots = parsehmmoutput(24,hmmoutputfolder + "Lactococcin.txt")[0]
|
|
3806 bcin7prots = parsehmmoutput(31,hmmoutputfolder + "Antimicrobial17.txt")[0]
|
|
3807 bcin8prots = parsehmmoutput(25,hmmoutputfolder + "Lactococcin_972.txt")[0]
|
|
3808 bcin9prots = parsehmmoutput(27,hmmoutputfolder + "Bacteriocin_IIc.txt")[0]
|
|
3809 bcin10prots = parsehmmoutput(78,hmmoutputfolder + "LcnG-beta.txt")[0]
|
|
3810 bcin11prots = parsehmmoutput(56,hmmoutputfolder + "Bacteriocin_IIi.txt")[0]
|
|
3811 bcin12prots = parsehmmoutput(98,hmmoutputfolder + "Subtilosin_A.txt")[0]
|
|
3812 bcin13prots = parsehmmoutput(27,hmmoutputfolder + "Cloacin.txt")[0]
|
|
3813 bcin14prots = parsehmmoutput(25,hmmoutputfolder + "Linocin_M18.txt")[0]
|
|
3814 bcin15prots = parsehmmoutput(150,hmmoutputfolder + "TIGR03603.txt")[0]
|
|
3815 bcin16prots = parsehmmoutput(440,hmmoutputfolder + "TIGR03604.txt")[0]
|
|
3816 bcin17prots = parsehmmoutput(200,hmmoutputfolder + "TIGR03605.txt")[0]
|
|
3817 bcin18prots = parsehmmoutput(18,hmmoutputfolder + "TIGR03651.txt")[0]
|
|
3818 bcin19prots = parsehmmoutput(35,hmmoutputfolder + "TIGR03678.txt")[0]
|
|
3819 bcin20prots = parsehmmoutput(400,hmmoutputfolder + "TIGR03693.txt")[0]
|
|
3820 bcin21prots = parsehmmoutput(16,hmmoutputfolder + "TIGR03798.txt")[0]
|
|
3821 bcin22prots = parsehmmoutput(150,hmmoutputfolder + "TIGR03882.txt")[0]
|
|
3822 bcin23prots = parsehmmoutput(50,hmmoutputfolder + "TIGR03601.txt")[0]
|
|
3823 bcin24prots = parsehmmoutput(50,hmmoutputfolder + "TIGR03602.txt")[0]
|
|
3824 bcin25prots = parsehmmoutput(20,hmmoutputfolder + "mvnA.txt")[0]
|
|
3825 bcin26prots = parsehmmoutput(20,hmmoutputfolder + "thiostrepton.txt")[0]
|
|
3826 bcin1scores = parsehmmoutput(50,hmmoutputfolder + "strepbact.txt")[1]
|
|
3827 bcin2scores = parsehmmoutput(90,hmmoutputfolder + "Antimicrobial14.txt")[1]
|
|
3828 bcin3scores = parsehmmoutput(23,hmmoutputfolder + "Bacteriocin_IId.txt")[1]
|
|
3829 bcin4scores = parsehmmoutput(92,hmmoutputfolder + "BacteriocIIc_cy.txt")[1]
|
|
3830 bcin5scores = parsehmmoutput(40,hmmoutputfolder + "Bacteriocin_II.txt")[1]
|
|
3831 bcin6scores = parsehmmoutput(24,hmmoutputfolder + "Lactococcin.txt")[1]
|
|
3832 bcin7scores = parsehmmoutput(31,hmmoutputfolder + "Antimicrobial17.txt")[1]
|
|
3833 bcin8scores = parsehmmoutput(25,hmmoutputfolder + "Lactococcin_972.txt")[1]
|
|
3834 bcin9scores = parsehmmoutput(27,hmmoutputfolder + "Bacteriocin_IIc.txt")[1]
|
|
3835 bcin10scores = parsehmmoutput(78,hmmoutputfolder + "LcnG-beta.txt")[1]
|
|
3836 bcin11scores = parsehmmoutput(56,hmmoutputfolder + "Bacteriocin_IIi.txt")[1]
|
|
3837 bcin12scores = parsehmmoutput(98,hmmoutputfolder + "Subtilosin_A.txt")[1]
|
|
3838 bcin13scores = parsehmmoutput(27,hmmoutputfolder + "Cloacin.txt")[1]
|
|
3839 bcin14scores = parsehmmoutput(25,hmmoutputfolder + "Linocin_M18.txt")[1]
|
|
3840 bcin15scores = parsehmmoutput(150,hmmoutputfolder + "TIGR03603.txt")[1]
|
|
3841 bcin16scores = parsehmmoutput(440,hmmoutputfolder + "TIGR03604.txt")[1]
|
|
3842 bcin17scores = parsehmmoutput(200,hmmoutputfolder + "TIGR03605.txt")[1]
|
|
3843 bcin18scores = parsehmmoutput(18,hmmoutputfolder + "TIGR03651.txt")[1]
|
|
3844 bcin19scores = parsehmmoutput(35,hmmoutputfolder + "TIGR03678.txt")[1]
|
|
3845 bcin20scores = parsehmmoutput(400,hmmoutputfolder + "TIGR03693.txt")[1]
|
|
3846 bcin21scores = parsehmmoutput(16,hmmoutputfolder + "TIGR03798.txt")[1]
|
|
3847 bcin22scores = parsehmmoutput(150,hmmoutputfolder + "TIGR03882.txt")[1]
|
|
3848 bcin23scores = parsehmmoutput(50,hmmoutputfolder + "TIGR03601.txt")[1]
|
|
3849 bcin24scores = parsehmmoutput(50,hmmoutputfolder + "TIGR03602.txt")[1]
|
|
3850 bcin25scores = parsehmmoutput(20,hmmoutputfolder + "mvnA.txt")[1]
|
|
3851 bcin26scores = parsehmmoutput(20,hmmoutputfolder + "thiostrepton.txt")[1]
|
|
3852 bcinprots = bcin1prots + bcin2prots + bcin3prots + bcin4prots + bcin5prots + bcin6prots + bcin7prots + bcin8prots + bcin9prots + bcin10prots + bcin11prots + bcin12prots + bcin13prots + bcin14prots + bcin15prots + bcin16prots + bcin17prots + bcin18prots + bcin19prots + bcin20prots + bcin21prots + bcin22prots + bcin23prots + bcin24prots + bcin25prots + bcin26prots
|
|
3853 bcinprots2 = []
|
|
3854 for i in bcinprots:
|
|
3855 if detecteddomainsdict.has_key(i):
|
|
3856 detdomlist = detecteddomainsdict[i]
|
|
3857 if i in bcin1prots:
|
|
3858 detdomlist.append(["Putative Streptomyces bacteriocin",bcin1scores[bcin1prots.index(i)]])
|
|
3859 detecteddomainsdict[i] = detdomlist
|
|
3860 else:
|
|
3861 if i in bcin1prots:
|
|
3862 detecteddomainsdict[i] = [["Putative Streptomyces bacteriocin",bcin1scores[bcin1prots.index(i)]]]
|
|
3863 if detecteddomainsdict.has_key(i):
|
|
3864 detdomlist = detecteddomainsdict[i]
|
|
3865 if i in bcin2prots:
|
|
3866 detdomlist.append(["Antimicrobial14 domain",bcin2scores[bcin2prots.index(i)]])
|
|
3867 detecteddomainsdict[i] = detdomlist
|
|
3868 else:
|
|
3869 if i in bcin2prots:
|
|
3870 detecteddomainsdict[i] = [["Antimicrobial14 domain",bcin2scores[bcin2prots.index(i)]]]
|
|
3871 if detecteddomainsdict.has_key(i):
|
|
3872 detdomlist = detecteddomainsdict[i]
|
|
3873 if i in bcin3prots:
|
|
3874 detdomlist.append(["Bacteriocin_IId domain",bcin3scores[bcin3prots.index(i)]])
|
|
3875 detecteddomainsdict[i] = detdomlist
|
|
3876 else:
|
|
3877 if i in bcin3prots:
|
|
3878 detecteddomainsdict[i] = [["Bacteriocin_IId domain",bcin3scores[bcin3prots.index(i)]]]
|
|
3879 if detecteddomainsdict.has_key(i):
|
|
3880 detdomlist = detecteddomainsdict[i]
|
|
3881 if i in bcin4prots:
|
|
3882 detdomlist.append(["BacteriocIIc_cy domain",bcin4scores[bcin4prots.index(i)]])
|
|
3883 detecteddomainsdict[i] = detdomlist
|
|
3884 else:
|
|
3885 if i in bcin4prots:
|
|
3886 detecteddomainsdict[i] = [["BacteriocIIc_cy domain",bcin4scores[bcin4prots.index(i)]]]
|
|
3887 if detecteddomainsdict.has_key(i):
|
|
3888 detdomlist = detecteddomainsdict[i]
|
|
3889 if i in bcin5prots:
|
|
3890 detdomlist.append(["Bacteriocin_II domain",bcin5scores[bcin5prots.index(i)]])
|
|
3891 detecteddomainsdict[i] = detdomlist
|
|
3892 else:
|
|
3893 if i in bcin5prots:
|
|
3894 detecteddomainsdict[i] = [["Bacteriocin_II domain",bcin5scores[bcin5prots.index(i)]]]
|
|
3895 if detecteddomainsdict.has_key(i):
|
|
3896 detdomlist = detecteddomainsdict[i]
|
|
3897 if i in bcin6prots:
|
|
3898 detdomlist.append(["Lactococcin",bcin6scores[bcin6prots.index(i)]])
|
|
3899 detecteddomainsdict[i] = detdomlist
|
|
3900 else:
|
|
3901 if i in bcin6prots:
|
|
3902 detecteddomainsdict[i] = [["Lactococcin",bcin6scores[bcin6prots.index(i)]]]
|
|
3903 if detecteddomainsdict.has_key(i):
|
|
3904 detdomlist = detecteddomainsdict[i]
|
|
3905 if i in bcin7prots:
|
|
3906 detdomlist.append(["Antimicrobial17 domain",bcin7scores[bcin7prots.index(i)]])
|
|
3907 detecteddomainsdict[i] = detdomlist
|
|
3908 else:
|
|
3909 if i in bcin7prots:
|
|
3910 detecteddomainsdict[i] = [["Antimicrobial17 domain",bcin7scores[bcin7prots.index(i)]]]
|
|
3911 if detecteddomainsdict.has_key(i):
|
|
3912 detdomlist = detecteddomainsdict[i]
|
|
3913 if i in bcin8prots:
|
|
3914 detdomlist.append(["Lactococcin_972 domain",bcin8scores[bcin8prots.index(i)]])
|
|
3915 detecteddomainsdict[i] = detdomlist
|
|
3916 else:
|
|
3917 if i in bcin8prots:
|
|
3918 detecteddomainsdict[i] = [["Lactococcin_972 domain",bcin8scores[bcin8prots.index(i)]]]
|
|
3919 if detecteddomainsdict.has_key(i):
|
|
3920 detdomlist = detecteddomainsdict[i]
|
|
3921 if i in bcin9prots:
|
|
3922 detdomlist.append(["Bacteriocin_IIc domain",bcin9scores[bcin9prots.index(i)]])
|
|
3923 detecteddomainsdict[i] = detdomlist
|
|
3924 else:
|
|
3925 if i in bcin9prots:
|
|
3926 detecteddomainsdict[i] = [["Bacteriocin_IIc domain",bcin9scores[bcin9prots.index(i)]]]
|
|
3927 if detecteddomainsdict.has_key(i):
|
|
3928 detdomlist = detecteddomainsdict[i]
|
|
3929 if i in bcin10prots:
|
|
3930 detdomlist.append(["LcnG-beta domain",bcin10scores[bcin10prots.index(i)]])
|
|
3931 detecteddomainsdict[i] = detdomlist
|
|
3932 else:
|
|
3933 if i in bcin10prots:
|
|
3934 detecteddomainsdict[i] = [["LcnG-beta domain",bcin10scores[bcin10prots.index(i)]]]
|
|
3935 if detecteddomainsdict.has_key(i):
|
|
3936 detdomlist = detecteddomainsdict[i]
|
|
3937 if i in bcin11prots:
|
|
3938 detdomlist.append(["Bacteriocin_IIi domain",bcin11scores[bcin11prots.index(i)]])
|
|
3939 detecteddomainsdict[i] = detdomlist
|
|
3940 else:
|
|
3941 if i in bcin11prots:
|
|
3942 detecteddomainsdict[i] = [["Bacteriocin_IIi domain",bcin11scores[bcin11prots.index(i)]]]
|
|
3943 if detecteddomainsdict.has_key(i):
|
|
3944 detdomlist = detecteddomainsdict[i]
|
|
3945 if i in bcin12prots:
|
|
3946 detdomlist.append(["Subtilosin_A domain",bcin12scores[bcin12prots.index(i)]])
|
|
3947 detecteddomainsdict[i] = detdomlist
|
|
3948 else:
|
|
3949 if i in bcin12prots:
|
|
3950 detecteddomainsdict[i] = [["Subtilosin_A domain",bcin12scores[bcin12prots.index(i)]]]
|
|
3951 if detecteddomainsdict.has_key(i):
|
|
3952 detdomlist = detecteddomainsdict[i]
|
|
3953 if i in bcin13prots:
|
|
3954 detdomlist.append(["Cloacin domain",bcin13scores[bcin13prots.index(i)]])
|
|
3955 detecteddomainsdict[i] = detdomlist
|
|
3956 else:
|
|
3957 if i in bcin13prots:
|
|
3958 detecteddomainsdict[i] = [["Cloacin domain",bcin13scores[bcin13prots.index(i)]]]
|
|
3959 if detecteddomainsdict.has_key(i):
|
|
3960 detdomlist = detecteddomainsdict[i]
|
|
3961 if i in bcin14prots:
|
|
3962 detdomlist.append(["Linocin_M18 domain",bcin14scores[bcin14prots.index(i)]])
|
|
3963 detecteddomainsdict[i] = detdomlist
|
|
3964 else:
|
|
3965 if i in bcin14prots:
|
|
3966 detecteddomainsdict[i] = [["Linocin_M18 domain",bcin14scores[bcin14prots.index(i)]]]
|
|
3967 if detecteddomainsdict.has_key(i):
|
|
3968 detdomlist = detecteddomainsdict[i]
|
|
3969 if i in bcin15prots:
|
|
3970 detdomlist.append(["TIGR03603: bacteriocin biosynthesis cyclodehydratase",bcin15scores[bcin15prots.index(i)]])
|
|
3971 detecteddomainsdict[i] = detdomlist
|
|
3972 else:
|
|
3973 if i in bcin15prots:
|
|
3974 detecteddomainsdict[i] = [["TIGR03603: bacteriocin biosynthesis cyclodehydratase",bcin15scores[bcin15prots.index(i)]]]
|
|
3975 if detecteddomainsdict.has_key(i):
|
|
3976 detdomlist = detecteddomainsdict[i]
|
|
3977 if i in bcin16prots:
|
|
3978 detdomlist.append(["TGIR03604: bacteriocin biosynthesis docking scaffold",bcin16scores[bcin16prots.index(i)]])
|
|
3979 detecteddomainsdict[i] = detdomlist
|
|
3980 else:
|
|
3981 if i in bcin16prots:
|
|
3982 detecteddomainsdict[i] = [["TGIR03604: bacteriocin biosynthesis docking scaffold",bcin16scores[bcin16prots.index(i)]]]
|
|
3983 if detecteddomainsdict.has_key(i):
|
|
3984 detdomlist = detecteddomainsdict[i]
|
|
3985 if i in bcin17prots:
|
|
3986 detdomlist.append(["TGIR03605: SagB-type dehydrogenase",bcin17scores[bcin17prots.index(i)]])
|
|
3987 detecteddomainsdict[i] = detdomlist
|
|
3988 else:
|
|
3989 if i in bcin17prots:
|
|
3990 detecteddomainsdict[i] = [["TGIR03605: SagB-type dehydrogenase",bcin17scores[bcin17prots.index(i)]]]
|
|
3991 if detecteddomainsdict.has_key(i):
|
|
3992 detdomlist = detecteddomainsdict[i]
|
|
3993 if i in bcin18prots:
|
|
3994 detdomlist.append(["TIGR03651: bacteriocin, circularin A/uberolysin family",bcin18scores[bcin18prots.index(i)]])
|
|
3995 detecteddomainsdict[i] = detdomlist
|
|
3996 else:
|
|
3997 if i in bcin18prots:
|
|
3998 detecteddomainsdict[i] = [["TIGR03651: bacteriocin, circularin A/uberolysin family",bcin18scores[bcin18prots.index(i)]]]
|
|
3999 if detecteddomainsdict.has_key(i):
|
|
4000 detdomlist = detecteddomainsdict[i]
|
|
4001 if i in bcin19prots:
|
|
4002 detdomlist.append(["TIGR03678: bacteriocin, microcyclamide/patellamide family",bcin19scores[bcin19prots.index(i)]])
|
|
4003 detecteddomainsdict[i] = detdomlist
|
|
4004 else:
|
|
4005 if i in bcin19prots:
|
|
4006 detecteddomainsdict[i] = [["TIGR03678: bacteriocin, microcyclamide/patellamide family",bcin19scores[bcin19prots.index(i)]]]
|
|
4007 if detecteddomainsdict.has_key(i):
|
|
4008 detdomlist = detecteddomainsdict[i]
|
|
4009 if i in bcin20prots:
|
|
4010 detdomlist.append(["TIGR03693: thiazole-containing bacteriocin maturation protein",bcin20scores[bcin20prots.index(i)]])
|
|
4011 detecteddomainsdict[i] = detdomlist
|
|
4012 else:
|
|
4013 if i in bcin20prots:
|
|
4014 detecteddomainsdict[i] = [["TIGR03693: thiazole-containing bacteriocin maturation protein",bcin20scores[bcin20prots.index(i)]]]
|
|
4015 if detecteddomainsdict.has_key(i):
|
|
4016 detdomlist = detecteddomainsdict[i]
|
|
4017 if i in bcin21prots:
|
|
4018 detdomlist.append(["TIGR03798: bacteriocin propeptide",bcin21scores[bcin21prots.index(i)]])
|
|
4019 detecteddomainsdict[i] = detdomlist
|
|
4020 else:
|
|
4021 if i in bcin21prots:
|
|
4022 detecteddomainsdict[i] = [["TIGR03798: bacteriocin propeptide",bcin21scores[bcin21prots.index(i)]]]
|
|
4023 if detecteddomainsdict.has_key(i):
|
|
4024 detdomlist = detecteddomainsdict[i]
|
|
4025 if i in bcin22prots:
|
|
4026 detdomlist.append(["TIGR03882: bacteriocin biosynthesis cyclodehydratase",bcin22scores[bcin22prots.index(i)]])
|
|
4027 detecteddomainsdict[i] = detdomlist
|
|
4028 else:
|
|
4029 if i in bcin22prots:
|
|
4030 detecteddomainsdict[i] = [["TIGR03882: bacteriocin biosynthesis cyclodehydratase",bcin22scores[bcin22prots.index(i)]]]
|
|
4031 if detecteddomainsdict.has_key(i):
|
|
4032 detdomlist = detecteddomainsdict[i]
|
|
4033 if i in bcin23prots:
|
|
4034 detdomlist.append(["TIGR03601: bacteriocin, BA_2677 family",bcin23scores[bcin23prots.index(i)]])
|
|
4035 detecteddomainsdict[i] = detdomlist
|
|
4036 else:
|
|
4037 if i in bcin23prots:
|
|
4038 detecteddomainsdict[i] = [["TIGR03601: bacteriocin, BA_2677 family",bcin23scores[bcin23prots.index(i)]]]
|
|
4039 if detecteddomainsdict.has_key(i):
|
|
4040 detdomlist = detecteddomainsdict[i]
|
|
4041 if i in bcin24prots:
|
|
4042 detdomlist.append(["TIGR03602: bacteriocin protoxin, streptolysin S family",bcin24scores[bcin24prots.index(i)]])
|
|
4043 detecteddomainsdict[i] = detdomlist
|
|
4044 else:
|
|
4045 if i in bcin24prots:
|
|
4046 detecteddomainsdict[i] = [["TIGR03602: bacteriocin protoxin, streptolysin S family",bcin24scores[bcin24prots.index(i)]]]
|
|
4047 if detecteddomainsdict.has_key(i):
|
|
4048 detdomlist = detecteddomainsdict[i]
|
|
4049 if i in bcin25prots:
|
|
4050 detdomlist.append(["Bacteriocin, microviridin family",bcin25scores[bcin25prots.index(i)]])
|
|
4051 detecteddomainsdict[i] = detdomlist
|
|
4052 else:
|
|
4053 if i in bcin25prots:
|
|
4054 detecteddomainsdict[i] = [["Bacteriocin, microviridin family",bcin25scores[bcin25prots.index(i)]]]
|
|
4055 if detecteddomainsdict.has_key(i):
|
|
4056 detdomlist = detecteddomainsdict[i]
|
|
4057 if i in bcin26prots:
|
|
4058 detdomlist.append(["Thiopeptide, thiostrepton-like",bcin26scores[bcin26prots.index(i)]])
|
|
4059 detecteddomainsdict[i] = detdomlist
|
|
4060 else:
|
|
4061 if i in bcin26prots:
|
|
4062 detecteddomainsdict[i] = [["Thiopeptide, thiostrepton-like",bcin26scores[bcin26prots.index(i)]]]
|
|
4063 if i not in bcinprots2:
|
|
4064 bcinprots2.append(i)
|
|
4065 bcinprots = bcinprots2
|
|
4066 #Extract beta-lactam synthetase proteins, cut-off: 250
|
|
4067 lactamprots = []
|
|
4068 if 1 in geneclustertypes or 9 in geneclustertypes:
|
|
4069 bls = parsehmmoutput(250,hmmoutputfolder + "BLS.txt")
|
|
4070 blsprots = bls[0]
|
|
4071 blsscores = bls[1]
|
|
4072 for i in bls[0]:
|
|
4073 lactamprots.append(i)
|
|
4074 if detecteddomainsdict.has_key(i):
|
|
4075 detdomlist = detecteddomainsdict[i]
|
|
4076 detdomlist.append(["Beta-lactam synthase",blsscores[blsprots.index(i)]])
|
|
4077 detecteddomainsdict[i] = detdomlist
|
|
4078 else:
|
|
4079 detecteddomainsdict[i] = [["Beta-lactam synthase",blsscores[blsprots.index(i)]]]
|
|
4080 cas = parsehmmoutput(250,hmmoutputfolder + "CAS.txt")
|
|
4081 casprots = cas[0]
|
|
4082 casscores = cas[1]
|
|
4083 for i in cas[0]:
|
|
4084 if i not in lactamprots:
|
|
4085 lactamprots.append(i)
|
|
4086 if detecteddomainsdict.has_key(i):
|
|
4087 detdomlist = detecteddomainsdict[i]
|
|
4088 detdomlist.append(["Clavulanic acid synthase-like",casscores[casprots.index(i)]])
|
|
4089 detecteddomainsdict[i] = detdomlist
|
|
4090 else:
|
|
4091 detecteddomainsdict[i] = [["Clavulanic acid synthase-like",casscores[casprots.index(i)]]]
|
|
4092 tabtoxin = parsehmmoutput(500,hmmoutputfolder + "tabtoxin.txt")
|
|
4093 tabtoxinprots = tabtoxin[0]
|
|
4094 tabtoxinscores = tabtoxin[1]
|
|
4095 for i in tabtoxin[0]:
|
|
4096 if i not in lactamprots:
|
|
4097 lactamprots.append(i)
|
|
4098 if detecteddomainsdict.has_key(i):
|
|
4099 detdomlist = detecteddomainsdict[i]
|
|
4100 detdomlist.append(["Tabtoxin synthase-like",tabtoxinscores[tabtoxinprots.index(i)]])
|
|
4101 detecteddomainsdict[i] = detdomlist
|
|
4102 else:
|
|
4103 detecteddomainsdict[i] = [["Tabtoxin synthase-like",tabtoxinscores[tabtoxinprots.index(i)]]]
|
|
4104 #Extract aminoglycoside / aminocyclitol biosynthesis clusters, clusters taken from Flatt & Mahmud et al. 2007
|
|
4105 amglyccyclprots = []
|
|
4106 if 1 in geneclustertypes or 10 in geneclustertypes:
|
|
4107 strH = parsehmmoutput(200,hmmoutputfolder + "strH_like.txt")
|
|
4108 strhprots = strH[0]
|
|
4109 strhscores = strH[1]
|
|
4110 for i in strH[0]:
|
|
4111 amglyccyclprots.append(i)
|
|
4112 if detecteddomainsdict.has_key(i):
|
|
4113 detdomlist = detecteddomainsdict[i]
|
|
4114 detdomlist.append(["StrH-like glycosyltransferase",strhscores[strhprots.index(i)]])
|
|
4115 detecteddomainsdict[i] = detdomlist
|
|
4116 else:
|
|
4117 detecteddomainsdict[i] = [["StrH-like glycosyltransferase",strhscores[strhprots.index(i)]]]
|
|
4118 strK1 = parsehmmoutput(800,hmmoutputfolder + "strK_like1.txt")
|
|
4119 strk1prots = strK1[0]
|
|
4120 strk1scores = strK1[1]
|
|
4121 for i in strK1[0]:
|
|
4122 amglyccyclprots.append(i)
|
|
4123 if detecteddomainsdict.has_key(i):
|
|
4124 detdomlist = detecteddomainsdict[i]
|
|
4125 detdomlist.append(["StrK-like phosphatase",strk1scores[strk1prots.index(i)]])
|
|
4126 detecteddomainsdict[i] = detdomlist
|
|
4127 else:
|
|
4128 detecteddomainsdict[i] = [["StrK-like phosphatase",strk1scores[strk1prots.index(i)]]]
|
|
4129 strK2 = parsehmmoutput(650,hmmoutputfolder + "strK_like2.txt")
|
|
4130 strk2prots = strK2[0]
|
|
4131 strk2scores = strK2[1]
|
|
4132 for i in strK2[0]:
|
|
4133 amglyccyclprots.append(i)
|
|
4134 if detecteddomainsdict.has_key(i):
|
|
4135 detdomlist = detecteddomainsdict[i]
|
|
4136 detdomlist.append(["StrK-like phosphatase, model 2",strk2scores[strk2prots.index(i)]])
|
|
4137 detecteddomainsdict[i] = detdomlist
|
|
4138 else:
|
|
4139 detecteddomainsdict[i] = [["StrK-like phosphatase, model 2",strk2scores[strk2prots.index(i)]]]
|
|
4140 neoL = parsehmmoutput(50,hmmoutputfolder + "neoL_like.txt")
|
|
4141 neolprots = neoL[0]
|
|
4142 neolscores = neoL[1]
|
|
4143 for i in neoL[0]:
|
|
4144 amglyccyclprots.append(i)
|
|
4145 if detecteddomainsdict.has_key(i):
|
|
4146 detdomlist = detecteddomainsdict[i]
|
|
4147 detdomlist.append(["NeoL-like deacetylase",neolscores[neolprots.index(i)]])
|
|
4148 detecteddomainsdict[i] = detdomlist
|
|
4149 else:
|
|
4150 detecteddomainsdict[i] = [["NeoL-like deacetylase",neolscores[neolprots.index(i)]]]
|
|
4151 DOIS = parsehmmoutput(500,hmmoutputfolder + "DOIS.txt")
|
|
4152 doisprots = DOIS[0]
|
|
4153 doisscores = DOIS[1]
|
|
4154 for i in DOIS[0]:
|
|
4155 amglyccyclprots.append(i)
|
|
4156 if detecteddomainsdict.has_key(i):
|
|
4157 detdomlist = detecteddomainsdict[i]
|
|
4158 detdomlist.append(["2-deoxy-scyllo-inosose synthase",doisscores[doisprots.index(i)]])
|
|
4159 detecteddomainsdict[i] = detdomlist
|
|
4160 else:
|
|
4161 detecteddomainsdict[i] = [["2-deoxy-scyllo-inosose synthase",doisscores[doisprots.index(i)]]]
|
|
4162 valA = parsehmmoutput(600,hmmoutputfolder + "valA_like.txt")
|
|
4163 valaprots = valA[0]
|
|
4164 valascores = valA[1]
|
|
4165 for i in valA[0]:
|
|
4166 amglyccyclprots.append(i)
|
|
4167 if detecteddomainsdict.has_key(i):
|
|
4168 detdomlist = detecteddomainsdict[i]
|
|
4169 detdomlist.append(["2-epi-5-epi-valiolone synthase, ValA-like",valascores[valaprots.index(i)]])
|
|
4170 detecteddomainsdict[i] = detdomlist
|
|
4171 else:
|
|
4172 detecteddomainsdict[i] = [["2-epi-5-epi-valiolone synthase, ValA-like",valascores[valaprots.index(i)]]]
|
|
4173 spcFG = parsehmmoutput(200,hmmoutputfolder + "spcFG_like.txt")
|
|
4174 spcfgprots = spcFG[0]
|
|
4175 spcfgscores = spcFG[1]
|
|
4176 for i in spcFG[0]:
|
|
4177 amglyccyclprots.append(i)
|
|
4178 if detecteddomainsdict.has_key(i):
|
|
4179 detdomlist = detecteddomainsdict[i]
|
|
4180 detdomlist.append(["SpcF/SpcG-like glycosyltransferase",spcfgscores[spcfgprots.index(i)]])
|
|
4181 detecteddomainsdict[i] = detdomlist
|
|
4182 else:
|
|
4183 detecteddomainsdict[i] = [["SpcF/SpcG-like glycosyltransferase",spcfgscores[spcfgprots.index(i)]]]
|
|
4184 spcDK_glyc = parsehmmoutput(600,hmmoutputfolder + "spcDK_like_glyc.txt")
|
|
4185 spcdkglycprots = spcDK_glyc[0]
|
|
4186 spcdkglycscores = spcDK_glyc[1]
|
|
4187 for i in spcDK_glyc[0]:
|
|
4188 amglyccyclprots.append(i)
|
|
4189 if detecteddomainsdict.has_key(i):
|
|
4190 detdomlist = detecteddomainsdict[i]
|
|
4191 detdomlist.append(["SpcD/SpcK-like thymidylyltransferase",spcdkglycscores[spcdkglycprots.index(i)]])
|
|
4192 detecteddomainsdict[i] = detdomlist
|
|
4193 else:
|
|
4194 detecteddomainsdict[i] = [["SpcD/SpcK-like thymidylyltransferase",spcdkglycscores[spcdkglycprots.index(i)]]]
|
|
4195 salQ = parsehmmoutput(480,hmmoutputfolder + "salQ.txt")
|
|
4196 salqprots = salQ[0]
|
|
4197 salqscores = salQ[1]
|
|
4198 for i in salqprots:
|
|
4199 amglyccyclprots.append(i)
|
|
4200 if detecteddomainsdict.has_key(i):
|
|
4201 detdomlist = detecteddomainsdict[i]
|
|
4202 detdomlist.append(["2-epi-5-epi-valiolone synthase, SalQ-like",salqscores[salqprots.index(i)]])
|
|
4203 detecteddomainsdict[i] = detdomlist
|
|
4204 else:
|
|
4205 detecteddomainsdict[i] = [["2-epi-5-epi-valiolone synthase, SalQ-like",salqscores[salqprots.index(i)]]]
|
|
4206 #Extract aminocoumarin biosynthesis clusters
|
|
4207 aminocoumarinprots = []
|
|
4208 if 1 in geneclustertypes or 11 in geneclustertypes:
|
|
4209 novK = parsehmmoutput(200,hmmoutputfolder + "novK.txt")
|
|
4210 novkprots = novK[0]
|
|
4211 novkscores = novK[1]
|
|
4212 for i in novkprots:
|
|
4213 aminocoumarinprots.append(i)
|
|
4214 if detecteddomainsdict.has_key(i):
|
|
4215 detdomlist = detecteddomainsdict[i]
|
|
4216 detdomlist.append(["NovK-like reductase",novkscores[novkprots.index(i)]])
|
|
4217 detecteddomainsdict[i] = detdomlist
|
|
4218 else:
|
|
4219 detecteddomainsdict[i] = [["NovK-like reductase",novkscores[novkprots.index(i)]]]
|
|
4220 novJ = parsehmmoutput(350,hmmoutputfolder + "novJ.txt")
|
|
4221 novjprots = novJ[0]
|
|
4222 novjscores = novJ[1]
|
|
4223 for i in novjprots:
|
|
4224 aminocoumarinprots.append(i)
|
|
4225 if detecteddomainsdict.has_key(i):
|
|
4226 detdomlist = detecteddomainsdict[i]
|
|
4227 detdomlist.append(["NovJ-like reductase",novjscores[novjprots.index(i)]])
|
|
4228 detecteddomainsdict[i] = detdomlist
|
|
4229 else:
|
|
4230 detecteddomainsdict[i] = [["NovJ-like reductase",novjscores[novjprots.index(i)]]]
|
|
4231 novI = parsehmmoutput(600,hmmoutputfolder + "novI.txt")
|
|
4232 noviprots = novI[0]
|
|
4233 noviscores = novI[1]
|
|
4234 for i in noviprots :
|
|
4235 aminocoumarinprots.append(i)
|
|
4236 if detecteddomainsdict.has_key(i):
|
|
4237 detdomlist = detecteddomainsdict[i]
|
|
4238 detdomlist.append(["NovI-like cytochrome P450",noviscores[noviprots.index(i)]])
|
|
4239 detecteddomainsdict[i] = detdomlist
|
|
4240 else:
|
|
4241 detecteddomainsdict[i] = [["NovI-like cytochrome P450",noviscores[noviprots.index(i)]]]
|
|
4242 novH = parsehmmoutput(750,hmmoutputfolder + "novH.txt")
|
|
4243 novhprots = novH[0]
|
|
4244 novhscores = novH[1]
|
|
4245 for i in novhprots:
|
|
4246 aminocoumarinprots.append(i)
|
|
4247 if detecteddomainsdict.has_key(i):
|
|
4248 detdomlist = detecteddomainsdict[i]
|
|
4249 detdomlist.append(["NovH-like protein",novhscores[novhprots.index(i)]])
|
|
4250 detecteddomainsdict[i] = detdomlist
|
|
4251 else:
|
|
4252 detecteddomainsdict[i] = [["NovH-like protein",novhscores[novhprots.index(i)]]]
|
|
4253 spcDK_like_cou = parsehmmoutput(600,hmmoutputfolder + "spcDK_like_cou.txt")
|
|
4254 spcDK_like_cou_prots = spcDK_like_cou[0]
|
|
4255 spcDK_like_cou_scores = spcDK_like_cou[1]
|
|
4256 for i in spcDK_like_cou_prots:
|
|
4257 aminocoumarinprots.append(i)
|
|
4258 if detecteddomainsdict.has_key(i):
|
|
4259 detdomlist = detecteddomainsdict[i]
|
|
4260 detdomlist.append(["SpcD/SpcK-like thymidylyltransferase, aminocoumarins group",spcDK_like_cou_scores[spcDK_like_cou_prots.index(i)]])
|
|
4261 detecteddomainsdict[i] = detdomlist
|
|
4262 else:
|
|
4263 detecteddomainsdict[i] = [["SpcD/SpcK-like thymidylyltransferase, aminocoumarins group",spcDK_like_cou_scores[spcDK_like_cou_prots.index(i)]]]
|
|
4264 #Extract siderophores biosynthesis proteins, IucA/C and AlcB
|
|
4265 siderophoreprots = []
|
|
4266 if 1 in geneclustertypes or 12 in geneclustertypes:
|
|
4267 siderophore = parsehmmoutput(30,hmmoutputfolder + "IucA_IucC.txt")
|
|
4268 siderophoreprots = siderophore[0]
|
|
4269 siderophorescores = siderophore[1]
|
|
4270 for i in siderophoreprots:
|
|
4271 if detecteddomainsdict.has_key(i):
|
|
4272 detdomlist = detecteddomainsdict[i]
|
|
4273 detdomlist.append(["IucA-IucC domain",siderophorescores[siderophoreprots.index(i)]])
|
|
4274 detecteddomainsdict[i] = detdomlist
|
|
4275 else:
|
|
4276 detecteddomainsdict[i] = [["IucA-IucC domain",siderophorescores[siderophoreprots.index(i)]]]
|
|
4277 #Extract ectoine biosynthesis proteins
|
|
4278 ectprots = []
|
|
4279 if 1 in geneclustertypes or 13 in geneclustertypes:
|
|
4280 ect = parsehmmoutput(35,hmmoutputfolder + "ectoine_synt.txt")
|
|
4281 ectprots = ect[0]
|
|
4282 ectscores = ect[1]
|
|
4283 for i in ectprots:
|
|
4284 if detecteddomainsdict.has_key(i):
|
|
4285 detdomlist = detecteddomainsdict[i]
|
|
4286 detdomlist.append(["Ectoine synthase",ectscores[ectprots.index(i)]])
|
|
4287 detecteddomainsdict[i] = detdomlist
|
|
4288 else:
|
|
4289 detecteddomainsdict[i] = [["Ectoine synthase",ectscores[ectprots.index(i)]]]
|
|
4290 #Extract butyrolactone biosynthesis proteins
|
|
4291 butyrprots = []
|
|
4292 if 1 in geneclustertypes or 14 in geneclustertypes:
|
|
4293 butyr= parsehmmoutput(25,hmmoutputfolder + "AfsA.txt")
|
|
4294 butyrprots = butyr[0]
|
|
4295 butyrscores = butyr[1]
|
|
4296 for i in butyrprots:
|
|
4297 if detecteddomainsdict.has_key(i):
|
|
4298 detdomlist = detecteddomainsdict[i]
|
|
4299 detdomlist.append(["AfsA butyrolactone synthesis domain",butyrscores[butyrprots.index(i)]])
|
|
4300 detecteddomainsdict[i] = detdomlist
|
|
4301 else:
|
|
4302 detecteddomainsdict[i] = [["AfsA butyrolactone synthesis domain",butyrscores[butyrprots.index(i)]]]
|
|
4303 #Extract indole biosynthesis proteins
|
|
4304 indoleprots = []
|
|
4305 if 1 in geneclustertypes or 15 in geneclustertypes:
|
|
4306 indole = parsehmmoutput(100,hmmoutputfolder + "indsynth.txt")
|
|
4307 indoleprots = indole[0]
|
|
4308 indolescores = indole[1]
|
|
4309 for i in indoleprots:
|
|
4310 if detecteddomainsdict.has_key(i):
|
|
4311 detdomlist = detecteddomainsdict[i]
|
|
4312 detdomlist.append(["StaD-like chromopyrrolic acid synthase domain",indolescores[indoleprots.index(i)]])
|
|
4313 detecteddomainsdict[i] = detdomlist
|
|
4314 else:
|
|
4315 detecteddomainsdict[i] = [["StaD-like chromopyrrolic acid synthase domain",indolescores[indoleprots.index(i)]]]
|
|
4316 #Extract nucleoside antibiotic biosynthesis proteins
|
|
4317 nucleoprots = []
|
|
4318 if 1 in geneclustertypes or 16 in geneclustertypes:
|
|
4319 nucleoprots = []
|
|
4320 lipm = parsehmmoutput(50,hmmoutputfolder + "LipM.txt")
|
|
4321 lipmprots = lipm[0]
|
|
4322 lipmscores = lipm[1]
|
|
4323 lipu = parsehmmoutput(30,hmmoutputfolder + "LipU.txt")
|
|
4324 lipuprots = lipu[0]
|
|
4325 lipuscores = lipu[1]
|
|
4326 lipv = parsehmmoutput(375,hmmoutputfolder + "LipV.txt")
|
|
4327 lipvprots = lipv[0]
|
|
4328 lipvscores = lipv[1]
|
|
4329 toyb = parsehmmoutput(175,hmmoutputfolder + "ToyB.txt")
|
|
4330 toybprots = toyb[0]
|
|
4331 toybscores = toyb[1]
|
|
4332 tund = parsehmmoutput(200,hmmoutputfolder + "TunD.txt")
|
|
4333 tundprots = tund[0]
|
|
4334 tundscores = tund[1]
|
|
4335 pur6 = parsehmmoutput(200,hmmoutputfolder + "pur6.txt")
|
|
4336 pur6prots = pur6[0]
|
|
4337 pur6scores = pur6[1]
|
|
4338 pur10 = parsehmmoutput(600,hmmoutputfolder + "pur10.txt")
|
|
4339 pur10prots = pur10[0]
|
|
4340 pur10scores = pur10[1]
|
|
4341 nikj = parsehmmoutput(200,hmmoutputfolder + "nikJ.txt")
|
|
4342 nikjprots = nikj[0]
|
|
4343 nikjscores = nikj[1]
|
|
4344 niko = parsehmmoutput(400,hmmoutputfolder + "nikO.txt")
|
|
4345 nikoprots = niko[0]
|
|
4346 nikoscores = niko[1]
|
|
4347 for i in lipmprots:
|
|
4348 if i not in nucleoprots:
|
|
4349 nucleoprots.append(i)
|
|
4350 if detecteddomainsdict.has_key(i):
|
|
4351 detdomlist = detecteddomainsdict[i]
|
|
4352 detdomlist.append(["LipM-like nucleotidyltransferase",lipmscores[lipmprots.index(i)]])
|
|
4353 detecteddomainsdict[i] = detdomlist
|
|
4354 else:
|
|
4355 detecteddomainsdict[i] = [["LipM-like nucleotidyltransferase",lipmscores[lipmprots.index(i)]]]
|
|
4356 for i in lipuprots:
|
|
4357 if i not in nucleoprots:
|
|
4358 nucleoprots.append(i)
|
|
4359 if detecteddomainsdict.has_key(i):
|
|
4360 detdomlist = detecteddomainsdict[i]
|
|
4361 detdomlist.append(["LipU-like protein",lipuscores[lipuprots.index(i)]])
|
|
4362 detecteddomainsdict[i] = detdomlist
|
|
4363 else:
|
|
4364 detecteddomainsdict[i] = [["LipU-like protein",lipuscores[lipuprots.index(i)]]]
|
|
4365 for i in lipvprots:
|
|
4366 if i not in nucleoprots:
|
|
4367 nucleoprots.append(i)
|
|
4368 if detecteddomainsdict.has_key(i):
|
|
4369 detdomlist = detecteddomainsdict[i]
|
|
4370 detdomlist.append(["LipV-like dehydrogenase",lipvscores[lipvprots.index(i)]])
|
|
4371 detecteddomainsdict[i] = detdomlist
|
|
4372 else:
|
|
4373 detecteddomainsdict[i] = [["LipV-like dehydrogenase",lipvscores[lipvprots.index(i)]]]
|
|
4374 for i in toybprots:
|
|
4375 if i not in nucleoprots:
|
|
4376 nucleoprots.append(i)
|
|
4377 if detecteddomainsdict.has_key(i):
|
|
4378 detdomlist = detecteddomainsdict[i]
|
|
4379 detdomlist.append(["ToyB-like synthase",toybscores[toybprots.index(i)]])
|
|
4380 detecteddomainsdict[i] = detdomlist
|
|
4381 else:
|
|
4382 detecteddomainsdict[i] = [["ToyB-like synthase",toybscores[toybprots.index(i)]]]
|
|
4383 for i in tundprots:
|
|
4384 if i not in nucleoprots:
|
|
4385 nucleoprots.append(i)
|
|
4386 if detecteddomainsdict.has_key(i):
|
|
4387 detdomlist = detecteddomainsdict[i]
|
|
4388 detdomlist.append(["TunD-like putative N-acetylglucosamine transferase",tundscores[tundprots.index(i)]])
|
|
4389 detecteddomainsdict[i] = detdomlist
|
|
4390 else:
|
|
4391 detecteddomainsdict[i] = [["TunD-like putative N-acetylglucosamine transferase",tundscores[tundprots.index(i)]]]
|
|
4392 for i in pur6prots:
|
|
4393 if i not in nucleoprots:
|
|
4394 nucleoprots.append(i)
|
|
4395 if detecteddomainsdict.has_key(i):
|
|
4396 detdomlist = detecteddomainsdict[i]
|
|
4397 detdomlist.append(["Pur6-like synthetase",pur6scores[pur6prots.index(i)]])
|
|
4398 detecteddomainsdict[i] = detdomlist
|
|
4399 else:
|
|
4400 detecteddomainsdict[i] = [["Pur6-like synthetase",pur6scores[pur6prots.index(i)]]]
|
|
4401 for i in pur10prots:
|
|
4402 if i not in nucleoprots:
|
|
4403 nucleoprots.append(i)
|
|
4404 if detecteddomainsdict.has_key(i):
|
|
4405 detdomlist = detecteddomainsdict[i]
|
|
4406 detdomlist.append(["Pur10-like oxidoreductase",pur10scores[pur10prots.index(i)]])
|
|
4407 detecteddomainsdict[i] = detdomlist
|
|
4408 else:
|
|
4409 detecteddomainsdict[i] = [["Pur10-like oxidoreductase",pur10scores[pur10prots.index(i)]]]
|
|
4410 for i in nikjprots:
|
|
4411 if i not in nucleoprots:
|
|
4412 nucleoprots.append(i)
|
|
4413 if detecteddomainsdict.has_key(i):
|
|
4414 detdomlist = detecteddomainsdict[i]
|
|
4415 detdomlist.append(["NikJ-like protein",nikjscores[nikjprots.index(i)]])
|
|
4416 detecteddomainsdict[i] = detdomlist
|
|
4417 else:
|
|
4418 detecteddomainsdict[i] = [["NikJ-like protein",nikjscores[nikjprots.index(i)]]]
|
|
4419 for i in nikoprots:
|
|
4420 if i not in nucleoprots:
|
|
4421 nucleoprots.append(i)
|
|
4422 if detecteddomainsdict.has_key(i):
|
|
4423 detdomlist = detecteddomainsdict[i]
|
|
4424 detdomlist.append(["NikO-like enolpyruvyl transferase",nikoscores[nikoprots.index(i)]])
|
|
4425 detecteddomainsdict[i] = detdomlist
|
|
4426 else:
|
|
4427
|
|
4428 detecteddomainsdict[i] = [["NikO-like enolpyruvyl transferase",nikoscores[nikoprots.index(i)]]]
|
|
4429 #Extract phosphoglycolipid biosynthesis proteins
|
|
4430 phosphoprots = []
|
|
4431 if 1 in geneclustertypes or 17 in geneclustertypes:
|
|
4432 phosphogl = parsehmmoutput(65,hmmoutputfolder + "MoeO5.txt")
|
|
4433 phosphoprots = phosphogl[0]
|
|
4434 phosphoscores = phosphogl[1]
|
|
4435 for i in phosphoprots:
|
|
4436 if detecteddomainsdict.has_key(i):
|
|
4437 detdomlist = detecteddomainsdict[i]
|
|
4438 detdomlist.append(["MoeO5-like prenyl-3-phosphoglycerate synthase",phosphoscores[phosphoprots.index(i)]])
|
|
4439 detecteddomainsdict[i] = detdomlist
|
|
4440 else:
|
|
4441 detecteddomainsdict[i] = [["MoeO5-like prenyl-3-phosphoglycerate synthase",phosphoscores[phosphoprots.index(i)]]]
|
|
4442 #Extract melanin biosynthesis proteins
|
|
4443 melaninprots = []
|
|
4444 if 1 in geneclustertypes or 18 in geneclustertypes:
|
|
4445 melanin = parsehmmoutput(40,hmmoutputfolder + "melC.txt")
|
|
4446 melaninprots = melanin[0]
|
|
4447 melaninscores = melanin[1]
|
|
4448 for i in melaninprots:
|
|
4449 if detecteddomainsdict.has_key(i):
|
|
4450 detdomlist = detecteddomainsdict[i]
|
|
4451 detdomlist.append(["MelC-like melanin synthase",melaninscores[melaninprots.index(i)]])
|
|
4452 detecteddomainsdict[i] = detdomlist
|
|
4453 else:
|
|
4454 detecteddomainsdict[i] = [["MelC-like melanin synthase",melaninscores[melaninprots.index(i)]]]
|
|
4455 #Extract other putative secondary metabolite biosynthesis proteins
|
|
4456 otherprots = []
|
|
4457 amp_t_prots = []
|
|
4458 if 1 in geneclustertypes or 19 in geneclustertypes:
|
|
4459 pptb = parsehmmoutput(20,hmmoutputfolder + "PP-binding.txt")
|
|
4460 pptbprots = pptb[0]
|
|
4461 pptbscores = pptb[1]
|
|
4462 cond = parsehmmoutput(20,hmmoutputfolder + "Condensation.txt")
|
|
4463 amp = parsehmmoutput(20,hmmoutputfolder + "AMP-binding.txt")
|
|
4464 ampprots = amp[0]
|
|
4465 ampscores = amp[1]
|
|
4466 ampox = parsehmmoutput(50,hmmoutputfolder + "A-OX.txt")
|
|
4467 ampoxprots = ampox[0]
|
|
4468 ampoxscores = ampox[1]
|
|
4469 nad4 = parsehmmoutput(40,hmmoutputfolder + "NAD_binding_4.txt")
|
|
4470 nad4prots = nad4[0]
|
|
4471 nad4scores = nad4[1]
|
|
4472 cprots = cond[0]
|
|
4473 aprots = amp[0]
|
|
4474 for i in ampox[0]:
|
|
4475 if i not in aprots:
|
|
4476 aprots.append(i)
|
|
4477 nrpsprots2 = []
|
|
4478 for i in cprots:
|
|
4479 if i in aprots:
|
|
4480 nrpsprots2.append(i)
|
|
4481 tprots = pptb[0]
|
|
4482 for i in tprots:
|
|
4483 if i in aprots and i not in nrpsprots2 and i not in aminocoumarinprots:
|
|
4484 otherprots.append(i)
|
|
4485 amp_t_prots.append(i)
|
|
4486 if detecteddomainsdict.has_key(i):
|
|
4487 detdomlist = detecteddomainsdict[i]
|
|
4488 detdomlist.append(["PP-binding domain",pptbscores[pptbprots.index(i)]])
|
|
4489 if i in ampprots:
|
|
4490 detdomlist.append(["Adenylation domain",ampscores[ampprots.index(i)]])
|
|
4491 elif i in ampoxprots:
|
|
4492 detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
|
|
4493 detecteddomainsdict[i] = detdomlist
|
|
4494 else:
|
|
4495 if i in ampprots:
|
|
4496 detecteddomainsdict[i] = [["PP-binding domain",pptbscores[pptbprots.index(i)]],["Adenylation domain",ampscores[ampprots.index(i)]]]
|
|
4497 elif i in ampoxprots:
|
|
4498 detecteddomainsdict[i] = [["PP-binding domain",pptbscores[pptbprots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
|
|
4499 for i in nad4prots:
|
|
4500 if i in aprots and i not in aminocoumarinprots:
|
|
4501 otherprots.append(i)
|
|
4502 amp_t_prots.append(i)
|
|
4503 if detecteddomainsdict.has_key(i):
|
|
4504 detdomlist = detecteddomainsdict[i]
|
|
4505 detdomlist.append(["NAD-binding domain 4",nad4scores[nad4prots.index(i)]])
|
|
4506 if i in ampprots:
|
|
4507 detdomlist.append(["Adenylation domain",ampscores[ampprots.index(i)]])
|
|
4508 elif i in ampoxprots:
|
|
4509 detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
|
|
4510 detecteddomainsdict[i] = detdomlist
|
|
4511 else:
|
|
4512 if i in ampprots:
|
|
4513 detecteddomainsdict[i] = [["NAD-binding domain 4",nad4scores[nad4prots.index(i)]],["Adenylation domain",ampscores[ampprots.index(i)]]]
|
|
4514 elif i in ampoxprots:
|
|
4515 detecteddomainsdict[i] = [["NAD-binding domain 4",nad4scores[nad4prots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
|
|
4516 lmbu = parsehmmoutput(50,hmmoutputfolder + "LmbU.txt")
|
|
4517 lmbuprots = lmbu[0]
|
|
4518 lmbuscores = lmbu[1]
|
|
4519 for i in lmbuprots:
|
|
4520 if i not in otherprots:
|
|
4521 otherprots.append(i)
|
|
4522 if detecteddomainsdict.has_key(i):
|
|
4523 detdomlist = detecteddomainsdict[i]
|
|
4524 detdomlist.append(["LmbU-like protein",lmbuscores[lmbuprots.index(i)]])
|
|
4525 detecteddomainsdict[i] = detdomlist
|
|
4526 else:
|
|
4527 detecteddomainsdict[i] = [["LmbU-like protein",lmbuscores[lmbuprots.index(i)]]]
|
|
4528 goadsporin = parsehmmoutput(500,hmmoutputfolder + "goadsporin_like.txt")
|
|
4529 goadsporinprots = goadsporin[0]
|
|
4530 goadsporinscores = goadsporin[1]
|
|
4531 for i in goadsporinprots:
|
|
4532 if i not in otherprots:
|
|
4533 otherprots.append(i)
|
|
4534 if detecteddomainsdict.has_key(i):
|
|
4535 detdomlist = detecteddomainsdict[i]
|
|
4536 detdomlist.append(["Goadsporin-like protein",goadsporinscores[goadsporinprots.index(i)]])
|
|
4537 detecteddomainsdict[i] = detdomlist
|
|
4538 else:
|
|
4539 detecteddomainsdict[i] = [["Goadsporin-like protein",goadsporinscores[goadsporinprots.index(i)]]]
|
|
4540 neocarzinostat = parsehmmoutput(28,hmmoutputfolder + "Neocarzinostat.txt")
|
|
4541 neocarzinostatprots = neocarzinostat[0]
|
|
4542 neocarzinostatscores = neocarzinostat[1]
|
|
4543 for i in neocarzinostatprots:
|
|
4544 if i not in otherprots:
|
|
4545 otherprots.append(i)
|
|
4546 if detecteddomainsdict.has_key(i):
|
|
4547 detdomlist = detecteddomainsdict[i]
|
|
4548 detdomlist.append(["Neocarzinostatin-like protein",neocarzinostatscores[neocarzinostatprots.index(i)]])
|
|
4549 detecteddomainsdict[i] = detdomlist
|
|
4550 else:
|
|
4551 detecteddomainsdict[i] = [["Neocarzinostatin-like protein",neocarzinostatscores[neocarzinostatprots.index(i)]]]
|
|
4552 cyanobactin = parsehmmoutput(80,hmmoutputfolder + "cyanobactin_synth.txt")
|
|
4553 cyanobactinprots = cyanobactin[0]
|
|
4554 cyanobactinscores = cyanobactin[1]
|
|
4555 for i in cyanobactinprots:
|
|
4556 if i not in otherprots:
|
|
4557 otherprots.append(i)
|
|
4558 if detecteddomainsdict.has_key(i):
|
|
4559 detdomlist = detecteddomainsdict[i]
|
|
4560 detdomlist.append(["Cyanobactin protease",cyanobactinscores[cyanobactinprots.index(i)]])
|
|
4561 detecteddomainsdict[i] = detdomlist
|
|
4562 else:
|
|
4563 detecteddomainsdict[i] = [["Cyanobactin protease",cyanobactinscores[cyanobactinprots.index(i)]]]
|
|
4564 cycdipeptide = parsehmmoutput(110,hmmoutputfolder + "cycdipepsynth.txt")
|
|
4565 cycdipeptideprots = cycdipeptide[0]
|
|
4566 cycdipeptidescores = cycdipeptide[1]
|
|
4567 for i in cycdipeptideprots:
|
|
4568 if i not in otherprots:
|
|
4569 otherprots.append(i)
|
|
4570 if detecteddomainsdict.has_key(i):
|
|
4571 detdomlist = detecteddomainsdict[i]
|
|
4572 detdomlist.append(["Cyclodipeptide synthase",cycdipeptidescores[cycdipeptideprots.index(i)]])
|
|
4573 detecteddomainsdict[i] = detdomlist
|
|
4574 else:
|
|
4575 detecteddomainsdict[i] = [["Cyclodipeptide synthase",cycdipeptidescores[cycdipeptideprots.index(i)]]]
|
|
4576 fom1 = parsehmmoutput(750,hmmoutputfolder + "fom1.txt")
|
|
4577 fom1prots = fom1[0]
|
|
4578 fom1scores = fom1[1]
|
|
4579 for i in fom1prots:
|
|
4580 if i not in otherprots:
|
|
4581 otherprots.append(i)
|
|
4582 if detecteddomainsdict.has_key(i):
|
|
4583 detdomlist = detecteddomainsdict[i]
|
|
4584 detdomlist.append(["Fom1-like phosphomutase",fom1scores[fom1prots.index(i)]])
|
|
4585 detecteddomainsdict[i] = detdomlist
|
|
4586 else:
|
|
4587 detecteddomainsdict[i] = [["Fom1-like phosphomutase",fom1scores[fom1prots.index(i)]]]
|
|
4588 bcpb = parsehmmoutput(400,hmmoutputfolder + "bcpB.txt")
|
|
4589 bcpbprots = bcpb[0]
|
|
4590 bcpbscores = bcpb[1]
|
|
4591 for i in bcpbprots:
|
|
4592 if i not in otherprots:
|
|
4593 otherprots.append(i)
|
|
4594 if detecteddomainsdict.has_key(i):
|
|
4595 detdomlist = detecteddomainsdict[i]
|
|
4596 detdomlist.append(["BcpB-like phosphomutase",bcpbscores[bcpbprots.index(i)]])
|
|
4597 detecteddomainsdict[i] = detdomlist
|
|
4598 else:
|
|
4599 detecteddomainsdict[i] = [["BcpB-like phosphomutase",bcpbscores[bcpbprots.index(i)]]]
|
|
4600 frbd = parsehmmoutput(350,hmmoutputfolder + "frbD.txt")
|
|
4601 frbdprots = frbd[0]
|
|
4602 frbdscores = frbd[1]
|
|
4603 for i in frbdprots:
|
|
4604 if i not in otherprots:
|
|
4605 otherprots.append(i)
|
|
4606 if detecteddomainsdict.has_key(i):
|
|
4607 detdomlist = detecteddomainsdict[i]
|
|
4608 detdomlist.append(["FrbD-like phosphomutase",frbdscores[frbdprots.index(i)]])
|
|
4609 detecteddomainsdict[i] = detdomlist
|
|
4610 else:
|
|
4611 detecteddomainsdict[i] = [["FrbD-like phosphomutase",frbdscores[frbdprots.index(i)]]]
|
|
4612 mite = parsehmmoutput(400,hmmoutputfolder + "mitE.txt")
|
|
4613 miteprots = mite[0]
|
|
4614 mitescores = mite[1]
|
|
4615 for i in miteprots:
|
|
4616 if i not in otherprots:
|
|
4617 otherprots.append(i)
|
|
4618 if detecteddomainsdict.has_key(i):
|
|
4619 detdomlist = detecteddomainsdict[i]
|
|
4620 detdomlist.append(["MitE-like CoA-ligase",mitescores[miteprots.index(i)]])
|
|
4621 detecteddomainsdict[i] = detdomlist
|
|
4622 else:
|
|
4623 detecteddomainsdict[i] = [["MitE-like CoA-ligase",mitescores[miteprots.index(i)]]]
|
|
4624 vlmb = parsehmmoutput(250,hmmoutputfolder + "vlmB.txt")
|
|
4625 vlmbprots = vlmb[0]
|
|
4626 vlmbscores = vlmb[1]
|
|
4627 for i in vlmbprots:
|
|
4628 if i not in otherprots:
|
|
4629 otherprots.append(i)
|
|
4630 if detecteddomainsdict.has_key(i):
|
|
4631 detdomlist = detecteddomainsdict[i]
|
|
4632 detdomlist.append(["Valanimycin biosynthesis VlmB domain",vlmbscores[vlmbprots.index(i)]])
|
|
4633 detecteddomainsdict[i] = detdomlist
|
|
4634 else:
|
|
4635 detecteddomainsdict[i] = [["Valanimycin biosynthesis VlmB domain",vlmbscores[vlmbprots.index(i)]]]
|
|
4636 prnb = parsehmmoutput(200,hmmoutputfolder + "prnB.txt")
|
|
4637 prnbprots = prnb[0]
|
|
4638 prnbscores = prnb[1]
|
|
4639 for i in prnbprots:
|
|
4640 if i not in otherprots:
|
|
4641 otherprots.append(i)
|
|
4642 if detecteddomainsdict.has_key(i):
|
|
4643 detdomlist = detecteddomainsdict[i]
|
|
4644 detdomlist.append(["Pyrrolnitrin biosynthesis PrnB domain",prnbscores[prnbprots.index(i)]])
|
|
4645 detecteddomainsdict[i] = detdomlist
|
|
4646 else:
|
|
4647 detecteddomainsdict[i] = [["Pyrrolnitrin biosynthesis PrnB domain",prnbscores[prnbprots.index(i)]]]
|
|
4648 if 5 not in geneclustertypes and 1 not in geneclustertypes:
|
|
4649 nrpsprots = []
|
|
4650 if 4 not in geneclustertypes and 1 not in geneclustertypes:
|
|
4651 t3pksprots = []
|
|
4652 if 3 not in geneclustertypes and 1 not in geneclustertypes:
|
|
4653 t2pksprots = []
|
|
4654 if 2 not in geneclustertypes and 1 not in geneclustertypes:
|
|
4655 t1pksprots = []
|
|
4656 t4pksprots = []
|
|
4657 transatpksprots = []
|
|
4658 #Assemble all core sec met proteins
|
|
4659 allsecmetprots = []
|
|
4660 for i in t1pksprots:
|
|
4661 if i not in allsecmetprots:
|
|
4662 allsecmetprots.append(i)
|
|
4663 for i in transatpksprots:
|
|
4664 if i not in allsecmetprots:
|
|
4665 allsecmetprots.append(i)
|
|
4666 for i in t2pksprots:
|
|
4667 if i not in allsecmetprots:
|
|
4668 allsecmetprots.append(i)
|
|
4669 for i in t3pksprots:
|
|
4670 if i not in allsecmetprots:
|
|
4671 allsecmetprots.append(i)
|
|
4672 for i in t4pksprots:
|
|
4673 if i not in allsecmetprots:
|
|
4674 allsecmetprots.append(i)
|
|
4675 for i in nrpsprots:
|
|
4676 if i not in allsecmetprots:
|
|
4677 allsecmetprots.append(i)
|
|
4678 for i in terpeneprots:
|
|
4679 if i not in allsecmetprots:
|
|
4680 allsecmetprots.append(i)
|
|
4681 for i in lantprots:
|
|
4682 if i not in allsecmetprots:
|
|
4683 allsecmetprots.append(i)
|
|
4684 for i in bcinprots:
|
|
4685 if i not in allsecmetprots:
|
|
4686 allsecmetprots.append(i)
|
|
4687 for i in lactamprots:
|
|
4688 if i not in allsecmetprots:
|
|
4689 allsecmetprots.append(i)
|
|
4690 for i in amglyccyclprots:
|
|
4691 if i not in allsecmetprots:
|
|
4692 allsecmetprots.append(i)
|
|
4693 for i in siderophoreprots:
|
|
4694 if i not in allsecmetprots:
|
|
4695 allsecmetprots.append(i)
|
|
4696 for i in ectprots:
|
|
4697 if i not in allsecmetprots:
|
|
4698 allsecmetprots.append(i)
|
|
4699 for i in butyrprots:
|
|
4700 if i not in allsecmetprots:
|
|
4701 allsecmetprots.append(i)
|
|
4702 for i in indoleprots:
|
|
4703 if i not in allsecmetprots:
|
|
4704 allsecmetprots.append(i)
|
|
4705 for i in nucleoprots:
|
|
4706 if i not in allsecmetprots:
|
|
4707 allsecmetprots.append(i)
|
|
4708 for i in phosphoprots:
|
|
4709 if i not in allsecmetprots:
|
|
4710 allsecmetprots.append(i)
|
|
4711 for i in melaninprots:
|
|
4712 if i not in allsecmetprots:
|
|
4713 allsecmetprots.append(i)
|
|
4714 for i in aminocoumarinprots:
|
|
4715 if i not in allsecmetprots:
|
|
4716 allsecmetprots.append(i)
|
|
4717 for i in otherprots:
|
|
4718 if i not in allsecmetprots:
|
|
4719 allsecmetprots.append(i)
|
|
4720 allsecmetprots.sort()
|
|
4721
|
|
4722 if len(allsecmetprots) == 0:
|
|
4723 logfile.write("No secondary metabolite biosynthesis gene clusters detected in this nucleotide file.\n")
|
|
4724 logfile.close()
|
|
4725 print >> sys.stderr, "No secondary metabolite biosynthesis gene clusters detected in this nucleotide file."
|
|
4726 sys.exit(1)
|
|
4727
|
|
4728 elapsed = (time.time() - starttime)
|
|
4729 #print "4713Time since start: " + str(elapsed)
|
|
4730
|
|
4731 #Extract approximate gene clusters based on hmmsearch results, create list of core PKS / NRPS genes for further analysis (use less strict parameters for this then in gene cluster detection to include all PKS/NRPS domains)
|
|
4732 #Create nucleotide fasta files with sec met gene clusters
|
|
4733 #print "Extracting gene clusters from gbk/embl file using detected signature genes..."
|
|
4734 logfile.write("Extracting gene clusters from gbk/embl file using detected signature genes...\n")
|
|
4735 fastafile = open(genomename + "/clusterblast/geneclusterprots.fasta","w")
|
|
4736 txtfile = open(genomename + "/clusterblast/geneclusters.txt","w")
|
|
4737 wb = Workbook()
|
|
4738 font1 = Font()
|
|
4739 style1 = XFStyle()
|
|
4740 style1.font = font1
|
|
4741 font1.bold = True
|
|
4742 ws0 = wb.add_sheet('0')
|
|
4743 ws0.write(0,0,"Input accession number",style1)
|
|
4744 ws0.write(0,1,"Input name",style1)
|
|
4745 ws0.write(0,2,"Gene cluster type",style1)
|
|
4746 ws0.write(0,3,"Gene cluster genes",style1)
|
|
4747 if clusterblast == "y":
|
|
4748 ws0.write(0,4,"Compound with gene cluster of highest homology",style1)
|
|
4749 protcodes = allsecmetprots
|
|
4750 nuccode = genomename
|
|
4751 gbkfile = open(infile,"r")
|
|
4752 output = gbkfile.read()
|
|
4753 output = output.replace("\r","\n")
|
|
4754 #Extract description of nucleotide from gbk/embl file
|
|
4755 if ".gbk" in infile or ".GBK" in infile or ".gb" in infile or ".GB" in infile or ".genbank" in infile or ".GENBANK" in infile:
|
|
4756 try:
|
|
4757 nucname1 = output.split("ACCESSION ")[0]
|
|
4758 nucname2 = nucname1.split("DEFINITION ")[1]
|
|
4759 nucname3 = nucname2.replace("\n","")
|
|
4760 while " " in nucname3:
|
|
4761 nucname3 = nucname3.replace(" "," ")
|
|
4762 nucname = nucname3
|
|
4763 except(KeyError,IOError,IndexError):
|
|
4764 nucname = "input_nucleotide"
|
|
4765 elif ".embl" in infile or ".EMBL" in infile or ".emb" in infile or ".EMB" in infile:
|
|
4766 try:
|
|
4767 nucname1 = output.split("DE ")[1]
|
|
4768 nucname2 = nucname1.split("\n")[0]
|
|
4769 nucname3 = nucname2.replace("\n","")
|
|
4770 while " " in nucname3:
|
|
4771 nucname3 = nucname3.replace(" "," ")
|
|
4772 nucname = nucname3
|
|
4773 except(KeyError,IOError,IndexError):
|
|
4774 nucname = "input_nucleotide"
|
|
4775 protstartlocations = []
|
|
4776 protendlocations = []
|
|
4777 genelist = proteins[2]
|
|
4778 genedict = proteins[3]
|
|
4779 #Save all locations of query proteins on the nucleotide in a list
|
|
4780 for j in protcodes:
|
|
4781 if j in genelist:
|
|
4782 protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
|
|
4783 protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
|
|
4784 protstartlocations.append(protstart_abs)
|
|
4785 protendlocations.append(protend_abs)
|
|
4786 #Identify clusters of genes based on protein locations on the nucleotide
|
|
4787 clusterstarts = []
|
|
4788 clusterends = []
|
|
4789 protstartlocations.sort()
|
|
4790 protendlocations.sort()
|
|
4791 nrlocations = len(protstartlocations)
|
|
4792 a = 0
|
|
4793 for i in protstartlocations:
|
|
4794 if a == 0:
|
|
4795 start = str(i)
|
|
4796 clusterstarts.append(start)
|
|
4797 if len(protendlocations) == 1:
|
|
4798 clusterends.append(protendlocations[a])
|
|
4799 elif a == nrlocations - 1:
|
|
4800 if i < ((protendlocations[a - 1]) + 20000):
|
|
4801 clusterends.append(str(protendlocations[a]))
|
|
4802 else:
|
|
4803 end = str(protendlocations[a - 1])
|
|
4804 clusterends.append(end)
|
|
4805 clusterstarts.append(str(i))
|
|
4806 clusterends.append(str(protendlocations[a]))
|
|
4807 else:
|
|
4808 if i > ((protendlocations[a - 1]) + 20000):
|
|
4809 clusterends.append(str(protendlocations[a - 1]))
|
|
4810 start = str(i)
|
|
4811 clusterstarts.append(start)
|
|
4812 else:
|
|
4813 pass
|
|
4814 a += 1
|
|
4815 lastendlocation = i
|
|
4816 #Extend clusters with 20kb on each side of the identified core genes
|
|
4817 clusterstarts2 = []
|
|
4818 for i in clusterstarts:
|
|
4819 j = int(i) - 20000
|
|
4820 if j < 0:
|
|
4821 j = 0
|
|
4822 clusterstarts2.append(j)
|
|
4823 clusterstarts = clusterstarts2
|
|
4824 clusterends2 = []
|
|
4825 for i in clusterends:
|
|
4826 j = int(i) + 20000
|
|
4827 clusterends2.append(j)
|
|
4828 clusterends = clusterends2
|
|
4829 #For each genbank secondary metabolite gene cluster: extract all proteins and write to fasta,
|
|
4830 a = 0
|
|
4831 clusterinfo = {}
|
|
4832 geneclusters = []
|
|
4833 geneclustergenes = []
|
|
4834 allcoregenes = []
|
|
4835 for i in clusterstarts:
|
|
4836 cstart = int(i)
|
|
4837 cend = int(clusterends[a])
|
|
4838 a += 1
|
|
4839 clusternr = a
|
|
4840 geneclusters.append(clusternr)
|
|
4841 coregenes = []
|
|
4842 clustergenes = []
|
|
4843 #For each gene in nucleotide, check if it is inside this cluster; if, so append info to list of clustergenes
|
|
4844 if a == 1:
|
|
4845 for i in genelist:
|
|
4846 geneinfo = genedict[i][:-1]
|
|
4847 geneinfo.append(i)
|
|
4848 genedict[i] = geneinfo
|
|
4849 for i in genelist:
|
|
4850 geneinfo = genedict[i]
|
|
4851 genestart = int(geneinfo[0])
|
|
4852 geneend = int(geneinfo[1])
|
|
4853 if (genestart > cstart and genestart < cend) or (geneend > cstart and geneend < cend):
|
|
4854 clustergenes.append(geneinfo)
|
|
4855 #Determine type of cluster
|
|
4856 type = "other"
|
|
4857 z = 0
|
|
4858 for k in clustergenes:
|
|
4859 i = k[4]
|
|
4860 if i in t1pksprots:
|
|
4861 if z == 0:
|
|
4862 type = "t1pks"
|
|
4863 elif "t1pks" not in type:
|
|
4864 type = type + "-t1pks"
|
|
4865 z = 1
|
|
4866 if i in transatpksprots:
|
|
4867 if z == 0:
|
|
4868 type = "transatpks"
|
|
4869 elif "transatpks" not in type:
|
|
4870 type = type + "-transatpks"
|
|
4871 z = 1
|
|
4872 if i in t2pksprots:
|
|
4873 if z == 0:
|
|
4874 type = "t2pks"
|
|
4875 elif "t2pks" not in type:
|
|
4876 type = type + "-t2pks"
|
|
4877 z = 1
|
|
4878 if i in t3pksprots:
|
|
4879 if z == 0:
|
|
4880 type = "t3pks"
|
|
4881 elif "t3pks" not in type:
|
|
4882 type = type + "-t3pks"
|
|
4883 z = 1
|
|
4884 if i in t4pksprots:
|
|
4885 if z == 0:
|
|
4886 type = "t1pks"
|
|
4887 elif "t1pks" not in type:
|
|
4888 type = type + "-t1pks"
|
|
4889 z = 1
|
|
4890 if i in nrpsprots:
|
|
4891 if z == 0:
|
|
4892 type = "nrps"
|
|
4893 elif "nrps" not in type:
|
|
4894 type = type + "-nrps"
|
|
4895 z = 1
|
|
4896 if i in terpeneprots:
|
|
4897 if z == 0:
|
|
4898 type= "terpene"
|
|
4899 elif "terpene" not in type:
|
|
4900 type = type + "-terpene"
|
|
4901 z = 1
|
|
4902 if i in lantprots:
|
|
4903 if z == 0:
|
|
4904 type= "lant"
|
|
4905 elif "lant" not in type:
|
|
4906 type = type + "-lant"
|
|
4907 z = 1
|
|
4908 if i in bcinprots:
|
|
4909 if z == 0:
|
|
4910 type= "bcin"
|
|
4911 elif "bcin" not in type:
|
|
4912 type = type + "-bcin"
|
|
4913 z = 1
|
|
4914 if i in lactamprots:
|
|
4915 if z == 0:
|
|
4916 type = "blactam"
|
|
4917 elif "blactam" not in type:
|
|
4918 type = type + "-blactam"
|
|
4919 z = 1
|
|
4920 if i in amglyccyclprots:
|
|
4921 if z == 0:
|
|
4922 type = "amglyccycl"
|
|
4923 elif "amglyccycl" not in type:
|
|
4924 type = type + "-amglyccycl"
|
|
4925 z = 1
|
|
4926 if i in siderophoreprots:
|
|
4927 if z == 0:
|
|
4928 type = "siderophore"
|
|
4929 elif "siderophore" not in type:
|
|
4930 type = type + "-siderophore"
|
|
4931 z = 1
|
|
4932 if i in ectprots:
|
|
4933 if z == 0:
|
|
4934 type = "ectoine"
|
|
4935 elif "ectoine" not in type:
|
|
4936 type = type + "-ectoine"
|
|
4937 z = 1
|
|
4938 if i in indoleprots:
|
|
4939 if z == 0:
|
|
4940 type = "indole"
|
|
4941 elif "indole" not in type:
|
|
4942 type = type + "-indole"
|
|
4943 z = 1
|
|
4944 if i in nucleoprots:
|
|
4945 if z == 0:
|
|
4946 type = "nucleoside"
|
|
4947 elif "nucleoside" not in type:
|
|
4948 type = type + "-nucleoside"
|
|
4949 z = 1
|
|
4950 if i in phosphoprots:
|
|
4951 if z == 0:
|
|
4952 type = "phosphoglycolipid"
|
|
4953 elif "phosphoglycolipid" not in type:
|
|
4954 type = type + "-phosphoglycolipid"
|
|
4955 z = 1
|
|
4956 if i in butyrprots:
|
|
4957 if z == 0:
|
|
4958 type = "butyrolactone"
|
|
4959 elif "butyrolactone" not in type:
|
|
4960 type = type + "-butyrolactone"
|
|
4961 z = 1
|
|
4962 if i in melaninprots:
|
|
4963 if z == 0:
|
|
4964 type = "melanin"
|
|
4965 elif "melanin" not in type:
|
|
4966 type = type + "-melanin"
|
|
4967 z = 1
|
|
4968 if i in aminocoumarinprots:
|
|
4969 if z == 0:
|
|
4970 type = "aminocoumarin"
|
|
4971 elif "aminocoumarin" not in type:
|
|
4972 type = type + "-aminocoumarin"
|
|
4973 z = 1
|
|
4974 if "other-" in type[:6]:
|
|
4975 type = type[6:]
|
|
4976 #Shorten gene cluster if type is among typically short gene cluster types
|
|
4977 if cend > dnaseqlength:
|
|
4978 cend = dnaseqlength
|
|
4979 if type == "t3pks" or type == "t2pks":
|
|
4980 if cstart != 0:
|
|
4981 cstart = cstart + 5000
|
|
4982 if cend != dnaseqlength:
|
|
4983 cend = cend - 5000
|
|
4984 clustergenes2 = []
|
|
4985 for i in clustergenes:
|
|
4986 start = int(i[0])
|
|
4987 end = int(i[1])
|
|
4988 if (start > cstart and start < cend) or (end > cstart and end < cend):
|
|
4989 clustergenes2.append(i)
|
|
4990 clustergenes = clustergenes2
|
|
4991 if type == "bcin" or type == "siderophore" or type == "lant" or type == "terpene":
|
|
4992 if cstart != 0:
|
|
4993 cstart = cstart + 10000
|
|
4994 if cend != dnaseqlength:
|
|
4995 cend = cend - 10000
|
|
4996 clustergenes2 = []
|
|
4997 for i in clustergenes:
|
|
4998 start = int(i[0])
|
|
4999 end = int(i[1])
|
|
5000 if (start > cstart and start < cend) or (end > cstart and end < cend):
|
|
5001 clustergenes2.append(i)
|
|
5002 clustergenes = clustergenes2
|
|
5003 if type == "butyrolactone" or type == "melanin" or type == "ectoine":
|
|
5004 if cstart != 0:
|
|
5005 cstart = cstart + 17000
|
|
5006 if cend != dnaseqlength:
|
|
5007 cend = cend - 17000
|
|
5008 clustergenes2 = []
|
|
5009 for i in clustergenes:
|
|
5010 start = int(i[0])
|
|
5011 end = int(i[1])
|
|
5012 if (start > cstart and start < cend) or (end > cstart and end < cend):
|
|
5013 clustergenes2.append(i)
|
|
5014 clustergenes = clustergenes2
|
|
5015 #For all clustergenes, write info to fasta
|
|
5016 for i in clustergenes:
|
|
5017 start = str(i[0])
|
|
5018 end = str(i[1])
|
|
5019 strand = i[2]
|
|
5020 seq = seqdict[i[4]]
|
|
5021 ann = i[3].replace(" ","_")
|
|
5022 accession = i[4]
|
|
5023 name = nuccode + "|c" + str(a) + "|" + start + "-" + end + "|" + strand + "|" + accession + "|" + ann
|
|
5024 fastafile.write(">" + name + "\n" + seq + "\n")
|
|
5025 if accession not in geneclustergenes:
|
|
5026 geneclustergenes.append(accession)
|
|
5027 #Write gene cluster info to separate txt file
|
|
5028 txtfile.write(nuccode + "\t" + nucname + "\t" + "c" + str(a) + "\t" + type + "\t")
|
|
5029 ws0.write(a,0,genomic_accnr)
|
|
5030 try:
|
|
5031 ws0.write(a,1,nucname)
|
|
5032 except:
|
|
5033 ws0.write(a,1,"Name to long to be contained in Excel cell; see txt file in downloadable zip archive.")
|
|
5034 ws0.write(a,2,type)
|
|
5035 xlsgenesfield = ""
|
|
5036 for i in clustergenes:
|
|
5037 txtfile.write(i[4] + ";")
|
|
5038 xlsgenesfield = xlsgenesfield + i[4] + ";"
|
|
5039 txtfile.write("\t")
|
|
5040 for i in clustergenes:
|
|
5041 txtfile.write(accessiondict[i[4]] + ";")
|
|
5042 xlsgenesfield = xlsgenesfield[:-1]
|
|
5043 try:
|
|
5044 ws0.write(a,3,xlsgenesfield)
|
|
5045 except:
|
|
5046 ws0.write(a,3,"Too many genes to be contained in Excel cell; see txt file in downloadable zip archive.")
|
|
5047 txtfile.write("\n")
|
|
5048 #Write gene cluster info to clusterinfo dictionary
|
|
5049 for i in clustergenes:
|
|
5050 if i[4] in allsecmetprots:
|
|
5051 coregenes.append(i[4])
|
|
5052 allcoregenes.append(i[4])
|
|
5053 clusterinfo[clusternr] = [type,cstart,cend,coregenes,clustergenes]
|
|
5054 #Close xls, fasta and txt files
|
|
5055 fastafile.close()
|
|
5056 txtfile.close()
|
|
5057
|
|
5058 #Analysis of core PKS/NRPS genes (separate py), detect subgroups and predict specificities and final products
|
|
5059 #Make list of PKS / NRPS gene clusters to be analysed
|
|
5060 #print "Analysing core PKS/NRPS genes..."
|
|
5061 logfile.write("Analysing core PKS/NRPS genes...\n")
|
|
5062 pksnrpsgeneclusters = []
|
|
5063 pksnrpscoregenes = []
|
|
5064 for i in geneclusters:
|
|
5065 if "t1pks" in clusterinfo[i][0] or "t4pks" in clusterinfo[i][0] or "transatpks" in clusterinfo[i][0] or "nrps" in clusterinfo[i][0]:
|
|
5066 pksnrpsgeneclusters.append(i)
|
|
5067 for i in t1pksprots:
|
|
5068 pksnrpscoregenes.append(i)
|
|
5069 for i in transatpksprots:
|
|
5070 pksnrpscoregenes.append(i)
|
|
5071 for i in t4pksprots:
|
|
5072 pksnrpscoregenes.append(i)
|
|
5073 for i in nrpsprots:
|
|
5074 pksnrpscoregenes.append(i)
|
|
5075 for i in amp_t_prots:
|
|
5076 pksnrpscoregenes.append(i)
|
|
5077 pksnrpsgenestartdict = {}
|
|
5078 for i in pksnrpscoregenes:
|
|
5079 start = int(genedict[i][0])
|
|
5080 pksnrpsgenestartdict[i] = start
|
|
5081 pksnrpscoregenes = sortdictkeysbyvalues(pksnrpsgenestartdict)
|
|
5082 nrpsnames = []
|
|
5083 nrpsseqs = []
|
|
5084 pksnrpsnames = []
|
|
5085 pksnrpsseqs = []
|
|
5086 pksnames = []
|
|
5087 pksseqs = []
|
|
5088 calnames = []
|
|
5089 calseqs = []
|
|
5090 krnames = []
|
|
5091 krseqs = []
|
|
5092 nrpspkstypedict = {}
|
|
5093 domaindict = {}
|
|
5094 if len(pksnrpscoregenes) > 0:
|
|
5095 #Write PKS / NRPS core genes to FASTA file
|
|
5096 for i in pksnrpscoregenes:
|
|
5097 name = i
|
|
5098 seq = seqdict[i]
|
|
5099 pksnrpsnames.append(name)
|
|
5100 pksnrpsseqs.append(seq)
|
|
5101 writefasta(pksnrpsnames,pksnrpsseqs,genomename + "/nrpspks_proteins.fasta")
|
|
5102 #Analyse for abMotifs
|
|
5103 hmmsearch = hmmscan_path + " --cpu " + str(nrcpus) + " -E 0.1 -o " + genomename + "/nrpspks/abmotifshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/abmotifshmm.txt "+ hmms_path +"abmotifs.hmm " + genomename + "/nrpspks_proteins.fasta"
|
|
5104 os.system(hmmsearch)
|
|
5105 mhmmlengthsdict = hmmlengths(hmms_path+"abmotifs.hmm")
|
|
5106 motifdict = hmmscanparse(genomename + "/nrpspks/abmotifshmm_output.txt",mhmmlengthsdict)
|
|
5107 #Analyse for C/A/PCP/E/KS/AT/ATd/DH/KR/ER/ACP/TE/TD/COM/Docking/MT/CAL domains
|
|
5108 hmmsearch = hmmscan_path + " --cut_tc --cpu " + str(nrcpus) + " -o " + genomename + "/nrpspks/nrpspkshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/nrpspkshmm.txt "+ hmms_path +"nrpspksdomains.hmm " + genomename + "/nrpspks_proteins.fasta"
|
|
5109 os.system(hmmsearch)
|
|
5110 hmmlengthsdict = hmmlengths(hmms_path+"nrpspksdomains.hmm")
|
|
5111 domaindict = hmmscanparse(genomename + "/nrpspks/nrpspkshmm_output.txt",hmmlengthsdict)
|
|
5112 nrpspksdomainsfile = open(genomename + "/nrpspks/nrpspksdomains.txt","w")
|
|
5113 #Analyse KS domains & PKS/NRPS protein domain composition to detect NRPS/PKS types
|
|
5114 kshmmsearch = hmmscan_path + " --cut_tc --cpu " + str(nrcpus) + " -o " + genomename + "/nrpspks/kshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/kshmm.txt " + hmms_path + "ksdomains.hmm " + genomename + "/nrpspks_proteins.fasta"
|
|
5115 os.system(kshmmsearch)
|
|
5116 kshmmlengthsdict = hmmlengths(hmms_path+"ksdomains.hmm")
|
|
5117 ksdomaindict = hmmscanparse(genomename + "/nrpspks/kshmm_output.txt",kshmmlengthsdict)
|
|
5118 for k in pksnrpscoregenes:
|
|
5119 #structure of domaindict: domaindict[genename] = [[name,start,end,evalue,score],[name,start,end,evalue,score], etc.]
|
|
5120 domainlist = []
|
|
5121 nrKSdomains = 0
|
|
5122 for i in domaindict[k]:
|
|
5123 domainlist.append(i[0])
|
|
5124 if i[0] == "PKS_KS":
|
|
5125 nrKSdomains += 1
|
|
5126 modKSscore = 0
|
|
5127 traKSscore = 0
|
|
5128 eneKSscore = 0
|
|
5129 iterKSscore = 0
|
|
5130 for i in ksdomaindict[k]:
|
|
5131 if i[0] == "Trans-AT-KS":
|
|
5132 traKSscore += 1
|
|
5133 if i[0] == "Modular-KS":
|
|
5134 modKSscore += 1
|
|
5135 if i[0] == "Enediyne-KS":
|
|
5136 eneKSscore += 1
|
|
5137 if i[0] == "Iterative-KS":
|
|
5138 iterKSscore += 1
|
|
5139 for i in domaindict[k]:
|
|
5140 if "Cglyc" in domainlist and "Epimerization" in domainlist and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
|
|
5141 type = "Glycopeptide NRPS"
|
|
5142 elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist) and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
|
|
5143 type = "NRPS"
|
|
5144 elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist) or "AMP-binding" in domainlist and ("PKS_KS" in domainlist or "PKS_AT" in domainlist):
|
|
5145 type = "Hybrid PKS-NRPS"
|
|
5146 elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" not in domainlist and "Trans-AT_docking" in domainlist and traKSscore > modKSscore and traKSscore > iterKSscore and traKSscore > eneKSscore:
|
|
5147 type = "Type I Trans-AT PKS"
|
|
5148 elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and iterKSscore > modKSscore and iterKSscore > traKSscore and iterKSscore > eneKSscore and nrKSdomains < 3:
|
|
5149 type = "Type I Iterative PKS"
|
|
5150 elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and eneKSscore > modKSscore and eneKSscore > traKSscore and eneKSscore > iterKSscore and nrKSdomains < 3:
|
|
5151 type = "Type I Enediyne PKS"
|
|
5152 elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and ((modKSscore > eneKSscore and modKSscore > traKSscore and modKSscore > iterKSscore) or nrKSdomains > 3):
|
|
5153 type = "Type I Modular PKS"
|
|
5154 elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist:
|
|
5155 type = "PKS-like protein"
|
|
5156 elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist or "AMP-binding" in domainlist) and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
|
|
5157 type = "NRPS-like protein"
|
|
5158 else:
|
|
5159 type = "PKS/NRPS-like protein"
|
|
5160 nrpspkstypedict[k] = type
|
|
5161 #Write data to output file
|
|
5162 for k in pksnrpscoregenes:
|
|
5163 j = domaindict[k]
|
|
5164 l = motifdict[k]
|
|
5165 nrpspksdomainsfile.write(">> " + k + "\n")
|
|
5166 nrpspksdomainsfile.write(">> " + nrpspkstypedict[k] + "\n")
|
|
5167 nrpspksdomainsfile.write("name\tstart\tend\te-value\tscore\n")
|
|
5168 for i in j:
|
|
5169 #nrpspksdomainsfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n")
|
|
5170 nrpspksdomainsfile.write("%s\t%s\t%s\t%s\t%s\n" % (i[0], i[1], i[2], i[3], i[4]) )
|
|
5171 nrpspksdomainsfile.write("** Motifs: **\n")
|
|
5172 for i in l:
|
|
5173 #nrpspksdomainsfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n")
|
|
5174 nrpspksdomainsfile.write("%s\t%s\t%s\t%s\t%s\n" % (i[0], i[1], i[2], i[3], i[4]) )
|
|
5175 nrpspksdomainsfile.write("\n\n")
|
|
5176 nrpspksdomainsfile.close()
|
|
5177
|
|
5178 elapsed = (time.time() - starttime)
|
|
5179 #print "5163Time since start: " + str(elapsed)
|
|
5180
|
|
5181 #Predict NRPS A domain specificities with NRPSPredictor and Minowa et al. method
|
|
5182 #print "Predicting NRPS A domain substrate specificities by NRPSPredictor"
|
|
5183 logfile.write("Predicting NRPS A domain substrate specificities by NRPSPredictor\n")
|
|
5184 #NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor
|
|
5185 for k in pksnrpscoregenes:
|
|
5186 j = domaindict[k]
|
|
5187 nr = 0
|
|
5188 for i in j:
|
|
5189 if i[0] == "AMP-binding" or i[0] == "A-OX":
|
|
5190 nr += 1
|
|
5191 start = int(i[1])
|
|
5192 end = int(i[2]) + 120
|
|
5193 seq = seqdict[k][start:end]
|
|
5194 name = k + "_A" + str(nr)
|
|
5195 nrpsnames.append(name)
|
|
5196 nrpsseqs.append(seq)
|
|
5197 if len(nrpsnames) > 0:
|
|
5198 writefasta(nrpsnames,nrpsseqs,"NRPSPredictor2/nrpsseqs.fasta")
|
|
5199 #nrpspredcommand = "perl nrpsSpecPredictor.pl nrpsseqs.fasta ../" + nrpspredictoroutputfolder + " ." #OLD NRPSPREDICTOR1 command
|
|
5200 os.chdir("NRPSPredictor2/")
|
|
5201 #Get NRPSPredictor2 code predictions, output sig file for input for NRPSPredictor2 SVMs
|
|
5202 if sys.platform == ('win32'):
|
|
5203 nrpspred2codecommand = 'nrpscodepred nrpsseqs.fasta input.sig nrpscodes.txt > nul'
|
|
5204 if sys.platform == ('linux2'):
|
|
5205 nrpspred2codecommand = 'python nrpscodepred.py nrpsseqs.fasta input.sig nrpscodes.txt > /dev/null'
|
|
5206 os.system(nrpspred2codecommand)
|
|
5207 #Run NRPSPredictor2 SVM
|
|
5208 currentdir = os.getcwd()
|
|
5209 if sys.platform == ('win32'):
|
|
5210 nrpspred2command = 'java -Ddatadir="' + currentdir + '\\data" -cp build/NRPSpredictor2.jar;lib/java-getopt-1.0.13.jar;lib/Utilities.jar;lib/libsvm.jar org.roettig.NRPSpredictor2.NRPSpredictor2 -i input.sig -r ..\\' + nrpspredictoroutputfolder + 'nrpspredictor2.out -s 1'
|
|
5211 if sys.platform == ('linux2'):
|
|
5212 nrpspred2command = './NRPSpredictor2.sh -i input.sig -r ../' + nrpspredictoroutputfolder + 'nrpspredictor2.out -s 1'
|
|
5213 os.popen(nrpspred2command)
|
|
5214 #Copy NRPSPredictor results
|
|
5215 if sys.platform == ('win32'):
|
|
5216 copycommand = 'copy/y nrpscodes.txt ..\\' + nrpspredictoroutputfolder.replace("/","\\") + ' > nul'
|
|
5217 if sys.platform == ('linux2'):
|
|
5218 copycommand = 'cp nrpscodes.txt ../' + nrpspredictoroutputfolder + " > /dev/null"
|
|
5219 os.system(copycommand)
|
|
5220 os.chdir("..")
|
|
5221 elapsed = (time.time() - starttime)
|
|
5222 #print "5206Time since start: " + str(elapsed)
|
|
5223 # folgendes bis zum naechsten time braucht 500s, liegt wohl haupsaechlich an schlechtem minowa_A code
|
|
5224 #Minowa method: extract AMP-binding domain, and run Minowa_A
|
|
5225 if len(nrpsnames) > 0:
|
|
5226 #print "Predicting NRPS A domain substrate specificities by Minowa et al. method\n"
|
|
5227 logfile.write("Predicting NRPS A domain substrate specificities by Minowa et al. method")
|
|
5228 nrpsnames2 = []
|
|
5229 nrpsseqs2 = []
|
|
5230 for k in pksnrpscoregenes:
|
|
5231 j = domaindict[k]
|
|
5232 nr = 0
|
|
5233 for i in j:
|
|
5234 if i[0] in ["AMP-binding", "A-OX"]:
|
|
5235 nr += 1
|
|
5236 start = int(i[1])
|
|
5237 end = int(i[2])
|
|
5238 seq = seqdict[k][start:end]
|
|
5239 name = k + "_A" + str(nr)
|
|
5240 nrpsnames2.append(name)
|
|
5241 nrpsseqs2.append(seq)
|
|
5242 writefasta(nrpsnames2,nrpsseqs2,minowanrpsoutputfolder + "nrpsseqs.fasta")
|
|
5243 if sys.platform == ('win32'):
|
|
5244 minowanrpscommand = "minowa_A ../" + minowanrpsoutputfolder + "nrpsseqs.fasta ../" + minowanrpsoutputfolder + "nrpspredoutput.txt"
|
|
5245 if sys.platform == ('linux2'):
|
|
5246 minowanrpscommand = "python minowa_A.py ../" + minowanrpsoutputfolder + "nrpsseqs.fasta ../" + minowanrpsoutputfolder + "nrpspredoutput.txt"
|
|
5247 os.chdir("Minowa/")
|
|
5248 os.system(minowanrpscommand)
|
|
5249 os.chdir("..")
|
|
5250
|
|
5251 elapsed = (time.time() - starttime)
|
|
5252 #print "5235Time since start: " + str(elapsed)
|
|
5253 #Predict PKS AT domain specificities with Minowa et al. method and PKS code (NP searcher / ClustScan / own?)
|
|
5254 for k in pksnrpscoregenes:
|
|
5255 j = domaindict[k]
|
|
5256 nr = 0
|
|
5257 for i in j:
|
|
5258 if i[0] == "PKS_AT":
|
|
5259 nr += 1
|
|
5260 start = int(i[1])
|
|
5261 end = int(i[2])
|
|
5262 seq = seqdict[k][start:end]
|
|
5263 name = k + "_AT" + str(nr)
|
|
5264 pksnames.append(name)
|
|
5265 pksseqs.append(seq)
|
|
5266 if len(pksnames) > 0:
|
|
5267 writefasta(pksnames,pksseqs,pkssignatureoutputfolder + "pksseqs.fasta")
|
|
5268 writefasta(pksnames,pksseqs,minowapksoutputfolder + "pksseqs.fasta")
|
|
5269 #Run PKS signature analysis
|
|
5270 elapsed = (time.time() - starttime)
|
|
5271 #print "5254Time since start: " + str(elapsed)
|
|
5272 print "Predicting PKS AT domain substrate specificities by Yadav et al. PKS signature sequences"
|
|
5273 logfile.write("Predicting PKS AT domain substrate specificities by Yadav et al. PKS signature sequences\n")
|
|
5274 if sys.platform == ('win32'):
|
|
5275 pkspredcommand = "PKS_analysis ../" + pkssignatureoutputfolder + "pksseqs.fasta ../" + pkssignatureoutputfolder + "pkspredoutput.txt"
|
|
5276 if sys.platform == ('linux2'):
|
|
5277 pkspredcommand = "python PKS_analysis.py ../" + pkssignatureoutputfolder + "pksseqs.fasta ../" + pkssignatureoutputfolder + "pkspredoutput.txt"
|
|
5278 os.chdir("pkssignatures/")
|
|
5279 os.system(pkspredcommand)
|
|
5280 os.chdir("..")
|
|
5281 #Minowa method: run Minowa_AT
|
|
5282 elapsed = (time.time() - starttime)
|
|
5283 #print "5266Time since start: " + str(elapsed)
|
|
5284 print "Predicting PKS AT domain substrate specificities by Minowa et al. method"
|
|
5285 logfile.write("Predicting PKS AT domain substrate specificities by Minowa et al. method\n")
|
|
5286 if sys.platform == ('win32'):
|
|
5287 minowapkscommand = "minowa_AT ../" + minowapksoutputfolder + "pksseqs.fasta ../" + minowapksoutputfolder + "pkspredoutput.txt"
|
|
5288 if sys.platform == ('linux2'):
|
|
5289 minowapkscommand = "python minowa_AT.py ../" + minowapksoutputfolder + "pksseqs.fasta ../" + minowapksoutputfolder + "pkspredoutput.txt"
|
|
5290 os.chdir("Minowa/")
|
|
5291 os.system(minowapkscommand)
|
|
5292 os.chdir("..")
|
|
5293
|
|
5294 #Predict PKS CAL domain specificities with Minowa et al. method
|
|
5295 elapsed = (time.time() - starttime)
|
|
5296 #print "5279Time since start: " + str(elapsed)
|
|
5297 print "Predicting CAL domain substrate specificities by Minowa et al. method"
|
|
5298 logfile.write("Predicting CAL domain substrate specificities by Minowa et al. method\n")
|
|
5299 for k in pksnrpscoregenes:
|
|
5300 j = domaindict[k]
|
|
5301 nr = 0
|
|
5302 for i in j:
|
|
5303 if i[0] == "CAL_domain":
|
|
5304 nr += 1
|
|
5305 start = int(i[1])
|
|
5306 end = int(i[2])
|
|
5307 seq = seqdict[k][start:end]
|
|
5308 name = k + "_CAL" + str(nr)
|
|
5309 calnames.append(name)
|
|
5310 calseqs.append(seq)
|
|
5311 if len(calnames) > 0:
|
|
5312 writefasta(calnames,calseqs,minowacaloutputfolder + "calseqs.fasta")
|
|
5313 if sys.platform == ('win32'):
|
|
5314 minowacalcommand = "minowa_CAL ../" + minowacaloutputfolder + "calseqs.fasta ../" + minowacaloutputfolder + "calpredoutput.txt"
|
|
5315 if sys.platform == ('linux2'):
|
|
5316 minowacalcommand = "python minowa_CAL.py ../" + minowacaloutputfolder + "calseqs.fasta ../" + minowacaloutputfolder + "calpredoutput.txt"
|
|
5317 os.chdir("Minowa/")
|
|
5318 os.system(minowacalcommand)
|
|
5319 os.chdir("..")
|
|
5320
|
|
5321 elapsed = (time.time() - starttime)
|
|
5322 #print "5305Time since start: " + str(elapsed)
|
|
5323 #Predict PKS KR domain stereochemistry using pattern as published in ClustScan
|
|
5324 print "Predicting PKS KR activity and stereochemistry using KR fingerprints from Starcevic et al."
|
|
5325 logfile.write("Predicting PKS KR activity and stereochemistry using KR fingerprints from Starcevic et al.\n")
|
|
5326 for k in pksnrpscoregenes:
|
|
5327 j = domaindict[k]
|
|
5328 nr = 0
|
|
5329 for i in j:
|
|
5330 if i[0] == "PKS_KR":
|
|
5331 nr += 1
|
|
5332 start = int(i[1])
|
|
5333 end = int(i[2])
|
|
5334 seq = seqdict[k][start:end]
|
|
5335 name = k + "_KR" + str(nr)
|
|
5336 krnames.append(name)
|
|
5337 krseqs.append(seq)
|
|
5338 if len(krnames) > 0:
|
|
5339 writefasta(krnames,krseqs,kranalysisoutputfolder + "krseqs.fasta")
|
|
5340 if sys.platform == ('win32'):
|
|
5341 kranalysiscommand = "kr_analysis ../" + kranalysisoutputfolder + "krseqs.fasta ../" + kranalysisoutputfolder + "krpredoutput.txt"
|
|
5342 if sys.platform == ('linux2'):
|
|
5343 kranalysiscommand = "python kr_analysis.py ../" + kranalysisoutputfolder + "krseqs.fasta ../" + kranalysisoutputfolder + "krpredoutput.txt"
|
|
5344 os.chdir("kr_analysis/")
|
|
5345 os.system(kranalysiscommand)
|
|
5346 os.chdir("..")
|
|
5347
|
|
5348 #Read and parse all substrate specificity prediction output files
|
|
5349 minowa_nrps_preds = {}
|
|
5350 minowa_nrps_preds_details = {}
|
|
5351 nrps_svm_preds = {}
|
|
5352 nrps_svm_preds_details = {}
|
|
5353 nrps_code_preds = {}
|
|
5354 nrps_code_preds_details = {}
|
|
5355 substratetransdict2 = {'pipecolate':'pip','fOHOrn':'orn','beta-Lys':'blys','5NhOrn':'orn','OHOrn':'orn','Aad':'Aaa','bOHTyr':'bht'}
|
|
5356 if len(nrpsnames) > 0:
|
|
5357 minowa_a_file = open(minowanrpsoutputfolder + "nrpspredoutput.txt","r")
|
|
5358 minowa_a_file = minowa_a_file.read()
|
|
5359 minowa_a_file = minowa_a_file.replace("\r","\n")
|
|
5360 parts = minowa_a_file.split("\\\\\n")[1:]
|
|
5361 for i in parts:
|
|
5362 partlines = i.split("\n")
|
|
5363 acc = partlines[0]
|
|
5364 tophit = partlines[2].split("\t")[0]
|
|
5365 if tophit in substratetransdict2.keys():
|
|
5366 tophit = substratetransdict2[tophit]
|
|
5367 minowa_nrps_preds[acc] = tophit.lower()
|
|
5368 minowa_nrps_preds_details[acc] = "<b>Minowa HMM method A-domain<br>Substrate specificity prediction top hits:</b><br>\n" + partlines[1] + "<br>\n" + partlines[2] + "<br>\n" + partlines[3] + "<br>\n" + partlines[4] + "<br><br>\n\n"
|
|
5369 nrpspredictorfile1 = open(nrpspredictoroutputfolder + "nrpspredictor2.out","r")
|
|
5370 nrpspredictorfile2 = open(nrpspredictoroutputfolder + "nrpscodes.txt","r")
|
|
5371 nrpspredictorfile1 = nrpspredictorfile1.read()
|
|
5372 nrpspredictorfile1 = nrpspredictorfile1.replace("\r","\n")
|
|
5373 lines = nrpspredictorfile1.split("\n")[1:-1]
|
|
5374 for k in lines:
|
|
5375 tabs = k.split("\t")
|
|
5376 nrps_svm_preds[tabs[0]] = tabs[6]
|
|
5377 nrps_svm_preds_details[tabs[0]] = "<b> NRPSPredictor2 SVM prediction details:</b><br>\n8 Angstrom 34 AA code:<br>\n" + tabs[1] + "<br>\nPredicted physicochemical class:<br>\n" + tabs[3] + "<br>\nLarge clusters prediction:<br>\n" + tabs[4] + "<br>\nSmall clusters prediction:<br>\n" + tabs[5] + "<br>\nSingle AA prediction:<br>\n" + tabs[6] + "<br><br>\n\n"
|
|
5378 nrpspredictorfile2 = nrpspredictorfile2.read()
|
|
5379 nrpspredictorfile2 = nrpspredictorfile2.replace("\r","\n")
|
|
5380 lines = nrpspredictorfile2.split("\n")[:-1]
|
|
5381 for k in lines:
|
|
5382 tabs = k.split("\t")
|
|
5383 nrps_code_preds[tabs[0]] = tabs[1]
|
|
5384 nrps_code_preds_details[tabs[0]] = "<b> NRPSPredictor2 Stachelhaus code prediction:</b><br>\n" + tabs[1] + "<br><br>\n\n"
|
|
5385 minowa_pks_preds_details = {}
|
|
5386 minowa_pks_preds = {}
|
|
5387 pks_code_preds ={}
|
|
5388 pks_code_preds_details ={}
|
|
5389 substratetransdict = {'Malonyl-CoA':'mal','Methylmalonyl-CoA':'mmal','Methoxymalonyl-CoA':'mxmal','Ethylmalonyl-CoA':'emal','Isobutyryl-CoA':'isobut','2-Methylbutyryl-CoA':'2metbut','trans-1,2-CPDA':'trans-1,2-CPDA','Acetyl-CoA':'Acetyl-CoA','Benzoyl-_CoA':'benz','Propionyl-CoA':'prop','3-Methylbutyryl-CoA':'3metbut','Ethylmalonyl-CoA':'Ethyl_mal','CE-Malonyl-CoA':'cemal','2-Rhyd-Malonyl-CoA':'2Rhydmal','CHC-CoA':'CHC-CoA','inactive':'inactive'}
|
|
5390 if len(pksnames) > 0:
|
|
5391 minowa_at_file = open(minowapksoutputfolder + "pkspredoutput.txt","r")
|
|
5392 minowa_at_file = minowa_at_file.read()
|
|
5393 minowa_at_file = minowa_at_file.replace("\r","\n")
|
|
5394 parts = minowa_at_file.split("\\\\\n")[1:]
|
|
5395 for i in parts:
|
|
5396 partlines = i.split("\n")
|
|
5397 acc = partlines[0]
|
|
5398 if substratetransdict.has_key(partlines[2].split("\t")[0]):
|
|
5399 tophit = substratetransdict[partlines[2].split("\t")[0]]
|
|
5400 else:
|
|
5401 tophit = "pk"
|
|
5402 minowa_pks_preds[acc] = tophit
|
|
5403 minowa_pks_preds_details[acc] = "<b>Minowa HMM method AT-domain<br>Substrate specificity prediction top hits:</b><br>\n" + partlines[1] + "<br>\n" + partlines[2] + "<br>\n" + partlines[3] + "<br>\n" + partlines[4] + "<br><br>\n\n"
|
|
5404 pkssignaturefile = open(pkssignatureoutputfolder + "pkspredoutput.txt","r")
|
|
5405 pkssignaturefile = pkssignaturefile.read()
|
|
5406 pkssignaturefile = pkssignaturefile.replace("\r","\n")
|
|
5407 parts = pkssignaturefile.split("//\n")[1:]
|
|
5408 for i in parts:
|
|
5409 partlines = i.split("\n")
|
|
5410 partlines2 = []
|
|
5411 for j in partlines:
|
|
5412 if j != "":
|
|
5413 partlines2.append(j)
|
|
5414 partlines = partlines2
|
|
5415 acc = partlines[0].split("\t")[0]
|
|
5416 if len(partlines) > 2:
|
|
5417 tophit = (partlines[1].split("\t")[0]).split("__")[1]
|
|
5418 pks_code_preds[acc] = tophit
|
|
5419 codes = []
|
|
5420 prots = []
|
|
5421 scores = []
|
|
5422 for i in partlines[1:4]:
|
|
5423 codes.append(i.split("\t")[0])
|
|
5424 prot = i.split("\t")[1]
|
|
5425 prot = prot.replace("_AT"," (AT")
|
|
5426 prot = prot.replace("__","): ")
|
|
5427 prots.append(prot)
|
|
5428 scores.append(i.split("\t")[2])
|
|
5429 if len(prots) >= 3:
|
|
5430 pks_code_preds_details[acc] = "<b>PKS Active Site Signature method<br>AT-domain substrate specificity prediction top hits:</b><br>\nCode:" + partlines[0].split("\t")[1] + "<br>\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)<br>\n" + codes[1] + " - " + prots[1] + " : (" + scores[1] + "% identity)<br>\n" + codes[2] + " - " + prots[2] + " : (" + scores[2] + "% identity)<br><br>\n\n"
|
|
5431 elif len(prots) == 2:
|
|
5432 pks_code_preds_details[acc] = "<b>PKS Active Site Signature method<br>AT-domain substrate specificity prediction top hits:</b><br>\nCode:" + partlines[0].split("\t")[1] + "<br>\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)<br>\n" + codes[1] + " - " + prots[1] + " : (" + scores[1] + "% identity)<br><br>\n\n"
|
|
5433 elif len(prots) == 1:
|
|
5434 pks_code_preds_details[acc] = "<b>PKS Active Site Signature method<br>AT-domain substrate specificity prediction top hits:</b><br>\nCode:" + partlines[0].split("\t")[1] + "<br>\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)<br><br>\n\n"
|
|
5435 else:
|
|
5436 pks_code_preds[acc] = "N/A"
|
|
5437 pks_code_preds_details[acc] = "<b>PKS Active Site Signature method<br>No AT-domain substrate specificity prediction hits above 40% identity.<br>\n\n"
|
|
5438 minowa_cal_preds = {}
|
|
5439 minowa_cal_preds_details = {}
|
|
5440 if len(calnames) > 0:
|
|
5441 minowa_cal_file = open(minowacaloutputfolder + "calpredoutput.txt","r")
|
|
5442 minowa_cal_file = minowa_cal_file.read()
|
|
5443 minowa_cal_file = minowa_cal_file.replace("\r","\n")
|
|
5444 parts = minowa_cal_file.split("\\\\\n")[1:]
|
|
5445 for i in parts:
|
|
5446 partlines = i.split("\n")
|
|
5447 acc = partlines[0]
|
|
5448 tophit = partlines[2].split("\t")[0]
|
|
5449 minowa_cal_preds[acc] = tophit
|
|
5450 minowa_cal_preds_details[acc] = "<b>Minowa HMM method<br>CAL-domain substrate specificity prediction top hits:</b><br>\n" + partlines[1] + "<br>\n" + partlines[2] + "<br>\n" + partlines[3] + "<br>\n" + partlines[4] + "<br><br>\n\n"
|
|
5451 kr_activity_preds = {}
|
|
5452 kr_stereo_preds = {}
|
|
5453 if len(krnames) > 0:
|
|
5454 krfile = open(kranalysisoutputfolder + "krpredoutput.txt","r")
|
|
5455 krfile = krfile.read()
|
|
5456 krfile = krfile.replace("\r","\n")
|
|
5457 krlines = krfile.split("\n")[:-1]
|
|
5458 for i in krlines:
|
|
5459 tabs = i.split("\t")
|
|
5460 kr_activity_preds[tabs[0]] = tabs[1]
|
|
5461 kr_stereo_preds[tabs[0]] = tabs[2]
|
|
5462
|
|
5463 #Combine substrate specificity predictions into consensus prediction
|
|
5464 consensuspreds = {}
|
|
5465 #available_smiles_parts = ['ALA','ARG','ASN','ASP','CYS','GLN','GLU','GLY','HIS','ILE','LEU','LYS','PHE','PRO','SER','THR','TRP','TYR','VAL','MET','ORN','ala','arg','asn','asp','cys','gln','glu','gly','his','ile','leu','lys','phe','pro','ser','thr','trp','tyr','val','met','orn','Ala','Arg','Asn','Asp','Cys','Gln','Glu','Gly','His','Ile','Leu','Lys','Phe','Pro','Ser','Thr','Trp','Tyr','Val','Met','Orn','MPRO','23DHB','34DHB','2HIVA','PGLY','DAB','BALA','AEO','4MHA','PICO','AAA','DHA','SCY','PIP','BMT','ADDS','mpro','23dhb','34dhb','2hiva','pgly','dab','bala','aeo','4mha','pico','aaa','dha','scy','pip','bmt','adds','Mpro','23Dhb','34Dhb','2Hiva','Pgly','Dab','Bala','Aeo','4Mha','Pico','Aaa','Dha','Scy','Pip','Bmt','Adds','mal','mmal','omal','emal','nrp','pk']
|
|
5466 available_smiles_parts = ['GLY','ALA','VAL','LEU','ILE','MET','PRO','PHE','TRP','SER','THR','ASN','GLN','TYR','CYS','LYS','ARG','HIS','ASP','GLU','MPRO','ORN','PGLY','DAB','BALA','AEO','DHA','PIP','BMT','gly','ala','val','leu','ile','met','pro','phe','trp','ser','thr','asn','gln','tyr','cys','lys','arg','his','asp','glu','aaa','mpro','dhb','2hiva','orn','pgly','dab','bala','aeo','4mha','pico','phg','dha','scy','pip','bmt','adds','aad','abu','hiv','dhpg','bht','3-me-glu','4pPro','ala-b','ala-d','dht','Sal','tcl','lys-b','hpg','hyv-d','iva','vol','mal','mmal','mxmal','emal','nrp','pk','Gly','Ala','Val','Leu','Ile','Met','Pro','Phe','Trp','Ser','Thr','Asn','Gln','Tyr','Cys','Lys','Arg','His','Asp','Glu','Mpro','23Dhb','34Dhb','2Hiva','Orn','Pgly','Dab','Bala','Aeo','4Mha','Pico','Aaa','Dha','Scy','Pip','Bmt','Adds','DHpg','DHB','nrp','pk']
|
|
5467 for i in pksnrpscoregenes:
|
|
5468 nra = 0
|
|
5469 nrat = 0
|
|
5470 nrcal = 0
|
|
5471 j = domaindict[i]
|
|
5472 for k in j:
|
|
5473 if k[0] == "PKS_AT":
|
|
5474 nrat += 1
|
|
5475 preds = []
|
|
5476 preds.append(minowa_pks_preds[i + "_AT" + str(nrat)])
|
|
5477 preds.append(pks_code_preds[i + "_AT" + str(nrat)])
|
|
5478 cpred = "n"
|
|
5479 for l in preds:
|
|
5480 if preds.count(l) > 1:
|
|
5481 if l in available_smiles_parts:
|
|
5482 consensuspreds[i + "_AT" + str(nrat)] = l
|
|
5483 else:
|
|
5484 consensuspreds[i + "_AT" + str(nrat)] = "pk"
|
|
5485 cpred = "y"
|
|
5486 if cpred == "n":
|
|
5487 consensuspreds[i + "_AT" + str(nrat)] = "pk"
|
|
5488 if k[0] == "AMP-binding" or k[0] == "A-OX":
|
|
5489 nra +=1
|
|
5490 preds = []
|
|
5491 preds.append(minowa_nrps_preds[i + "_A" + str(nra)])
|
|
5492 preds.append(nrps_svm_preds[i + "_A" + str(nra)])
|
|
5493 preds.append(nrps_code_preds[i + "_A" + str(nra)])
|
|
5494 cpred = "n"
|
|
5495 for l in preds:
|
|
5496 if preds.count(l) > 1:
|
|
5497 if l in available_smiles_parts:
|
|
5498 consensuspreds[i + "_A" + str(nra)] = l
|
|
5499 else:
|
|
5500 consensuspreds[i + "_A" + str(nra)] = "nrp"
|
|
5501 cpred = "y"
|
|
5502 if cpred == "n":
|
|
5503 consensuspreds[i + "_A" + str(nra)] = "nrp"
|
|
5504 if k[0] == "CAL_domain":
|
|
5505 nrcal += 1
|
|
5506 if minowa_cal_preds[i + "_CAL" + str(nrcal)] in available_smiles_parts:
|
|
5507 consensuspreds[i + "_CAL" + str(nrcal)] = minowa_cal_preds[i + "_CAL" + str(nrcal)]
|
|
5508 else:
|
|
5509 consensuspreds[i + "_CAL" + str(nrcal)] = "pk"
|
|
5510
|
|
5511 #Write all prediction details to HTML files for each gene to be used as pop-up window
|
|
5512 domainnamesdict = {}
|
|
5513 for i in pksnrpscoregenes:
|
|
5514 j = domaindict[i]
|
|
5515 domainnames = []
|
|
5516 for k in j:
|
|
5517 domainnames.append(k[0])
|
|
5518 domainnamesdict[i] = domainnames
|
|
5519 for i in pksnrpscoregenes:
|
|
5520 if "PKS_AT" in domainnamesdict[i] or "AMP-binding" in domainnamesdict[i] or "A-OX" in domainnamesdict[i] or "CAL_domain" in domainnamesdict[i]:
|
|
5521 j = domaindict[i]
|
|
5522 nrat = 0
|
|
5523 nra = 0
|
|
5524 nrcal = 0
|
|
5525 nrkr = 0
|
|
5526 for k in j:
|
|
5527 if k[0] == "PKS_AT":
|
|
5528 nrat += 1
|
|
5529 domainname = i + "_AT" + str(nrat)
|
|
5530 htmloutfile = open(substrspecsfolder + domainname + ".html","w")
|
|
5531 htmloutfile.write('<html>\n<head>\n<title>Prediction details</title>\n<STYLE type="text/css">\nbody{\n text-align:left;\n background-color:white;\n font-family: Tahoma, sans-serif;\n font-size: 0.8em;\n color: #810E15;\n}\n</STYLE>\n</head>\n<body>')
|
|
5532 htmloutfile.write(minowa_pks_preds_details[domainname])
|
|
5533 htmloutfile.write(pks_code_preds_details[domainname])
|
|
5534 htmloutfile.write("<b><i>Consensus Predictions: " + consensuspreds[domainname] + "</b></i>")
|
|
5535 htmloutfile.write('\n</body>\n</html>')
|
|
5536 htmloutfile.close()
|
|
5537 if k[0] == "AMP-binding" or k[0] == "A-OX":
|
|
5538 nra += 1
|
|
5539 domainname = i + "_A" + str(nra)
|
|
5540 htmloutfile = open(substrspecsfolder + domainname + ".html","w")
|
|
5541 htmloutfile.write('<html>\n<head>\n<title>Prediction details</title>\n<STYLE type="text/css">\nbody{\n text-align:left;\n background-color:white;\n font-family: Tahoma, sans-serif;\n font-size: 0.8em;\n color: #810E15;\n}\n</STYLE>\n</head>\n<body>')
|
|
5542 htmloutfile.write(nrps_svm_preds_details[domainname])
|
|
5543 htmloutfile.write(nrps_code_preds_details[domainname])
|
|
5544 htmloutfile.write(minowa_nrps_preds_details[domainname])
|
|
5545 htmloutfile.write("<b><i>Consensus Prediction: '" + consensuspreds[domainname] + "'</b></i>")
|
|
5546 htmloutfile.write('\n</body>\n</html>')
|
|
5547 htmloutfile.close()
|
|
5548 if k[0] == "CAL_domain":
|
|
5549 nrcal += 1
|
|
5550 domainname = i + "_CAL" + str(nrcal)
|
|
5551 htmloutfile = open(substrspecsfolder + domainname + ".html","w")
|
|
5552 htmloutfile.write('<html>\n<head>\n<title>Prediction details</title>\n<STYLE type="text/css">\nbody{\n text-align:left;\n background-color:white;\n font-family: Tahoma, sans-serif;\n font-size: 0.8em;\n color: #810E15;\n}\n</STYLE>\n</head>\n<body>')
|
|
5553 htmloutfile.write(minowa_cal_preds_details[domainname])
|
|
5554 htmloutfile.write('\n</body>\n</html>')
|
|
5555 htmloutfile.close()
|
|
5556
|
|
5557 elapsed = (time.time() - starttime)
|
|
5558 #print "5541Time since start: " + str(elapsed)
|
|
5559 #Predict biosynthetic gene order in gene cluster using starter domains, thioesterase domains, gene order and docking domains
|
|
5560 compound_pred_dict = {}
|
|
5561 dockingdomainanalysis = []
|
|
5562 nrpspksclusters = []
|
|
5563 a = 1
|
|
5564 for i in geneclusters:
|
|
5565 genecluster = i
|
|
5566 clustercoregenes = clusterinfo[i][3]
|
|
5567 clusterpksnrpsgenes = []
|
|
5568 for j in clustercoregenes:
|
|
5569 if j in pksnrpscoregenes:
|
|
5570 clusterpksnrpsgenes.append(j)
|
|
5571 if len(clusterpksnrpsgenes) > 0:
|
|
5572 nrpspksclusters.append(genecluster)
|
|
5573 pksgenes = 0
|
|
5574 clusterpksgenes = []
|
|
5575 nrpsgenes = 0
|
|
5576 clusternrpsgenes = []
|
|
5577 hybridgenes = 0
|
|
5578 clusterhybridgenes = []
|
|
5579 for j in clusterpksnrpsgenes:
|
|
5580 k = nrpspkstypedict[j]
|
|
5581 if "PKS" in k and "NRPS" not in k:
|
|
5582 pksgenes += 1
|
|
5583 clusterpksgenes.append(j)
|
|
5584 elif "PKS" not in k and "NRPS" in k:
|
|
5585 nrpsgenes += 1
|
|
5586 clusternrpsgenes.append(j)
|
|
5587 elif "PKS/NRPS" in k:
|
|
5588 if ("PKS_KS" in domainnamesdict[j] or "PKS_AT" in domainnamesdict[j]) and ("AMP-binding" not in domainnamesdict[j] and "A-OX" not in domainnamesdict[j] and "Condensation" not in domainnamesdict[j]):
|
|
5589 pksgenes += 1
|
|
5590 clusterpksgenes.append(j)
|
|
5591 elif ("PKS_KS" not in domainnamesdict[j] and "PKS_AT" not in domainnamesdict[j]) and ("AMP-binding" in domainnamesdict[j] or "A-OX" in domainnamesdict[j] or "Condensation" in domainnamesdict[j]):
|
|
5592 nrpsgenes += 1
|
|
5593 clusternrpsgenes.append(j)
|
|
5594 elif "PKS" in k and "NRPS" in k:
|
|
5595 hybridgenes += 1
|
|
5596 clusterhybridgenes.append(j)
|
|
5597 #If more than three PKS genes, use dock_dom_analysis if possible to identify order
|
|
5598 dock_dom_analysis = "failed"
|
|
5599 if pksgenes > 3 and nrpsgenes == 0 and hybridgenes == 0:
|
|
5600 #print "Predicting PKS gene order by docking domain sequence analysis"
|
|
5601 logfile.write("Predicting PKS gene order by docking domain sequence analysis")
|
|
5602 dockhtmlfile = open(htmlfolder + "docking_analysis" + str(genecluster) + ".html","w")
|
|
5603 #Find first and last genes based on starter module and TE / TD
|
|
5604 startergene = ""
|
|
5605 endinggene = ""
|
|
5606 for k in clusterpksgenes:
|
|
5607 if "Thioesterase" in domainnamesdict[k] or "TD" in domainnamesdict[k]:
|
|
5608 if endinggene == "":
|
|
5609 endinggene = k
|
|
5610 else:
|
|
5611 endinggene = ""
|
|
5612 if len(domainnamesdict[k]) >=2 and "PKS_AT" == domainnamesdict[k][0] and "ACP" == domainnamesdict[k][1]:
|
|
5613 if startergene == "":
|
|
5614 startergene = k
|
|
5615 else:
|
|
5616 startergene = ""
|
|
5617 if startergene == "":
|
|
5618 for k in clusterpksgenes:
|
|
5619 if len(domainnamesdict[k]) >=3 and "PKS_KS" == domainnamesdict[k][0] and "PKS_AT" == domainnamesdict[k][1] and "ACP" == domainnamesdict[k][2]:
|
|
5620 if startergene == "":
|
|
5621 startergene = k
|
|
5622 else:
|
|
5623 startergene = ""
|
|
5624 break
|
|
5625 #Extract N-terminal 50 residues of each non-starting protein, scan for docking domains using hmmsearch, parse output to locate interacting residues
|
|
5626 ntermintresdict = {}
|
|
5627 ntermnames = []
|
|
5628 ntermseqs = []
|
|
5629 for k in clusterpksgenes:
|
|
5630 if k != startergene:
|
|
5631 ntermnames.append(k)
|
|
5632 seq = seqdict[k]
|
|
5633 ntermseqs.append(seq[:50])
|
|
5634 ntermfasta = "docking_analysis/input.fasta"
|
|
5635 z = 0
|
|
5636 for k in ntermnames:
|
|
5637 writefasta([ntermnames[z]],[ntermseqs[z]],ntermfasta)
|
|
5638 os.chdir("docking_analysis")
|
|
5639 os.system("muscle -profile -quiet -in1 nterm.fasta -in2 input.fasta -out muscle.fasta")
|
|
5640 intresidues = extractpositions("nterm.fasta","muscle.fasta",[2,15],"EryAIII_5_6_ref",ntermnames[z])
|
|
5641 ntermintresdict[ntermnames[z]] = intresidues
|
|
5642 os.chdir("..")
|
|
5643 z += 1
|
|
5644 #Extract C-terminal 100 residues of each non-ending protein, scan for docking domains using hmmsearch, parse output to locate interacting residues
|
|
5645 ctermintresdict = {}
|
|
5646 ctermnames = []
|
|
5647 ctermseqs = []
|
|
5648 for k in clusterpksgenes:
|
|
5649 if k != endinggene:
|
|
5650 ctermnames.append(k)
|
|
5651 seq = seqdict[k]
|
|
5652 ctermseqs.append(seq[-100:])
|
|
5653 ctermfasta = "docking_analysis/input.fasta"
|
|
5654 z = 0
|
|
5655 for k in ctermnames:
|
|
5656 writefasta([ctermnames[z]],[ctermseqs[z]],ctermfasta)
|
|
5657 os.chdir("docking_analysis")
|
|
5658 os.system("muscle -profile -quiet -in1 cterm.fasta -in2 input.fasta -out muscle.fasta")
|
|
5659 intresidues = extractpositions("cterm.fasta","muscle.fasta",[55,64],"EryAII_ref",ctermnames[z])
|
|
5660 ctermintresdict[ctermnames[z]] = intresidues
|
|
5661 os.chdir("..")
|
|
5662 z += 1
|
|
5663 #If docking domains found in all, check for optimal order using interacting residues
|
|
5664 genes_to_order = []
|
|
5665 z = 0
|
|
5666 for k in clusterpksgenes:
|
|
5667 if k == startergene or k == endinggene:
|
|
5668 pass
|
|
5669 else:
|
|
5670 genes_to_order.append(k)
|
|
5671 z += 1
|
|
5672 possible_orders = list(itertools.permutations(genes_to_order,len(genes_to_order)))
|
|
5673 hydrophobic = ["A","V","I","L","F","W","Y","M"]
|
|
5674 positivecharge = ["H","K","R"]
|
|
5675 negativecharge = ["D","E"]
|
|
5676 other = ["C","G","P","S","T","N","Q","X","U"]
|
|
5677 possible_orders_scoredict = {}
|
|
5678 for k in possible_orders:
|
|
5679 score = 0
|
|
5680 interactions = []
|
|
5681 z = 0
|
|
5682 for l in k[:-1]:
|
|
5683 interactions.append([l,k[z + 1]])
|
|
5684 z += 1
|
|
5685 for l in interactions:
|
|
5686 res1a = ctermintresdict[l[0]][0]
|
|
5687 res1b = ntermintresdict[l[1]][0]
|
|
5688 res2a = ctermintresdict[l[0]][1]
|
|
5689 res2b = ntermintresdict[l[1]][1]
|
|
5690 if (res1a in hydrophobic and res1b in hydrophobic) or (res1a in positivecharge and res1b in negativecharge) or (res1a in negativecharge and res1b in positivecharge):
|
|
5691 score += 1
|
|
5692 if (res1a in positivecharge and res1b in positivecharge) or (res1a in negativecharge and res1b in negativecharge):
|
|
5693 score = score - 1
|
|
5694 if (res2a in hydrophobic and res2b in hydrophobic) or (res2a in positivecharge and res2b in negativecharge) or (res2a in negativecharge and res2b in positivecharge):
|
|
5695 score += 1
|
|
5696 if (res2a in positivecharge and res2b in positivecharge) or (res2a in negativecharge and res2b in negativecharge):
|
|
5697 score = score - 1
|
|
5698 possible_orders_scoredict[k] = score
|
|
5699 ranked_orders = sortdictkeysbyvaluesrev(possible_orders_scoredict)
|
|
5700 ranked_orders_part = []
|
|
5701 ranked_orders2 = []
|
|
5702 a = 0
|
|
5703 ranked_orders_len = len(ranked_orders) - 1
|
|
5704 for i in ranked_orders:
|
|
5705 if a == 0:
|
|
5706 score = possible_orders_scoredict[i]
|
|
5707 ranked_orders_part.append(i)
|
|
5708 elif a == ranked_orders_len:
|
|
5709 ranked_orders_part.append(i)
|
|
5710 ranked_orders2 = ranked_orders2 + ranked_orders_part
|
|
5711 else:
|
|
5712 if possible_orders_scoredict[i] == score:
|
|
5713 ranked_orders_part.append(i)
|
|
5714 else:
|
|
5715 ranked_orders_part.reverse()
|
|
5716 ranked_orders2 = ranked_orders2 + ranked_orders_part
|
|
5717 score = possible_orders_scoredict[i]
|
|
5718 ranked_orders_part = []
|
|
5719 ranked_orders_part.append(i)
|
|
5720 a += 1
|
|
5721 ranked_orders = ranked_orders2[:1000]
|
|
5722 geneorders = ranked_orders
|
|
5723 geneorders2 = []
|
|
5724 for l in geneorders:
|
|
5725 geneorder = []
|
|
5726 if startergene != "":
|
|
5727 geneorder.append(startergene)
|
|
5728 [ geneorder.append(m) for m in l ]
|
|
5729 #for m in l:
|
|
5730 # geneorder.append(m)
|
|
5731 if endinggene != "":
|
|
5732 geneorder.append(endinggene)
|
|
5733 geneorders2.append(geneorder)
|
|
5734 geneorders = geneorders2
|
|
5735 if len(ranked_orders) == 1000:
|
|
5736 dockhtmlfile.write('<html>\n<head>\n<LINK href="style.css" rel="stylesheet" type="text/css">\n</head>\n<body>\nDocking domain analysis. Score for 1000 highest scoring gene orders:<br><br><table border=1>\n')
|
|
5737 else:
|
|
5738 dockhtmlfile.write('<html>\n<head>\n<LINK href="style.css" rel="stylesheet" type="text/css">\n</head>\n<body>\nDocking domain analysis. Scores for all possible gene orders:<br><br><table border=1>\n')
|
|
5739 dockhtmlfile.write('<tr><td><b>Gene order</b></td><td><b>Score</b></td></tr>\n')
|
|
5740 for l in geneorders:
|
|
5741 string = "<tr><td>"
|
|
5742 for m in l:
|
|
5743 string = string + m + ","
|
|
5744 if startergene != "" and endinggene != "":
|
|
5745 string = string[:-1] + "</td><td>" + str(possible_orders_scoredict[tuple(l[1:-1])])
|
|
5746 elif startergene == "" and endinggene != "":
|
|
5747 string = string[:-1] + "</td><td>" + str(possible_orders_scoredict[tuple(l[:-1])])
|
|
5748 elif startergene != "" and endinggene == "":
|
|
5749 string = string[:-1] + "</td><td>" + str(possible_orders_scoredict[tuple(l[1:])])
|
|
5750 elif startergene == "" and endinggene == "":
|
|
5751 string = string[:-1] + "</td><td>" + str(possible_orders_scoredict[tuple(l)])
|
|
5752 dockhtmlfile.write(string + "</td></tr>\n")
|
|
5753 dockhtmlfile.write('\n</table></body></html>')
|
|
5754 dockhtmlfile.close()
|
|
5755 #print "Predicting PKS gene order by docking domain sequence analysis succeeded."
|
|
5756 #Write html outfile with docking domain analysis output
|
|
5757 #
|
|
5758 logfile.write("Predicting PKS gene order by docking domain sequence analysis succeeded.")
|
|
5759 dockingdomainanalysis.append(genecluster)
|
|
5760 #If NRPS genes, mixed NRPS/PKS genes, PKS genes without detected docking domains, or clusters with a 1-3 PKS genes, assume colinearity
|
|
5761 direction = 0
|
|
5762 for k in clusterpksnrpsgenes:
|
|
5763 if strandsdict[k] == "+":
|
|
5764 direction += 1
|
|
5765 elif strandsdict[k] == "-":
|
|
5766 direction = direction - 1
|
|
5767 if direction < 0:
|
|
5768 clusterpksnrpsgenes.reverse()
|
|
5769 if "Thioesterase" in domainnamesdict[clusterpksnrpsgenes[0]] or "TD" in domainnamesdict[clusterpksnrpsgenes[0]]:
|
|
5770 clusterpksnrpsgenes.reverse()
|
|
5771 geneorder = clusterpksnrpsgenes
|
|
5772 #Generate substrates order from predicted gene order and consensus predictions
|
|
5773 prediction = ""
|
|
5774 for k in geneorder:
|
|
5775 domains = domainnamesdict[k]
|
|
5776 nra = 0
|
|
5777 nrat = 0
|
|
5778 nrcal = 0
|
|
5779 for l in domains:
|
|
5780 if "PKS_AT" in l:
|
|
5781 nrat += 1
|
|
5782 prediction = prediction + consensuspreds[k + "_AT" + str(nrat)] + " "
|
|
5783 if "AMP-binding" in l or "A-OX" in l:
|
|
5784 nra += 1
|
|
5785 prediction = prediction + consensuspreds[k + "_A" + str(nra)] + " "
|
|
5786 if "CAL_domain" in l:
|
|
5787 nrcal += 1
|
|
5788 prediction = prediction + consensuspreds[k + "_CAL" + str(nrcal)] + " "
|
|
5789 prediction = prediction[:-1]
|
|
5790 compound_pred_dict[genecluster] = prediction
|
|
5791 a += 1
|
|
5792
|
|
5793 #Combine predictions into a prediction of the final chemical structure and generate images
|
|
5794 os.chdir("NRPeditor")
|
|
5795 failedstructures = []
|
|
5796 for i in geneclusters:
|
|
5797 genecluster = i
|
|
5798 if compound_pred_dict.has_key(genecluster):
|
|
5799 residues = compound_pred_dict[genecluster]
|
|
5800 nrresidues = len(residues.split(" "))
|
|
5801 if nrresidues > 1:
|
|
5802 if sys.platform == ('win32'):
|
|
5803 structcommand = 'main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "'
|
|
5804 elif sys.platform == ('linux2'):
|
|
5805 structcommand = './main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "'
|
|
5806 for i in residues.split(" "):
|
|
5807 structcommand = structcommand + i + " "
|
|
5808 structcommand = structcommand + 'TE"'
|
|
5809 smilesinfo = os.popen(structcommand)
|
|
5810 smilesinfo = smilesinfo.read()
|
|
5811 smiles_string = (smilesinfo.split("core peptide: ")[1]).split("\ntermintype")[0]
|
|
5812 if sys.platform == ('linux2'):
|
|
5813 smiles_string.replace("[X]","[*:X]")
|
|
5814 smiles_string2 = ""
|
|
5815 a = 1
|
|
5816 for k in smiles_string:
|
|
5817 if k == "X":
|
|
5818 smiles_string2 = smiles_string2 + str(a)
|
|
5819 a += 1
|
|
5820 else:
|
|
5821 smiles_string2 = smiles_string2 + k
|
|
5822 smiles_string = smiles_string2
|
|
5823 smilesfile = open("genecluster" + str(genecluster) + ".smi","w")
|
|
5824 smilesfile.write(smiles_string)
|
|
5825 smilesfile.close()
|
|
5826 depictstatus = depict_smile(genecluster,structuresfolder)
|
|
5827 if depictstatus == "failed":
|
|
5828 failedstructures.append(genecluster)
|
|
5829 elif clusterinfo[genecluster][0] == "ectoine":
|
|
5830 smiles_string = "CC1=NCCC(N1)C(=O)O"
|
|
5831 smilesfile = open("genecluster" + str(genecluster) + ".smi","w")
|
|
5832 smilesfile.write(smiles_string)
|
|
5833 smilesfile.close()
|
|
5834 depictstatus = depict_smile(genecluster,structuresfolder)
|
|
5835 if depictstatus == "failed":
|
|
5836 failedstructures.append(genecluster)
|
|
5837 elif genecluster in failedstructures:
|
|
5838 del failedstructures[failedstructures.index(genecluster)]
|
|
5839 compound_pred_dict[genecluster] = "ectoine "
|
|
5840 os.chdir("..")
|
|
5841
|
|
5842 elapsed = (time.time() - starttime)
|
|
5843 #print "5826 Time since start: " + str(elapsed)
|
|
5844 #ClusterBlast
|
|
5845 if clusterblast == "y":
|
|
5846 #Load gene cluster database into memory
|
|
5847 #print "ClusterBlast: Loading gene clusters database into memory..."
|
|
5848 logfile.write("ClusterBlast: Loading gene clusters database into memory...\n")
|
|
5849
|
|
5850 os.chdir(genomename + "/clusterblast")
|
|
5851 #file = open( os.path.join(antismash_path, "clusterblast/geneclusters.txt") ,"r")
|
|
5852 #filetext = file.read()
|
|
5853 #lines = filetext.split("\n")
|
|
5854 clusters = {}
|
|
5855 #for i in open(os.path.join(antismash_path, "clusterblast/geneclusters.txt")):
|
|
5856 bin_path = os.path.join(antismash_path, "clusterblast/geneclusters.bin")
|
|
5857 if os.path.exists( bin_path ):
|
|
5858 clusters = cPickle.load( open(bin_path) )
|
|
5859 #print clusters
|
|
5860 else:
|
|
5861 for line in open( os.path.join(antismash_path, "clusterblast/geneclusters.txt") ,"r"):
|
|
5862 line = line.strip()
|
|
5863 tabs = line.split("\t")
|
|
5864 accession = tabs[0]
|
|
5865 clusterdescription = tabs[1]
|
|
5866 clusternr = tabs[2]
|
|
5867 clustertype = tabs[3]
|
|
5868 clustername = accession + "_" + clusternr
|
|
5869 clustertags = tabs[4].split(";")
|
|
5870 clusterprots = tabs[5].split(";")
|
|
5871 clusters[clustername] = [clusterprots,clusterdescription,clustertype,clustertags]
|
|
5872 cPickle.dump(clusters, open(bin_path, 'w'), -1)
|
|
5873 #Load gene cluster database proteins info into memory
|
|
5874 #print "ClusterBlast: Loading gene cluster database proteins into memory..."
|
|
5875 logfile.write("ClusterBlast: Loading gene cluster database proteins into memory...\n")
|
|
5876 #file = open( os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta") ,"r")
|
|
5877 #filetext = file.read()
|
|
5878 #filetext = filetext.replace("\r","\n")
|
|
5879 #lines = filetext.split("\n")
|
|
5880 proteingeneclusters = {}
|
|
5881 proteinlocations = {}
|
|
5882 proteinstrands = {}
|
|
5883 proteinannotations = {}
|
|
5884 proteintags = {}
|
|
5885 bin_path = os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta.bin")
|
|
5886 if os.path.exists( bin_path ):
|
|
5887 (proteingeneclusters, proteinlocations, proteinstrands, proteinannotations, proteintags) = cPickle.load( open(bin_path, 'r') )
|
|
5888 else:
|
|
5889 for line in open( os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta") ,"r"):
|
|
5890 line = line.replace('\n', '')
|
|
5891 if line.startswith(">"):
|
|
5892 tabs = line.split("|")
|
|
5893 #print 'Protein:', tabs
|
|
5894 protein = tabs[6]
|
|
5895 locustag = tabs[4]
|
|
5896 if accessiondict.has_key(locustag):
|
|
5897 locustag = "h_" + locustag
|
|
5898 proteintags[protein] = locustag
|
|
5899 clustername = tabs[0] + "_" + tabs[1]
|
|
5900 proteingeneclusters[protein] = clustername
|
|
5901 location = tabs[2]
|
|
5902 proteinlocations[protein] = location
|
|
5903 strand = tabs[3]
|
|
5904 proteinstrands[protein] = strand
|
|
5905 annotation = tabs[5]
|
|
5906 proteinannotations[protein] = annotation
|
|
5907 cPickle.dump([proteingeneclusters, proteinlocations, proteinstrands, proteinannotations, proteintags], open(bin_path, 'w'), -1)
|
|
5908 #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly
|
|
5909 #print "Finding internal homologs in each gene cluster.."
|
|
5910 logfile.write("Finding internal homologs in each gene cluster..\n")
|
|
5911 internalhomologygroupsdict = {}
|
|
5912 for i in geneclusters:
|
|
5913 clusternumber = i
|
|
5914 #Create input fasta files for BLAST search
|
|
5915 queryclusterprotslist = clusterinfo[i][4]
|
|
5916 queryclusterprots = []
|
|
5917 for i in queryclusterprotslist:
|
|
5918 queryclusterprots.append(i[4])
|
|
5919 queryclusternames = []
|
|
5920 queryclusterseqs = []
|
|
5921 for i in queryclusterprots:
|
|
5922 seq = seqdict[i]
|
|
5923 name = fullnamedict[i]
|
|
5924 queryclusterseqs.append(seq)
|
|
5925 queryclusternames.append(name)
|
|
5926 writefasta(queryclusternames,queryclusterseqs,"internal_input.fasta")
|
|
5927 #Run and parse BLAST search
|
|
5928 makeblastdbcommand = "makeblastdb -in internal_input.fasta -out internal_input.fasta -dbtype prot"
|
|
5929 blastsearch = "blastp -db internal_input.fasta -query internal_input.fasta -outfmt 6 -max_target_seqs 1000 -evalue 1e-05 -out internal_input.out"
|
|
5930 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
|
|
5931 os.popen(makeblastdbcommand)
|
|
5932 os.popen(blastsearch)
|
|
5933 else:
|
|
5934 os.system(makeblastdbcommand)
|
|
5935 os.system(blastsearch)
|
|
5936 #print "5920 makeblastdb finised"
|
|
5937 blastoutput = open("internal_input.out","r").read()
|
|
5938 minseqcoverage = 25
|
|
5939 minpercidentity = 30
|
|
5940 seqlengths = fastaseqlengths(proteins)
|
|
5941 iblastinfo = blastparse(blastoutput,minseqcoverage,minpercidentity,seqlengths,geneclustergenes)
|
|
5942 iblastdict = iblastinfo[0]
|
|
5943 iquerylist = iblastinfo[1]
|
|
5944 #find and store internal homologs
|
|
5945 groups = []
|
|
5946 for j in queryclusternames:
|
|
5947 jsplit = j.split("|")[4]
|
|
5948 if iblastdict.has_key(j):
|
|
5949 hits = iblastdict[j][0]
|
|
5950 group = []
|
|
5951 for k in hits:
|
|
5952 if k[:2] == "h_":
|
|
5953 group.append(k[2:])
|
|
5954 elif k.count("|") > 4:
|
|
5955 group.append(k.split("|")[4])
|
|
5956 else:
|
|
5957 group.append(k)
|
|
5958 if jsplit not in group:
|
|
5959 group.append( jsplit )
|
|
5960 x = 0
|
|
5961 for l in groups:
|
|
5962 for m in group:
|
|
5963 if m in l:
|
|
5964 del groups[x]
|
|
5965 [group.append(n) for n in l if n not in group]
|
|
5966 #for n in l:
|
|
5967 # if n not in group:
|
|
5968 # group.append(n)
|
|
5969 break
|
|
5970 x += 1
|
|
5971 group.sort()
|
|
5972 groups.append(group)
|
|
5973 else:
|
|
5974 groups.append([ jsplit ])
|
|
5975 internalhomologygroupsdict[clusternumber] = groups
|
|
5976
|
|
5977 #Run BLAST on gene cluster proteins of each cluster and parse output
|
|
5978 #print "5961 Running NCBI BLAST+ gene cluster searches.."
|
|
5979 logfile.write("Running NCBI BLAST+ gene cluster searches..\n")
|
|
5980 for i in geneclusters:
|
|
5981 clusternumber = i
|
|
5982 #print " Gene cluster " + str(clusternumber)
|
|
5983 #Create input fasta files for BLAST search
|
|
5984 queryclusterprotslist = clusterinfo[i][4]
|
|
5985 queryclusterprots = []
|
|
5986 for i in queryclusterprotslist:
|
|
5987 queryclusterprots.append(i[4])
|
|
5988 queryclusternames = []
|
|
5989 queryclusterseqs = []
|
|
5990 for i in queryclusterprots:
|
|
5991 seq = seqdict[i]
|
|
5992 name = fullnamedict[i]
|
|
5993 queryclusterseqs.append(seq)
|
|
5994 queryclusternames.append(name)
|
|
5995 equalpartsizes = int(len(queryclusternames)/nrcpus)
|
|
5996 for i in range(nrcpus):
|
|
5997 if i == 0:
|
|
5998 setnames = queryclusternames[:equalpartsizes]
|
|
5999 setseqs = queryclusterseqs[:equalpartsizes]
|
|
6000 elif i == (nrcpus - 1):
|
|
6001 setnames = queryclusternames[(i*equalpartsizes):]
|
|
6002 setseqs = queryclusterseqs[(i*equalpartsizes):]
|
|
6003 else:
|
|
6004 setnames = queryclusternames[(i*equalpartsizes):((i+1)*equalpartsizes)]
|
|
6005 setseqs = queryclusterseqs[(i*equalpartsizes):((i+1)*equalpartsizes)]
|
|
6006 writefasta(setnames,setseqs,"input" + str(i) + ".fasta")
|
|
6007 processes = []
|
|
6008 processnames = []
|
|
6009 for i in range(nrcpus):
|
|
6010 processes.append(Process(target=runblast, args=["input" + str(i) + ".fasta"]))
|
|
6011 [i.start() for i in processes]
|
|
6012 time.sleep(10)
|
|
6013 while True:
|
|
6014 processrunning = "n"
|
|
6015 for i in processes:
|
|
6016 if i.is_alive():
|
|
6017 processrunning = "y"
|
|
6018 if processrunning == "y":
|
|
6019 time.sleep(5)
|
|
6020 else:
|
|
6021 break
|
|
6022 [i.join() for i in processes]
|
|
6023 blastoutput = ""
|
|
6024 for i in range(nrcpus):
|
|
6025 output = open("input" + str(i) + ".out","r")
|
|
6026 output = output.read()
|
|
6027 blastoutput = blastoutput + output
|
|
6028 os.chdir("..")
|
|
6029 blastoutputfile = open("./clusterblastoutput.txt","w")
|
|
6030 blastoutputfile.write(blastoutput)
|
|
6031 blastoutputfile.close()
|
|
6032 os.chdir("clusterblast")
|
|
6033 #print " Blast search finished. Parsing results..."
|
|
6034 logfile.write(" Blast search finished. Parsing results...\n")
|
|
6035 minseqcoverage = 25
|
|
6036 minpercidentity = 30
|
|
6037 seqlengths = fastaseqlengths(proteins)
|
|
6038 blastinfo = blastparse(blastoutput,minseqcoverage,minpercidentity,seqlengths,geneclustergenes)
|
|
6039 blastdict = blastinfo[0]
|
|
6040 querylist = blastinfo[1]
|
|
6041 #Remove queries without hits
|
|
6042 querylist2 = []
|
|
6043 for i in querylist:
|
|
6044 if blastdict.has_key(i):
|
|
6045 querylist2.append(i)
|
|
6046 else:
|
|
6047 pass
|
|
6048 querylist = querylist2
|
|
6049 hitclusters = blastinfo[2]
|
|
6050 #Score BLAST output on all gene clusters
|
|
6051 #Rank gene cluster hits based on 1) number of protein hits covering >25% sequence length or at least 100aa alignment, with >30% identity and 2) cumulative blast score
|
|
6052 #Find number of protein hits and cumulative blast score for each gene cluster
|
|
6053 #print " Scoring Blast outputs on database of gene clusters..."
|
|
6054 logfile.write(" Scoring Blast outputs on database of gene clusters...\n")
|
|
6055 hitclusterdict = {}
|
|
6056 hitclusterdata = {}
|
|
6057 for i in hitclusters:
|
|
6058 hitclusterdatalist = []
|
|
6059 nrhits = float(0)
|
|
6060 nrcoregenehits = float(0)
|
|
6061 cumblastscore = float(0)
|
|
6062 hitpositions = []
|
|
6063 hitposcorelist = []
|
|
6064 for j in querylist:
|
|
6065 querynrhits = 0
|
|
6066 querycumblastscore = float(0)
|
|
6067 nrhitsplus = "n"
|
|
6068 for k in blastdict[j][0]:
|
|
6069 if i == blastdict[j][1][k][0]:
|
|
6070 if [querylist.index(j),clusters[i][0].index(blastdict[j][1][k][9])] not in hitpositions:
|
|
6071 nrhitsplus = "y"
|
|
6072 querynrhits += 1
|
|
6073 blastscore = float(blastdict[j][1][k][6]) / 1000000
|
|
6074 querycumblastscore = querycumblastscore + blastscore
|
|
6075 hitclusterdatalist.append([j,k,blastdict[j][1][k][5],blastdict[j][1][k][6],blastdict[j][1][k][7],blastdict[j][1][k][8]])
|
|
6076 hitclusterdata[i] = hitclusterdatalist
|
|
6077 hitpositions.append([querylist.index(j),clusters[i][0].index(blastdict[j][1][k][9])])
|
|
6078 if nrhitsplus == "y":
|
|
6079 nrhits += 1
|
|
6080 if j.split("|")[4] in allcoregenes:
|
|
6081 nrcoregenehits += 0.1
|
|
6082 for hit in range(querynrhits):
|
|
6083 hitposcorelist.append(1)
|
|
6084 else:
|
|
6085 for hit in range(querynrhits):
|
|
6086 hitposcorelist.append(0)
|
|
6087 cumblastscore = cumblastscore + float(querycumblastscore)
|
|
6088 query_givenscores_querydict = {}
|
|
6089 query_givenscores_hitdict = {}
|
|
6090 #Find groups of hits
|
|
6091 hitgroupsdict = {}
|
|
6092 for p in hitpositions:
|
|
6093 if not hitgroupsdict.has_key(p[0]):
|
|
6094 hitgroupsdict[p[0]] = [p[1]]
|
|
6095 else:
|
|
6096 hitgroupsdict[p[0]].append(p[1])
|
|
6097 #Calculate synteny score; give score only if more than one hits (otherwise no synteny possible), and only once for every query gene and every hit gene
|
|
6098 synteny_score = 0
|
|
6099 z = 1
|
|
6100 if nrhits > 1:
|
|
6101 for p in hitpositions[:-1]:
|
|
6102 tandem = "n"
|
|
6103 #Check if a gene homologous to this gene has already been scored for synteny in the previous entry
|
|
6104 if p[1] in hitgroupsdict[hitpositions[z][0]]:
|
|
6105 tandem = "y"
|
|
6106 #Score entry
|
|
6107 if ((not query_givenscores_querydict.has_key(p[0])) or query_givenscores_querydict[p[0]] == 0) and ((not query_givenscores_hitdict.has_key(p[1])) or query_givenscores_hitdict[p[1]] == 0) and tandem == "n":
|
|
6108 q = hitpositions[z]
|
|
6109 if (abs(p[0] - q[0]) < 2) and abs(p[0]-q[0]) == abs(p[1]-q[1]):
|
|
6110 synteny_score += 1
|
|
6111 if hitposcorelist[z - 1] == 1 or hitposcorelist[z] == 1:
|
|
6112 synteny_score += 1
|
|
6113 query_givenscores_querydict[p[0]] = 1
|
|
6114 query_givenscores_hitdict[p[1]] = 1
|
|
6115 else:
|
|
6116 query_givenscores_querydict[p[0]] = 0
|
|
6117 query_givenscores_hitdict[p[1]] = 0
|
|
6118 z += 1
|
|
6119 #Give bonus to gene clusters with >0 core gene hits
|
|
6120 if nrcoregenehits > 0:
|
|
6121 corebonus = 3
|
|
6122 else:
|
|
6123 corebonus = 0
|
|
6124 #sorting score is based on number of hits (discrete values) & cumulative blast score (behind comma values)
|
|
6125 sortingscore = nrhits + synteny_score + corebonus + nrcoregenehits + cumblastscore
|
|
6126 hitclusterdict[i] = sortingscore
|
|
6127 #Sort gene clusters
|
|
6128 rankedclusters = sortdictkeysbyvaluesrev(hitclusterdict)
|
|
6129 rankedclustervalues = sortdictkeysbyvaluesrevv(hitclusterdict)
|
|
6130 #Output for each hit: table of genes and locations of input cluster, table of genes and locations of hit cluster, table of hits between the clusters
|
|
6131 #print " Writing output file..."
|
|
6132 logfile.write(" Writing output file...\n")
|
|
6133 #os.chdir("..")
|
|
6134 #os.chdir(genomename)
|
|
6135 #os.chdir("clusterblast")
|
|
6136 out_file = open("cluster" + str(clusternumber) + ".txt","w")
|
|
6137 out_file.write("ClusterBlast scores for " + infile)
|
|
6138 out_file.write("\n\nTable of genes, locations, strands and annotations of query cluster:\n")
|
|
6139 #out_file.write("\n")
|
|
6140 #out_file.write("Table of genes, locations, strands and annotations of query cluster:")
|
|
6141 #out_file.write("\n")
|
|
6142 for i in queryclusterprots:
|
|
6143 out_file.write("%s\t%s\t%s\t%s\t%s\t\n" % (i, proteins[3][i][0], proteins[3][i][1], proteins[3][i][2], proteins[3][i][3]))
|
|
6144 """out_file.write(i)
|
|
6145 out_file.write("\t")
|
|
6146 out_file.write(proteins[3][i][0])
|
|
6147 out_file.write("\t")
|
|
6148 out_file.write(proteins[3][i][1])
|
|
6149 out_file.write("\t")
|
|
6150 out_file.write(proteins[3][i][2])
|
|
6151 out_file.write("\t")
|
|
6152 out_file.write(proteins[3][i][3])
|
|
6153 out_file.write("\t")
|
|
6154 out_file.write("\n")"""
|
|
6155 out_file.write("\n\nSignificant hits: \n")
|
|
6156 #out_file.write("\n")
|
|
6157 #out_file.write("Significant hits: ")
|
|
6158 #out_file.write("\n")
|
|
6159 z = 0
|
|
6160 for i in rankedclusters[:100]:
|
|
6161 #out_file.write(str(z+1) + ". " + i + "\t" + clusters[i][1])
|
|
6162 #out_file.write("\n")
|
|
6163 out_file.write("%s. %s\t%s\n" % ((z+1), i, clusters[i][1]) )
|
|
6164 z += 1
|
|
6165 out_file.write("\n\n")
|
|
6166 #out_file.write("\n")
|
|
6167 z = 0
|
|
6168 out_file.write("Details:")
|
|
6169 for i in rankedclusters[:100]:
|
|
6170 value = str(rankedclustervalues[z])
|
|
6171 nrhits = value.split(".",1)[0]
|
|
6172 if nrhits > 0:
|
|
6173 cumblastscore = str(int(float(value.split(".")[1])))
|
|
6174 out_file.write("\n\n>>\n\n%s. %s\nSource: %s\nType: %s\nNumber of proteins with BLAST hits to this cluster: %s\nCumulative BLAST score: %s\n\nTable of genes, locations, strands and annotations of subject cluster:\n" % (z+1, i, clusters[i][1], clusters[i][2], nrhits, cumblastscore))
|
|
6175 clusterproteins = clusters[i][0]
|
|
6176 #print 'clusterproteins\n\n', clusterproteins
|
|
6177 """out_file.write("\n\n")
|
|
6178 out_file.write(">>")
|
|
6179 out_file.write("\n")
|
|
6180 cumblastscore = str(int(float(value.split(".")[1])))
|
|
6181 out_file.write("\n")
|
|
6182 out_file.write(str(z+1) + ". " + i)
|
|
6183 out_file.write("\n")
|
|
6184 out_file.write("Source: " + clusters[i][1])
|
|
6185 out_file.write("\n")
|
|
6186 out_file.write("Type: " + clusters[i][2])
|
|
6187 out_file.write("\n")
|
|
6188 out_file.write("Number of proteins with BLAST hits to this cluster: " + nrhits)
|
|
6189 out_file.write("\n")
|
|
6190 out_file.write("Cumulative BLAST score: " + cumblastscore)
|
|
6191 out_file.write("\n")
|
|
6192 out_file.write("\n")
|
|
6193 out_file.write("Table of genes, locations, strands and annotations of subject cluster:")
|
|
6194 out_file.write("\n")
|
|
6195 clusterproteins = clusters[i][0]"""
|
|
6196
|
|
6197 for j in clusterproteins:
|
|
6198 #print '##########asdfasdf######', j, '---'+proteinlocations.keys()[0]+ '---', proteinannotations.has_key(j), proteinstrands.has_key(j), proteinlocations.has_key(j)
|
|
6199 if proteinlocations.has_key(j) and proteinannotations.has_key(j) and proteinstrands.has_key(j):
|
|
6200 if proteintags[j] == "no_locus_tag":
|
|
6201 out_file.write(j)
|
|
6202 else:
|
|
6203 out_file.write(proteintags[j])
|
|
6204 out_file.write( "\t%s\t%s\t%s\t%s\t%s\n" % (j, proteinlocations[j].split("-")[0], proteinlocations[j].split("-")[1], proteinstrands[j], proteinannotations[j]) )
|
|
6205 """out_file.write("\t")
|
|
6206 out_file.write(j)
|
|
6207 out_file.write("\t")
|
|
6208 out_file.write(proteinlocations[j].split("-")[0])
|
|
6209 out_file.write("\t")
|
|
6210 out_file.write(proteinlocations[j].split("-")[1])
|
|
6211 out_file.write("\t")
|
|
6212 out_file.write(proteinstrands[j])
|
|
6213 out_file.write("\t")
|
|
6214 out_file.write(proteinannotations[j])
|
|
6215 out_file.write("\n")
|
|
6216 """
|
|
6217
|
|
6218 out_file.write("\nTable of Blast hits (query gene, subject gene, %identity, blast score, %coverage, e-value):\n")
|
|
6219 if i in hitclusterdata.keys():
|
|
6220 tabledata = hitclusterdata[i]
|
|
6221 for x in tabledata:
|
|
6222 w = 0
|
|
6223 for y in x:
|
|
6224 if w == 0:
|
|
6225 out_file.write( "%s\t" % y.split("|")[4] )
|
|
6226 #out_file.write("\t")
|
|
6227 w += 1
|
|
6228 else:
|
|
6229 out_file.write("%s\t" % y)
|
|
6230 #out_file.write("\t")
|
|
6231 out_file.write("\n")
|
|
6232 else:
|
|
6233 "data not found"
|
|
6234 out_file.write("\n")
|
|
6235 out_file.write("\n")
|
|
6236 z += 1
|
|
6237 #os.chdir("..")
|
|
6238 #os.chdir("..")
|
|
6239 #os.chdir("clusterblast")
|
|
6240 os.chdir("..")
|
|
6241 out_file.close()
|
|
6242
|
|
6243 elapsed = (time.time() - starttime)
|
|
6244 #print "Time since start: " + str(elapsed)
|
|
6245 #smCOG analysis
|
|
6246 smcogtreedict = {}
|
|
6247 if smcogs == "y":
|
|
6248 #print "Performing smCOG analysis"
|
|
6249 logfile.write("Performing smCOG analysis\n")
|
|
6250 hmmsearch = hmmscan_path + " --cpu " + str(nrcpus) + " -E 1E-6 -o " + "./smcogs/smcogshmm_output.txt" + " --noali --tblout " + "./smcogs/smcogshmm.txt "+ hmms_path +"smcogs.hmm " + "./clusterblast/geneclusterprots.fasta"
|
|
6251 #print hmmsearch
|
|
6252 os.system(hmmsearch)
|
|
6253 #print 'finised'
|
|
6254 smcoghmmlengthsdict = hmmlengths(hmms_path+"smcogs.hmm")
|
|
6255 smcogdict = hmmscanparse("./smcogs/smcogshmm_output.txt", smcoghmmlengthsdict)
|
|
6256 smcogdict2 = {}
|
|
6257 for i in smcogdict.keys():
|
|
6258 newkey = i.split("|")[4]
|
|
6259 smcogdict2[newkey] = smcogdict[i]
|
|
6260 smcogdict = smcogdict2
|
|
6261 #Write output
|
|
6262 #os.chdir(genomename)
|
|
6263 os.chdir("smcogs")
|
|
6264 smcogfile = open("smcogs.txt","w")
|
|
6265 for k in geneclustergenes:
|
|
6266 if k not in pksnrpscoregenes:
|
|
6267 l = smcogdict[k]
|
|
6268 smcogfile.write(">> " + k + "\n")
|
|
6269 smcogfile.write("name\tstart\tend\te-value\tscore\n")
|
|
6270 smcogfile.write("** smCOG hits **\n")
|
|
6271 for i in l:
|
|
6272 smcogfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n")
|
|
6273 smcogfile.write("\n\n")
|
|
6274 smcogfile.close()
|
|
6275 os.chdir("..")
|
|
6276 os.chdir("..")
|
|
6277 #smCOG phylogenetic tree construction
|
|
6278 #print "Calculating and drawing phylogenetic trees of cluster genes with smCOG members"
|
|
6279 logfile.write("Calculating and drawing phylogenetic trees of cluster genes with smCOG members")
|
|
6280 os.chdir("smcogtree")
|
|
6281 smcoganalysisgenes = []
|
|
6282 #for k in geneclustergenes:
|
|
6283 # if k not in pksnrpscoregenes:
|
|
6284 # smcoganalysisgenes.append(k)
|
|
6285 [smcoganalysisgenes.append(k) for k in geneclustergenes if k not in pksnrpscoregenes]
|
|
6286 smcogsets = []
|
|
6287 equalpartsizes = int(len(smcoganalysisgenes)/nrcpus)
|
|
6288 for i in range(nrcpus):
|
|
6289 if i == 0:
|
|
6290 geneslist = smcoganalysisgenes[:equalpartsizes]
|
|
6291 elif i == (nrcpus - 1):
|
|
6292 geneslist = smcoganalysisgenes[(i*equalpartsizes):]
|
|
6293 else:
|
|
6294 geneslist = smcoganalysisgenes[(i*equalpartsizes):((i+1)*equalpartsizes)]
|
|
6295 smcogsets.append(geneslist)
|
|
6296 processes = []
|
|
6297 processnames = []
|
|
6298 z = 0
|
|
6299 for k in smcogsets:
|
|
6300 processes.append(Process(target=smcog_analysis, args=[k,z,accessiondict,seqdict,smcogdict,smcogsoutputfolder]))
|
|
6301 z += 1
|
|
6302 for k in processes:
|
|
6303 k.start()
|
|
6304 time.sleep(1)
|
|
6305 while True:
|
|
6306 processrunning = "n"
|
|
6307 for k in processes:
|
|
6308 if k.is_alive():
|
|
6309 processrunning = "y"
|
|
6310 if processrunning == "y":
|
|
6311 time.sleep(5)
|
|
6312 else:
|
|
6313 break
|
|
6314 for k in processes:
|
|
6315 k.join()
|
|
6316 os.chdir("..")
|
|
6317 currentpath = os.getcwd()
|
|
6318 os.chdir(smcogsoutputfolder)
|
|
6319 dircontents = getdircontents()
|
|
6320 for k in dircontents:
|
|
6321 #POTENTIAL pERFORMANCE gainfor k in glob.glob('*.png'):
|
|
6322 if ".png" in k:
|
|
6323 tag = k.split(".png")[0]
|
|
6324 smcogtreedict[tag] = tag + ".png"
|
|
6325 os.chdir(currentpath)
|
|
6326
|
|
6327
|
|
6328 ##Visualization
|
|
6329 #Read in ClusterBlast data
|
|
6330 #Read in PubMed / PubChem links of database gene clusters
|
|
6331 if clusterblast == "y":
|
|
6332 if genomename in os.getcwd():
|
|
6333 os.chdir('..')
|
|
6334 pubmed_dict = {}
|
|
6335 pubchem_dict = {}
|
|
6336 known_compound_dict = {}
|
|
6337 #pubfile = open(antismash_path + "pubmed_pubchem_links.txt","r")
|
|
6338 #pubfile = pubfile.read()
|
|
6339 #publines = pubfile.split("\n")
|
|
6340 #for i in publines:
|
|
6341 bin_path = os.path.join(antismash_path, "pubmed_pubchem_links.bin")
|
|
6342 if os.path.exists( bin_path ):
|
|
6343 (pubmed_dict, pubchem_dict, known_compound_dict) = cPickle.load( open(bin_path) )
|
|
6344 else:
|
|
6345 for line in open(antismash_path + "pubmed_pubchem_links.txt","r"):
|
|
6346 line = line.replace('\n', '')
|
|
6347 tabs = line.split("\t")
|
|
6348 acc = tabs[0]
|
|
6349 if tabs[1] != "":
|
|
6350 pubmed_dict[acc] = tabs[1]
|
|
6351 if tabs[2] != "":
|
|
6352 pubchem_dict[acc] = tabs[2]
|
|
6353 if tabs[3] != "":
|
|
6354 known_compound_dict[acc] = tabs[3]
|
|
6355 cPickle.dump([pubmed_dict, pubchem_dict, known_compound_dict], open(bin_path, 'w'), -1)
|
|
6356 #print "Writing visualization SVGs and XHTML"
|
|
6357 logfile.write("Writing visualization SVGs and XHTML\n")
|
|
6358 queryclusterdata = {}
|
|
6359 nrhitgeneclusters = {}
|
|
6360 cblastclusternr = 1
|
|
6361 #print os.getcwd()
|
|
6362 if clusterblast == "y":
|
|
6363 for x in geneclusters:
|
|
6364 clusterblastfile = open(clusterblastoutputfolder + "cluster" + str(x) + ".txt","r")
|
|
6365 #print clusterblastfile
|
|
6366 clusterblastfile = clusterblastfile.read()
|
|
6367 clusterblastfile = clusterblastfile.replace("\r","\n")
|
|
6368 toptenhitclusters = []
|
|
6369 #Identify top ten hits for visualization
|
|
6370 hitlines = ((clusterblastfile.split("Significant hits: \n")[1]).split("\nDetails:")[0]).split("\n")
|
|
6371 #print '\n\n#######hitlines\n', hitlines
|
|
6372 a = 0
|
|
6373 cb_accessiondict = {}
|
|
6374 b = 1
|
|
6375 for i in hitlines:
|
|
6376 if " " in i:
|
|
6377 cb_accessiondict[b] = (i.split("\t")[0]).split(" ")[1]
|
|
6378 if genomic_accnr == "" or genomic_accnr not in i:
|
|
6379 b += 1
|
|
6380 if a < 10:
|
|
6381 if len(i) < 80:
|
|
6382 toptenhitclusters.append(i)
|
|
6383 elif len(i) >= 80:
|
|
6384 j = i[0:77] + "..."
|
|
6385 toptenhitclusters.append(j)
|
|
6386 a += 1
|
|
6387 #print clusterblastfile
|
|
6388 details = (clusterblastfile.split("\nDetails:")[1]).split(">>")[1:]
|
|
6389 #print details
|
|
6390 nrhitclusters = len(toptenhitclusters)
|
|
6391 #Save query gene cluster data
|
|
6392 querylines = ((clusterblastfile.split("Table of genes, locations, strands and annotations of query cluster:\n")[1]).split("\n\n\nSignificant hits:")[0]).split("\n")
|
|
6393 queryclustergenes = []
|
|
6394 queryclustergenesdetails = {}
|
|
6395 for i in querylines:
|
|
6396 tabs = i.split("\t")
|
|
6397 queryclustergenes.append(tabs[0])
|
|
6398 queryclustergenesdetails[tabs[0]] = [tabs[1],tabs[2],tabs[3],tabs[4]]
|
|
6399 #For every gene cluster, store hit genes and details
|
|
6400 colorgroupsdict = {}
|
|
6401 hitclusterdata = {}
|
|
6402 hitclusternr = 1
|
|
6403 compound_found = "n"
|
|
6404 nrhitgeneclusters[x] = 0
|
|
6405 for i in details:
|
|
6406 hitclustergenes = []
|
|
6407 hitclustergenesdetails = {}
|
|
6408 #Only calculate for first ten hit gene clusters
|
|
6409 if genomic_accnr == "" or genomic_accnr not in i:
|
|
6410 if hitclusternr <= 10:
|
|
6411 nrhitgeneclusters[x] = hitclusternr
|
|
6412 accession = cb_accessiondict[hitclusternr]
|
|
6413 hitclustergeneslines = ((i.split("Table of genes, locations, strands and annotations of subject cluster:\n")[1]).split("\n\nTable of Blast hits ")[0]).split("\n")
|
|
6414 #print '***********\n', i, '\n'
|
|
6415 #print hitclustergeneslines
|
|
6416 for j in hitclustergeneslines:
|
|
6417 tabs = j.split("\t")
|
|
6418 hitclustergenes.append(tabs[0])
|
|
6419 hitclustergenesdetails[tabs[0]] = [tabs[2],tabs[3],tabs[4],tabs[5],tabs[1]]
|
|
6420
|
|
6421 blasthitslines = ((i.split("%coverage, e-value):\n")[1]).split("\n\n")[0]).split("\n")
|
|
6422 querygeneswithhits = []
|
|
6423 coregeneswithhits = []
|
|
6424
|
|
6425
|
|
6426 blasthitdict = {}
|
|
6427 blastdetailsdict = {}
|
|
6428 querygenes = []
|
|
6429 revblasthitdict = {}
|
|
6430 hitgenes = []
|
|
6431
|
|
6432
|
|
6433 for k in blasthitslines:
|
|
6434 tabs = k.split("\t")
|
|
6435 if tabs[0] not in querygeneswithhits:
|
|
6436 querygeneswithhits.append(tabs[0])
|
|
6437 if tabs[0] in allcoregenes and tabs[0] not in coregeneswithhits:
|
|
6438 coregeneswithhits.append(tabs[0])
|
|
6439
|
|
6440
|
|
6441 if blasthitdict.has_key(tabs[0]):
|
|
6442 hits = blasthitdict[tabs[0]]
|
|
6443 hits.append(tabs[1])
|
|
6444 blasthitdict[tabs[0]] = hits
|
|
6445 if revblasthitdict.has_key(tabs[1]):
|
|
6446 revhits = revblasthitdict[tabs[1]]
|
|
6447 revhits.append(tabs[0])
|
|
6448 revblasthitdict[tabs[1]] = revhits
|
|
6449 else:
|
|
6450 revblasthitdict[tabs[1]] = [tabs[0]]
|
|
6451 blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
|
|
6452 if tabs[0] not in querygenes:
|
|
6453 querygenes.append(tabs[0])
|
|
6454 hitgenes.append(tabs[1])
|
|
6455 else:
|
|
6456 blasthitdict[tabs[0]] = [tabs[1]]
|
|
6457 if revblasthitdict.has_key(tabs[1]):
|
|
6458 revhits = revblasthitdict[tabs[1]]
|
|
6459 revhits.append(tabs[0])
|
|
6460 revblasthitdict[tabs[1]] = revhits
|
|
6461 else:
|
|
6462 revblasthitdict[tabs[1]] = [tabs[0]]
|
|
6463 blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
|
|
6464 if tabs[0] not in querygenes:
|
|
6465 querygenes.append(tabs[0])
|
|
6466 hitgenes.append(tabs[1])
|
|
6467
|
|
6468
|
|
6469
|
|
6470 for k in known_compound_dict.keys():
|
|
6471 if k in i and compound_found == "n" and len(querygeneswithhits) > 2 and len(coregeneswithhits) > 0:
|
|
6472 ws0.write(x,4,known_compound_dict[k])
|
|
6473 compound_found = "y"
|
|
6474 """blasthitdict = {}
|
|
6475 blastdetailsdict = {}
|
|
6476 querygenes = []
|
|
6477 revblasthitdict = {}
|
|
6478 hitgenes = []
|
|
6479 for i in blasthitslines:
|
|
6480 tabs = i.split("\t")
|
|
6481 if blasthitdict.has_key(tabs[0]):
|
|
6482 hits = blasthitdict[tabs[0]]
|
|
6483 hits.append(tabs[1])
|
|
6484 blasthitdict[tabs[0]] = hits
|
|
6485 if revblasthitdict.has_key(tabs[1]):
|
|
6486 revhits = revblasthitdict[tabs[1]]
|
|
6487 revhits.append(tabs[0])
|
|
6488 revblasthitdict[tabs[1]] = revhits
|
|
6489 else:
|
|
6490 revblasthitdict[tabs[1]] = [tabs[0]]
|
|
6491 blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
|
|
6492 if tabs[0] not in querygenes:
|
|
6493 querygenes.append(tabs[0])
|
|
6494 hitgenes.append(tabs[1])
|
|
6495 else:
|
|
6496 blasthitdict[tabs[0]] = [tabs[1]]
|
|
6497 if revblasthitdict.has_key(tabs[1]):
|
|
6498 revhits = revblasthitdict[tabs[1]]
|
|
6499 revhits.append(tabs[0])
|
|
6500 revblasthitdict[tabs[1]] = revhits
|
|
6501 else:
|
|
6502 revblasthitdict[tabs[1]] = [tabs[0]]
|
|
6503 blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
|
|
6504 if tabs[0] not in querygenes:
|
|
6505 querygenes.append(tabs[0])
|
|
6506 hitgenes.append(tabs[1])
|
|
6507 """
|
|
6508 #Make groups of genes for coloring
|
|
6509 colorgroups = []
|
|
6510 internalgroups = internalhomologygroupsdict[x]
|
|
6511 for i in internalgroups:
|
|
6512 querygenes_and_hits = []
|
|
6513 for j in i:
|
|
6514 #Make list of query gene and its hits
|
|
6515 additionalhits = []
|
|
6516 #For each hit, check if it was also hit by another gene; if so, only add it to the group if this hit had the lowest blast score
|
|
6517 otherscores = []
|
|
6518 queryscore = 0
|
|
6519 if blasthitdict.has_key(j):
|
|
6520 for k in blasthitdict[j]:
|
|
6521 for l in blastdetailsdict.keys():
|
|
6522 if k in l and j in l:
|
|
6523 queryscore = blastdetailsdict[l][1]
|
|
6524 elif k in l and j not in l:
|
|
6525 otherscores.append(blastdetailsdict[l][1])
|
|
6526 allscores = otherscores + [queryscore]
|
|
6527 if queryscore == max(allscores):
|
|
6528 additionalhits.append(k)
|
|
6529 #Add additional hits to the querygenes_and_hits list that will form a colorgroup
|
|
6530 querygenes_and_hits = querygenes_and_hits + additionalhits
|
|
6531 if j not in querygenes_and_hits:
|
|
6532 querygenes_and_hits.append(j)
|
|
6533 if len(querygenes_and_hits) > 0:
|
|
6534 colorgroups.append(querygenes_and_hits)
|
|
6535 colorgroupsdict[hitclusternr] = colorgroups
|
|
6536 hitclusterdata[hitclusternr] = [colorgroupsdict,hitclustergenes,hitclustergenesdetails,queryclustergenes,queryclustergenesdetails,toptenhitclusters,accession]
|
|
6537 hitclusternr += 1
|
|
6538 elif hitclusternr > 10 and hitclusternr <= 50:
|
|
6539 blasthitslines = ((i.split("%coverage, e-value):\n")[1]).split("\n\n")[0]).split("\n")
|
|
6540 querygeneswithhits = []
|
|
6541 coregeneswithhits = []
|
|
6542 for k in blasthitslines:
|
|
6543 tabs = k.split("\t")
|
|
6544 if tabs[0] not in querygeneswithhits:
|
|
6545 querygeneswithhits.append( tabs[0] )
|
|
6546 if tabs[0] in allcoregenes and tabs[0] not in coregeneswithhits:
|
|
6547 coregeneswithhits.append(tabs[0])
|
|
6548 for k in known_compound_dict.keys():
|
|
6549 if k in i and compound_found == "n" and len(querygeneswithhits) > 2 and len(coregeneswithhits) > 0:
|
|
6550 ws0.write(x,4,known_compound_dict[k])
|
|
6551 compound_found = "y"
|
|
6552 hitclusternr += 1
|
|
6553 queryclusterdata[cblastclusternr] = [nrhitclusters,hitclusterdata]
|
|
6554 cblastclusternr += 1
|
|
6555 wb.save(genomename + "/" + genomename + ".geneclusters.xls")
|
|
6556 #Gather and store data on each gene cluster
|
|
6557 gtrcoglist = ['SMCOG1045','SMCOG1062','SMCOG1102']
|
|
6558 transportercoglist = ['SMCOG1000','SMCOG1005','SMCOG1011','SMCOG1020','SMCOG1029','SMCOG1033','SMCOG1035','SMCOG1044','SMCOG1065','SMCOG1067','SMCOG1069','SMCOG1074','SMCOG1085','SMCOG1096','SMCOG1106','SMCOG1118','SMCOG1131','SMCOG1166','SMCOG1169','SMCOG1184','SMCOG1202','SMCOG1205','SMCOG1214','SMCOG1234','SMCOG1243','SMCOG1245','SMCOG1252','SMCOG1254','SMCOG1288']
|
|
6559 qgeneclusterdata = {}
|
|
6560 if smcogs == "y":
|
|
6561 smcogdict2 = {}
|
|
6562 smcogdescriptions = {}
|
|
6563 for i in smcogdict.keys():
|
|
6564 if len(smcogdict[i]) > 0 and len(smcogdict[i][0]) > 0 and ":" in smcogdict[i][0][0]:
|
|
6565 smcogdict2[i] = (smcogdict[i][0][0]).split(":")[0]
|
|
6566 smcogdescriptions[(smcogdict[i][0][0]).split(":")[0]] = (smcogdict[i][0][0]).split(":")[1]
|
|
6567 elif len(smcogdict[i]) > 0:
|
|
6568 smcogdict2[i] = smcogdict[i][0][0]
|
|
6569 smcogdict = smcogdict2
|
|
6570 for genecluster in geneclusters:
|
|
6571 clustergenes = clusterinfo[genecluster][4]
|
|
6572 clustergenes2 = []
|
|
6573 #for i in clustergenes:
|
|
6574 # clustergenes2.append(i[4])
|
|
6575 [clustergenes2.append(i[4]) for i in clustergenes]
|
|
6576 clustergenes = clustergenes2
|
|
6577 clusternr = 1
|
|
6578 clustertype = clusterinfo[genecluster][0]
|
|
6579 annotations = {}
|
|
6580 colors = []
|
|
6581 starts = []
|
|
6582 ends = []
|
|
6583 strands = []
|
|
6584 pksnrpsprots = []
|
|
6585 gtrs = []
|
|
6586 transporters = []
|
|
6587 for j in clustergenes:
|
|
6588 annotations[j] = proteins[3][j][3]
|
|
6589 starts.append(int(proteins[3][j][0]))
|
|
6590 ends.append(int(proteins[3][j][1]))
|
|
6591 strands.append(proteins[3][j][2])
|
|
6592 if j in allcoregenes:
|
|
6593 colors.append("#810E15")
|
|
6594 else:
|
|
6595 colors.append("grey")
|
|
6596 if j in pksnrpscoregenes:
|
|
6597 pksnrpsprots.append(j)
|
|
6598 if smcogs == "y":
|
|
6599 if smcogdict.has_key(j) and len(smcogdict[j]) > 0 :
|
|
6600 if smcogdict[j][0] in gtrcoglist:
|
|
6601 gtrs.append(j)
|
|
6602 if smcogdict[j][0] in transportercoglist:
|
|
6603 transporters.append(j)
|
|
6604 clustersize = max(ends) - min(starts)
|
|
6605 if clusterblast == "n":
|
|
6606 nrhitgeneclusters = {}
|
|
6607 for i in geneclusters:
|
|
6608 nrhitgeneclusters[i] = 0
|
|
6609 hitgeneclusters = range(1,(nrhitgeneclusters[genecluster] + 1))
|
|
6610 hitgeneclusterdata = {}
|
|
6611 hitgeneclusterdata[genecluster] = [hitgeneclusters]
|
|
6612 pksnrpsprotsnames = nrpspkstypedict
|
|
6613 pksnrpsdomains = {}
|
|
6614 domlist = []
|
|
6615 domsdetails = {}
|
|
6616 substrspecnrpspredictordict = {}
|
|
6617 substrspecminowadict = {}
|
|
6618 substrspecpkssigdict = {}
|
|
6619 substrspecconsensusdict = {}
|
|
6620 krpredictionsdict = {}
|
|
6621 for i in pksnrpsprots:
|
|
6622 domlist = []
|
|
6623 domsdetails = {}
|
|
6624 doms = domaindict[i]
|
|
6625 for j in doms:
|
|
6626 nr = 1
|
|
6627 while j[0] + str(nr) in domlist:
|
|
6628 nr += 1
|
|
6629 domname = j[0] + str(nr)
|
|
6630 domlist.append(domname)
|
|
6631 domsdetails[domname] = [j[1],j[2]]
|
|
6632 if "AMP-binding" in domname or "A-OX" in domname:
|
|
6633 domname2 = i + "_" + "A" + str(nr)
|
|
6634 substrspecminowadict[domname2] = minowa_nrps_preds[i + "_A" + str(nr)]
|
|
6635 substrspecnrpspredictordict[domname2] = [nrps_code_preds[i + "_A" + str(nr)],nrps_svm_preds[i + "_A" + str(nr)]]
|
|
6636 substrspecconsensusdict[domname2] = consensuspreds[i + "_A" + str(nr)]
|
|
6637 if "PKS_AT" in domname:
|
|
6638 domname2 = i + "_" + "AT" + str(nr)
|
|
6639 substrspecminowadict[domname2] = minowa_pks_preds[i + "_AT" + str(nr)]
|
|
6640 substrspecpkssigdict[domname2] = pks_code_preds[i + "_AT" + str(nr)]
|
|
6641 substrspecconsensusdict[domname2] = consensuspreds[i + "_AT" + str(nr)]
|
|
6642 if "CAL_domain" in domname:
|
|
6643 domname2 = i + "_" + "CAL" + str(nr)
|
|
6644 substrspecminowadict[domname2] = minowa_cal_preds[i + "_CAL" + str(nr)]
|
|
6645 substrspecconsensusdict[domname2] = consensuspreds[i + "_CAL" + str(nr)]
|
|
6646 if "CAL_domain" in domname:
|
|
6647 domname2 = i + "_" + "CAL" + str(nr)
|
|
6648 substrspecminowadict[domname2] = minowa_cal_preds[i + "_CAL" + str(nr)]
|
|
6649 substrspecconsensusdict[domname2] = consensuspreds[i + "_CAL" + str(nr)]
|
|
6650 if "PKS_KR" in domname:
|
|
6651 domname2 = i + "_" + "KR" + str(nr)
|
|
6652 krpredictionsdict[domname2] = [kr_activity_preds[i + "_KR" + str(nr)],kr_stereo_preds[i + "_KR" + str(nr)]]
|
|
6653 pksnrpsdomains[i] = [domlist,domsdetails]
|
|
6654 if compound_pred_dict.has_key(genecluster):
|
|
6655 structpred = compound_pred_dict[genecluster]
|
|
6656 else:
|
|
6657 structpred = "N/A"
|
|
6658 qgeneclusterdata[genecluster] = [clustertype,clustersize,clustergenes,annotations,starts,ends,strands,pksnrpsprots,pksnrpsprotsnames,pksnrpsdomains,substrspecnrpspredictordict,substrspecminowadict,substrspecpkssigdict,substrspecconsensusdict,gtrs,transporters,colors,hitgeneclusterdata,structpred,krpredictionsdict]
|
|
6659
|
|
6660 #Create genecluster svg for each gene cluster
|
|
6661 geneposdict = {}
|
|
6662 for qclusternr in geneclusters:
|
|
6663 data = qgeneclusterdata[qclusternr]
|
|
6664 #Some of the below 23 lines may already be internal to script, scan to remove unnecessary data fetching
|
|
6665 clustertype = data[0]
|
|
6666 clustersize = data[1]
|
|
6667 genes = data[2]
|
|
6668 annotations = data[3]
|
|
6669 starts = data[4]
|
|
6670 ends = data[5]
|
|
6671 strands = data[6]
|
|
6672 pksnrpsprots = data[7]
|
|
6673 pksnrpsprotsnames = data[8]
|
|
6674 pksnrpsdomains = data[9]
|
|
6675 substrspecnrpspredictordict = data[10]
|
|
6676 substrspecminowadict = data[11]
|
|
6677 substrspecpkssigdict = data[12]
|
|
6678 substrspecconsensusdict = data[13]
|
|
6679 gtrs = data[14]
|
|
6680 transporters = data[15]
|
|
6681 colors = data[16]
|
|
6682 hitgeneclusterdata = data[17]
|
|
6683 structpred = data[18]
|
|
6684 krpredictionsdict = data[19]
|
|
6685 relpositions = relativepositions(starts,ends,clustersize)
|
|
6686 rel_starts = relpositions[0]
|
|
6687 rel_ends = relpositions[1]
|
|
6688 y = 0
|
|
6689 for i in genes:
|
|
6690 geneposdict[i] = [starts[y],ends[y]]
|
|
6691 y += 1
|
|
6692 s = geneclustersvg(genes,rel_starts,rel_ends,strands,geneposdict,pksnrpsprots,pksnrpsdomains,qclusternr)
|
|
6693 outfile = open(svgfolder + "genecluster" + str(qclusternr) + ".svg","w")
|
|
6694 outfile.write(s.getXML())
|
|
6695 outfile.close()
|
|
6696 #Create ClusterBlast svg
|
|
6697 if clusterblast == "y":
|
|
6698 clusterblastpositiondata = {}
|
|
6699 #Create alignment svg for each pair of hit&query
|
|
6700 for i in geneclusters:
|
|
6701 hitclusters = range(queryclusterdata[i][0] + 1)[1:]
|
|
6702 #Create svgs for pairwise gene cluster alignment
|
|
6703 colorschemedict,rgbcolorscheme = calculate_colorgroups(i,hitclusters,queryclusterdata,internalhomologygroupsdict)
|
|
6704 for k in hitclusters:
|
|
6705 cresults = clusterblastresults(i,[k],queryclusterdata,colorschemedict,rgbcolorscheme)
|
|
6706 s = cresults[0]
|
|
6707 clusterblastpositiondata[str(i) + "_"+str(k)] = cresults[1]
|
|
6708 outfile = open(svgfolder + "clusterblast" + str(i) + "_" + str(k) + ".svg","w")
|
|
6709 outfile.write(s.getXML())
|
|
6710 outfile.close()
|
|
6711 #Create svgs for multiple gene cluster alignment
|
|
6712 cresults = clusterblastresults(i,hitclusters,queryclusterdata,colorschemedict,rgbcolorscheme)
|
|
6713 s = cresults[0]
|
|
6714 clusterblastpositiondata[str(i) + "_all"] = cresults[1]
|
|
6715 outfile = open(svgfolder + "clusterblast" + str(i) + "_all.svg","w")
|
|
6716 outfile.write(s.getXML())
|
|
6717 outfile.close()
|
|
6718
|
|
6719 #Create folder for SEARCHGTR HTML files, load search form template
|
|
6720 formtemplate = open("search_form.html","r")
|
|
6721 formtemplate = formtemplate.read()
|
|
6722 formtemplate = formtemplate.replace("\r","\n")
|
|
6723 formtemplateparts = formtemplate.split("FASTASEQUENCE")
|
|
6724 #Create HTML file with gene cluster info in hidden div tags
|
|
6725 htmlfile = open("empty.xhtml","r")
|
|
6726 html = htmlfile.read()
|
|
6727 html = html.replace("\r","\n")
|
|
6728 htmlparts = html.split("<SPLIT HERE>")
|
|
6729 htmloutfile = open(genomename + "/display.xhtml","w")
|
|
6730 htmloutfile.write(htmlparts[0])
|
|
6731 #Add lines toreload all svgs up front
|
|
6732 for qclusternr in geneclusters:
|
|
6733 htmloutfile.write(' loadsvg(' + str(qclusternr) + ');\n')
|
|
6734 if clusterblast == "y":
|
|
6735 cblastclusters = [1,2,3,4,5,6,7,8,9,10]
|
|
6736 for qclusternr in geneclusters:
|
|
6737 nrhitclusters = queryclusterdata[qclusternr][0]
|
|
6738 for j in range(nrhitclusters):
|
|
6739 htmloutfile.write(' loadcblastsvg(' + str(qclusternr) + ',' + str(j+1) + ');\n')
|
|
6740 #For each gene cluster, add hidden div tags for gene names, add hidden div tags for NRPS/PKS domains, add hidden div tags for ClusterBLAST depictions
|
|
6741 htmloutfile.write(htmlparts[1])
|
|
6742 for qclusternr in geneclusters:
|
|
6743 data = qgeneclusterdata[qclusternr]
|
|
6744 pksnrpsprots = data[7]
|
|
6745 pksnrpsprotsnames = data[8]
|
|
6746 pksnrpsdomains = data[9]
|
|
6747 a = 0
|
|
6748 for i in pksnrpsprots:
|
|
6749 for j in pksnrpsdomains[i][0]:
|
|
6750 htmloutfile.write(' $("#b' + str(qclusternr) + '_00' + str(a) + '_div").hide();\n')
|
|
6751 a += 1
|
|
6752 htmloutfile.write(htmlparts[2])
|
|
6753 #Add top menu
|
|
6754 gifdict = {"t1pks":"16","t2pks":"17","t3pks":"18","t4pks":"20","nrps":"10","amglyccycl":"1","bcin":"2","blactam":"3","butyrolactone":"4","ectoine":"5","terpene":"19","indole":"7","lant":"8","melanin":"9","nucleoside":"12","other":"13","phosphoglycolipid":"14","siderophore":"15"}
|
|
6755 htmloutfile.write('<img border="0" align="top" src="images/empty.png" name="img0_" />\n')
|
|
6756 menubutton_nr = 1
|
|
6757 nrclustercolumns = 1
|
|
6758 for i in geneclusters:
|
|
6759 if qgeneclusterdata[i][0] in gifdict.keys():
|
|
6760 typenr = gifdict[qgeneclusterdata[i][0]]
|
|
6761 elif "-" in qgeneclusterdata[i][0]:
|
|
6762 typenr = "6"
|
|
6763 else:
|
|
6764 typenr = "13"
|
|
6765 htmloutfile.write('<a href="javascript:displaycluster(' + str(i) + ')"><img align="top" border="0" src="images/img' + str(i) + '_1.png" name="img' + str(i) + '_" onmouseover="over(' + str(i) + '),over2(0,' + typenr + ')" onmouseout="out(' + str(i) + '),out2(0,' + typenr + ')"/></a>\n')
|
|
6766 if menubutton_nr == 22 or menubutton_nr == 49:
|
|
6767 htmloutfile.write('<br/>')
|
|
6768 nrclustercolumns += 1
|
|
6769 menubutton_nr += 1
|
|
6770
|
|
6771 #Add gene cluster description
|
|
6772 htmloutfile.write(htmlparts[3])
|
|
6773 extrapixelsdict = {}
|
|
6774 for qclusternr in geneclusters:
|
|
6775 data = qgeneclusterdata[qclusternr]
|
|
6776 clustertype = data[0]
|
|
6777 clustersize = data[1]
|
|
6778 genes = data[2]
|
|
6779 annotations = data[3]
|
|
6780 starts = data[4]
|
|
6781 ends = data[5]
|
|
6782 strands = data[6]
|
|
6783 pksnrpsprots = data[7]
|
|
6784 pksnrpsprotsnames = data[8]
|
|
6785 pksnrpsdomains = data[9]
|
|
6786 substrspecnrpspredictordict = data[10]
|
|
6787 substrspecminowadict = data[11]
|
|
6788 substrspecpkssigdict = data[12]
|
|
6789 substrspecconsensusdict = data[13]
|
|
6790 gtrs = data[14]
|
|
6791 transporters = data[15]
|
|
6792 colors = data[16]
|
|
6793 hitgeneclusterdata = data[17]
|
|
6794 structpred = data[18]
|
|
6795 krpredictionsdict = data[19]
|
|
6796 relpositions = relativepositions(starts,ends,clustersize)
|
|
6797 rel_starts = relpositions[0]
|
|
6798 rel_ends = relpositions[1]
|
|
6799 #Create genes overview pop-up HTMLs
|
|
6800 genepopupoutfile = open(htmlfolder + "geneclustergenes" + str(qclusternr) + '.html',"w")
|
|
6801 genepopupoutfile.write('<html>\n<head>\n<LINK href="style.css" rel="stylesheet" type="text/css">\n</head>\n<body>\nOverview of gene cluster genes:<br><br><table border=1>\n')
|
|
6802 genepopupoutfile.write('<tr><td><b>Gene</b></td><td><b>Annotation</b></td><td><b>Start position</b></td><td><b>End position</b></td><td><b>Strand</b></td></tr>\n')
|
|
6803 for i in genes:
|
|
6804 genepopupoutfile.write('<tr><td>' + i + '</td><td>' + annotations[i].replace("_"," ") + '</td><td>' + str(starts[genes.index(i)]) + '</td><td>' + str(ends[genes.index(i)]) + '</td><td>' + strands[genes.index(i)] + '</td></tr>\n')
|
|
6805 genepopupoutfile.write('\n</table><br><br><br>Biosynthetic gene cluster signature gene domains detected: <br><br>\n')
|
|
6806 genepopupoutfile.write('<table border=1><tr><td><b>Gene</b></td><td><b>Detected domains</b></td><td><b>Bit scores</b></td>\n')
|
|
6807 for i in genes:
|
|
6808 if i in allcoregenes:
|
|
6809 detected_doms = detecteddomainsdict[i]
|
|
6810 for j in detected_doms:
|
|
6811 genepopupoutfile.write('<tr><td>' + i + '</td><td>' + str(j[0]) + '</td><td>' + str(j[1]) + '</td>\n')
|
|
6812 genepopupoutfile.write('\n</table><br><br><br>')
|
|
6813 genepopupoutfile.write('\n</body>\n</html>\n')
|
|
6814 genepopupoutfile.close()
|
|
6815 #Add gene cluster description on top
|
|
6816 if qclusternr == 1:
|
|
6817 htmloutfile.write('<div id="genecluster'+ str(qclusternr) + '">')
|
|
6818 else:
|
|
6819 htmloutfile.write('\n\n<div id="genecluster'+ str(qclusternr) + '" style="display:none">')
|
|
6820 #Add menu bars 1 & 2
|
|
6821 htmloutfile.write('<div id="bartext1" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:' + str(113 + nrclustercolumns * 28) + 'px; left:30px;"><b>Gene cluster description</b></div>')
|
|
6822 htmloutfile.write('<div id="bartext2" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:' + str(263 + nrclustercolumns * 28) + 'px; left:30px;"><b>PKS/NRPS domain annotation</b></div>')
|
|
6823 htmloutfile.write('<div id="descrbar1" style="position:absolute; z-index:1; top:' + str(110 + nrclustercolumns * 28) + 'px;"><img src="images/bar.png" height="25" width="' + str(int(0.75 * screenwidth)) + '"/></div>\n')
|
|
6824 htmloutfile.write('<div class="help" id="help1" style="position:absolute; z-index:1; top:' + str(112 + nrclustercolumns * 28) + 'px; left:' + str(int(screenwidth * 0.75) - 20) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#panel1" target="_blank"><img border="0" src="images/help.png"/></a></div>\n')
|
|
6825 htmloutfile.write('<div id="descrbar2" style="position:absolute; z-index:1; top:' + str(260 + nrclustercolumns * 28) + 'px;"><img src="images/bar.png" height="25" width="' + str(int(0.75 * screenwidth)) + '"/></div>\n')
|
|
6826 htmloutfile.write('<div class="help" id="help2" style="position:absolute; z-index:1; top:' + str(262 + nrclustercolumns * 28) + 'px; left:' + str(int(screenwidth * 0.75) - 20) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#panel2" target="_blank"><img border="0" src="images/help.png"/></a></div>\n')
|
|
6827 if screenwidth < 1280:
|
|
6828 htmloutfile.write('<div class="clusterdescr" style="font-size:0.7em; position:absolute; top:' + str(125 + nrclustercolumns * 28) + 'px; left:' + str(12) + 'px;">\n')
|
|
6829 else:
|
|
6830 htmloutfile.write('<div class="clusterdescr" style="font-size:0.8em; position:absolute; top:' + str(120 + nrclustercolumns * 28) + 'px; left:' + str(12) + 'px;">\n')
|
|
6831 htmloutfile.write("<br/>Gene Cluster " + str(qclusternr) + ". Type = " + clustertype + ". Location: "+ str(starts[0]) + " - " + str(ends[-1]) + " nt. Click on genes for more information.")
|
|
6832 if len(genomic_accnr) > 4:
|
|
6833 htmloutfile.write(' <a href="http://www.ncbi.nlm.nih.gov/nuccore/' + genomic_accnr + '" target="_blank">GBK</a>')
|
|
6834 #Genes overview pop-up.
|
|
6835 if len(clustertype) > 20:
|
|
6836 htmloutfile.write('<br/>')
|
|
6837 htmloutfile.write(' <a href="html/geneclustergenes' + str(qclusternr) + '.html" onclick=\'window.open("html/geneclustergenes' + str(qclusternr) + '.html","popup","width=800,height=800,scrollbars=yes,resizable=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>Genes and detection info overview</a>')
|
|
6838 htmloutfile.write("</div>\n\n")
|
|
6839 htmloutfile.write('<div id="display' + str(qclusternr) + '">\n')
|
|
6840 if nrclustercolumns > 1:
|
|
6841 spacers = nrclustercolumns - 1
|
|
6842 for i in range(spacers):
|
|
6843 htmloutfile.write('<img src="images/spacer.png"/>\n')
|
|
6844 htmloutfile.write('</div>\n')
|
|
6845 #Add gene pop-ups
|
|
6846 a = 0
|
|
6847 for i in genes:
|
|
6848 htmloutfile.write('<div id="a' + str(qclusternr) + '_00' + str(a) + '_div" class="hidden popup" style="position:absolute; z-index:2; top:' + str(185 + nrclustercolumns * 28) + 'px; left:' + str(int(((rel_starts[a] + rel_ends[a])/2)*0.875)) + 'px;">\n')
|
|
6849 htmloutfile.write(annotations[i].replace("_"," ").replace("&","&") + "\n")
|
|
6850 if smcogs == "y":
|
|
6851 if smcogdict.has_key(i):
|
|
6852 smcog = smcogdict[i]
|
|
6853 htmloutfile.write("<br/>smCOG: " + smcog + " (" + smcogdescriptions[smcog].replace("_"," ").replace("&","&") + ")\n")
|
|
6854 if smcog in gtrcoglist:
|
|
6855 formfileloc = searchgtrfolder + i + ".html"
|
|
6856 formfile = open(formfileloc,"w")
|
|
6857 specificformtemplate = formtemplateparts[0].replace("GlycTr",i)
|
|
6858 formfile.write(specificformtemplate)
|
|
6859 formfile.write(i + "\n" + seqdict[i])
|
|
6860 formfile.write(formtemplateparts[1])
|
|
6861 formfile.close()
|
|
6862 htmloutfile.write("<br/><a href=\"searchgtr/" + i + ".html\" target=\"_blank\"> Run SEARCHGTr on this gene </a>\n")
|
|
6863 if smcog in transportercoglist:
|
|
6864 link = "http://blast.jcvi.org/er-blast/index.cgi?project=transporter;program=blastp;sequence=sequence%0A" + seqdict[i]
|
|
6865 htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> TransportDB BLAST on this gene </a>\n")
|
|
6866 else:
|
|
6867 htmloutfile.write("<br/>smCOG: -\n")
|
|
6868 link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=" + seqdict[i] + "&LINK_LOC=protein&PAGE_TYPE=BlastSearch"
|
|
6869 htmloutfile.write("<br/>Location: " + str(starts[a]) + "-" + str(ends[a]) + "\n")
|
|
6870 htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a><br/>\n")
|
|
6871 browse_start = starts[a] - 10000
|
|
6872 browse_end = ends[a] + 10000
|
|
6873 if browse_start < 0:
|
|
6874 browse_start = 0
|
|
6875 if browse_end > dnaseqlength:
|
|
6876 browse_end = dnaseqlength
|
|
6877 if genomic_accnr != "none" and genomic_accnr != "":
|
|
6878 htmloutfile.write('<a href="http://www.ncbi.nlm.nih.gov/projects/sviewer/?Db=gene&DbFrom=protein&Cmd=Link&noslider=1&id=' + genomic_accnr + '&from=' + str(browse_start) + '&to=' + str(browse_end) + '" target=\"_blank\">View genomic context</a><br/>\n')
|
|
6879 if smcogs == "y":
|
|
6880 if smcogtreedict.has_key(i.rpartition(".")[0]):
|
|
6881 htmloutfile.write('<a href="smcogs/' + smcogtreedict[i.rpartition(".")[0]] + '" onclick=\'window.open("smcogs/' + smcogtreedict[i.rpartition(".")[0]] + '","popup","width=1280,height=1500,resizable=yes,scrollbars=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>View smCOG seed phylogenetic tree with this gene</a>\n')
|
|
6882 elif smcogtreedict.has_key(i):
|
|
6883 htmloutfile.write('<a href="smcogs/' + smcogtreedict[i] + '" onclick=\'window.open("smcogs/' + smcogtreedict[i] + '","popup","width=1280,height=1500,resizable=yes,scrollbars=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>View smCOG seed phylogenetic tree with this gene</a>\n')
|
|
6884 htmloutfile.write("</div>\n\n")
|
|
6885 htmloutfile.write('<div id="a' + str(qclusternr) + '_00' + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(162 + nrclustercolumns * 28) + 'px; left:' + str(float((rel_starts[a]+rel_ends[a])/2)*0.9375) + 'px;">\n')
|
|
6886 htmloutfile.write(i)
|
|
6887 htmloutfile.write("</div>\n\n")
|
|
6888 a += 1
|
|
6889 #Early calculation of nr of domains to be able to fit structure prediction information of large NRPSs/PKSs
|
|
6890 pksnrpsdomainnr = 0
|
|
6891 krdomainnr = 0
|
|
6892 adomainnr = 0
|
|
6893 for i in pksnrpsprots:
|
|
6894 doms = pksnrpsdomains[i][0]
|
|
6895 first = "no"
|
|
6896 nra = 0
|
|
6897 nrat = 0
|
|
6898 nrkr = 0
|
|
6899 nrcal = 0
|
|
6900 for j in doms:
|
|
6901 if "AMP-binding" in j or "A-OX" in j:
|
|
6902 j = "A"
|
|
6903 nra += 1
|
|
6904 adomainnr += 1
|
|
6905 z = nra
|
|
6906 if "KR" in j:
|
|
6907 j = "KR"
|
|
6908 nrkr += 1
|
|
6909 krdomainnr += 1
|
|
6910 z = nrkr
|
|
6911 if "AT" in j and "docking" not in j:
|
|
6912 j = "AT"
|
|
6913 nrat += 1
|
|
6914 pksnrpsdomainnr += 1
|
|
6915 z = nrat
|
|
6916 if "CAL" in j:
|
|
6917 j = "CAL"
|
|
6918 nrcal += 1
|
|
6919 pksnrpsdomainnr += 1
|
|
6920 z = nrcal
|
|
6921 pixels = adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16) + 375
|
|
6922 extrapixels = pixels - (676 + len(pksnrpsprots) * 99)
|
|
6923 if extrapixels < 0:
|
|
6924 extrapixels = 0
|
|
6925 extrapixelsdict[qclusternr] = extrapixels
|
|
6926 #Add picture of predicted chemical structure
|
|
6927 htmloutfile.write('<div id="verticalbar1" style="position:absolute; left:' + str(int(screenwidth * 0.75) + 12) + 'px; top:' + str(106 + nrclustercolumns * 28) + 'px;"><img src="images/linefill.png" height="' + str(1126 + len(pksnrpsprots) * 99 + extrapixels) + '" width="2"/></div>\n')
|
|
6928 htmloutfile.write('<div id="verticalbar2" style="position:absolute; left:' + str(int(screenwidth * 0.98)) + 'px; top:0px;"><img src="images/linefill.png" height="' + str(1288 + len(pksnrpsprots) * 99 + nrclustercolumns * 28 + extrapixels) + '" width="2"/></div>\n')
|
|
6929 htmloutfile.write('<div id="horizbar1" style="position:absolute; left:0px; top:' + str(92 + nrclustercolumns * 28) + 'px;"><img src="images/linefill.png" height="2" width="' + str(screenwidth * 0.98) + '"/></div>\n')
|
|
6930 htmloutfile.write('<div id="horizbar2" style="position:absolute; left:0px; top:82px;"><img src="images/linefill.png" height="2" width="' + str(screenwidth * 0.98) + '"/></div>\n')
|
|
6931 htmloutfile.write('<div id="horizbar3" style="position:absolute; left:0px; top:' + str(1223 + len(pksnrpsprots) * 99 + nrclustercolumns * 28 + extrapixels) + 'px;"><img src="images/linefill.png" height="2" width="' + str(screenwidth * 0.98) + '"/></div>\n')
|
|
6932 if screenwidth < 1280:
|
|
6933 htmloutfile.write('<div id="bartext4" style="color:#FFFFFF; font-size:0.8em; position:absolute; z-index:2; top:' + str(114 + nrclustercolumns * 28) + 'px; left:' + str(int(screenwidth * 0.75) + 30) + 'px;"><b>Predicted core structure</b></div>\n')
|
|
6934 else:
|
|
6935 htmloutfile.write('<div id="bartext4" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:' + str(113 + nrclustercolumns * 28) + 'px; left:' + str(int(screenwidth * 0.75) + 30) + 'px;"><b>Predicted core structure</b></div>\n')
|
|
6936 htmloutfile.write('<div class="title" style="position:absolute; top:' + str(110 + nrclustercolumns * 28) + 'px; left:' + str(screenwidth * 0.75 + 20) + 'px;">\n')
|
|
6937 htmloutfile.write('<div id="descrbar4" style="right:25px; position:absolute; z-index:1; top:0px; left:0px;"><img src="images/bar.png" height="25" width="' + str(int(0.21 * screenwidth)) + '"/></div>\n')
|
|
6938 htmloutfile.write('<div class="help" id="help4" style="position:absolute; z-index:1; top:2px; left:' + str(int(screenwidth * 0.2) - 20) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#sidepanel1" target="_blank"><img border="0" src="images/help.png"/></a></div>\n')
|
|
6939 if qclusternr in failedstructures:
|
|
6940 htmloutfile.write('<br/><br/><img src="images/nostructure_icon.png" border="1" width="' + str(int(screenwidth * 0.19)) + '" height="200" />\n')
|
|
6941 elif " " in structpred:
|
|
6942 htmloutfile.write('<br/><br/><a href="structures/genecluster' + str(qclusternr) + '.png" onclick=\'window.open("structures/genecluster' + str(qclusternr) + '.png","popup","width=600,height=300,scrollbars=yes,resizable=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'><img src="structures/genecluster' + str(qclusternr) + '_icon.png" border="1" width="' + str(int(screenwidth * 0.19)) + '" height="200" /></a>\n')
|
|
6943 else:
|
|
6944 htmloutfile.write('<br/><br/><img src="images/nostructure_icon.png" border="1" width="' + str(int(screenwidth * 0.19)) + '" height="200" />\n')
|
|
6945 htmloutfile.write('<div class="clusterdescr" style="font-size:0.8em;">\n')
|
|
6946 htmloutfile.write("Monomers prediction: " + structpred + "<br/>\n")
|
|
6947 if qclusternr in dockingdomainanalysis:
|
|
6948 htmloutfile.write('<a href="html/docking_analysis' + str(qclusternr) + '.html" onclick=\'window.open("html/docking_analysis' + str(qclusternr) + '.html","popup","width=600,height=1200,scrollbars=yes,resizable=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>Docking domain analysis results.</a><br/>\n')
|
|
6949 nrpsfound = "no"
|
|
6950 pksnrpsdomainnr = 0
|
|
6951 adomainnr = 0
|
|
6952 krdomainnr = 0
|
|
6953 for i in pksnrpsprots:
|
|
6954 doms = pksnrpsdomains[i][0]
|
|
6955 first = "no"
|
|
6956 nra = 0
|
|
6957 nrat = 0
|
|
6958 nrkr = 0
|
|
6959 nrcal = 0
|
|
6960 for j in doms:
|
|
6961 if "AMP-binding" in j or "A-OX" in j:
|
|
6962 j = "A"
|
|
6963 nra += 1
|
|
6964 adomainnr += 1
|
|
6965 z = nra
|
|
6966 if "KR" in j:
|
|
6967 j = "KR"
|
|
6968 nrkr += 1
|
|
6969 krdomainnr += 1
|
|
6970 z = nrkr
|
|
6971 if "AT" in j and "docking" not in j:
|
|
6972 j = "AT"
|
|
6973 nrat += 1
|
|
6974 pksnrpsdomainnr += 1
|
|
6975 z = nrat
|
|
6976 if "CAL" in j:
|
|
6977 j = "CAL"
|
|
6978 nrcal += 1
|
|
6979 pksnrpsdomainnr += 1
|
|
6980 z = nrcal
|
|
6981 prediction = "no"
|
|
6982 domname = str(i) + "_" + str(j) + str(z)
|
|
6983 if domname in substrspecnrpspredictordict.keys():
|
|
6984 nrpsfound = "yes"
|
|
6985 prediction = "yes"
|
|
6986 if substrspecnrpspredictordict[domname][0] == "nrp":
|
|
6987 if first == "no":
|
|
6988 first = "yes"
|
|
6989 htmloutfile.write(i + ':<br/>')
|
|
6990 htmloutfile.write('<font size="1"> NRPSPredictor code prediction, '+ str(j) + str(z) + ': ?</font><br/>\n')
|
|
6991 else:
|
|
6992 if first == "no":
|
|
6993 first = "yes"
|
|
6994 htmloutfile.write(i + ':<br/>')
|
|
6995 htmloutfile.write('<font size="1"> NRPSPredictor code prediction, '+ str(j) + str(z) + ': ' + substrspecnrpspredictordict[domname][0] + '</font><br/>\n')
|
|
6996 if substrspecnrpspredictordict[domname][1] == "nrp":
|
|
6997 if first == "no":
|
|
6998 first = "yes"
|
|
6999 htmloutfile.write(i + ':<br/>')
|
|
7000 htmloutfile.write('<font size="1"> NRPSPredictor SVM prediction, '+ str(j) + str(z) + ': ?</font><br/>\n')
|
|
7001 else:
|
|
7002 if first == "no":
|
|
7003 first = "yes"
|
|
7004 htmloutfile.write(i + ':<br/>')
|
|
7005 htmloutfile.write('<font size="1"> NRPSPredictor SVM prediction, '+ str(j) + str(z) + ': ' + substrspecnrpspredictordict[domname][1] + '</font><br/>\n')
|
|
7006 if domname in substrspecminowadict.keys():
|
|
7007 prediction = "yes"
|
|
7008 if substrspecminowadict[domname] == "nrp" or substrspecminowadict[domname] == "pk":
|
|
7009 if first == "no":
|
|
7010 first = "yes"
|
|
7011 htmloutfile.write(i + ':<br/>')
|
|
7012 htmloutfile.write('<font size="1"> Minowa prediction, '+ str(j) + str(z) + ': ?</font><br/>\n')
|
|
7013 else:
|
|
7014 if first == "no":
|
|
7015 first = "yes"
|
|
7016 htmloutfile.write(i + ':<br/>')
|
|
7017 htmloutfile.write('<font size="1"> Minowa prediction, '+ str(j) + str(z) + ': ' + substrspecminowadict[domname] + '</font><br/>\n')
|
|
7018 if domname in substrspecpkssigdict.keys():
|
|
7019 prediction = "yes"
|
|
7020 if substrspecpkssigdict[domname] == "pk":
|
|
7021 if first == "no":
|
|
7022 first = "yes"
|
|
7023 htmloutfile.write(i + ':<br/>')
|
|
7024 htmloutfile.write('<font size="1"> PKS code prediction, '+ str(j) + str(z) + ': ?</font><br/>\n')
|
|
7025 else:
|
|
7026 if first == "no":
|
|
7027 first = "yes"
|
|
7028 htmloutfile.write(i + ':<br/>')
|
|
7029 htmloutfile.write('<font size="1"> PKS code prediction, '+ str(j) + str(z) + ': ' + substrspecpkssigdict[domname] + '</font><br/>\n')
|
|
7030 if domname in krpredictionsdict.keys():
|
|
7031 if first == "no":
|
|
7032 first = "yes"
|
|
7033 htmloutfile.write(i + ':<br/>')
|
|
7034 htmloutfile.write('<font size="1"> KR activity, '+ str(j) + str(z) + ': ' + krpredictionsdict[domname][0] + "</font><br/>\n")
|
|
7035 htmloutfile.write('<font size="1"> KR stereochemistry, '+ str(j) + str(z) + ': ' + krpredictionsdict[domname][1] + "</font><br/>\n")
|
|
7036 #Add link to prediction details pop-up
|
|
7037 if prediction == "yes":
|
|
7038 htmloutfile.write('<font size="1"> <a href="substrspecs/' + domname + '.html" onclick=\'window.open("substrspecs/' + domname + '.html","popup","width=500,height=400,scrollbars=yes,resizable=no,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>Prediction details</a></font><br/>\n')
|
|
7039 if nrpsfound == "yes":
|
|
7040 htmloutfile.write('<br/><a href="http://bioinfo.lifl.fr/norine/form2.jsp" target="_blank">Perform Norine peptide search</a>')
|
|
7041 htmloutfile.write('</div>')
|
|
7042 if screenwidth < 1280:
|
|
7043 htmloutfile.write('<div id="bartext5" style="color:#FFFFFF; font-size:0.8em; position:absolute; z-index:2; top:' + str(624 + adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:10px;"><b>File outputs</b></div>\n')
|
|
7044 else:
|
|
7045 htmloutfile.write('<div id="bartext5" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:' + str(623 + adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:10px;"><b>Downloadable output files</b></div>\n')
|
|
7046 htmloutfile.write('<div id="descrbar5" style="right:25px; position:absolute; z-index:1; top:' + str(620 + adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:0px;"><img src="images/bar.png" height="25" width="' + str(int(0.21 * screenwidth)) + '"/></div>\n')
|
|
7047 htmloutfile.write('<div class="help" id="help5" style="position:absolute; z-index:1; top:' + str(622 + adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:' + str(int(screenwidth * 0.2) - 20) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#sidepanel2" target="_blank"><img border="0" src="images/help.png"/></a></div>\n')
|
|
7048 htmloutfile.write('<div class="text" id="outputinfo" style="font-size:0.8em; right:25px; position:absolute; z-index:1; top:' + str(655 + adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:0px;">')
|
|
7049 if fullhmm == "y" or fullblast == "y":
|
|
7050 htmloutfile.write('<a href="' + oldgenomename + '.final.embl" target="_blank">Open EMBL summary file</a><br/><br/>')
|
|
7051 #htmloutfile.write('<a href="' + genomename + '.final.csv" target="_blank">Download CSV summary file</a><br/><br/>')
|
|
7052 if fullhmm == "y":
|
|
7053 htmloutfile.write('<a href="' + oldgenomename + '.cluster_prediction.png" onclick=\'window.open("' + oldgenomename + '.cluster_prediction.png","popup","width=1024,height=1400,scrollbars=0,resizable=0,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>Sec. met. enriched genome regions</a><br/><br/>')
|
|
7054 htmloutfile.write('<a href="' + genomename + '.geneclusters.xls" target="_blank">Open XLS overview table</a><br/><br/>')
|
|
7055 htmloutfile.write('</div>')
|
|
7056 htmloutfile.write("</div>\n\n")
|
|
7057 #Add descriptions of NRPS/PKS genes
|
|
7058 htmloutfile.write('<div class="title" style="position:absolute; top:' + str(180) + 'px; left:' + str(12) + 'px;">\n')
|
|
7059 htmloutfile.write("</div>\n\n")
|
|
7060 z = 1
|
|
7061 for i in pksnrpsprots:
|
|
7062 htmloutfile.write('<div class="text" style="position:absolute; top:' + str(228 + 84 * z + nrclustercolumns * 28) + 'px; left:' + str(12) + 'px;">\n')
|
|
7063 htmloutfile.write(i + " (" + pksnrpsprotsnames[i].lower() + ")")
|
|
7064 htmloutfile.write("</div>\n\n")
|
|
7065 z += 1
|
|
7066 #Add NRPS/PKS domain pop-ups
|
|
7067 longestprot = 0
|
|
7068 protlengthdict = {}
|
|
7069 for i in pksnrpsprots:
|
|
7070 protlength = (geneposdict[i][1] - geneposdict[i][0]) / 3
|
|
7071 protlengthdict[i] = protlength
|
|
7072 if protlength > longestprot:
|
|
7073 longestprot = protlength
|
|
7074 try:
|
|
7075 aa2pixelratio = longestprot * 0.75 / screenwidth
|
|
7076 except:
|
|
7077 aa2pixelratio = 0.1
|
|
7078 a = 0
|
|
7079 z = 1
|
|
7080 for i in pksnrpsprots:
|
|
7081 domainsdict = pksnrpsdomains[i][1]
|
|
7082 nra = 0
|
|
7083 nrat = 0
|
|
7084 nrkr = 0
|
|
7085 nrcal = 0
|
|
7086 for j in pksnrpsdomains[i][0]:
|
|
7087 startpos = domainsdict[j][0]
|
|
7088 endpos = domainsdict[j][1]
|
|
7089 htmloutfile.write('<div id="b' + str(qclusternr) + '_00' + str(a) + '_div" class="hidden popup" style="position:absolute; z-index:2; top:' + str(277 + 84 * z + nrclustercolumns * 28) + 'px; left:' + str( ( ( (endpos+startpos) / 2) / aa2pixelratio) * 0.9375 ) + 'px;">\n')
|
|
7090 htmloutfile.write("Domain " + j + " (" + i + ")")
|
|
7091 link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=" + seqdict[i][startpos:endpos] + "&LINK_LOC=protein&PAGE_TYPE=BlastSearch"
|
|
7092 htmloutfile.write("<br/>Location: " + str(startpos) + "-" + str(endpos) + " AA\n")
|
|
7093 domid = i + "_" + j
|
|
7094 if "AMP-binding" in j or "A-OX" in j:
|
|
7095 j = "A"
|
|
7096 nra += 1
|
|
7097 y = nra
|
|
7098 if "PKS_KR" in j:
|
|
7099 j = "KR"
|
|
7100 nrkr += 1
|
|
7101 y = nrkr
|
|
7102 if "PKS_AT" in j:
|
|
7103 j = "AT"
|
|
7104 nrat += 1
|
|
7105 y = nrat
|
|
7106 if "CAL_domain" in j:
|
|
7107 j = "CAL"
|
|
7108 nrcal += 1
|
|
7109 y = nrcal
|
|
7110 prediction = "no"
|
|
7111 domid = str(i) + "_" + str(j) + str(y)
|
|
7112 if substrspecnrpspredictordict.has_key(domid) or substrspecminowadict.has_key(domid) or substrspecpkssigdict.has_key(domid):
|
|
7113 htmloutfile.write("<br/>Predicted substrate: " + substrspecconsensusdict[domid] + "\n")
|
|
7114 if substrspecnrpspredictordict.has_key(domid):
|
|
7115 htmloutfile.write("<br/>-NRPSPredictor code: " + substrspecnrpspredictordict[domid][0] + "\n")
|
|
7116 htmloutfile.write("<br/>-NRPSPredictor SVM: " + substrspecnrpspredictordict[domid][1] + "\n")
|
|
7117 if substrspecminowadict.has_key(domid):
|
|
7118 htmloutfile.write("<br/>-Minowa HMM: " + substrspecminowadict[domid] + "\n")
|
|
7119 if substrspecpkssigdict.has_key(domid):
|
|
7120 htmloutfile.write("<br/>-PKS code: " + substrspecpkssigdict[domid] + "\n")
|
|
7121 if krpredictionsdict.has_key(domid):
|
|
7122 htmloutfile.write("<br/>KR activity: " + krpredictionsdict[domid][0] + "\n")
|
|
7123 htmloutfile.write("<br/>KR stereochemistry: " + krpredictionsdict[domid][1] + "\n")
|
|
7124 htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this domain </a>\n")
|
|
7125 htmloutfile.write("</div>\n\n")
|
|
7126 a += 1
|
|
7127 z += 1
|
|
7128 htmloutfile.write('</div>\n')
|
|
7129
|
|
7130 if clusterblast == "y":
|
|
7131 #Write ClusterBlast divs with pictures and description pop-up tags
|
|
7132 htmloutfile.write('<div id="clusterblastview" class="clusterdescr">\n\n')
|
|
7133 #Add menu bar 3
|
|
7134 htmloutfile.write('<div id="bartext3" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:3px; left:20px;"><b>Homologous gene clusters</b></div>')
|
|
7135 htmloutfile.write('<div id="descrbar3" style="position:absolute; z-index:1; top:0px;"><img src="images/bar.png" height="25" width="' + str(int(0.75*screenwidth)) + '"/></div>')
|
|
7136 htmloutfile.write('<div class="help" id="help3" style="position:absolute; z-index:1; top:2px; left:' + str(int(screenwidth * 0.75) - 30) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#panel3" target="_blank"><img border="0" src="images/help.png"/></a></div>')
|
|
7137 for qclusternr in geneclusters:
|
|
7138 nrhitclusters = queryclusterdata[qclusternr][0]
|
|
7139 hitclusterdata = queryclusterdata[qclusternr][1]
|
|
7140 if qclusternr == 1:
|
|
7141 htmloutfile.write('<div id="qcluster' + str(qclusternr) + '">\n<br/><br/>\n<div align="left">\n<form name="clusterform' + str(qclusternr) + '">\n<select name="selection' + str(qclusternr) + '" onchange="javascript:navigate(this);">\n')
|
|
7142 else:
|
|
7143 htmloutfile.write('<div id="qcluster' + str(qclusternr) + '" style="display:none">\n<br/><br/>\n<div align="left">\n<form name="clusterform' + str(qclusternr) + '">\n<select name="selection' + str(qclusternr) + '" onchange="javascript:navigate(this);">\n')
|
|
7144 htmloutfile.write('<option value="">Select gene cluster alignment</option>\n')
|
|
7145 for i in range(nrhitclusters):
|
|
7146 htmloutfile.write('<option value="javascript:displaycblastresults(' + str(qclusternr) + ',' + str(i+1) + ')">' + hitclusterdata[i+1][5][i].replace("&","&") + '</option>\n')
|
|
7147 htmloutfile.write('</select>\n</form>\n\n</div>')
|
|
7148 htmloutfile.write('<div style="position:absolute; top:33px; left:' + str(screenwidth*0.625) + 'px;"><img src="images/button.gif" name="button' + str(qclusternr) + '" onclick="javascript:displaybutton(' + str(qclusternr) + ');"/></div>')
|
|
7149 clustersizes = []
|
|
7150 for i in range(nrhitclusters):
|
|
7151 hitclusterdata = queryclusterdata[qclusternr][1]
|
|
7152 queryclustergenes = hitclusterdata[1][3]
|
|
7153 queryclustergenesdetails = hitclusterdata[1][4]
|
|
7154 hitclusternumber = i + 1
|
|
7155 cluster_acc = hitclusterdata[hitclusternumber][6]
|
|
7156 hitclustergenes = hitclusterdata[hitclusternumber][1]
|
|
7157 hitclustergenesdetails = hitclusterdata[hitclusternumber][2]
|
|
7158 relpositiondata = clusterblastpositiondata[str(qclusternr) + "_" + str(i+1)]
|
|
7159 qrel_starts = relpositiondata[0][0]
|
|
7160 qrel_ends = relpositiondata[0][1]
|
|
7161 hrel_starts = relpositiondata[1][hitclusternumber ][0]
|
|
7162 hrel_ends = relpositiondata[1][hitclusternumber ][1]
|
|
7163 strandsbalance = relpositiondata[2][hitclusternumber]
|
|
7164 if strandsbalance < 0:
|
|
7165 hitclustergenes.reverse()
|
|
7166 if qclusternr == 1 and (i+1) == 1:
|
|
7167 htmloutfile.write('<div id="hitcluster' + str(qclusternr) + '_' + str(i+1) + '">\n')
|
|
7168 else:
|
|
7169 htmloutfile.write('<div id="hitcluster' + str(qclusternr) + '_' + str(i+1) + '" style="display:none">\n')
|
|
7170 #Insert gene cluster descriptions
|
|
7171 cdescription = hitclusterdata[i+1][5][i].replace("&","&").replace("\t"," ").partition(" ")[2].partition(" ")[2].split(", whole")[0].split(", complete")[0]
|
|
7172 if len(nucname) < 80:
|
|
7173 qdescription = nucname
|
|
7174 else:
|
|
7175 qdescription = nucname[0:77] + "..."
|
|
7176 htmloutfile.write('<div id="descriptionquery" style="text-align:right; position:absolute; top:70px; right:50px; font-size:10px; font-style:italic">' + qdescription + '</div>\n')
|
|
7177 htmloutfile.write('<div id="description' + str(qclusternr) + '" style="text-align:right; position:absolute; top:137px; right:50px; font-size:10px; font-style:italic">' + cdescription + '</div>\n')
|
|
7178 #Insert pubmed/pubchem links
|
|
7179 htmloutfile.write('<div id="pub_pics" style="position:absolute; top:60px; left:' + str(int(screenwidth * 0.0)) + 'px; font-size:10px"> Hit cluster cross-links: \n')
|
|
7180 htmloutfile.write(' <a href="http://www.ncbi.nlm.nih.gov/nuccore/' + cluster_acc.split(".")[0] + '" target="_blank"><img align="bottom" border="0" src="images/genbank.gif"/></a>\n')
|
|
7181 present = "n"
|
|
7182 for j in pubmed_dict.keys():
|
|
7183 if j in cluster_acc:
|
|
7184 present = "y"
|
|
7185 for j in pubchem_dict.keys():
|
|
7186 if j in cluster_acc:
|
|
7187 present = "y"
|
|
7188 if present == "y":
|
|
7189 for j in pubmed_dict.keys():
|
|
7190 if j in cluster_acc:
|
|
7191 pubmedstring = pubmed_dict[j]
|
|
7192 htmloutfile.write(' <a href="http://www.ncbi.nlm.nih.gov/pubmed/' + pubmedstring + '" target="_blank"><img align="bottom" border="0" src="images/pubmed.gif"/></a>\n')
|
|
7193 for j in pubchem_dict.keys():
|
|
7194 if j in cluster_acc:
|
|
7195 pubchemstring = pubchem_dict[j]
|
|
7196 if "," in pubchemstring:
|
|
7197 htmloutfile.write(' <a href="http://www.ncbi.nlm.nih.gov/sites/entrez?db=pccompound&term=' + pubchemstring + '" target="_blank"><img align="bottom" border="0" src="images/struct.gif"/></a>\n')
|
|
7198 else:
|
|
7199 htmloutfile.write(' <a href="http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=' + pubchemstring + '" target="_blank"><img align="bottom" border="0" src="images/struct.gif"/></a>\n')
|
|
7200 htmloutfile.write('</div>\n\n')
|
|
7201 #Create gene pop-ups
|
|
7202 a = 0
|
|
7203 for j in queryclustergenes:
|
|
7204 j_accession = accessiondict[j]
|
|
7205 htmloutfile.write('<div id="q' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_div" class="hidden popup" style="position:absolute; top:' + str(113) + 'px; left:' + str(int(float(qrel_starts[a])*0.875)) + 'px;">\n')
|
|
7206 htmloutfile.write(queryclustergenesdetails[j][3].replace("_"," ").replace("&","&") + "\n")
|
|
7207 link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=" + j_accession + "&LINK_LOC=protein&PAGE_TYPE=BlastSearch"
|
|
7208 htmloutfile.write("<br/>Location: " + str(queryclustergenesdetails[j][0]) + "-" + str(queryclustergenesdetails[j][1]) + "\n")
|
|
7209 htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a>\n")
|
|
7210 htmloutfile.write("</div>\n\n")
|
|
7211 htmloutfile.write('<div id="q' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(83) + 'px; left:' + str(int(float((float(qrel_starts[a])+float(qrel_ends[a]))/2)*0.9375)) + 'px;">\n')
|
|
7212 htmloutfile.write(j)
|
|
7213 htmloutfile.write("</div>\n\n")
|
|
7214 a+= 1
|
|
7215 a = 0
|
|
7216 for j in hitclustergenes:
|
|
7217 j_accession = hitclustergenesdetails[j][4]
|
|
7218 htmloutfile.write('<div id="h' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_div" class="hidden popup" style="position:absolute; top:' + str(183) + 'px; left:' + str(int(float(hrel_starts[a])*0.875)) + 'px;">\n')
|
|
7219 htmloutfile.write(hitclustergenesdetails[j][3].replace("_"," ").replace("&","&") + "\n")
|
|
7220 link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=" + j_accession + "&LINK_LOC=protein&PAGE_TYPE=BlastSearch"
|
|
7221 htmloutfile.write("<br/>Location: " + str(hitclustergenesdetails[j][0]) + "-" + str(hitclustergenesdetails[j][1]) + "\n")
|
|
7222 htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a>\n")
|
|
7223 htmloutfile.write("</div>\n\n")
|
|
7224 htmloutfile.write('<div id="h' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(153) + 'px; left:' + str(int(float((float(hrel_starts[a])+float(hrel_ends[a]))/2)*0.9375)) + 'px;">\n')
|
|
7225 htmloutfile.write(j)
|
|
7226 htmloutfile.write("</div>\n\n")
|
|
7227 a += 1
|
|
7228 htmloutfile.write('</div>\n')
|
|
7229 #Find new relative positions for display of all gene clusters in one picture
|
|
7230 relpositiondata = clusterblastpositiondata[str(qclusternr) + "_all"]
|
|
7231 qrel_starts = relpositiondata[0][0]
|
|
7232 qrel_ends = relpositiondata[0][1]
|
|
7233 htmloutfile.write('<div id="hitcluster' + str(qclusternr) + '_all" style="display:none">\n')
|
|
7234 if len(nucname) < 80:
|
|
7235 qdescription = nucname
|
|
7236 else:
|
|
7237 qdescription = nucname[0:77] + "..."
|
|
7238 htmloutfile.write('<div id="descriptionquery" style="text-align:right; position:absolute; top:60px; right:50px; font-size:10px; font-style:italic">' + qdescription + '</div>\n')
|
|
7239 for i in range(nrhitclusters):
|
|
7240 hitclusterdata = queryclusterdata[qclusternr][1]
|
|
7241 queryclustergenes = hitclusterdata[1][3]
|
|
7242 queryclustergenesdetails = hitclusterdata[1][4]
|
|
7243 hitclusternumber = i + 1
|
|
7244 hrel_starts = relpositiondata[1][hitclusternumber][0]
|
|
7245 hrel_ends = relpositiondata[1][hitclusternumber][1]
|
|
7246 cluster_acc = hitclusterdata[hitclusternumber][6]
|
|
7247 hitclustergenes = hitclusterdata[hitclusternumber][1]
|
|
7248 hitclustergenesdetails = hitclusterdata[hitclusternumber][2]
|
|
7249 strandsbalance = relpositiondata[2][hitclusternumber]
|
|
7250 cdescription = hitclusterdata[i+1][5][i].replace("&","&").replace("\t"," ").partition(" ")[2].partition(" ")[2].split(", whole")[0].split(", complete")[0]
|
|
7251 htmloutfile.write('<div id="description' + str(qclusternr) + '" style="text-align:right; position:absolute; top:' + str(60 + (57 * hitclusternumber)) + 'px; right:50px; font-size:10px; font-style:italic">' + cdescription + '</div>\n')
|
|
7252 if hitclusternumber == 1:
|
|
7253 a = 0
|
|
7254 for j in queryclustergenes:
|
|
7255 htmloutfile.write('<div id="all_' + str(qclusternr) + "_0_" + str(a) + '_div" class="hidden popup" style="position:absolute; top:' + str(100) + 'px; left:' + str(int(float(qrel_starts[a])*0.875)) + 'px; z-index:2;">\n')
|
|
7256 htmloutfile.write(queryclustergenesdetails[j][3].replace("_"," ").replace("&","&") + "\n")
|
|
7257 link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=" + j + "&LINK_LOC=protein&PAGE_TYPE=BlastSearch"
|
|
7258 htmloutfile.write("<br/>Location: " + str(queryclustergenesdetails[j][0]) + "-" + str(queryclustergenesdetails[j][1]) + "\n")
|
|
7259 htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a>\n")
|
|
7260 htmloutfile.write("</div>\n\n")
|
|
7261 htmloutfile.write('<div id="all_' + str(qclusternr) + "_0_" + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(75) + 'px; left:' + str(int(float((float(qrel_starts[a])+float(qrel_ends[a]))/2)*0.9375)) + 'px;">\n')
|
|
7262 htmloutfile.write(j)
|
|
7263 htmloutfile.write("</div>\n\n")
|
|
7264 a+= 1
|
|
7265 a = 0
|
|
7266 for j in hitclustergenes:
|
|
7267 htmloutfile.write('<div id="all_' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_div" class="hidden popup" style="position:absolute; top:' + str(100 + 57 * hitclusternumber) + 'px; left:' + str(int(float(hrel_starts[a])*0.875)) + 'px; z-index:2;">\n')
|
|
7268 htmloutfile.write(hitclustergenesdetails[j][3].replace("_"," ").replace("&","&") + "\n")
|
|
7269 link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=" + j + "&LINK_LOC=protein&PAGE_TYPE=BlastSearch"
|
|
7270 htmloutfile.write("<br/>Location: " + str(hitclustergenesdetails[j][0]) + "-" + str(hitclustergenesdetails[j][1]) + "\n")
|
|
7271 htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a>\n")
|
|
7272 htmloutfile.write("</div>\n\n")
|
|
7273 htmloutfile.write('<div id="all_' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(75 + 56.75 * hitclusternumber) + 'px; left:' + str(int(float((float(hrel_starts[a])+float(hrel_ends[a]))/2)*0.9375)) + 'px;">\n')
|
|
7274 htmloutfile.write(j)
|
|
7275 htmloutfile.write("</div>\n\n")
|
|
7276 a += 1
|
|
7277 htmloutfile.write('</div>\n')
|
|
7278 htmloutfile.write('</div>\n\n')
|
|
7279 if clusterblast == "y":
|
|
7280 htmloutfile.write('</div>\n')
|
|
7281 for i in geneclusters:
|
|
7282 data = qgeneclusterdata[i]
|
|
7283 extrapixels = extrapixelsdict[i]
|
|
7284 pksnrpsprots = data[7]
|
|
7285 if i == 1:
|
|
7286 htmloutfile.write('<div id="creditsbar' + str(i) + '" class="banner" style="position:absolute; width:' + str(int(0.98 * screenwidth)) +'px; align:\'left\'; height:75; top:' + str(1242 + int(len(pksnrpsprots) * 99) + nrclustercolumns * 28 + extrapixels) + 'px; left:0px; color:#810E15; z-index:-1;">')
|
|
7287 else:
|
|
7288 htmloutfile.write('<div id="creditsbar' + str(i) + '" class="banner" style="display:none; position:absolute; width:' + str(int(0.98 * screenwidth)) +'px; align:\'left\'; height:75; top:' + str(1242 + int(len(pksnrpsprots) * 99) + nrclustercolumns * 28 + extrapixels) + 'px; left:0px; color:#810E15; z-index:-1;">')
|
|
7289 htmloutfile.write('<div style="float:center; font-size:0.9em;">\n<div style="position:absolute; top:0px; left:30px;">\n<img src="images/ruglogo.gif" border="0"/> \n<img src="images/gbblogo.gif" border="0"/> \n<img src="images/tueblogo.gif" border="0"/> \n<img src="images/ucsflogo.gif" border="0"/> \n</div>\n<div style="position:absolute; top:0px; left:600px;">\nantiSMASH: Rapid identification, annotation and analysis of secondary metabolite biosynthesis gene clusters.\n<br/>Marnix H. Medema, Kai Blin, Peter Cimermancic, Victor de Jager, Piotr Zakrzewski, Michael A. Fischbach, Tilmann Weber, Rainer Breitling & Eriko Takano\n<br/><i>Nucleic Acids Research</i> (2011), proposal submitted.\n</div>\n</div>\n</div>')
|
|
7290 #Add final part of HTML file
|
|
7291 htmloutfile.write(htmlparts[-1])
|
|
7292 #Copy accessory files for HTML viewing
|
|
7293 if sys.platform == ('win32'):
|
|
7294 copycommand1 = "copy/y vis\\* " + genomename + " > nul"
|
|
7295 copycommand2 = "copy/y vis\\html\\* " + genomename + "\\html > nul"
|
|
7296 copycommand3 = "copy/y vis\\images\\* " + genomename + "\\images > nul"
|
|
7297 elif sys.platform == ('linux2'):
|
|
7298 copycommand1 = "cp -r vis/* " + genomename + " > /dev/null"
|
|
7299 copycommand2 = "true"
|
|
7300 copycommand3 = "true"
|
|
7301 os.system(copycommand1)
|
|
7302 os.system(copycommand2)
|
|
7303 os.system(copycommand3)
|
|
7304
|
|
7305 #Generate EMBL output
|
|
7306 emblfile = open(genomename + "/embl_lines.txt","w")
|
|
7307 for i in geneclustergenes:
|
|
7308 emblfile.write(i + "\t")
|
|
7309 if smcogs == "y":
|
|
7310 if smcogdict.has_key(i):
|
|
7311 emblfile.write("smCOG: " + smcogdict[i] + ":" + smcogdescriptions[smcogdict[i]] + "\t")
|
|
7312 if nrpspkstypedict.has_key(i):
|
|
7313 emblfile.write("NRPS/PKS type: " + nrpspkstypedict[i] + "\t")
|
|
7314 if domaindict.has_key(i):
|
|
7315 domains = domaindict[i]
|
|
7316 for j in domains:
|
|
7317 emblfile.write(j[0] + " (" + str(j[1]) + "-" + str(j[2]) + "); E-value:" + str(j[3]) + "; Bit score: " + str(j[4]) + "\t")
|
|
7318 nrat = 0
|
|
7319 for k in minowa_pks_preds.keys():
|
|
7320 if i in k:
|
|
7321 nrat += 1
|
|
7322 emblfile.write("AT-domain " + str(nrat) + " Minowa substrate specificity prediction: " + minowa_pks_preds[k] + "\t")
|
|
7323 nrat = 0
|
|
7324 for k in pks_code_preds.keys():
|
|
7325 if i in k:
|
|
7326 nrat += 1
|
|
7327 emblfile.write("AT-domain " + str(nrat) + " PKS code substrate specificity prediction: " + pks_code_preds[k] + "\t")
|
|
7328 nrcal = 0
|
|
7329 for k in minowa_cal_preds.keys():
|
|
7330 if i in k:
|
|
7331 nrcal += 1
|
|
7332 emblfile.write("CAL-domain " + str(nrcal) + " Minowa substrate specificity prediction: " + minowa_cal_preds[k] + "\t")
|
|
7333 nra = 0
|
|
7334 for k in minowa_nrps_preds.keys():
|
|
7335 if i in k:
|
|
7336 nra += 1
|
|
7337 emblfile.write("A-domain " + str(nra) + " Minowa substrate specificity prediction: " + minowa_nrps_preds[k] + "\t")
|
|
7338 nra = 0
|
|
7339 for k in nrps_code_preds.keys():
|
|
7340 if i in k:
|
|
7341 nra += 1
|
|
7342 emblfile.write("A-domain " + str(nra) + " Stachelhaus code substrate specificity prediction: " + nrps_code_preds[k] + "\t")
|
|
7343 nra = 0
|
|
7344 for k in nrps_svm_preds.keys():
|
|
7345 if i in k:
|
|
7346 nra += 1
|
|
7347 emblfile.write("A-domain " + str(nra) + " NRPSPredictor2 SVM substrate specificity prediction: " + nrps_svm_preds[k] + "\t")
|
|
7348 nrkr = 0
|
|
7349 for k in kr_activity_preds.keys():
|
|
7350 if i in k:
|
|
7351 nrkr += 1
|
|
7352 emblfile.write("KR-domain " + str(nrat) + " activity prediction: " + kr_activity_preds[k] + "\t")
|
|
7353 emblfile.write("KR-domain " + str(nrat) + " predicted stereochemistry group: " + kr_stereo_preds[k] + "\t")
|
|
7354 if motifdict.has_key(i):
|
|
7355 l = motifdict[i]
|
|
7356 for m in l:
|
|
7357 emblfile.write("Motif " + str(m[0]) + " (" + str(m[1]) + "-" + str(m[2]) + "). E-value: " + str(m[3]) + "; Bit score: " + str(m[4]) + "\t")
|
|
7358 emblfile.write("\n")
|
|
7359 emblfile.write("\n\n>>\n\n")
|
|
7360 #enter separate domain entries
|
|
7361 for i in geneclustergenes:
|
|
7362 strand = strandsdict[i]
|
|
7363 startpos = geneposdict[i][0]
|
|
7364 endpos = geneposdict[i][1]
|
|
7365 if domaindict.has_key(i):
|
|
7366 domains = domaindict[i]
|
|
7367 for j in domains:
|
|
7368 if strand == "+":
|
|
7369 emblfile.write("misc_feature\t" + str(startpos + j[1] * 3) + ".." + str(startpos + j[2] * 3) + "\t" + str(j[0]) + " domain;\tE-value: " + str(j[3]) + "\tBit score: " + str(j[4]) + "\t/colour=2\n")
|
|
7370 elif strand == "-":
|
|
7371 emblfile.write("misc_feature\tcomplement(" + str(endpos - j[2] * 3) + ".." + str(endpos - j[1] * 3) + ")\t" + str(j[0]) + "domain;\tE-value: " + str(j[3]) + "Bit score: " + str(j[4]) + "\t/colour=2\n")
|
|
7372 if motifdict.has_key(i):
|
|
7373 l = motifdict[i]
|
|
7374 for m in l:
|
|
7375 if strand == "+":
|
|
7376 emblfile.write("misc_feature\t" + str(startpos + m[1] * 3) + ".." + str(startpos + m[2] * 3) + "\t" + str(m[0]) + " motif;\tE-value: " + str(m[3]) + "\tBit score: " + str(m[4]) + "\t/colour=6\n")
|
|
7377 elif strand == "-":
|
|
7378 emblfile.write("misc_feature\tcomplement(" + str(endpos - m[2] * 3) + ".." + str(endpos - m[1] * 3) + ")\t" + str(m[0]) + " motif;\tE-value: " + str(m[3]) + "\tBit score: " + str(m[4]) + "\t/colour=6\n")
|
|
7379 emblfile.write("\n\n>>\n\n")
|
|
7380 for i in geneclusters:
|
|
7381 cstart = clusterinfo[i][1]
|
|
7382 if cstart == 0:
|
|
7383 cstart = 1
|
|
7384 cend = clusterinfo[i][2]
|
|
7385 emblfile.write("misc_feature\t" + str(cstart) + ".." + str(cend) + "\t" + clusterinfo[i][0] + " gene cluster\t/colour=13\n")
|
|
7386 emblfile.close()
|
|
7387
|
|
7388 #Close open html file
|
|
7389 htmloutfile.close()
|
|
7390
|
|
7391 #Run whole-genome BLAST / HMM CLUSEAN modules & ClusterFinder
|
|
7392 if sys.platform == ('win32'):
|
|
7393 copycommand = "copy " + infile + " " + genomename + ' > nul'
|
|
7394 if sys.platform == ('linux2'):
|
|
7395 copycommand = "cp " + infile + " " + genomename
|
|
7396 os.system(copycommand)
|
|
7397 os.chdir(genomename)
|
|
7398 args = "--cpus %s " % nrcpus
|
|
7399 if fullblast == "n":
|
|
7400 args += "--without-blast "
|
|
7401 if fullhmm == "n":
|
|
7402 args += "--without-hmmer "
|
|
7403 if fullhmm == "y":
|
|
7404 args += '--pfamdbpath %s ' % pfamdbpath
|
|
7405 if fullblast == "y":
|
|
7406 args += '--blastdbpath %s ' % blastdbpath
|
|
7407 logfile.write("Running CLUSEAN pipeline modules.\n")
|
|
7408 if sys.platform == ('win32'):
|
|
7409 os.system("python ..\\clusean\\scripts\\runPipeline.py %s" % args)
|
|
7410 if sys.platform == ('linux2'):
|
|
7411 os.system( antismash_path + "clusean/scripts/runPipeline.py %s" % args)
|
|
7412 #print antismash_path + "clusean/scripts/runPipeline.py %s" % args
|
|
7413
|
|
7414 os.chdir('..')
|
|
7415
|
|
7416 #Close log file
|
|
7417 logfile.write("antiSMASH successfully finished in " + str(elapsed) + " seconds.\n")
|
|
7418 #print "antiSMASH successfully finished in " + str(elapsed) + " seconds.\n"
|
|
7419 logfile.close()
|