comparison antismash.py @ 0:6a37d0a4510a default tip

initial uploaded
author bjoern-gruening
date Thu, 15 Mar 2012 05:23:03 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6a37d0a4510a
1 #!/usr/bin/env python
2 ## Copyright (c) 2010 Marnix H. Medema
3 ## University of Groningen
4 ## Department of Microbial Physiology / Groningen Bioinformatics Centre
5 ## License: GNU General Public License v3 or later
6 ## A copy of GNU GPL v3 should have been included in this software package in LICENSE.txt.
7
8 ##Functions necessary for this script
9
10 import linecache, cPickle
11
12 DEBUG = True
13
14
15 def invalidoptions(argument):
16 if len(argument) > 0:
17 print >> sys.stderr, "Invalid options input:"
18 print >> sys.stderr, argument
19 print "From the command line, input antismash --help for more information."
20 logfile.write("Invalid options input: " + argument + "\n")
21 logfile.close()
22 sys.exit(1)
23
24 def sortdictkeysbyvalues(dict):
25 items = [(value, key) for key, value in dict.items()]
26 items.sort()
27 return [key for value, key in items]
28
29 def sortdictkeysbyvaluesrev(dict):
30 items = [(value, key) for key, value in dict.items()]
31 items.sort()
32 items.reverse()
33 return [key for value, key in items]
34
35 def sortdictkeysbyvaluesrevv(dict):
36 items = [(value, key) for key, value in dict.items()]
37 items.sort()
38 items.reverse()
39 return [value for value, key in items]
40
41 def get_sequence(fasta):
42 """get the description and trimmed dna sequence"""
43 #in_file = open(fasta, 'r')
44 #content = in_file.readlines()
45 #in_file.close()
46 #content2 = []
47 #for i in content:
48 #if i != "":
49 # content2.append(i)
50 content = []
51 [content.append(line) for line in open(fasta, 'r') if line]
52 #content = content2
53 while content[0] == "" or content[0] == "\n":
54 content = content[1:]
55 header = content[0]
56 content = content[1:]
57 content = [x.rstrip() for x in content]
58 seq = "".join(content)
59 if ">" not in header or ">" in seq:
60 print >> sys.stderr, "FASTA file not properly formatted; should be single sequence starting with '>' and sequence name."
61 logfile.write("FASTA file not properly formatted; should started with '>' and sequence name on first line.\n")
62 logfile.close()
63 sys.exit(1)
64 return seq
65
66 def complement(seq):
67 complement = {'a': 't', 'c': 'g', 'g': 'c', 't': 'a', 'n': 'n', 'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
68 complseq = []
69 for base in seq:
70 if base in complement.keys():
71 complbase = complement[str(base)]
72 complseq.append(complbase)
73 else:
74 complbase = 'n'
75 complseq.append(complbase)
76 return complseq
77
78 def reverse_complement(seq):
79 seq = list(seq)
80 seq.reverse()
81 revcompl = complement(seq)
82 revcomplstr = str()
83 for i in revcompl:
84 revcomplstr = revcomplstr + str(i)
85 return revcomplstr
86
87 def fastaseqlengths(proteins):
88 names = proteins[0]
89 seqs = proteins[1]
90 seqlengths = {}
91 a = 0
92 for i in names:
93 #seq = seqs[a]
94 #seqlength = len(seq)
95 #seqlengths[i] = seqlength
96 seqlengths[i] = len(seqs[a])
97 a += 1
98 return seqlengths
99
100 # Function that reads the fasta file into a dictionary
101 def fastadict(fasta):
102 file = open(fasta,"r")
103 filetext = file.read()
104 filetext = filetext.replace("\r","\n")
105 filetext = filetext.strip()
106 #Replaces all spaces with "_" to avoid problems
107 filetext = filetext.replace(' ','_')
108 filetext = filetext.split()
109 dictseq = {}
110 for a in filetext:
111 if ">" in a[0]:
112 f = str()
113 d = a[1:68]
114 else:
115 e = a
116 f += e
117 dictseq[d] = f
118 return dictseq
119
120 # Function that extracts all sequence names from the fasta dictionary
121 def lnames(fastadict):
122 items = fastadict.items()
123 items.sort()
124 return [names for names, seqs in items]
125
126 # Function that extracts all sequences from the fasta dictionary
127 def lseqs(fastadict):
128 items = fastadict.items()
129 items.sort()
130 return [seqs for names, seqs in items]
131
132 def extractpositions(refmusclefile,newmusclefile,positions,refsequencename,querysequencename):
133 dict = fastadict(refmusclefile)
134 seqs = lseqs(dict)
135 names = lnames(dict)
136 #startpos = 2
137 residues = []
138 #Count residues in ref sequence and put positions in list
139 muscle_dict = fastadict(newmusclefile)
140 muscle_seqs = lseqs(muscle_dict)
141 muscle_names = lnames(muscle_dict)
142 refseqnr = muscle_names.index(refsequencename)
143 #Extract activity signature
144 refseq = muscle_seqs[refseqnr]
145 poslist = []
146 b = 0
147 c = 0
148 while refseq != "":
149 i = refseq[0]
150 if c in positions and i != "-":
151 poslist.append(b)
152 if i != "-":
153 c += 1
154 b += 1
155 refseq = refseq[1:]
156 #Extract positions from query sequence
157 query_seqnr = muscle_names.index(querysequencename)
158 query_seq = muscle_seqs[query_seqnr]
159 for j in poslist:
160 residues.append(query_seq[j])
161 return residues
162
163 def parsegenes(genes):
164 genedict = {}
165 genelist = []
166 joinlist = []
167 joindict = {}
168 accessiondict = {}
169 error = "n"
170 errorlocations = []
171 genenr = 0
172 for i in genes:
173 if " gene " in i:
174 i = i.split(" gene ")[0]
175 elif "FT gene " in i:
176 i = i.split("FT gene ")[0]
177 join = "no"
178 genenr += 1
179 #Find gene location info for each gene
180 if "complement" in i.split("\n")[0].lower() and i.split("\n")[0][-1] == ")":
181 location = i.split("\n")[0]
182 elif "complement" in i.split("\n")[0].lower() and i.split("\n")[0][-1] != ")":
183 location = i.split(" /")[0]
184 while ")" not in location.replace(" ","")[-3:]:
185 location = location.rpartition("\n")[0]
186 location = location.replace("\n","")
187 location = location.replace(" ","")
188 elif "join" in i.split("\n")[0].lower() and i.split("\n")[0][-1] == ")":
189 location = i.split("\n")[0]
190 elif "join" in i.split("\n")[0].lower() and i.split("\n")[0][-1] != ")":
191 location = i.split("/")[0]
192 while ")" not in location.replace(" ","")[-3:]:
193 location = location.rpartition("\n")[0]
194 location = location.replace("\n","")
195 location = location.replace(" ","")
196 else:
197 location = i.split("\n")[0]
198 original_location = location
199 #location info found in gbk/embl file, now extract start and end positions
200 if location.count("(") != location.count(")"):
201 error = "y"
202 errorlocations.append(original_location)
203 continue
204 if "join(complement" in location.lower():
205 location = location.lower()
206 join = "yes"
207 location2 = location.partition("join(")[2][:-1].replace("<","").replace(">","")
208 if ("complement(" in location2[0:12] and location2[-1] != ")") or ")," in location2:
209 error = "y"
210 errorlocations.append(original_location)
211 continue
212 elif ("complement(" in location2[0:12] and location2[-1] == ")" and location2[12:-2].count(")") == 0 and location2[12:-2].count("(") == 0):
213 location2 = location2.partition("complement(")[2][:-1]
214 start = location2.split(",")[0]
215 start = start.split("..")[0]
216 start = start.replace("<","")
217 end = location2.split(",")[-1]
218 if ".." in end:
219 end = end.split("..")[1]
220 end = end.replace(">","")
221 joinedparts = location2.split(",")
222 joinedparts2 = []
223 for j in joinedparts:
224 newjoinedpart = j.replace("<","")
225 newjoinedpart = newjoinedpart.replace(">","")
226 joinedparts2.append(newjoinedpart)
227 strand = "-"
228 else:
229 error = "y"
230 errorlocations.append(original_location)
231 continue
232 elif "complement" in location.lower():
233 location = location.lower()
234 location = location.partition("complement(")[2][:-1]
235 if "join(" in location.lower():
236 join = "yes"
237 location = location.lower()
238 location2 = location.partition("join(")[2][:-1]
239 start = location2.split(",")[0]
240 start = start.split("..")[0]
241 start = start.replace("<","")
242 end = location2.split(",")[-1]
243 if ".." in end:
244 end = end.split("..")[1]
245 end = end.replace(">","")
246 joinedparts = location2.split(",")
247 joinedparts2 = []
248 for j in joinedparts:
249 newjoinedpart = j.replace("<","")
250 newjoinedpart = newjoinedpart.replace(">","")
251 joinedparts2.append(newjoinedpart)
252 else:
253 start = location.split("..")[0]
254 start = start.replace("<","")
255 if ".." in location:
256 end = location.split("..")[1]
257 else:
258 end = location
259 end = end.replace(">","")
260 strand = "-"
261 else:
262 if "join(" in location.lower():
263 join = "yes"
264 location = location.lower()
265 location2 = location.partition("join(")[2][:-1]
266 start = location2.split(",")[0]
267 start = start.split("..")[0]
268 start = start.replace("<","")
269 end = location2.split(",")[-1]
270 if ".." in end:
271 end = end.split("..")[1]
272 end = end.replace(">","")
273 joinedparts = location2.split(",")
274 joinedparts2 = []
275 for j in joinedparts:
276 newjoinedpart = j.replace("<","")
277 newjoinedpart = newjoinedpart.replace(">","")
278 joinedparts2.append(newjoinedpart)
279 else:
280 start = location.split("..")[0]
281 start = start.replace("<","")
282 if ".." in location:
283 end = location.split("..")[1]
284 else:
285 end = location
286 end = end.replace(">","")
287 strand = "+"
288 try:
289 if int(start) > int(end):
290 start2 = end
291 end2 = start
292 start = start2
293 end = end2
294 except ValueError:
295 error = "y"
296 errorlocations.append(original_location)
297 continue
298 #Correct for alternative codon start positions
299 if "codon_start=" in i.lower():
300 temp = i.lower().split("codon_start=")[1].split()[0]
301 if '"' in temp:
302 # temp ist "1" oder "2", dies kommt aus biopython
303 temp = temp[1]
304 else:
305 # ohne anfuhrungszeichen ... 1 oder 2
306 temp = temp[0]
307 codonstart = temp
308 if strand == "+":
309 start = str(int(start) + (int(codonstart) - 1))
310 elif strand == "-":
311 end = str(int(end) - (int(codonstart) - 1))
312 #Find gene name for each gene, preferably locus_tag, than gene, than protein_ID
313 a = 0
314 b = 0
315 genename = ""
316 nrlines = len(i.split("\n"))
317 while b == 0:
318 line = i.split("\n")[a]
319 if "protein_id=" in line:
320 genename = (line.split("protein_id=")[1][1:-1]).replace(" ","_")
321 genename = genename.replace("\\","_")
322 genename = genename.replace("/","_")
323 b += 1
324 elif "protein_id=" in line.lower():
325 genename = (line.lower().split("protein_id=")[1][1:-1]).replace(" ","_")
326 genename = genename.replace("\\","_")
327 genename = genename.replace("/","_")
328 b += 1
329 elif a == (nrlines - 1):
330 genename = ""
331 b += 1
332 else:
333 a += 1
334 if len(genename) > 1:
335 accnr = genename
336 else:
337 accnr = "no_accession_number_found"
338 a = 0
339 b = 0
340 nrlines = len(i.split("\n"))
341 while b == 0:
342 line = i.split("\n")[a]
343 if "gene=" in line:
344 genename = (line.split("gene=")[1][1:-1]).replace(" ","_")
345 genename = genename.replace("\\","_")
346 genename = genename.replace("/","_")
347 b += 1
348 elif "gene=" in line.lower():
349 genename = (line.lower().split("gene=")[1][1:-1]).replace(" ","_")
350 genename = genename.replace("\\","_")
351 genename = genename.replace("/","_")
352 b += 1
353 elif a == (nrlines - 1):
354 b += 1
355 else:
356 a += 1
357 a = 0
358 b = 0
359 nrlines = len(i.split("\n"))
360 while b == 0:
361 line = i.split("\n")[a]
362 if "locus_tag=" in line:
363 genename = (line.split("locus_tag=")[1][1:-1]).replace(" ","_")
364 genename = genename.replace("\\","_")
365 genename = genename.replace("/","_")
366 b += 1
367 elif "locus_tag=" in line.lower():
368 genename = (line.lower().split("locus_tag=")[1][1:-1]).replace(" ","_")
369 genename = genename.replace("\\","_")
370 genename = genename.replace("/","_")
371 b += 1
372 elif a == (nrlines - 1):
373 if genename == "":
374 genename = "prot_ID_" + str(genenr)
375 b += 1
376 else:
377 a += 1
378 #Find sequence for each gene
379 a = 0 ###Not all gbks contain protein sequences as translations, therefore sequences from gene clusters are now extracted from the database at a later stage if sequence is not in gbk
380 b = 0
381 sequence = ""
382 while b < 2:
383 line = i.split("\n")[a]
384 if "translation=" in line:
385 sequence = line.split("translation=")[1][1:]
386 b += 1
387 a += 1
388 if line.count('"') > 1:
389 sequence = line.split("translation=")[1][1:-1]
390 b = 2
391 elif "translation=" in line.lower():
392 sequence = line.lower().split("translation=")[1][1:]
393 b += 1
394 a += 1
395 if line.count('"') > 1:
396 sequence = line.lower().split("translation=")[1][1:-1]
397 b = 2
398 elif a == (nrlines - 2) or a == (nrlines - 1):
399 sequence = ""
400 b = 2
401 elif b == 1:
402 if '"' in line:
403 seqline = line.replace(" ","")
404 seqline = seqline.split('"')[0]
405 sequence = sequence + seqline
406 b += 1
407 else:
408 seqline = line.replace(" ","")
409 sequence = sequence + seqline
410 a += 1
411 else:
412 a += 1
413 sequence = sequence.upper()
414 #Quality-check sequence
415 forbiddencharacters = ["'",'"','=',';',':','[',']','>','<','|','\\',"/",'*','-','_','.',',','?',')','(','^','#','!','`','~','+','{','}','@','$','%','&']
416 for z in forbiddencharacters:
417 if z in sequence:
418 sequence = ""
419 #Find annotation for each gene
420 a = 0
421 b = 0
422 while b == 0:
423 line = i.split("\n")[a]
424 if "product=" in line:
425 annotation = line.split("product=")[1][1:]
426 annotation = annotation.replace(" ","_")
427 if annotation[-1] == '"':
428 annotation = annotation[:-1]
429 b += 1
430 elif "product=" in line.lower():
431 annotation = line.lower().split("product=")[1][1:]
432 annotation = annotation.replace(" ","_")
433 if annotation[-1] == '"':
434 annotation = annotation[:-1]
435 b += 1
436 elif a == (nrlines - 1):
437 annotation = "not_annotated"
438 b += 1
439 else:
440 a += 1
441 accessiondict[genename] = accnr
442 if join == "yes":
443 joinlist.append(genename)
444 joindict[genename] = joinedparts2
445 #Save data to dictionary
446 if len(genename) > 1:
447 genedict[genename] = [start,end,strand,annotation,sequence]
448 genelist.append(genename)
449 if error == "y":
450 errorinfo = "\n".join(errorlocations)
451 print >> sys.stderr, "Exit: locations in GBK/EMBL file not properly formatted:\n" + errorinfo
452 logfile.write("Exit: GBK file not properly formatted, no sequence found or no CDS annotation found.\n")
453 logfile.close()
454 sys.exit(1)
455 return [genelist, genedict, joinlist, joindict, accessiondict]
456
457 def cleandnaseq(dnaseq):
458 dnaseq = dnaseq.replace(" ","")
459 dnaseq = dnaseq.replace("\t","")
460 dnaseq = dnaseq.replace("\n","")
461 dnaseq = dnaseq.replace("0","")
462 dnaseq = dnaseq.replace("1","")
463 dnaseq = dnaseq.replace("2","")
464 dnaseq = dnaseq.replace("3","")
465 dnaseq = dnaseq.replace("4","")
466 dnaseq = dnaseq.replace("5","")
467 dnaseq = dnaseq.replace("6","")
468 dnaseq = dnaseq.replace("7","")
469 dnaseq = dnaseq.replace("8","")
470 dnaseq = dnaseq.replace("9","")
471 dnaseq = dnaseq.replace("/","")
472 dnaseq = dnaseq.replace("u","t")
473 dnaseq = dnaseq.replace("U","T")
474 dnaseq = dnaseq.replace("r","n")
475 dnaseq = dnaseq.replace("R","n")
476 dnaseq = dnaseq.replace("y","n")
477 dnaseq = dnaseq.replace("Y","n")
478 dnaseq = dnaseq.replace("w","n")
479 dnaseq = dnaseq.replace("W","n")
480 dnaseq = dnaseq.replace("s","n")
481 dnaseq = dnaseq.replace("S","n")
482 dnaseq = dnaseq.replace("m","n")
483 dnaseq = dnaseq.replace("M","n")
484 dnaseq = dnaseq.replace("k","n")
485 dnaseq = dnaseq.replace("K","n")
486 dnaseq = dnaseq.replace("h","n")
487 dnaseq = dnaseq.replace("H","n")
488 dnaseq = dnaseq.replace("b","n")
489 dnaseq = dnaseq.replace("B","n")
490 dnaseq = dnaseq.replace("v","n")
491 dnaseq = dnaseq.replace("V","n")
492 dnaseq = dnaseq.replace("d","n")
493 dnaseq = dnaseq.replace("D","n")
494 return dnaseq
495
496 def extractprotfasta(genelist,genedict,dnaseq,rc_dnaseq,joinlist,joindict,accessiondict):
497 names = []
498 seqs = []
499 for i in genelist:
500 genename = i
501 #If suitable translation found in gbk, use that
502 if len(genedict[i][4]) > 5:
503 protseq = genedict[i][4]
504 i = genedict[i]
505 #If no suitable translation found in gbk, extract from DNA sequence
506 else:
507 i = genedict[i]
508 y = int(i[0])
509 z = int(i[1])
510 if i[2] == "+":
511 if genename in joinlist:
512 geneseq = ""
513 for j in joindict[genename]:
514 partstart = int(j.split("..")[0])
515 if ".." in j:
516 partend = int(j.split("..")[1])
517 else:
518 partend = int(j)
519 geneseqpart = dnaseq[(partstart - 1):partend]
520 geneseq = geneseq + geneseqpart
521 else:
522 geneseq = dnaseq[(y - 1):z]
523 protseq = translate(geneseq)
524 elif i[2] == "-":
525 if genename in joinlist:
526 geneseq = ""
527 joinlistrev = joindict[genename]
528 joinlistrev.reverse()
529 for j in joinlistrev:
530 partstart = int(j.split("..")[0])
531 if ".." in j:
532 partend = int(j.split("..")[1])
533 else:
534 partend = int(j)
535 geneseqpart = rc_dnaseq[(len(rc_dnaseq) - partend):(len(rc_dnaseq) - partstart + 1)]
536 geneseq = geneseq + geneseqpart
537 else:
538 geneseq = rc_dnaseq[(len(rc_dnaseq) - z):(len(rc_dnaseq) - y + 1)]
539 protseq = translate(geneseq)
540 name = "input" + "|" + "c1" + "|" + i[0] + "-" + i[1] + "|" + i[2] + "|" + genename + "|" + i[3]
541 seqs.append(protseq)
542 names.append(name)
543 proteins = [names,seqs,genelist,genedict,accessiondict]
544 return proteins
545
546 def gbk2proteins(gbkfile):
547 file = open(gbkfile,"r")
548 filetext = file.read()
549 filetext = filetext.replace("\r","\n")
550 if " CDS " not in filetext or "\nORIGIN" not in filetext:
551 print >> sys.stderr, "Exit: GBK file not properly formatted, no sequence found or no CDS annotation found."
552 logfile.write("Exit: GBK file not properly formatted, no sequence found or no CDS annotation found.\n")
553 logfile.close()
554 sys.exit(1)
555 cdspart = filetext.split("\nORIGIN")[0]
556 #Extract DNA sequence and calculate reverse complement of it
557 dnaseq = filetext.split("\nORIGIN")[1]
558 dnaseq = cleandnaseq(dnaseq)
559 sequence = dnaseq
560 if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
561 print >> sys.stderr, "Protein GBK/EMBL file provided. Please provide nucleotide GBK/EMBL file."
562 sys.exit(1)
563 dnaseqlength = len(dnaseq)
564 rc_dnaseq = reverse_complement(dnaseq)
565 #Extract genes
566 genes = cdspart.split(" CDS ")
567 genes = genes[1:]
568 try:
569 genesdetails = parsegenes(genes)
570 except ValueError, e:
571 print >> sys.stderr, "Could not parse genes from GBK/EMBL file. Please check if your GBK/EMBL file is valid."
572 raise
573 print >> sys.stderr, "Error was: %s" % e
574 print len(genes)
575 sys.exit(1)
576 genelist = genesdetails[0]
577 genedict = genesdetails[1]
578 joinlist = genesdetails[2]
579 joindict = genesdetails[3]
580 accessiondict = genesdetails[4]
581 #Locate all genes on DNA sequence and translate to protein sequence
582 proteins = extractprotfasta(genelist,genedict,dnaseq,rc_dnaseq,joinlist,joindict,accessiondict)
583 textlines = filetext.split("\n//")[0]
584 textlines = textlines.split("\n")
585 accession = ""
586 for i in textlines:
587 if accession == "":
588 if "LOCUS " in i:
589 j = i.split("LOCUS ")[1]
590 accession = j.split(" ")[0]
591 if len(accession) < 4:
592 accession = ""
593 #Test if accession number is probably real GenBank/RefSeq acc nr
594 numbers = range(0,10)
595 letters = []
596 for i in ascii_letters:
597 letters.append(i)
598 nrnumbers = 0
599 nrletters = 0
600 for i in accession:
601 if i in letters:
602 nrletters += 1
603 try:
604 j = int(i)
605 if j in numbers:
606 nrnumbers += 1
607 except:
608 pass
609 if nrnumbers < 3 or nrletters < 1:
610 accession = ""
611 return [proteins,accession,dnaseqlength]
612
613 def embl2proteins(emblfile,sequence):
614 file = open(emblfile,"r")
615 filetext = file.read()
616 filetext = filetext.replace("\r","\n")
617 file.close()
618 if "FT CDS " not in filetext or ("\nSQ" not in filetext and len(sequence) < 1):
619 logfile.write("Exit: EMBL file not properly formatted, no sequence found or no CDS annotation found.\n")
620 print >> sys.stderr, "Exit: EMBL file not properly formatted, no sequence found or no CDS annotation found.\n"
621 logfile.close()
622 sys.exit(1)
623 cdspart = filetext.split("\nSQ ")[0]
624 #Extract DNA sequence and calculate reverse complement of it
625 seqpart = filetext.split("\nSQ ")[1]
626 seqlines = seqpart.split("\n")[1:]
627 dnaseq = ""
628 for i in seqlines:
629 dnaseq = dnaseq + i
630 dnaseq = cleandnaseq(dnaseq)
631 sequence = dnaseq
632 if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
633 print >> sys.stderr, "Protein GBK/EMBL file provided. Please provide nucleotide GBK/EMBL file."
634 sys.exit(1)
635 dnaseqlength = len(dnaseq)
636 rc_dnaseq = reverse_complement(dnaseq)
637 #Extract genes
638 genes = cdspart.split("FT CDS ")
639 genes = genes[1:]
640 try:
641 genesdetails = parsegenes(genes)
642 except ValueError, e:
643 print >> sys.stderr, "Could not parse genes from GBK/EMBL file. Please check if your GBK/EMBL file is valid."
644 print >> sys.stderr, "Error was: %s" % e
645 sys.exit(1)
646 genelist = genesdetails[0]
647 genedict = genesdetails[1]
648 joinlist = genesdetails[2]
649 joindict = genesdetails[3]
650 accessiondict = genesdetails[4]
651 #Locate all genes on DNA sequence and translate to protein sequence
652 proteins = extractprotfasta(genelist,genedict,dnaseq,rc_dnaseq,joinlist,joindict,accessiondict)
653 textlines = filetext.split("SQ ")[0]
654 textlines = textlines.split("\n")
655 accession = ""
656 for i in textlines:
657 if accession == "":
658 if "AC " in i:
659 j = i.split("AC ")[1]
660 j = j.replace(" ","")
661 accession = j.split(";")[0]
662 if len(accession) < 4:
663 accession = ""
664 #Test if accession number is probably real GenBank/RefSeq acc nr
665 numbers = range(0,10)
666 letters = []
667 for i in ascii_letters:
668 letters.append(i)
669 nrnumbers = 0
670 nrletters = 0
671 for i in accession:
672 if i in letters:
673 nrletters += 1
674 try:
675 j = int(i)
676 if j in numbers:
677 nrnumbers += 1
678 except:
679 pass
680 if nrnumbers < 3 or nrletters < 1:
681 accession = ""
682 return [proteins,accession,dnaseqlength]
683
684 def translate(sequence):
685 #Translation table standard genetic code; according to http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
686 transldict = { 'TTT': 'F', 'TCT': 'S', 'TAT': 'Y', 'TGT': 'C',
687 'TTC': 'F', 'TCC': 'S', 'TAC': 'Y', 'TGC': 'C',
688 'TTA': 'L', 'TCA': 'S', 'TAA': '*', 'TGA': '*',
689 'TTG': 'L', 'TCG': 'S', 'TAG': '*', 'TGG': 'W',
690 'CTT': 'L', 'CCT': 'P', 'CAT': 'H', 'CGT': 'R',
691 'CTC': 'L', 'CCC': 'P', 'CAC': 'H', 'CGC': 'R',
692 'CTA': 'L', 'CCA': 'P', 'CAA': 'Q', 'CGA': 'R',
693 'CTG': 'L', 'CCG': 'P', 'CAG': 'Q', 'CGG': 'R',
694 'ATT': 'I', 'ACT': 'T', 'AAT': 'N', 'AGT': 'S',
695 'ATC': 'I', 'ACC': 'T', 'AAC': 'N', 'AGC': 'S',
696 'ATA': 'I', 'ACA': 'T', 'AAA': 'K', 'AGA': 'R',
697 'ATG': 'M', 'ACG': 'T', 'AAG': 'K', 'AGG': 'R',
698 'GTT': 'V', 'GCT': 'A', 'GAT': 'D', 'GGT': 'G',
699 'GTC': 'V', 'GCC': 'A', 'GAC': 'D', 'GGC': 'G',
700 'GTA': 'V', 'GCA': 'A', 'GAA': 'E', 'GGA': 'G',
701 'GTG': 'V', 'GCG': 'A', 'GAG': 'E', 'GGG': 'G',
702 'ttt': 'F', 'tct': 'S', 'tat': 'Y', 'tgt': 'C',
703 'ttc': 'F', 'tcc': 'S', 'tac': 'Y', 'tgc': 'C',
704 'tta': 'L', 'tca': 'S', 'taa': '*', 'tga': '*',
705 'ttg': 'L', 'tcg': 'S', 'tag': '*', 'tgg': 'W',
706 'ctt': 'L', 'cct': 'P', 'cat': 'H', 'cgt': 'R',
707 'ctc': 'L', 'ccc': 'P', 'cac': 'H', 'cgc': 'R',
708 'cta': 'L', 'cca': 'P', 'caa': 'Q', 'cga': 'R',
709 'ctg': 'L', 'ccg': 'P', 'cag': 'Q', 'cgg': 'R',
710 'att': 'I', 'act': 'T', 'aat': 'N', 'agt': 'S',
711 'atc': 'I', 'acc': 'T', 'aac': 'N', 'agc': 'S',
712 'ata': 'I', 'aca': 'T', 'aaa': 'K', 'aga': 'R',
713 'atg': 'M', 'acg': 'T', 'aag': 'K', 'agg': 'R',
714 'gtt': 'V', 'gct': 'A', 'gat': 'D', 'ggt': 'G',
715 'gtc': 'V', 'gcc': 'A', 'gac': 'D', 'ggc': 'G',
716 'gta': 'V', 'gca': 'A', 'gaa': 'E', 'gga': 'G',
717 'gtg': 'V', 'gcg': 'A', 'gag': 'E', 'ggg': 'G'}
718 triplets = []
719 triplet = ""
720 a = 0
721 for i in sequence:
722 if a < 2:
723 a += 1
724 triplet = triplet + i
725 elif a == 2:
726 triplet = triplet + i
727 triplets.append(triplet)
728 triplet = ""
729 a = 0
730 protseq = ""
731 aanr = 0
732 for i in triplets:
733 aanr += 1
734 if aanr == 1:
735 protseq = protseq + "M"
736 else:
737 if "n" in i or "N" in i or i not in transldict.keys():
738 protseq = protseq + "X"
739 else:
740 protseq = protseq + transldict[i]
741 if len(protseq) > 0 and protseq[-1] == "*":
742 protseq = protseq[:-1]
743 return protseq
744
745 def writefasta(names,seqs,file):
746 e = 0
747 f = len(names) - 1
748 try:
749 out_file = open(file,"w")
750 while e <= f:
751 out_file.write(">%s\n%s\n" % (names[e], seqs[e]) )
752 #out_file.write(">")
753 #out_file.write(names[e])
754 #out_file.write("\n")
755 #out_file.write(seqs[e])
756 #out_file.write("\n")
757 e += 1
758 out_file.close()
759 except(IOError,OSError,NotImplementedError):
760 print >> sys.stderr, "FASTA file not created."
761 logfile.write("FASTA file not created.\n")
762
763 def parsehmmoutput(cutoff,file):
764 #file = open(file,"r")
765 #filetext = file.read()
766 #filetext = filetext.replace("\r","\n")
767 #lines = filetext.split("\n")
768 protlines = []
769 #for i in lines:
770 # if len(i) > 1 and i[0] != "#":
771 # protlines.append(i)
772 [protlines.append(line.strip()) for line in open(file,"r") if len(line) > 1 and not line.startswith('#')]
773 proteins = []
774 scores = []
775 #measuringline = lines[2]
776 measuringline = linecache.getline(file, 3)
777 x = 0
778 y = 0
779 for i in measuringline:
780 y += 1
781 if "-" in i:
782 x += 1
783 else:
784 if x > 1:
785 break
786 for i in protlines:
787 #accession = ""
788 #a = 0
789 protname = i[0:y]
790 protnameparts = protname.split("|")
791 accession = protnameparts[4]
792 score = i[(y+76):(y+82)]
793 score = float(score.replace(" ",""))
794 if score > cutoff and len(accession) > 1:
795 proteins.append(accession)
796 scores.append(score)
797 return [proteins,scores]
798
799 def sortonsecondvalueoflist(first,second):
800 f = int(first[1])
801 s = second[1]
802 if f > s:
803 value = 1
804 elif f < s:
805 value = -1
806 elif f == s:
807 value = 0
808 return value
809
810 def hmmlengths(hmmfile):
811 hmmlengthsdict = {}
812 file = open(hmmfile,"r")
813 filetext = file.read()
814 filetext = filetext.replace("\r","\n")
815 hmms = filetext.split("//")[:-1]
816 for i in hmms:
817 namepart = i.split("NAME ")[1]
818 name = namepart.split("\n", 1)[0]
819 lengthpart = i.split("LENG ")[1]
820 #print lengthline
821 #tabs = lengthline.split(" ")
822 #tabs2 = []
823 #for j in tabs:
824 # if j != "":
825 # tabs2.append(j)
826 #print tabs2
827 length = lengthpart.split("\n", 1)[0]
828 hmmlengthsdict[name] = int(length)
829 return hmmlengthsdict
830
831 def hmmscanparse(hmmscanoutputfile,hmmlengthsdict):
832 domaindict = {}
833 file = open(hmmscanoutputfile,"r")
834 filetext = file.read()
835 filetext = filetext.replace("\r","\n")
836 outputs = filetext.split("Query: ")[1:]
837 for i in outputs:
838 protname = i.split("\n", 1)[0]
839 protname = protname.split(" ", 1)[0]
840 domainresults = i.split("Domain annotation for each model:\n")[1]
841 domainresults = domainresults.split("\n\nInternal pipeline statistics summary:")[0]
842 domains = domainresults.split(">> ")
843 domainlist = []
844 #Find all domains
845 for i in domains:
846 tokens = i.split('\n')
847 domainname = tokens[0]
848 domainname = domainname.split(" ", 1)[0]
849 domainresults = tokens[3:-2]
850 for i in domainresults:
851 tabs = i.split(" ")
852 tabs2 = []
853 [tabs2.append(tab) for tab in tabs if tab != '']
854 #for i in tabs:
855 # if i != "":
856 # tabs2.append(i)
857 tabs = tabs2
858 start = int(tabs[12])
859 end = int(tabs[13])
860 evalue = tabs[5]
861 score = float(tabs[2])
862 domainlist.append([domainname,start,end,evalue,score])
863 domainlist.sort(sortonsecondvalueoflist)
864 #Purify domain list to remove overlapping domains, only keeping those with the highest scores
865 if len(domainlist) > 1:
866 domainlist2 = [domainlist[0]]
867 for i in domainlist[1:]:
868 maxoverlap = 20
869 if i[1] < (domainlist2[-1][2] - maxoverlap):
870 if i[4] < domainlist2[-1][4]:
871 pass
872 elif i[4] > domainlist2[-1][4]:
873 del domainlist2[-1]
874 domainlist2.append(i)
875 else:
876 domainlist2.append(i)
877 domainlist = domainlist2
878 #Merge domain fragments which are really one domain
879 if len(domainlist) > 1:
880 domainlist2 = [domainlist[0]]
881 for i in domainlist[1:]:
882 alilength1 = int(domainlist2[-1][2]) - int(domainlist2[-1][1])
883 alilength2 = int(i[2]) - int(i[1])
884 domainlength = hmmlengthsdict[i[0]]
885 if i[0] == domainlist2[-1][0] and (alilength1 < (0.75 * domainlength) or alilength2 < (0.75 * domainlength)) and (alilength1 + alilength2) < (1.5 * domainlength):
886 name = i[0]
887 start = domainlist2[-1][1]
888 end = i[2]
889 evalue = str(float(domainlist2[-1][3]) * float(i[3]))
890 score = str(float(domainlist2[-1][4]) + float(i[4]))
891 del domainlist2[-1]
892 domainlist2.append([name,start,end,evalue,score])
893 else:
894 domainlist2.append(i)
895 domainlist = domainlist2
896 #Remove incomplete domains (covering less than 60% of total domain hmm length)
897 if len(domainlist) > 1:
898 domainlist2 = []
899 for i in domainlist:
900 alilength = int(i[2]) - int(i[1])
901 domainlength = hmmlengthsdict[i[0]]
902 if alilength > (0.6 * domainlength):
903 domainlist2.append(i)
904 domainlist = domainlist2
905 #Save domainlist to domaindict
906 domaindict[protname] = domainlist
907 return domaindict
908
909 def blastparse(blasttext,minseqcoverage,minpercidentity,seqlengths,geneclustergenes):
910 blastdict = {}
911 querylist = []
912 hitclusters = []
913 blastlines = blasttext.split("\n")[:-1]
914 #Filter for best blast hits (of one query on each subject)
915 query_subject_combinations = []
916 blastlines2 = []
917 for i in blastlines:
918 tabs = i.split("\t")
919 query = tabs[0]
920 subject = tabs[1]
921 query_subject_combination = query + "_" + subject
922 if query_subject_combination in query_subject_combinations:
923 pass
924 else:
925 query_subject_combinations.append(query_subject_combination)
926 blastlines2.append(i)
927 blastlines = blastlines2
928 #Filters blastlines to get rid of hits that do not meet criteria
929 blastlines2 = []
930 for i in blastlines:
931 tabs = i.split("\t")
932 perc_ident = int(tabs[2].split(".",1)[0])
933 alignmentlength = float(tabs[3])
934 evalue = str(tabs[10])
935 blastscore = int(tabs[11].split(".",1)[0])
936 if seqlengths.has_key(query):
937 perc_coverage = (float(tabs[3]) / seqlengths[query]) * 100
938 if perc_ident > minpercidentity and (perc_coverage > minseqcoverage or alignmentlength > 40):
939 blastlines2.append(i)
940 blastlines = blastlines2
941 #Goes through the blastlines. For each query, creates a querydict and hitlist, and adds these to the blastdict when finding the next query
942 firstquery = "y"
943 for i in blastlines:
944 tabs = i.split("\t")
945 query = tabs[0]
946
947 second_column_split = tabs[1].split("|")
948
949 subject = second_column_split[4]
950 if subject == "no_locus_tag":
951 subject = second_column_split[6]
952 if subject in geneclustergenes:
953 subject = "h_" + subject
954 if len(second_column_split) > 6:
955 locustag = second_column_split[6]
956 else:
957 locustag = ""
958 subject_genecluster = second_column_split[0] + "_" + second_column_split[1]
959 subject_start = (second_column_split[2]).split("-")[0]
960 subject_end = (second_column_split[2]).split("-")[1]
961 subject_strand = second_column_split[3]
962 subject_annotation = second_column_split[5]
963 perc_ident = int(tabs[2].split(".")[0])
964 alignmentlength = float(tabs[3])
965 evalue = str(tabs[10])
966 blastscore = int(tabs[11].split(".", 1)[0])
967 if seqlengths.has_key(query):
968 perc_coverage = (float(tabs[3]) / seqlengths[query]) * 100
969 else:
970 seqlength = len(seqdict[query.split("|")[4]])
971 perc_coverage = (float(tabs[3]) / seqlength) * 100
972 if firstquery == "y": #Only until the first blastline with good hit
973 firstquery = "n"
974 querylist.append(query)
975 subjectlist = []
976 querydict = {}
977 subjectlist.append(subject)
978 querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
979 if subject_genecluster not in hitclusters:
980 hitclusters.append(subject_genecluster)
981 last_query = query
982 elif i == blastlines[-1]: #Only for the last blastline
983 if query not in querylist:
984 subjectlist.append(subject)
985 querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
986 blastdict[query] = [subjectlist,querydict]
987 querylist.append(query)
988 if subject_genecluster not in hitclusters:
989 hitclusters.append(subject_genecluster)
990 else:
991 subjectlist.append(subject)
992 querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
993 blastdict[query] = [subjectlist,querydict]
994 else: #For all but the first and last blastlines
995 if query not in querylist:
996 blastdict[last_query] = [subjectlist,querydict]
997 querylist.append(query)
998 subjectlist = []
999 querydict = {}
1000 subjectlist.append(subject)
1001 querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
1002 if subject_genecluster not in hitclusters:
1003 hitclusters.append(subject_genecluster)
1004 last_query = query
1005 else:
1006 subjectlist.append(subject)
1007 querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
1008 if subject_genecluster not in hitclusters:
1009 hitclusters.append(subject_genecluster)
1010 return [blastdict,querylist,hitclusters]
1011
1012 def getdircontents():
1013 return os.listdir('.')
1014 """
1015 if sys.platform == ('win32'):
1016 dircontents = os.popen("dir/w")
1017 dircontents = dircontents.read()
1018 dircontents = dircontents.replace("\n"," ")
1019 dircontents = dircontents.split(" ")
1020 if sys.platform == ('linux2'):
1021 dircontents = os.popen("ls")
1022 dircontents = dircontents.read()
1023 dircontents = dircontents.replace("\n"," ")
1024 dircontents = dircontents.replace("\r"," ")
1025 dircontents = dircontents.split(" ")
1026
1027 return dircontents
1028 """
1029
1030 def _gene_arrow(start,end,strand,color,base,height):
1031 halfheight = height/2
1032 if start > end:
1033 start2 = end
1034 end2 = start
1035 start = start2
1036 end = end2
1037 dist = 100
1038 oh = ShapeBuilder()
1039 if (end - start) < halfheight:
1040 if (strand == "+"):
1041 pointsAsTuples=[(start,base),
1042 (end,base - halfheight),
1043 (start,base - height),
1044 (start,base)
1045 ]
1046 elif (strand == "-"):
1047 pointsAsTuples=[(start,base - halfheight),
1048 (end,base - height),
1049 (end,base),
1050 (start,base - halfheight)
1051 ]
1052 else:
1053 if (strand == "+"):
1054 arrowstart = end-halfheight
1055 pointsAsTuples=[(start,base),
1056 (arrowstart,base),
1057 (end,base-halfheight),
1058 (arrowstart,base - height),
1059 (start,base - height),
1060 (start,base)
1061 ]
1062 elif (strand == "-"):
1063 arrowstart = start + halfheight
1064 pointsAsTuples=[(start,base - halfheight),
1065 (arrowstart,base - height),
1066 (end,base - height),
1067 (end,base),
1068 (arrowstart,base),
1069 (start,base - halfheight)
1070 ]
1071 pg=oh.createPolygon(points=oh.convertTupleArrayToPoints(pointsAsTuples),strokewidth=1, stroke='black', fill=color)
1072 return pg
1073
1074 def _gene_label(start,end,name,y,screenwidth):
1075 #Add gene label
1076 txt = name
1077 myStyle = StyleBuilder()
1078 myStyle.setFontFamily(fontfamily="Verdana")
1079 #myStyle.setFontWeight(fontweight='bold')
1080 myStyle.setFontStyle(fontstyle='italic')
1081 myStyle.setFontSize('10px')
1082 myStyle.setFilling('#600000')
1083 x = ((start + end)/2)
1084 base = 35
1085 height = 10
1086 halfheight = height/2
1087 y = base + halfheight
1088 t1 = text(txt,x,y)
1089 t1.set_style(myStyle.getStyle())
1090 return t1
1091
1092 def relativepositions(starts,ends,largestclustersize):
1093 rel_starts = []
1094 rel_ends = []
1095 #Assign relative start and end sites for visualization
1096 lowest_start = int(starts[0])
1097 leftboundary = lowest_start
1098 for i in starts:
1099 i = float(float(int(i) - int(leftboundary)) / largestclustersize) * screenwidth * 0.75
1100 i = int(i)
1101 rel_starts.append(i)
1102 for i in ends:
1103 i = float(float(int(i) - int(leftboundary)) / largestclustersize) * screenwidth * 0.75
1104 i = int(i)
1105 rel_ends.append(i)
1106 return [rel_starts,rel_ends]
1107
1108 def startendsitescheck(starts,ends):
1109 #Check whether start sites are always lower than end sites, reverse if necessary
1110 starts2 = []
1111 ends2 = []
1112 a = 0
1113 for i in starts:
1114 if int(i) > int(ends[a]):
1115 starts2.append(ends[a])
1116 ends2.append(i)
1117 else:
1118 starts2.append(i)
1119 ends2.append(ends[a])
1120 a += 1
1121 ends = ends2
1122 starts = starts2
1123 return [starts,ends]
1124
1125 def RadialGradient(startcolor,stopcolor,gradientname):
1126 d = defs()
1127 rg = radialGradient()
1128 rg.set_id(gradientname)
1129 s = stop(offset="0%")
1130 s.set_stop_color(startcolor)
1131 s.set_stop_opacity(1)
1132 rg.addElement(s)
1133 s = stop(offset="100%")
1134 s.set_stop_color(stopcolor)
1135 s.set_stop_opacity(1)
1136 rg.addElement(s)
1137 d.addElement(rg)
1138 return d
1139
1140 def LinearGradient(startcolor,stopcolor,gradientname):
1141 d = defs()
1142 lg = linearGradient()
1143 lg.set_id(gradientname)
1144 s = stop(offset="0%")
1145 s.set_stop_color(startcolor)
1146 s.set_stop_opacity(1)
1147 lg.addElement(s)
1148 s = stop(offset="100%")
1149 s.set_stop_color(stopcolor)
1150 s.set_stop_opacity(1)
1151 lg.addElement(s)
1152 d.addElement(lg)
1153 return d
1154
1155 def generate_rgbscheme(nr):
1156 usablenumbers = [1,2,4,8,12,18,24,32,48,64,10000]
1157 lengthsdict = {1:[1,1,1],2:[1,1,2],4:[1,2,2],8:[2,2,2],12:[2,2,3],18:[2,3,3],24:[3,3,3],32:[3,3,4],48:[3,4,4],64:[4,4,4]}
1158 shortestdistance = 10000
1159 for i in usablenumbers:
1160 distance = i - nr
1161 if distance >= 0:
1162 if distance < shortestdistance:
1163 shortestdistance = distance
1164 closestnr = i
1165 toohigh = "n"
1166 if closestnr == 10000:
1167 toohigh = "y"
1168 closestnr = 64
1169 xyznumbers = lengthsdict[closestnr]
1170 x = xyznumbers[0]
1171 y = xyznumbers[1]
1172 z = xyznumbers[2]
1173 xpoints = []
1174 xpoint = (255/z)/2
1175 for i in range(x):
1176 xpoints.append(xpoint)
1177 xpoint += (255/x)
1178 ypoints = []
1179 ypoint = (255/z)/2
1180 for i in range(y):
1181 ypoints.append(ypoint)
1182 ypoint += (255/y)
1183 zpoints = []
1184 zpoint = (255/z)/2
1185 for i in range(z):
1186 zpoints.append(zpoint)
1187 zpoint += (255/z)
1188 colorlist = []
1189 for i in xpoints:
1190 for j in ypoints:
1191 #for k in zpoints:
1192 # rgb = "rgb(%s,%s,%s)" % (i, j, k)
1193 # #rgb = "rgb(" + str(i) + "," + str(j) + "," + str(k) + ")"
1194 # colorlist.append(rgb)
1195 [colorlist.append("rgb(%s,%s,%s)" % (i, j, k)) for k in zpoints]
1196 if toohigh == "y":
1197 colorlist = colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist
1198 if closestnr == 24:
1199 colorlist = colorlist[:15] + colorlist[18:]
1200 if closestnr == 32:
1201 colorlist = colorlist[:21] + colorlist[24:]
1202 colorlist2 = []
1203 if closestnr == 1:
1204 colorlist2.append("red")
1205 if closestnr == 2:
1206 colorlist2.append("red")
1207 colorlist2.append("green")
1208 if closestnr == 4:
1209 colorlist2.append("red")
1210 colorlist2.append("green")
1211 colorlist2.append("blue")
1212 colorlist2.append("yellow")
1213 if closestnr == 8:
1214 neworder=[4,1,2,5,6,7,3,0]
1215 colorlist2 = [colorlist[i] for i in neworder]
1216 if closestnr == 12:
1217 neworder=[6,3,5,9,7,2,11,4,8,1,10,0]
1218 colorlist2 = [colorlist[i] for i in neworder]
1219 if closestnr == 18:
1220 neworder=[9,6,2,14,15,8,12,10,3,5,7,11,4,1,16,13,0]
1221 colorlist2 = [colorlist[i] for i in neworder]
1222 if closestnr == 24:
1223 neworder=[15,12,9,6,5,0,21,1,16,14,8,17,2,23,22,3,13,7,10,4,18,20,19,11]
1224 colorlist2 = [colorlist[i] for i in neworder]
1225 if closestnr == 32:
1226 neworder = [21,19,27,6,8,1,14,7,20,13,9,30,4,23,18,12,5,29,24,17,11,31,2,28,22,15,26,3,20,16,10,25]
1227 colorlist2 = [colorlist[i] for i in neworder]
1228 if closestnr > 32:
1229 random.shuffle(colorlist)
1230 colorlist2 = colorlist
1231 colorlist = colorlist2
1232 return colorlist
1233
1234 def geneclustersvg(genes,rel_starts,rel_ends,strands,geneposdict,pksnrpsprots,pksnrpsdomains,qclusternr):
1235 nrgenes = len(genes)
1236 #Define relative start and end positions for plotting
1237 s = svg(x = 0, y = 0, width = (screenwidth * 0.75), height = (259 + 99 * len(pksnrpsprots)))
1238 viewbox = "0 -30 " + str(screenwidth * 0.8) + " " + str(185 + 70 * len(pksnrpsprots))
1239 s.set_viewBox(viewbox)
1240 s.set_preserveAspectRatio("none")
1241
1242 #Add line behind gene arrows
1243 oh = ShapeBuilder()
1244 group = g()
1245 group.addElement(oh.createLine(10,60,10 + (screenwidth * 0.75),60, strokewidth = 2, stroke = "grey"))
1246 s.addElement(group)
1247 #Add gene arrows
1248 a = 0
1249 y = 0
1250 for x in range(nrgenes):
1251 group = g()
1252 #group.addElement(_gene_label(rel_starts[a],rel_ends[a],genes[a],y,screenwidth))
1253 group.addElement(_gene_arrow(10 + rel_starts[a],10 + rel_ends[a],strands[a],colors[a],65,10))
1254 #Can be used for domains
1255 # group.addElement(oh.createRect(rel_starts[a],45,(rel_ends[a]-rel_starts[a]),10, strokewidth = 2, stroke = "black", fill="#237845"))
1256 group.set_id("a" + str(qclusternr) + "_00%s"%x)
1257 s.addElement(group)
1258 if y == 0:
1259 y = 1
1260 elif y == 1:
1261 y = 0
1262 a += 1
1263 #Add domain depictions
1264 oh = ShapeBuilder()
1265 group = g()
1266 #Determine longest protein to decide on scaling
1267 longestprot = 0
1268 protlengthdict = {}
1269 for i in pksnrpsprots:
1270 protlength = (geneposdict[i][1] - geneposdict[i][0]) / 3
1271 protlengthdict[i] = protlength
1272 if protlength > longestprot:
1273 longestprot = protlength
1274 z = 1
1275 w = 0
1276 ksnr = 1
1277 atnr = 1
1278 dhnr = 1
1279 krnr = 1
1280 ernr = 1
1281 acpnr = 1
1282 cnr = 1
1283 enr = 1
1284 anr = 1
1285 pcpnr = 1
1286 tenr = 1
1287 othernr = 1
1288 for i in pksnrpsprots:
1289 domains = pksnrpsdomains[i][0]
1290 domainsdict = pksnrpsdomains[i][1]
1291 protlength = protlengthdict[i]
1292 group.addElement(oh.createLine(10,(125 + z * 60 ),10 + ((float(protlength) / float(longestprot)) * (screenwidth * 0.75)),(125 + z * 60 ), strokewidth = 1, stroke = "grey"))
1293 s.addElement(group)
1294 try:
1295 aa2pixelratio = longestprot * 0.75 / screenwidth
1296 except:
1297 aa2pixelratio = 0.1
1298 #print 'logestprot', longestprot
1299 #print 'scrennwidth', screenwidth
1300 #print aa2pixelratio
1301 myStyle = StyleBuilder()
1302 myStyle.setFontFamily(fontfamily="MS Reference Sans Serif")
1303 myStyle.setFontWeight(fontweight='bold')
1304 myStyle.setFontSize('12px')
1305 for j in domains:
1306 startpos = domainsdict[j][0]
1307 endpos = domainsdict[j][1]
1308 if "PKS_KS" in j:
1309 c = LinearGradient("#08B208","#81F781","KS_domain"+str(qclusternr) + "_" + str(ksnr))
1310 d = LinearGradient("#81F781","#08B208","KS_line"+str(qclusternr) + "_" + str(ksnr))
1311 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#KS_line' + str(qclusternr) + "_" + str(ksnr) + ")",fill="url(#KS_domain" + str(qclusternr) + "_" + str(ksnr) + ")")
1312 f = text("KS",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A2A0A')
1313 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
1314 myStyle.setFontSize('8px')
1315 f = text("KS",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#3B0B0B')
1316 elif ((endpos-startpos) / aa2pixelratio) < 20:
1317 f = "notext"
1318 if f != "notext":
1319 f.set_style(myStyle.getStyle())
1320 myStyle.setFontSize('12px')
1321 group = g()
1322 group.addElement(c)
1323 group.addElement(d)
1324 group.addElement(e)
1325 if f != "notext":
1326 group.addElement(f)
1327 group.set_id("b" + str(qclusternr) + "_00%s"%w)
1328 s.addElement(group)
1329 ksnr += 1
1330 elif "PKS_AT" in j:
1331 c = LinearGradient("#DC0404","#F78181","AT_domain"+str(qclusternr) + "_" + str(atnr))
1332 d = LinearGradient("#F78181","#DC0404","AT_line"+str(qclusternr) + "_" + str(atnr))
1333 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#AT_line' + str(qclusternr) + "_" + str(atnr) + ")",fill="url(#AT_domain" + str(qclusternr) + "_" + str(atnr) + ")")
1334 f = text("AT",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#2A1B0A')
1335 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
1336 myStyle.setFontSize('8px')
1337 f = text("AT",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#2A1B0A')
1338 elif ((endpos-startpos) / aa2pixelratio) < 20:
1339 f = "notext"
1340 if f != "notext":
1341 f.set_style(myStyle.getStyle())
1342 myStyle.setFontSize('12px')
1343 group = g()
1344 group.addElement(c)
1345 group.addElement(d)
1346 group.addElement(e)
1347 if f != "notext":
1348 group.addElement(f)
1349 group.set_id("b" + str(qclusternr) + "_00%s"%w)
1350 s.addElement(group)
1351 atnr += 1
1352 elif "PKS_DH" in j:
1353 c = LinearGradient("#B45F04","#F7BE81","DH_domain"+str(qclusternr) + "_" + str(dhnr))
1354 d = LinearGradient("#F7BE81","#B45F04","DH_line"+str(qclusternr) + "_" + str(dhnr))
1355 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#DH_line' + str(qclusternr) + "_" + str(dhnr) + ")",fill="url(#DH_domain" + str(qclusternr) + "_" + str(dhnr) + ")")
1356 f = text("DH",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#3B0B0B')
1357 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
1358 myStyle.setFontSize('8px')
1359 f = text("DH",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#3B0B0B')
1360 elif ((endpos-startpos) / aa2pixelratio) < 20:
1361 f = "notext"
1362 if f != "notext":
1363 f.set_style(myStyle.getStyle())
1364 myStyle.setFontSize('12px')
1365 group = g()
1366 group.addElement(c)
1367 group.addElement(d)
1368 group.addElement(e)
1369 if f != "notext":
1370 group.addElement(f)
1371 group.set_id("b" + str(qclusternr) + "_00%s"%w)
1372 s.addElement(group)
1373 dhnr += 1
1374 elif "PKS_KR" in j:
1375 c = LinearGradient("#089E4B","#81F781","KR_domain"+str(qclusternr) + "_" + str(krnr))
1376 d = LinearGradient("#81F781","#089E4B","KR_line"+str(qclusternr) + "_" + str(krnr))
1377 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#KR_line' + str(qclusternr) + "_" + str(krnr) + ")",fill="url(#KR_domain" + str(qclusternr) + "_" + str(krnr) + ")")
1378 f = text("KR",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A2A1B')
1379 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
1380 myStyle.setFontSize('8px')
1381 f = text("KR",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A2A1B')
1382 elif ((endpos-startpos) / aa2pixelratio) < 20:
1383 f = "notext"
1384 if f != "notext":
1385 f.set_style(myStyle.getStyle())
1386 myStyle.setFontSize('12px')
1387 group = g()
1388 group.addElement(c)
1389 group.addElement(d)
1390 group.addElement(e)
1391 if f != "notext":
1392 group.addElement(f)
1393 group.set_id("b" + str(qclusternr) + "_00%s"%w)
1394 s.addElement(group)
1395 krnr += 1
1396 elif "PKS_ER" in j:
1397 c = LinearGradient("#089E85","#81F7F3","ER_domain"+str(qclusternr) + "_" + str(ernr))
1398 d = LinearGradient("#81F7F3","#089E85","ER_line"+str(qclusternr) + "_" + str(ernr))
1399 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#ER_line' + str(qclusternr) + "_" + str(ernr) + ")",fill="url(#ER_domain" + str(qclusternr) + "_" + str(ernr) + ")")
1400 f = text("ER",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A2A29')
1401 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
1402 myStyle.setFontSize('8px')
1403 f = text("ER",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A2A29')
1404 elif ((endpos-startpos) / aa2pixelratio) < 20:
1405 f = "notext"
1406 if f != "notext":
1407 f.set_style(myStyle.getStyle())
1408 myStyle.setFontSize('12px')
1409 group = g()
1410 group.addElement(c)
1411 group.addElement(d)
1412 group.addElement(e)
1413 if f != "notext":
1414 group.addElement(f)
1415 group.set_id("b" + str(qclusternr) + "_00%s"%w)
1416 s.addElement(group)
1417 ernr += 1
1418 elif "ACP" in j:
1419 c = LinearGradient("#084BC6","#81BEF7","ACP_domain"+str(qclusternr) + "_" + str(acpnr))
1420 d = LinearGradient("#81BEF7","#084BC6","ACP_line"+str(qclusternr) + "_" + str(acpnr))
1421 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#ACP_line' + str(qclusternr) + "_" + str(acpnr) + ")",fill="url(#ACP_domain" + str(qclusternr) + "_" + str(acpnr) + ")")
1422 f = text("ACP",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A1B2A')
1423 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
1424 myStyle.setFontSize('8px')
1425 f = text("ACP",((-2 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A1B2A')
1426 elif ((endpos-startpos) / aa2pixelratio) < 20:
1427 f = "notext"
1428 if f != "notext":
1429 f.set_style(myStyle.getStyle())
1430 myStyle.setFontSize('12px')
1431 group = g()
1432 group.addElement(c)
1433 group.addElement(d)
1434 group.addElement(e)
1435 if f != "notext":
1436 group.addElement(f)
1437 group.set_id("b" + str(qclusternr) + "_00%s"%w)
1438 s.addElement(group)
1439 acpnr += 1
1440 elif ("C" in j or "Heterocyclization" in j ) and "ACP" not in j and "PCP" not in j and "NRPS-COM" not in j and "CAL" not in j:
1441 c = LinearGradient("#393989","#8181F7","C_domain"+str(qclusternr) + "_" + str(cnr))
1442 d = LinearGradient("#8181F7","#393989","C_line"+str(qclusternr) + "_" + str(cnr))
1443 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#C_line' + str(qclusternr) + "_" + str(cnr) + ")",fill="url(#C_domain" + str(qclusternr) + "_" + str(cnr) + ")")
1444 f = text("C",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A0A2A')
1445 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
1446 myStyle.setFontSize('8px')
1447 f = text("C",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A0A2A')
1448 elif ((endpos-startpos) / aa2pixelratio) < 20:
1449 f = "notext"
1450 if f != "notext":
1451 f.set_style(myStyle.getStyle())
1452 myStyle.setFontSize('12px')
1453 group = g()
1454 group.addElement(c)
1455 group.addElement(d)
1456 group.addElement(e)
1457 if f != "notext":
1458 group.addElement(f)
1459 group.set_id("b" + str(qclusternr) + "_00%s"%w)
1460 s.addElement(group)
1461 cnr += 1
1462 elif "Epimerization" in j and "ER" not in j and "TE" not in j:
1463 c = LinearGradient("#393989","#8181F7","E_domain"+str(qclusternr) + "_" + str(enr))
1464 d = LinearGradient("#8181F7","#393989","E_line"+str(qclusternr) + "_" + str(enr))
1465 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#E_line' + str(qclusternr) + "_" + str(enr) + ")",fill="url(#E_domain" + str(qclusternr) + "_" + str(enr) + ")")
1466 f = text("E",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A0A2A')
1467 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
1468 myStyle.setFontSize('8px')
1469 f = text("E",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A0A2A')
1470 elif ((endpos-startpos) / aa2pixelratio) < 20:
1471 f = "notext"
1472 if f != "notext":
1473 f.set_style(myStyle.getStyle())
1474 myStyle.setFontSize('12px')
1475 group = g()
1476 group.addElement(c)
1477 group.addElement(d)
1478 group.addElement(e)
1479 if f != "notext":
1480 group.addElement(f)
1481 group.set_id("b" + str(qclusternr) + "_00%s"%w)
1482 s.addElement(group)
1483 enr += 1
1484 elif ("AMP" in j or "A-OX" in j):
1485 c = LinearGradient("#56157F","#BE81F7","A_domain"+str(qclusternr) + "_" + str(anr))
1486 d = LinearGradient("#BE81F7","#56157F","A_line"+str(qclusternr) + "_" + str(anr))
1487 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#A_line' + str(qclusternr) + "_" + str(anr) + ")",fill="url(#A_domain" + str(qclusternr) + "_" + str(anr) + ")")
1488 f = text("A",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#1B0A2A')
1489 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
1490 myStyle.setFontSize('8px')
1491 f = text("A",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#1B0A2A')
1492 elif ((endpos-startpos) / aa2pixelratio) < 20:
1493 f = "notext"
1494 if f != "notext":
1495 f.set_style(myStyle.getStyle())
1496 myStyle.setFontSize('12px')
1497 group = g()
1498 group.addElement(c)
1499 group.addElement(d)
1500 group.addElement(e)
1501 if f != "notext":
1502 group.addElement(f)
1503 group.set_id("b" + str(qclusternr) + "_00%s"%w)
1504 s.addElement(group)
1505 anr += 1
1506 elif "PCP" in j:
1507 c = LinearGradient("#084BC6","#81BEF7","PCP_domain"+str(qclusternr) + "_" + str(pcpnr))
1508 d = LinearGradient("#81BEF7","#084BC6","PCP_line"+str(qclusternr) + "_" + str(pcpnr))
1509 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#PCP_line' + str(qclusternr) + "_" + str(pcpnr) + ")",fill="url(#PCP_domain" + str(qclusternr) + "_" + str(pcpnr) + ")")
1510 f = text("PCP",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A1B2A')
1511 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
1512 myStyle.setFontSize('8px')
1513 f = text("PCP",((-2 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A1B2A')
1514 elif ((endpos-startpos) / aa2pixelratio) < 20:
1515 f = "notext"
1516 if f != "notext":
1517 f.set_style(myStyle.getStyle())
1518 myStyle.setFontSize('12px')
1519 group = g()
1520 group.addElement(c)
1521 group.addElement(d)
1522 group.addElement(e)
1523 if f != "notext":
1524 group.addElement(f)
1525 group.set_id("b" + str(qclusternr) + "_00%s"%w)
1526 s.addElement(group)
1527 pcpnr += 1
1528 elif "Thioesterase" in j or "TD" in j:
1529 c = LinearGradient("#750072","#F5A9F2","TE_domain"+str(qclusternr) + "_" + str(tenr))
1530 d = LinearGradient("#F5A9F2","#750072","TE_line"+str(qclusternr) + "_" + str(tenr))
1531 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#TE_line' + str(qclusternr) + "_" + str(tenr) + ")",fill="url(#TE_domain" + str(qclusternr) + "_" + str(tenr) + ")")
1532 if "Thioesterase" in j:
1533 f = text("TE",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#2A0A29')
1534 else:
1535 f = text("TD",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#2A0A29')
1536 if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
1537 myStyle.setFontSize('8px')
1538 if "Thioesterase" in j:
1539 f = text("TE",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#2A0A29')
1540 else:
1541 f = text("TD",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#2A0A29')
1542 elif ((endpos-startpos) / aa2pixelratio) < 20:
1543 f = "notext"
1544 if f != "notext":
1545 f.set_style(myStyle.getStyle())
1546 myStyle.setFontSize('12px')
1547 group = g()
1548 group.addElement(c)
1549 group.addElement(d)
1550 group.addElement(e)
1551 if f != "notext":
1552 group.addElement(f)
1553 group.set_id("b" + str(qclusternr) + "_00%s"%w)
1554 s.addElement(group)
1555 tenr += 1
1556 else:
1557 c = LinearGradient("#929292","#DBDBDB","other_domain"+str(qclusternr) + "_" + str(othernr))
1558 d = LinearGradient("#DBDBDB","#929292","other_line"+str(qclusternr) + "_" + str(othernr))
1559 e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#other_line' + str(qclusternr) + "_" + str(othernr) + ")",fill="url(#other_domain" + str(qclusternr) + "_" + str(othernr) + ")")
1560 domname = (((((((((j.replace("0","")).replace("1","")).replace("2","")).replace("3","")).replace("4","")).replace("5","")).replace("6","")).replace("7","")).replace("8","")).replace("9","")
1561 if len(domname) == 1:
1562 f = text(domname,((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0B0B0B')
1563 elif len(domname) == 2:
1564 f = text(domname,((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0B0B0B')
1565 elif len(domname) == 3:
1566 f = text(domname,((-12 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0B0B0B')
1567 if len(domname) > 3 or ((endpos-startpos) / aa2pixelratio) < 100:
1568 myStyle.setFontSize('8px')
1569 f = text(domname,((-16 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0B0B0B')
1570 if len(domname) > 4 and ((endpos-startpos) / aa2pixelratio) < 100:
1571 myStyle.setFontSize('6px')
1572 f = text(domname,((-16 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0B0B0B')
1573 if ((endpos-startpos) / aa2pixelratio) < 60:
1574 f = "notext"
1575 if f != "notext":
1576 f.set_style(myStyle.getStyle())
1577 myStyle.setFontSize('12px')
1578 group = g()
1579 group.addElement(c)
1580 group.addElement(d)
1581 group.addElement(e)
1582 if f != "notext":
1583 group.addElement(f)
1584 group.set_id("b" + str(qclusternr) + "_00%s"%w)
1585 s.addElement(group)
1586 othernr += 1
1587 w += 1
1588 z += 1
1589 s.addElement(group)
1590 return s
1591
1592 def calculate_colorgroups(queryclusternumber,hitclusternumbers,queryclusterdata,internalhomologygroupsdict):
1593 #Extract data and generate color scheme
1594 nrhitclusters = queryclusterdata[queryclusternumber][0]
1595 hitclusterdata = queryclusterdata[queryclusternumber][1]
1596 queryclustergenes = hitclusterdata[1][3]
1597 queryclustergenesdetails = hitclusterdata[1][4]
1598 colorgroupsdict = {}
1599 colorgroupslengthlist = []
1600 colorgroupslist = []
1601 for hitclusternumber in hitclusternumbers:
1602 colorgroups = hitclusterdata[hitclusternumber][0][hitclusternumber]
1603 colorgroupsdict[hitclusternumber] = colorgroups
1604 colorgroupslengthlist.append(len(colorgroups))
1605 colorgroupslist.append(colorgroups)
1606 metacolorgroups = []
1607 internalgroups = internalhomologygroupsdict[queryclusternumber]
1608 for i in internalgroups:
1609 metagroup = []
1610 for j in i:
1611 for m in colorgroupslist:
1612 for l in m:
1613 if j in l:
1614 #for k in l:
1615 # if k not in metagroup:
1616 # metagroup.append(k)
1617 [metagroup.append(k) for k in l if k not in metagroup]
1618 if len(metagroup) > 1 and metagroup not in metacolorgroups:
1619 metacolorgroups.append(metagroup)
1620 #Generate RGB scheme
1621 rgbcolorscheme = generate_rgbscheme(len(metacolorgroups))
1622 rgbcolorscheme.append("#FFFFFF")
1623 #Create colorschemedict in which all genes that are hits of the same query gene get the same color
1624 colorschemedict = {}
1625 z = 0
1626 for i in queryclustergenes:
1627 for j in metacolorgroups:
1628 if i in j:
1629 for l in j:
1630 if colorschemedict.has_key(l):
1631 pass
1632 else:
1633 colorschemedict[l] = z
1634 #[colorschemedict[l] = z for l in j if not coloschemedict.has_key(l)]
1635 if z in colorschemedict.values():
1636 z += 1
1637 return colorschemedict,rgbcolorscheme
1638
1639 def clusterblastresults(queryclusternumber,hitclusternumbers,queryclusterdata,colorschemedict,rgbcolorscheme):
1640 #print "Generating svg for cluster",queryclusternumber
1641 #Extract data and generate color scheme
1642 nrhitclusters = queryclusterdata[queryclusternumber][0]
1643 hitclusterdata = queryclusterdata[queryclusternumber][1]
1644 queryclustergenes = hitclusterdata[1][3]
1645 queryclustergenesdetails = hitclusterdata[1][4]
1646 colorgroupsdict = {}
1647 colorgroupslengthlist = []
1648 colorgroupslist = []
1649 for hitclusternumber in hitclusternumbers:
1650 colorgroups = hitclusterdata[hitclusternumber][0][hitclusternumber]
1651 colorgroupsdict[hitclusternumber] = colorgroups
1652 colorgroupslengthlist.append(len(colorgroups))
1653 colorgroupslist.append(colorgroups)
1654 #Find out whether hit gene cluster needs to be inverted compared to query gene cluster
1655 strandsbalancedict = {}
1656 for m in hitclusternumbers:
1657 hitclustergenesdetails = hitclusterdata[m][2]
1658 strandsbalance = 0
1659 for i in queryclustergenes:
1660 refstrand = queryclustergenesdetails[i][2]
1661 for j in colorgroupsdict[m]:
1662 if i in j:
1663 for k in j:
1664 if k in hitclusterdata[m][1] and hitclustergenesdetails[k][2] == refstrand:
1665 strandsbalance += 1
1666 elif k in hitclusterdata[m][1] and hitclusterdata[m][2][k][2] != refstrand:
1667 strandsbalance = strandsbalance - 1
1668 strandsbalancedict[m] = strandsbalance
1669 #Generate coordinates for SVG figure
1670 qnrgenes = len(queryclustergenes)
1671 qstarts =[]
1672 qends = []
1673 qstrands =[]
1674 qcolors = []
1675 for i in queryclustergenes:
1676 qgenedata = queryclustergenesdetails[i]
1677 if qgenedata[0] > qgenedata[1]:
1678 qstarts.append(qgenedata[0])
1679 qends.append(qgenedata[1])
1680 else:
1681 qstarts.append(qgenedata[1])
1682 qends.append(qgenedata[0])
1683 qstrands.append(qgenedata[2])
1684 if colorschemedict.has_key(i):
1685 qcolors.append(colorschemedict[i])
1686 else:
1687 qcolors.append("white")
1688 qstarts_ends = startendsitescheck(qstarts,qends)
1689 qstarts = qstarts_ends[0]
1690 qends = qstarts_ends[1]
1691 hdata = {}
1692 for m in hitclusternumbers:
1693 hitclustergenes = hitclusterdata[m][1]
1694 hitclustergenesdetails = hitclusterdata[m][2]
1695 hnrgenes = len(hitclustergenes)
1696 hstarts =[]
1697 hends = []
1698 hstrands =[]
1699 hcolors = []
1700 for i in hitclustergenes:
1701 hgenedata = hitclustergenesdetails[i]
1702 if hgenedata[0] > hgenedata[1]:
1703 hstarts.append(hgenedata[0])
1704 hends.append(hgenedata[1])
1705 else:
1706 hstarts.append(hgenedata[1])
1707 hends.append(hgenedata[0])
1708 hstrands.append(hgenedata[2])
1709 if colorschemedict.has_key(i):
1710 hcolors.append(colorschemedict[i])
1711 else:
1712 hcolors.append("white")
1713 #Invert gene cluster if needed
1714 if strandsbalancedict[m] < 0:
1715 hstarts2 = []
1716 hends2 = []
1717 hstrands2 = []
1718 for i in hstarts:
1719 hstarts2.append(str(100000000 - int(i)))
1720 hstarts = hstarts2
1721 hstarts.reverse()
1722 for i in hends:
1723 hends2.append(str(100000000 - int(i)))
1724 hends = hends2
1725 hends.reverse()
1726 for i in hstrands:
1727 if i == "+":
1728 hstrands2.append("-")
1729 elif i == "-":
1730 hstrands2.append("+")
1731 hstrands = hstrands2
1732 hstrands.reverse()
1733 hcolors.reverse()
1734 hstarts_ends = startendsitescheck(hstarts,hends)
1735 hstarts = hstarts_ends[0]
1736 hends = hstarts_ends[1]
1737 hdata[m] = [hstarts,hends,hstrands,hcolors]
1738 #Find cluster size of largest cluster of query & all hit clusters assessed
1739 clustersizes = []
1740 for m in hitclusternumbers:
1741 hclustersize = int(hdata[m][1][-1]) - int(hdata[m][0][0])
1742 clustersizes.append(hclustersize)
1743 qclustersize = int(qends[-1]) - int(qstarts[0])
1744 clustersizes.append(qclustersize)
1745 largestclustersize = max(clustersizes)
1746 smallestclustersize = min(clustersizes)
1747 #Find relative positions
1748 qrelpositions = relativepositions(qstarts,qends,largestclustersize)
1749 qrel_starts = qrelpositions[0]
1750 qrel_ends = qrelpositions[1]
1751 qdata = [qrel_starts,qrel_ends,qstrands,qcolors]
1752 hdata2 = {}
1753 qdata2 = []
1754 for m in hitclusternumbers:
1755 hclustersize = int(hdata[m][1][-1]) - int(hdata[m][0][0])
1756 hrelpositions = relativepositions(hdata[m][0],hdata[m][1],largestclustersize)
1757 hrel_starts = hrelpositions[0]
1758 hrel_ends = hrelpositions[1]
1759 #Center-align smallest gene cluster
1760 if largestclustersize == hclustersize:
1761 qrel_ends2 = []
1762 qrel_starts2 = []
1763 for i in qrel_starts:
1764 qrel_starts2.append(int(i) + int(float(float((largestclustersize - qclustersize) / 2) / largestclustersize) * screenwidth * 0.75))
1765 for i in qrel_ends:
1766 qrel_ends2.append(int(i) + int(float(float((largestclustersize - qclustersize) / 2) / largestclustersize) * screenwidth * 0.75))
1767 qrel_ends = qrel_ends2
1768 qrel_starts = qrel_starts2
1769 else:
1770 hrel_ends2 = []
1771 hrel_starts2 = []
1772 for i in hrel_starts:
1773 hrel_starts2.append(int(i) + int(float(float((largestclustersize - hclustersize) / 2) / largestclustersize) * screenwidth * 0.75))
1774 for i in hrel_ends:
1775 hrel_ends2.append(int(i) + int(float(float((largestclustersize - hclustersize) / 2) / largestclustersize) * screenwidth * 0.75))
1776 hrel_ends = hrel_ends2
1777 hrel_starts = hrel_starts2
1778 hdata2[m] = [hrel_starts,hrel_ends,hdata[m][2],hdata[m][3]]
1779 qdata2 = [qrel_starts,qrel_ends,qdata[2],qdata[3]]
1780 hdata = hdata2
1781 qdata = qdata2
1782 s = svg(x = 0, y = 0, width = (screenwidth * 0.75), height = (270 + len(hitclusternumbers) * 50))
1783 viewbox = "0 0 " + str(screenwidth * 0.8) + " " + str(180 + len(hitclusternumbers) * 50)
1784 s.set_viewBox(viewbox)
1785 s.set_preserveAspectRatio("none")
1786 #Add line behind query gene cluster gene arrows
1787 oh = ShapeBuilder()
1788 group = g()
1789 group.addElement(oh.createLine(10,35,10 + (screenwidth * 0.75),35, strokewidth = 1, stroke = "grey"))
1790 s.addElement(group)
1791 #Add query gene cluster gene arrows
1792 a = 0
1793 y = 0
1794 for x in range(qnrgenes):
1795 group = g()
1796 #group.addElement(_gene_label(rel_starts[a],rel_ends[a],genes[a],y,screenwidth))
1797 if qcolors[a] == "white":
1798 group.addElement(_gene_arrow(10 + qrel_starts[a],10 + qrel_ends[a],qstrands[a],rgbcolorscheme[-1],40,10))
1799 else:
1800 group.addElement(_gene_arrow(10 + qrel_starts[a],10 + qrel_ends[a],qstrands[a],rgbcolorscheme[qcolors[a]],40,10))
1801 #Can be used for domains
1802 #group.addElement(oh.createRect(rel_starts[a],45,(rel_ends[a]-rel_starts[a]),10, strokewidth = 2, stroke = "black", fill="#237845"))
1803 if len(hitclusternumbers) == 1:
1804 group.set_id("q" + str(queryclusternumber) + "_" + str(hitclusternumbers[0]) + "_" + "%s"%x)
1805 else:
1806 group.set_id("all_" + str(queryclusternumber) + "_0_" + "%s"%x)
1807 s.addElement(group)
1808 if y == 0:
1809 y = 1
1810 elif y == 1:
1811 y = 0
1812 a += 1
1813 for m in hitclusternumbers:
1814 #Add line behind hit gene cluster gene arrows
1815 group.addElement(oh.createLine(10,35 + 50 * (hitclusternumbers.index(m) + 1),10 + (screenwidth * 0.75),35 + 50 * (hitclusternumbers.index(m) + 1), strokewidth = 1, stroke = "grey"))
1816 s.addElement(group)
1817 #Add hit gene cluster gene arrows
1818 hitclustergenes = hitclusterdata[m][1]
1819 hnrgenes = len(hitclustergenes)
1820 hrel_starts = hdata[m][0]
1821 hrel_ends = hdata[m][1]
1822 hstrands = hdata[m][2]
1823 hcolors = hdata[m][3]
1824 a = 0
1825 y = 0
1826 for x in range(hnrgenes):
1827 group = g()
1828 #group.addElement(_gene_label(rel_starts[a],rel_ends[a],genes[a],y,screenwidth))
1829 if hcolors[a] == "white":
1830 group.addElement(_gene_arrow(10 + hrel_starts[a],10 + hrel_ends[a],hstrands[a],rgbcolorscheme[-1],40 + 50 * (hitclusternumbers.index(m) + 1),10))
1831 else:
1832 group.addElement(_gene_arrow(10 + hrel_starts[a],10 + hrel_ends[a],hstrands[a],rgbcolorscheme[hcolors[a]],40 + 50 * (hitclusternumbers.index(m) + 1),10))
1833 #Can be used for domains
1834 # group.addElement(oh.createRect(rel_starts[a],45,(rel_ends[a]-rel_starts[a]),10, strokewidth = 2, stroke = "black", fill="#237845"))
1835 if len(hitclusternumbers) == 1:
1836 group.set_id("h" + str(queryclusternumber) + "_" + str(m) + "_" + "%s"%x)
1837 else:
1838 group.set_id("all_" + str(queryclusternumber) + "_" + str(m) + "_" + "%s"%x)
1839 s.addElement(group)
1840 if y == 0:
1841 y = 1
1842 elif y == 1:
1843 y = 0
1844 a += 1
1845 return [s,[qdata,hdata,strandsbalancedict]]
1846
1847 def runblast(query):
1848 blastsearch = "blastp -db "+antismash_path+"clusterblast/geneclusterprots.fasta -query " + query + " -outfmt 6 -max_target_seqs 1000 -evalue 1e-05 -out " + query.split(".")[0] + ".out"
1849 os.system(blastsearch)
1850
1851 def smcog_analysis(inputgenes,inputnr,accessiondict,seqdict,smcogdict,smcogsoutputfolder):
1852 #create input.fasta file with single query sequence to be used as input for MSA
1853 for k in inputgenes:
1854 gene = accessiondict[k]
1855 tag = k
1856 seq = seqdict[k]
1857 writefasta([tag],[seq],"input" + str(inputnr) + ".fasta")
1858 if len(smcogdict[k]) > 0:
1859 smcog = (smcogdict[k][0][0]).split(":")[0]
1860 #Align to multiple sequence alignment, output as fasta file
1861 fastafile = "input" + str(inputnr) + ".fasta"
1862 musclecommand = "muscle -quiet -profile -in1 " + str(smcog).lower() + "_muscle.fasta -in2 input" + str(inputnr) + ".fasta -out muscle" + str(inputnr) + ".fasta"
1863 os.system(musclecommand)
1864 #Trim alignment
1865 #edit muscle fasta file: remove all positions before the first and after the last position shared by >33% of all sequences
1866 file = open("muscle" + str(inputnr) + ".fasta","r")
1867 filetext = file.read()
1868 filetext = filetext.replace("\r","\n")
1869 lines = filetext.split("\n")
1870 ##Combine all sequence lines into single lines
1871 lines2 = []
1872 seq = ""
1873 nrlines = len(lines)
1874 a = 0
1875 lines = lines[:-1]
1876 for i in lines:
1877 if a == (nrlines - 2):
1878 seq = seq + i
1879 lines2.append(seq)
1880 if i[0] == ">":
1881 lines2.append(seq)
1882 seq = ""
1883 lines2.append(i)
1884 else:
1885 seq = seq + i
1886 a += 1
1887 lines = lines2[1:]
1888 #Retrieve names and seqs from muscle fasta lines
1889 seqs = []
1890 names = []
1891 for i in lines:
1892 if len(i) > 0 and i[0] == ">":
1893 name = i[1:]
1894 names.append(name)
1895 else:
1896 seq = i
1897 seqs.append(seq)
1898 #Find first and last amino acids shared conserved >33%
1899 #Create list system to store conservation of residues
1900 conservationlist = []
1901 lenseqs = len(seqs[0])
1902 nrseqs = len(seqs)
1903 for i in range(lenseqs):
1904 conservationlist.append({"A":0,"B":0,"C":0,"D":0,"E":0,"F":0,"G":0,"H":0,"I":0,"J":0,"K":0,"L":0,"M":0,"N":0,"P":0,"Q":0,"R":0,"S":0,"T":0,"U":0,"V":0,"W":0,"X":0,"Y":0,"Z":0,"-":0})
1905 a = 0
1906 for i in seqs:
1907 aa = list(i)
1908 for i in aa:
1909 conservationlist[a][i] += 1
1910 a += 1
1911 a = 0
1912 firstsharedaa = 0
1913 lastsharedaa = lenseqs
1914 #Find first amino acid shared
1915 first = "yes"
1916 nr = 0
1917 for i in conservationlist:
1918 aa = sortdictkeysbyvaluesrev(i)
1919 if aa[0] != "-" and i[aa[1]] > (nrseqs / 3) and first == "yes":
1920 firstsharedaa = nr
1921 first = "no"
1922 nr += 1
1923 #Find last amino acid shared
1924 conservationlist.reverse()
1925 first = "yes"
1926 nr = 0
1927 for i in conservationlist:
1928 aa = sortdictkeysbyvaluesrev(i)
1929 if aa[0] != "-" and i[aa[1]] > (nrseqs / 3) and first == "yes":
1930 lastsharedaa = lenseqs - nr
1931 first = "no"
1932 nr += 1
1933 #Shorten sequences to detected conserved regions
1934 seqs2 = []
1935 for i in seqs:
1936 seq = i[firstsharedaa:lastsharedaa]
1937 seqs2.append(seq)
1938 seqs = seqs2
1939 seedfastaname = "trimmed_alignment" + str(inputnr) + ".fasta"
1940 writefasta(names,seqs,seedfastaname)
1941 #Draw phylogenetic tree with fasttree 2.1.1
1942 nwkfile = "tree" + str(inputnr) + ".nwk"
1943 if sys.platform == ('win32'):
1944 fasttreecommand = "fasttree -quiet -fastest -noml trimmed_alignment" + str(inputnr) + ".fasta > " + nwkfile
1945 elif sys.platform == ('linux2'):
1946 fasttreecommand = "./FastTree -quiet -fastest -noml trimmed_alignment" + str(inputnr) + ".fasta > " + nwkfile
1947 os.system(fasttreecommand)
1948 #Convert tree to XTG and draw PNG image using TreeGraph
1949 p = subprocess.Popen("java -Djava.awt.headless=true -jar TreeGraph.jar -convert tree" + str(inputnr) + ".nwk -xtg tree" + str(inputnr) + ".xtg", shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
1950 processes_starttime = time.time()
1951 while True:
1952 if (time.time() - processes_starttime) > 300:
1953 if sys.platform == ('linux2'):
1954 os.kill(p.pid,signal.SIGKILL)
1955 break
1956 if sys.platform == ('win32'):
1957 subprocess.Popen("taskkill /F /T /PID %i"%p.pid , shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
1958 break
1959 if p.poll() == 0:
1960 break
1961 time.sleep(2)
1962 out, err = p.communicate()
1963 output = out
1964 if "exception" not in output and "Exception" not in output:
1965 p = subprocess.Popen("java -Djava.awt.headless=true -jar TreeGraph.jar -image tree" + str(inputnr) + ".xtg " + tag.split(".")[0] + ".png", shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
1966 processes_starttime = time.time()
1967 while True:
1968 if (time.time() - processes_starttime) > 300:
1969 if sys.platform == ('linux2'):
1970 os.kill(p.pid,signal.SIGKILL)
1971 break
1972 if sys.platform == ('win32'):
1973 subprocess.Popen("taskkill /F /T /PID %i"%p.pid , shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
1974 break
1975 if p.poll() == 0:
1976 break
1977 time.sleep(2)
1978 out, err = p.communicate()
1979 output = out
1980 if "exception" not in output and "Exception" not in output:
1981 if sys.platform == ('win32'):
1982 copycommand = 'copy/y ' + tag.split(".")[0] + '.png "..\\' + smcogsoutputfolder + '" > nul'
1983 elif sys.platform == ('linux2'):
1984 copycommand = 'cp ' + tag.split(".")[0] + '.png "../' + smcogsoutputfolder + '" > /dev/null'
1985 os.system(copycommand)
1986 if sys.platform == ('win32'):
1987 os.system("del " + tag.split(".")[0] + ".png")
1988 os.system("del tree" + str(inputnr) + ".xtg")
1989 os.system("del trimmed_alignment" + str(inputnr) + ".fasta")
1990 elif sys.platform == ('linux2'):
1991 os.system("rm " + tag.split(".")[0] + ".png")
1992 os.system("rm tree" + str(inputnr) + ".xtg")
1993 os.system("rm trimmed_alignment" + str(inputnr) + ".fasta")
1994
1995 def depict_smile(genecluster,structuresfolder):
1996 if sys.platform == ('win32'):
1997 indigo_depict_command1 = "indigo-depict genecluster" + str(genecluster) + ".smi " + "genecluster" + str(genecluster) + "_icon.png -query -w 200 -h 150"
1998 indigo_depict_command2 = "indigo-depict genecluster" + str(genecluster) + ".smi " + "genecluster" + str(genecluster) + ".png -query"
1999 elif sys.platform == ('linux2'):
2000 indigo_depict_command1 = "./indigo-depict genecluster" + str(genecluster) + ".smi " + "genecluster" + str(genecluster) + "_icon.png -query -w 200 -h 150"
2001 indigo_depict_command2 = "./indigo-depict genecluster" + str(genecluster) + ".smi " + "genecluster" + str(genecluster) + ".png -query"
2002 os.system(indigo_depict_command1)
2003 os.system(indigo_depict_command2)
2004 dircontents = getdircontents()
2005 geneclusterstring = "genecluster" + str(genecluster) + ".png"
2006 if geneclusterstring in dircontents:
2007 if sys.platform == ('win32'):
2008 structuresfolder = structuresfolder.replace("/","\\")
2009 copycommand1 = "copy/y genecluster" + str(genecluster) + ".png ..\\" + structuresfolder + ' > nul'
2010 copycommand2 = "copy/y genecluster" + str(genecluster) + "_icon.png ..\\" + structuresfolder + ' > nul'
2011 delcommand1 = "del genecluster" + str(genecluster) + ".png"
2012 delcommand2 = "del genecluster" + str(genecluster) + "_icon.png"
2013 delcommand3 = "del genecluster" + str(genecluster) + ".smi"
2014 os.system(copycommand1)
2015 os.system(copycommand2)
2016 os.system(delcommand1)
2017 os.system(delcommand2)
2018 os.system(delcommand3)
2019 if sys.platform == ('linux2'):
2020 copycommand1 = "cp genecluster" + str(genecluster) + ".png ../" + structuresfolder
2021 copycommand2 = "cp genecluster" + str(genecluster) + "_icon.png ../" + structuresfolder
2022 delcommand1 = "rm genecluster" + str(genecluster) + ".png"
2023 delcommand2 = "rm genecluster" + str(genecluster) + "_icon.png"
2024 delcommand3 = "rm genecluster" + str(genecluster) + ".smi"
2025 os.system(copycommand1)
2026 os.system(copycommand2)
2027 os.system(delcommand1)
2028 os.system(delcommand2)
2029 return "success"
2030 else:
2031 return "failed"
2032
2033 ##Core script
2034 import os
2035 from os import system
2036 import sys
2037 import multiprocessing
2038 import time
2039 from multiprocessing import Process, freeze_support
2040 import random
2041 import string
2042 import itertools
2043 from pysvg.filter import *
2044 from pysvg.gradient import *
2045 from pysvg.linking import *
2046 from pysvg.script import *
2047 from pysvg.shape import *
2048 from pysvg.structure import *
2049 from pysvg.style import *
2050 from pysvg.text import *
2051 from pysvg.builders import *
2052 from string import ascii_letters
2053 from pyExcelerator import *
2054 from pyExcelerator.Workbook import *
2055 import signal
2056 import subprocess
2057 starttime = time.time()
2058
2059 os.environ['NRPS2BASEDIR'] = os.path.join(os.getcwd(), 'NRPSPredictor2')
2060
2061 #Fix sys.argv input
2062 options = []
2063 for i in sys.argv:
2064 if i.count('"') > 1:
2065 j = i.split(' ')
2066 for k in j:
2067 if k[0] == '"':
2068 k = k + '"'
2069 elif k[-1] == '"':
2070 k = '"' + k
2071 options.append(k)
2072 else:
2073 options.append(i)
2074 sys.argv = options
2075 #Redirect stdout and stderr if GUI-executed
2076 if "--gui" in sys.argv and len(sys.argv) < (sys.argv.index("--gui") + 2):
2077 print >> sys.stderr, "Invalid options input: --gui without n or y"
2078 print "From the command line, input antismash --help for more information."
2079 logfile = open("antismash.log","w")
2080 logfile.write("Invalid options input: --gui without n or y\n")
2081 logfile.close()
2082 sys.exit(1)
2083 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
2084 stdoutfile = open("stdout.txt","w")
2085 sys.stdout = stdoutfile
2086 sys.stderr = stdoutfile
2087
2088 if __name__ == '__main__':
2089 import shutil
2090 hmmsearch_path = 'hmmsearch'
2091 hmmscan_path = 'hmmscan'
2092 antismash_path = '/home/galaxy/bin/antismash-1.1.0/'
2093 hmms_path = antismash_path + '/hmms/'
2094 shutil.copytree(antismash_path + '/NRPSPredictor2/', './NRPSPredictor2/')
2095 shutil.copytree(antismash_path + '/Minowa/', './Minowa/')
2096 shutil.copytree(antismash_path + '/pkssignatures/', './pkssignatures/')
2097 shutil.copytree(antismash_path + '/kr_analysis/', './kr_analysis/')
2098 shutil.copytree(antismash_path + '/docking_analysis/', './docking_analysis/')
2099 shutil.copytree(antismash_path + '/NRPeditor/', './NRPeditor/')
2100 shutil.copy(antismash_path + '/search_form.html', './')
2101 shutil.copy(antismash_path + '/empty.xhtml', './')
2102 shutil.copytree(antismash_path + '/vis/', './vis/')
2103 shutil.copytree(antismash_path + '/smcogtree/', './smcogtree/')
2104
2105 # add freeze support
2106 freeze_support()
2107
2108 #Open logfile
2109 logfile = open("antismash.log","w")
2110
2111 #Identify screen width
2112 if sys.platform == ('win32'):
2113 import ctypes
2114 user32 = ctypes.windll.user32
2115 screenwidth = user32.GetSystemMetrics(0)
2116 if sys.platform == ('linux2'):
2117 screenwidth = 1024
2118 # res = os.popen("xrandr | grep \* | cut -d' ' -f4") ###FOR SERVER USE###
2119 # res = res.read() ###FOR SERVER USE###
2120 # screenwidth = int(res.split("x")[0]) ###FOR SERVER USE###
2121 if screenwidth < 1024:
2122 screenwidth = 1024
2123 #temporary for testing
2124 screenwidth = 1024
2125
2126
2127 #Reads input
2128 inputinstructions = "antiSMASH 1.1.0 arguments:\n\nUsage: antismash <query fasta/embl/gbk file> [options]\n\nOptions (x is an integer number, list x,y,z is a list of integer numbers separated by commas):\n\n--gtransl <x> : GenBank translation table used for Glimmer (only for FASTA inputs, default: 1)\n1. The Standard Code\n2. The Vertebrate Mitochondrial Code\n3. The Yeast Mitochondrial Code\n4. The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code\n5. The Invertebrate Mitochondrial Code\n6. The Ciliate, Dasycladacean and Hexamita Nuclear Code\n9. The Echinoderm and Flatworm Mitochondrial Code\n10. The Euplotid Nuclear Code\n11. The Bacterial, Archaeal and Plant Plastid Code\n12. The Alternative Yeast Nuclear Code\n13. The Ascidian Mitochondrial Code\n14. The Alternative Flatworm Mitochondrial Code\n15. Blepharisma Nuclear Code\n16. Chlorophycean Mitochondrial Code\n21. Trematode Mitochondrial Code\n22. Scenedesmus Obliquus Mitochondrial Code\n23. Thraustochytrium Mitochondrial Code\n--genomeconf <l/c> : Genome configuration used for Glimmer: linear / circular (only for FASTA inputs, default: l)\n--minglength <x> : Glimmer minimal gene length (range 30-120, only for FASTA inputs, default: 90)\n--taxon <p/e> : Taxonomy: prokaryotic / eukaryotic (default: p)\n--cores <x> : Number of parallel CPUs to use for threading (default: all)\n--clusterblast <y/n> : Include ClusterBlast gene cluster comparison analysis (default:y)\n--smcogs <y/n> : Include smCOG analysis for functional prediction of genes (default:y)\n--fullblast <y/n> : Include genome-wide BLAST analysis (default:n)\n--fullhmm <y/n> : Include genome-wide PFAM HMM analysis (default:n)\n--blastdbpath <path> : Specify folder containing CLUSEAN blast database (default:clusean/db)\n--pfamdbpath <path> : Specify folder containing PFAM database (default:clusean/db)\n--geneclustertypes <x,y,z> : Gene cluster types to scan for (default:1):\n1 = all\n2 = type I polyketide synthases\n3 = type II polyketide synthases\n4 = type III polyketide synthases\n5 = nonribosomal peptide synthetases\n6 = terpene synthases\n7 = lantibiotics\n8 = bacteriocins\n9 = beta-lactams\n10 = aminoglycosides / aminocyclitols\n11 = aminocoumarins\n12 = siderophores\n13 = ectoines\n14 = butyrolactones\n15 = indoles\n16 = nucleosides\n17 = phosphoglycolipids\n18 = melanins\n19 = others\n--help : this help screen\n"
2129 #Check input file format
2130 if len(sys.argv) < 2 or len(sys.argv[1]) < 1:
2131 print >> sys.stderr, "Please supply valid name for input file."
2132 print "Usage: antismash <query fasta/embl/gbk file> [options]"
2133 print "From the command line, input antismash --help for more information."
2134 logfile.write("Input format error. Please supply valid name for infile.\n")
2135 logfile.write("Usage: antismash <query fasta/embl/gbk file> [options]\n")
2136 logfile.write("From the command line, input antismash --help for more information.\n")
2137 logfile.close()
2138 sys.exit(1)
2139 if sys.argv[1] != "--help":
2140 if len(sys.argv[1].split(".")) < 2 or (sys.argv[1].split(".")[-1] != "embl" and sys.argv[1].split(".")[-1] != "EMBL" and sys.argv[1].split(".")[-1] != "emb" and sys.argv[1].split(".")[-1] != "EMB" and sys.argv[1].split(".")[-1] != "genbank" and sys.argv[1].split(".")[-1] != "GENBANK" and sys.argv[1].split(".")[-1] != "gbk" and sys.argv[1].split(".")[-1] != "GBK" and sys.argv[1].split(".")[-1] != "gb" and sys.argv[1].split(".")[-1] != "GB" and sys.argv[1].split(".")[-1] != "fasta" and sys.argv[1].split(".")[-1] != "FASTA" and sys.argv[1].split(".")[-1] != "fas" and sys.argv[1].split(".")[-1] != "FAS" and sys.argv[1].split(".")[-1] != "fa" and sys.argv[1].split(".")[-1] != "FA"):
2141 print >> sys.stderr, "No EMBL/GBK/FASTA file submitted as input. Please supply a valid file with .embl / .gbk / .fasta extension. "
2142 print "Usage: antismash <query fasta/embl/gbk file> [options]"
2143 print "From the command line, input antismash --help for more information."
2144 logfile.write("Input format error. Please supply a valid file with .embl / .gbk / .fasta extension.\n")
2145 logfile.write("Usage: antismash <query fasta/embl/gbk file> [options]\n")
2146 logfile.write("From the command line, input antismash --help for more information.\n")
2147 logfile.close()
2148 sys.exit(1)
2149 #Define input filename and make fixes if necessary
2150 infile = sys.argv[1]
2151 try:
2152 testfile = open(infile,"r").read()
2153 except(IOError):
2154 print >> sys.stderr, "Please supply valid name for input file."
2155 print "Usage: antismash <query fasta/embl/gbk file> [options]"
2156 print "From the command line, input antismash --help for more information."
2157 logfile = open("antismash.log","w")
2158 logfile.write("Input format error. Please supply valid name for infile.\n")
2159 logfile.write("Usage: antismash <query fasta/embl/gbk file> [options]\n")
2160 logfile.write("From the command line, input antismash --help for more information.\n")
2161 logfile.close()
2162 sys.exit(1)
2163 #Parse absolute paths if found
2164 absolutepath = "n"
2165 if "/" in infile or "\\" in infile:
2166 absolutepath = "y"
2167 lastpos1 = infile.rfind("\\")
2168 lastpos2 = infile.rfind("/")
2169 lastpos = max([lastpos1,lastpos2])
2170 originpath = infile[:(lastpos + 1)]
2171 infile = infile[(lastpos + 1):]
2172 if sys.platform == ('win32'):
2173 copycommand = 'copy/y "' + originpath + infile + '" ' + infile + ' > nul'
2174 os.system(copycommand)
2175 if sys.platform == ('linux2'):
2176 copycommand = 'cp ' + originpath + infile + " . > /dev/null"
2177 os.system(copycommand)
2178 #genomename = ".".join(infile.split(".")[:-1])
2179 #for i in genomename:
2180 # if i in '!"#$%&()*+,./:;=>?@[]^`{|}' or i in "'":
2181 # genomename = genomename.replace(i,"")
2182 # if "/" in genomename:
2183 # genomename = genomename.rpartition("/")[2]
2184 # if "\\" in genomename:
2185 # genomename = genomename.rpartition("\\")[2]
2186 genomename = os.path.splitext(os.path.basename(infile))[0]
2187 if sys.platform == ('linux2'):
2188 if genomename != infile.split(".")[-2]:
2189 oldinfile = infile.replace("(","\\(").replace(")","\\)").replace("*","\\*").replace("&","\\&").replace("!","\\!").replace("$","\\$").replace("{","\\{").replace("}","\\}").replace("|","\\|").replace("`","\\`").replace("'","\\'").replace('"','\\"').replace('?','\\?')
2190 infile = genomename + "." + infile.split(".")[-1]
2191 if "/" in genomename:
2192 genomename = genomename.rpartition("/")[2]
2193 if "\\" in genomename:
2194 genomename = genomename.rpartition("\\")[2]
2195 os.system("cp " + oldinfile + " " + infile)
2196 #Define outputfolder
2197 if absolutepath == "y":
2198 if sys.platform == ('win32'):
2199 dir1 = os.popen("dir/w/ad " + originpath)
2200 dir2 = os.popen("dir/w/ad")
2201 dir1 = dir1.read()
2202 dir2 = dir2.read()
2203 if sys.platform == ('linux2'):
2204 dir1 = os.popen("ls")
2205 dir2 = os.popen("ls " + originpath)
2206 dir1 = dir1.read()
2207 dir2 = dir2.read()
2208 parts = dir1.split(" ") + dir2.split(" ")
2209 else:
2210 if sys.platform == ('win32'):
2211 dir = os.popen("dir/w/ad")
2212 dir = dir.read()
2213 if sys.platform == ('linux2'):
2214 dir = os.popen("ls")
2215 dir = dir.read()
2216 parts = dir.split(" ")
2217 parts2 = []
2218 for i in parts:
2219 partparts = i.split("\n")
2220 for i in partparts:
2221 i = i.replace("[","")
2222 i = i.replace("]","")
2223 parts2.append(i)
2224 parts = parts2
2225 oldgenomename = genomename
2226 if genomename in parts:
2227 genomename = genomename + "_" + str(0)
2228 while genomename in parts:
2229 finalpart = genomename.split("_")[-1]
2230 allnumbers = "y"
2231 for i in finalpart:
2232 if i not in ["0","1","2","3","4","5","6","7","8","9"]:
2233 allnumbers = "n"
2234 if allnumbers == "y" and int(finalpart) in range(0,1000):
2235 newgenomename = ""
2236 for i in genomename.split("_")[:-1]:
2237 newgenomename = newgenomename + "_" + i
2238 newgenomename = newgenomename + "_" + str(int(finalpart) + 1)
2239 genomename = newgenomename[1:]
2240 genomename = genomename.replace("__","_")
2241 #Output results folder name for output checking by GUI
2242 resultslocfile = open("resultsfolder.txt","w")
2243 resultslocfile.write(os.getcwd() + os.sep + genomename)
2244 resultslocfile.close()
2245 #Implement defaults
2246 glimmertransl_table = str(1)
2247 genomeconf = "l"
2248 minglength = str(90)
2249 cores = "all"
2250 taxon = "p"
2251 clusterblast = "y"
2252 smcogs = "y"
2253 fullblast = "n"
2254 fullhmm = "n"
2255 if sys.platform == ('win32'):
2256 blastdbpath = '"' + os.getcwd() + "/clusean/db" + '"'
2257 if sys.platform == ('linux2'):
2258 blastdbpath = os.getcwd() + "/clusean/db"
2259 if sys.platform == ('win32'):
2260 pfamdbpath = '"' + os.getcwd() + "/clusean/db/" + '"'
2261 if sys.platform == ('linux2'):
2262 pfamdbpath = os.getcwd() + "/clusean/db/"
2263 geneclustertypes = [1]
2264 #Read user-specified options which may override defaults
2265 if len(sys.argv) > 2 or sys.argv[1] == "--help":
2266 options = sys.argv
2267 if "--" in options[-1] and sys.argv[1] != "--help":
2268 invalidoptions(options[-1])
2269 #identify option identifiers
2270 identifiers = []
2271 for i in options:
2272 if "--" in i:
2273 if i not in identifiers:
2274 identifiers.append(i)
2275 else:
2276 invalidoptions("No '--' in given options or option given twice.")
2277 for i in identifiers:
2278 if i != "--help":
2279 value = options[options.index(i) + 1].strip()
2280 if i == "--gtransl":
2281 for k in value:
2282 if k not in ["0","1","2","3","4","5","6","7","8","9"]:
2283 invalidoptions(i + "input is no number")
2284 if int(value) in range(1,24) and int(value) != 7 and int(value) != 8 and int(value) != 17 and int(value) != 18 and int(value) != 19 and int(value) != 20:
2285 glimmertransl_table = value
2286 else:
2287 invalidoptions(i)
2288 elif i == "--genomeconf":
2289 if value == "l" or value == "c":
2290 genomeconf = value
2291 else:
2292 invalidoptions(i)
2293 elif i == "--minglength":
2294 for k in value:
2295 if k not in ["0","1","2","3","4","5","6","7","8","9"]:
2296 invalidoptions(i)
2297 if int(value) in range(30,91):
2298 minglength = value
2299 else:
2300 print >> sys.stderr, "Invalid options input: minimal gene length should be a number between 30-90."
2301 logfile = open("antismash.log","w")
2302 logfile.write("Invalid options input: minimal gene length should be a number between 30-90.\n")
2303 logfile.close()
2304 sys.exit(1)
2305 elif i == "--cores":
2306 for k in value:
2307 if k not in ["0","1","2","3","4","5","6","7","8","9"]:
2308 invalidoptions(i)
2309 if int(value) in range(1,1000):
2310 cores = int(value)
2311 else:
2312 invalidoptions(i)
2313 elif i == "--taxon":
2314 if value == "p" or value == "e":
2315 taxon = value
2316 else:
2317 invalidoptions(i)
2318 elif i == "--clusterblast":
2319 if value == "y" or value == "n":
2320 clusterblast = value
2321 else:
2322 invalidoptions(i)
2323 elif i == "--smcogs":
2324 if value == "y" or value == "n":
2325 smcogs = value
2326 else:
2327 invalidoptions(i)
2328 elif i == "--fullblast":
2329 if value == "y" or value == "n":
2330 fullblast = value
2331 else:
2332 invalidoptions(i)
2333 elif i == "--fullhmm":
2334 if value == "y" or value == "n":
2335 fullhmm = value
2336 else:
2337 invalidoptions(i)
2338 elif i == "--glimmer_prediction":
2339 glimmer_prediction_path = value
2340 elif i == "--blastdbpath":
2341 if sys.platform == ('win32'):
2342 if options[options.index(i) + 1][0] != '"':
2343 value = '"' + options[options.index(i) + 1] + '"'
2344 else:
2345 value = options[options.index(i) + 1]
2346 if ":\\" in value:
2347 blastdbpath = value
2348 elif "\\" in value or "/" in value:
2349 if value[0] == "\\" or value[0] == "/":
2350 blastdbpath = os.getcwd() + value
2351 else:
2352 blastdbpath = os.getcwd() + "\\" + value
2353 else:
2354 blastdbpath = os.getcwd() + "\\" + value
2355 if sys.platform == ('linux2'):
2356 value = options[options.index(i) + 1]
2357 if "\\" in value or "/" in value:
2358 value = value.replace("\\","/")
2359 if value[0] == "/":
2360 blastdbpath = value
2361 else:
2362 blastdbpath = os.getcwd() + "/" + value
2363 else:
2364 blastdbpath = os.getcwd() + "/" + value
2365 elif i == "--pfamdbpath":
2366 if sys.platform == ('win32'):
2367 if options[options.index(i) + 1][0] != '"':
2368 value = '"' + options[options.index(i) + 1] + '"'
2369 else:
2370 value = options[options.index(i) + 1]
2371 if ":\\" in value:
2372 pfamdbpath = value
2373 elif "\\" in value or "/" in value:
2374 if value[0] == "\\" or value[0] == "/":
2375 pfamdbpath = os.getcwd() + value
2376 else:
2377 pfamdbpath = os.getcwd() + "\\" + value
2378 else:
2379 pfamdbpath = os.getcwd() + "\\" + value
2380 if sys.platform == ('linux2'):
2381 value = options[options.index(i) + 1]
2382 if "\\" in value or "/" in value:
2383 value = value.replace("\\","/")
2384 if value[0] == "/":
2385 pfamdbpath = value
2386 else:
2387 pfamdbpath = os.getcwd() + "/" + value
2388 else:
2389 pfamdbpath = os.getcwd() + "/" + value
2390 elif i == "--geneclustertypes":
2391 if "," not in value and value not in ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19"]:
2392 invalidoptions(i)
2393 else:
2394 types = value.split(",")
2395 types2 = []
2396 if "1" in types:
2397 types2 = [1]
2398 for j in types:
2399 if int(j) not in range(1,20):
2400 invalidoptions(i)
2401 else:
2402 types2.append(int(j))
2403 geneclustertypes = types2
2404 elif i == "--help":
2405 print inputinstructions
2406 sys.exit()
2407 elif i == "--gui":
2408 pass
2409 else:
2410 invalidoptions(i)
2411
2412 #Determine number of CPUs used
2413 if cores == "all":
2414 try:
2415 nrcpus = multiprocessing.cpu_count()
2416 except(IOError,OSError,NotImplementedError):
2417 nrcpus = 1
2418 else:
2419 try:
2420 nrcpus = multiprocessing.cpu_count()
2421 except(IOError,OSError,NotImplementedError):
2422 nrcpus = 1
2423 if cores < nrcpus:
2424 nrcpus = cores
2425
2426 #Create directory structure needed for file storage
2427 try:
2428 os.mkdir(genomename)
2429 except(IOError,OSError):
2430 pass
2431 hmmoutputfolder = genomename + "/hmmoutput/"
2432 try:
2433 os.mkdir(hmmoutputfolder)
2434 except(IOError,OSError):
2435 pass
2436 nrpspksoutputfolder = genomename + "/nrpspks/"
2437 try:
2438 os.mkdir(nrpspksoutputfolder)
2439 except(IOError,OSError):
2440 pass
2441 nrpspredictoroutputfolder = nrpspksoutputfolder + "nrpspredictor/"
2442 try:
2443 os.mkdir(nrpspredictoroutputfolder)
2444 except(IOError,OSError):
2445 pass
2446 minowanrpsoutputfolder = nrpspksoutputfolder + "minowanrpspred/"
2447 try:
2448 os.mkdir(minowanrpsoutputfolder)
2449 except(IOError,OSError):
2450 pass
2451 minowapksoutputfolder = nrpspksoutputfolder + "minowapkspred/"
2452 try:
2453 os.mkdir(minowapksoutputfolder)
2454 except(IOError,OSError):
2455 pass
2456 minowacaloutputfolder = nrpspksoutputfolder + "minowacalpred/"
2457 try:
2458 os.mkdir(minowacaloutputfolder)
2459 except(IOError,OSError):
2460 pass
2461 pkssignatureoutputfolder = nrpspksoutputfolder + "pkssignatures/"
2462 try:
2463 os.mkdir(pkssignatureoutputfolder)
2464 except(IOError,OSError):
2465 pass
2466 kranalysisoutputfolder = nrpspksoutputfolder + "kr_analysis/"
2467 try:
2468 os.mkdir(kranalysisoutputfolder)
2469 except(IOError,OSError):
2470 pass
2471 clusterblastoutputfolder = genomename + "/clusterblast/"
2472 try:
2473 os.mkdir(clusterblastoutputfolder)
2474 except(IOError,OSError):
2475 pass
2476 smcogsoutputfolder = genomename + "/smcogs/"
2477 try:
2478 os.mkdir(smcogsoutputfolder)
2479 except(IOError,OSError):
2480 pass
2481 substrspecsfolder = genomename + "/substrspecs/"
2482 try:
2483 os.mkdir(substrspecsfolder)
2484 except(IOError,OSError):
2485 pass
2486 structuresfolder = genomename + "/structures/"
2487 try:
2488 os.mkdir(structuresfolder)
2489 except(IOError,OSError):
2490 pass
2491 svgfolder = genomename + "/svg/"
2492 try:
2493 os.mkdir(svgfolder)
2494 except(IOError,OSError):
2495 pass
2496 searchgtrfolder = genomename + "/searchgtr/"
2497 try:
2498 os.mkdir(searchgtrfolder)
2499 except(IOError,OSError):
2500 pass
2501 htmlfolder = genomename + "/html/"
2502 try:
2503 os.mkdir(htmlfolder)
2504 except(IOError,OSError):
2505 pass
2506 imagesfolder = genomename + "/images/"
2507 try:
2508 os.mkdir(imagesfolder)
2509 except(IOError,OSError):
2510 pass
2511
2512 #If input is unannotated GBK/EMBL file, convert to FASTA and use that as input
2513 if " CDS " not in open(infile,"r").read() and "FT CDS " not in open(infile,"r").read():
2514 if infile.split(".")[-1] == "embl" or infile.split(".")[-1] == "EMBL" or infile.split(".")[-1] == "emb" or infile.split(".")[-1] == "EMB":
2515 filetext = open(infile,"r").read()
2516 if "\nSQ" not in filetext:
2517 print >> sys.stderr, "Exit: EMBL file not properly formatted, no sequence found."
2518 logfile = open("antismash.log","w")
2519 logfile.write("Exit: EMBL file not properly formatted, no sequence found.\n")
2520 logfile.close()
2521 sys.exit(1)
2522 dnaseq = filetext.split("\nSQ")[1]
2523 dnaseq = cleandnaseq(dnaseq)
2524 sequence = dnaseq
2525 if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
2526 print >> sys.stderr, "Protein EMBL file provided. Please provide nucleotide EMBL file."
2527 sys.exit(1)
2528 fastafile = open(infile.rpartition(".")[0] + ".fasta","w")
2529 fastafile.write(">" + infile.rpartition(".")[0] + "|\n")
2530 fastafile.write(sequence)
2531 fastafile.close()
2532 infile = fastafile
2533 elif infile.split(".")[-1] == "gbk" or infile.split(".")[-1] == "GBK" or infile.split(".")[-1] == "gb" or infile.split(".")[-1] == "GB" or infile.split(".")[-1] == "genbank" or infile.split(".")[-1] == "GENBANK":
2534 filetext = open(infile,"r").read()
2535 if "\nORIGIN" not in filetext:
2536 print >> sys.stderr, "Exit: GBK file not properly formatted, no sequence found."
2537 logfile = open("antismash.log","w")
2538 logfile.write("Exit: GBK file not properly formatted, no sequence found.\n")
2539 logfile.close()
2540 sys.exit(1)
2541 dnaseq = filetext.split("\nORIGIN")[1]
2542 dnaseq = cleandnaseq(dnaseq)
2543 sequence = dnaseq
2544 if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
2545 print >> sys.stderr, "Protein GBK file provided. Please provide nucleotide GBK file."
2546 sys.exit(1)
2547 fastafile = open(infile.rpartition(".")[0] + ".fasta","w")
2548 fastafile.write(">" + infile.rpartition(".")[0] + "\n")
2549 fastafile.write(sequence)
2550 fastafile.close()
2551 infile = infile.rpartition(".")[0] + ".fasta"
2552 #If input is unannotated fasta file, predict genes with Glimmer and create EMBL file. If input is EMBL or GBK file, read input embl/gbk and create input fasta file, read input protein info into memory
2553 annotated = "y"
2554 if infile.split(".")[-1] == "fasta" or infile.split(".")[-1] == "FASTA" or infile.split(".")[-1] == "FAS" or infile.split(".")[-1] == "fas" or infile.split(".")[-1] == "FA" or infile.split(".")[-1] == "fa":
2555 annotated = "n"
2556 #Check input file formatting
2557 sequence = get_sequence(infile)
2558 if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
2559 print >> sys.stderr, "Protein FASTA file provided. Please provide nucleotide FASTA file."
2560 sys.exit(1)
2561 nucleotides = ["A","a","C","c","G","g","T","t","N","n"]
2562 badsequence = "n"
2563 sequence_name = open(infile,"r").read().partition(">")[2].partition("\n")[0]
2564 for i in sequence:
2565 if i not in nucleotides:
2566 badsequence = "y"
2567 if badsequence == "y":
2568 cleaned_sequence = cleandnaseq(sequence)
2569 badsequence = "n"
2570 for i in cleaned_sequence:
2571 if i not in nucleotides:
2572 badsequence = "y"
2573 if badsequence == "n":
2574 writefasta([sequence_name],[cleaned_sequence],infile.rpartition(".")[0] + "_f.fasta")
2575 infile = infile.rpartition(".")[0] + "_f.fasta"
2576 else:
2577 print >>sys.stderr, "Incorrect file formatting. Please submit a properly formatted single-sequence FASTA file."
2578 logfile = open("antismash.log","w")
2579 logfile.write("Incorrect file formatting. Please submit a properly formatted single-sequence FASTA file.\n")
2580 logfile.close()
2581 sys.exit(1)
2582 revseq = reverse_complement(sequence)
2583 seqlength = len(sequence)
2584
2585 #Print Glimmer notification
2586 #if taxon == "p":
2587 # print "Running Glimmer 3.02 to predict genes in unannotated prokaryotic genome..."
2588 #elif taxon == "e":
2589 # print "Running GlimmerHMM 3.0.1 to predict genes in unannotated eukaryotic genome..."
2590 logfile = open("antismash.log","w")
2591 if taxon == "p":
2592 logfile.write("Running Glimmer 3.02 to predict genes in unannotated prokaryotic genome...\n")
2593 elif taxon == "e":
2594 logfile.write("Running GlimmerHMM 3.0.1 to predict genes in unannotated eukaryotic genome...\n")
2595 #logfile.close()
2596 loginfo = open("antismash.log","r").read()
2597 #logfile.close()
2598 #Copying file and changing to folder to prepare for Glimmer3 prediction
2599 os.mkdir( os.path.join(os.getcwd(), genomename, "geneprediction"))
2600 if sys.platform == ('win32'):
2601 os.system("copy/y " + infile + " geneprediction > nul")
2602 if sys.platform == ('linux2'):
2603 os.system("cp " + infile + " geneprediction > /dev/null")
2604
2605 os.chdir( os.path.join(os.getcwd(), genomename, "geneprediction"))
2606 fastafile = '../../'+infile
2607
2608 #Find DNA sequence length
2609 seq = get_sequence(fastafile)
2610 dnaseqlength = len(seq)
2611 #Run Glimmer for prokaryotic sequences, GlimmerHMM for eukaryotic sequences
2612 if taxon == "p":
2613 """
2614 GlimmerPrediction, not needed since we can predict it in galaxy on our own
2615 if genomeconf == "l":
2616 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
2617 os.popen("tigr-glimmer long-orfs -l -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
2618 else:
2619 os.system("tigr-glimmer long-orfs -l -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
2620 else:
2621 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
2622 os.popen("tigr-glimmer long-orfs -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
2623 else:
2624 os.system("tigr-glimmer long-orfs -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
2625 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
2626 os.popen("tigr-glimmer extract -t " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs > " + fastafile.rpartition(".")[0] + ".train")
2627 else:
2628 os.system("tigr-glimmer extract -t " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs > " + fastafile.rpartition(".")[0] + ".train")
2629 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
2630 os.popen("tigr-glimmer build-icm -r " + fastafile.rpartition(".")[0] + ".icm < " + fastafile.rpartition(".")[0] + ".train")
2631 else:
2632 os.system("tigr-glimmer build-icm -r " + fastafile.rpartition(".")[0] + ".icm < " + fastafile.rpartition(".")[0] + ".train")
2633 if genomeconf == "l":
2634 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
2635 os.popen("tigr-glimmer glimmer3 -l -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
2636 else:
2637 os.system("tigr-glimmer glimmer3 -l -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
2638 else:
2639 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
2640 os.popen("tigr-glimmer glimmer3 -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
2641 else:
2642 os.system("tigr-glimmer glimmer3 -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
2643 #Convert glimmer predictions into EMBL with sequence
2644 glfile = fastafile.rpartition(".")[0] + ".predict"
2645
2646 Ende der Glimmer-Prediction
2647 """
2648 glfile = glimmer_prediction_path
2649 emblfile = fastafile.rpartition(".")[0] + ".embl"
2650 try:
2651 file = open(glfile,"r")
2652 filetext = file.read()
2653 except:
2654 print >> sys.stderr, "Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 11."
2655 logfile = open("antismash.log","w")
2656 logfile.write("Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 11.\n")
2657 logfile.close()
2658 sys.exit(1)
2659 if "orf" not in filetext:
2660 print >> sys.stderr, "Glimmer gene prediction failed: no genes found."
2661 logfile = open("antismash.log","w")
2662 logfile.write("Glimmer gene prediction failed: no genes found.\n")
2663 logfile.close()
2664 sys.exit(1)
2665 filetext = filetext.replace("\r","\n")
2666 lines = filetext.split("\n")
2667 lines = lines[1:-1]
2668 orfnames = []
2669 starts = []
2670 ends = []
2671 strands = []
2672 starts2 = []
2673 ends2 = []
2674 firstline = "y"
2675 for i in lines:
2676 columns = i.split(" ")
2677 columns2 = []
2678 for i in columns:
2679 if i != "":
2680 columns2.append(i)
2681 columns = columns2
2682 if len(columns) > 3:
2683 frame = columns[3][0]
2684 strands.append(frame)
2685 else:
2686 frame = ""
2687 if firstline == "y" and frame == "+" and len(columns) > 3:
2688 orfname = str(columns[0])
2689 orfnames.append(orfname)
2690 if genomeconf == "c" and (int(columns[1]) > int(columns[2])) and (int(columns[1]) > (0.5 * dnaseqlength)):
2691 gstart = (int(columns[2]) % 3) + 1
2692 if gstart == 3:
2693 gstart = 0
2694 starts.append(str(gstart))
2695 ends.append(columns[2])
2696 starts.append(columns[1])
2697 ends.append(str(dnaseqlength))
2698 else:
2699 starts.append(columns[1])
2700 ends.append(columns[2])
2701 firstline = "n"
2702 elif firstline == "y" and frame == "-" and len(columns) > 3:
2703 orfname = str(columns[0])
2704 orfnames.append(orfname)
2705 if genomeconf == "c" and (int(columns[1]) > int(columns[2])) and (int(columns[1]) > (0.5 * dnaseqlength)):
2706 gstart = (int(columns[2]) % 3) + 1
2707 if gstart == 3:
2708 gstart = 0
2709 starts.append("complement(" + str(gstart))
2710 ends.append(columns[2] + ")")
2711 starts.append("complement(" + columns[1])
2712 ends.append(str(dnaseqlength) + ")")
2713 else:
2714 complstart = "complement(" + str(columns[1])
2715 starts.append(complstart)
2716 complend = str(columns[2]) + ")"
2717 ends.append(str(complend))
2718 firstline = "n"
2719 elif frame == "+" and len(columns) > 3:
2720 orfname = str(columns[0])
2721 orfnames.append(orfname)
2722 starts.append(columns[1])
2723 ends.append(columns[2])
2724 elif frame == "-" and len(columns) > 3:
2725 orfname = str(columns[0])
2726 orfnames.append(orfname)
2727 complstart = "complement(" + str(columns[1])
2728 starts.append(complstart)
2729 complend = str(columns[2]) + ")"
2730 ends.append(str(complend))
2731 if len(orfnames) == 0:
2732 print >> sys.stderr, "Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 10."
2733 logfile = open("antismash.log","w")
2734 logfile.write("Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 10.\n")
2735 logfile.close()
2736 sys.exit(1)
2737 out_file = open(emblfile,"w")
2738 a = 0
2739 #print "Writing EMBL file with Glimmer-predicted genes..."
2740 logfile = open("antismash.log","w")
2741 logfile.write(loginfo)
2742 logfile.write("Writing EMBL file with Glimmer-predicted genes...\n")
2743 #logfile.close()
2744 loginfo = open("antismash.log","r").read()
2745 #logfile.close()
2746 if taxon == "p":
2747 out_file.write("ID A01; SV 1; linear; DNA; STD; PRO; " + str(dnaseqlength) + " BP.\nXX\n")
2748 else:
2749 out_file.write("ID A01; SV 1; linear; DNA; STD; FUN; " + str(dnaseqlength) + " BP.\nXX\n")
2750 out_file.write("AC A01;\nXX\n")
2751 out_file.write("DE " + genomename + ";\nXX\n")
2752 out_file.write("KW none;\nXX\n")
2753 out_file.write("OS unknown;\n")
2754 if taxon == "p":
2755 out_file.write("OC Eubacteria;\nXX\n")
2756 else:
2757 out_file.write("OC Fungi;\nXX\n")
2758 out_file.write("RN [1]\n")
2759 out_file.write("RT ;\n")
2760 out_file.write("RL Unknown.\nXX\n")
2761 out_file.write("FH Key Location/Qualifiers\nFH\n")
2762 out_file.write("FT source 1.." + str(dnaseqlength) + "\n")
2763 for i in orfnames:
2764 out_file.write("FT gene ")
2765 out_file.write(starts[a])
2766 out_file.write("..")
2767 out_file.write(ends[a])
2768 out_file.write("\n")
2769 out_file.write('FT /gene="' + i + '"\n')
2770 out_file.write("FT CDS ")
2771 out_file.write(starts[a])
2772 out_file.write("..")
2773 out_file.write(ends[a])
2774 out_file.write("\n")
2775 out_file.write('FT /gene="' + i + '"\n')
2776 a += 1
2777 elif taxon == "e":
2778 """
2779 GlimmerHMM is executed extern ... in galaxy and will be provided through glimmer_prediction_path
2780
2781 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
2782 os.popen("glimmerhmm " + fastafile + " train_crypto -o " + fastafile.rpartition(".")[0] + ".predict -g")
2783 else:
2784 os.system("glimmerhmm " + fastafile + " train_crypto -o " + fastafile.rpartition(".")[0] + ".predict -g")
2785 """
2786 #Convert glimmerhmm predictions into EMBL with sequence
2787 #glfile = fastafile.rpartition(".")[0] + ".predict"
2788 glfile = glimmer_prediction_path
2789 emblfile = fastafile.rpartition(".")[0] + ".embl"
2790 try:
2791 file = open(glfile,"r")
2792 filetext = file.read().replace("\r","")
2793 except:
2794 print >> sys.stderr, "GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 9."
2795 logfile = open("antismash.log","w")
2796 logfile.write("GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 9.\n")
2797 logfile.close()
2798 sys.exit(1)
2799 if "CDS" not in filetext:
2800 print >> sys.stderr, "GlimmerHMM gene prediction failed: no genes found."
2801 logfile = open("antismash.log","w")
2802 logfile.write("GlimmerHMM gene prediction failed: no genes found.\n")
2803 logfile.close()
2804 sys.exit(1)
2805 filetext = filetext.replace("\r","\n")
2806 lines = filetext.split("\n")
2807 lines = lines[2:-1]
2808 orfnames = []
2809 positions = []
2810 firstline = "y"
2811 x = 0
2812 orfnr = 0
2813 starts = []
2814 ends = []
2815 for i in lines:
2816 columns = i.split("\t")
2817 if len(columns) > 1:
2818 if x == 0:
2819 strand = columns[6]
2820 if "mRNA" not in i:
2821 starts.append(columns[3])
2822 ends.append(columns[4])
2823 elif x == (len(lines) - 1) or "mRNA" in lines[x + 1]:
2824 strand = columns[6]
2825 starts.append(columns[3])
2826 ends.append(columns[4])
2827 orfnr += 1
2828 if len(str(orfnr)) == 1:
2829 orfname = "orf0000" + str(orfnr)
2830 elif len(str(orfnr)) == 2:
2831 orfname = "orf000" + str(orfnr)
2832 elif len(str(orfnr)) == 3:
2833 orfname = "orf00" + str(orfnr)
2834 elif len(str(orfnr)) == 4:
2835 orfname = "orf0" + str(orfnr)
2836 elif len(str(orfnr)) == 5:
2837 orfname = "orf" + str(orfnr)
2838 orfnames.append(orfname)
2839 if strand == "+":
2840 if len(starts) == 1:
2841 pos = starts[0] + ".." + ends[0]
2842 positions.append(pos)
2843 else:
2844 pos = "join("
2845 y = 0
2846 for i in starts:
2847 pos = pos + i + ".." + ends[y]
2848 if i != starts[-1]:
2849 pos = pos + ","
2850 y += 1
2851 pos = pos + ")"
2852 positions.append(pos)
2853 elif strand == "-":
2854 if len(starts) == 1:
2855 pos = "complement(" + starts[0] + ".." + ends[0] + ")"
2856 positions.append(pos)
2857 else:
2858 pos = "complement(join("
2859 y = 0
2860 for i in starts:
2861 pos = pos + i + ".." + ends[y]
2862 if i != starts[-1]:
2863 pos = pos + ","
2864 y += 1
2865 pos = pos + "))"
2866 positions.append(pos)
2867 starts = []
2868 ends = []
2869 elif "mRNA" not in i:
2870 starts.append(columns[3])
2871 ends.append(columns[4])
2872 x += 1
2873 if len(orfnames) == 0:
2874 print >> sys.stderr, "GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error: 12"
2875 logfile = open("antismash.log","w")
2876 logfile.write("GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 12\n")
2877 logfile.close()
2878 sys.exit(1)
2879 out_file = open(emblfile,"w")
2880 a = 0
2881 #print "Writing EMBL file with GlimmerHMM-predicted genes..."
2882 logfile = open("antismash.log","w")
2883 logfile.write(loginfo)
2884 logfile.write("Writing EMBL file with GlimmerHMM-predicted genes...\n")
2885 #logfile.close()
2886 loginfo = open("antismash.log","r").read()
2887 #logfile.close()
2888 out_file.write("ID A01; SV 1; linear; DNA; STD; FUN; " + str(dnaseqlength) + " BP.\nXX\n")
2889 out_file.write("AC A01;\nXX\n")
2890 out_file.write("DE " + genomename + ";\nXX\n")
2891 out_file.write("KW none;\nXX\n")
2892 out_file.write("OS unknown;\n")
2893 out_file.write("OC Fungi;\nXX\n")
2894 out_file.write("RN [1]\n")
2895 out_file.write("RT ;\n")
2896 out_file.write("RL Unknown.\nXX\n")
2897 out_file.write("FH Key Location/Qualifiers\nFH\n")
2898 out_file.write("FT source 1.." + str(dnaseqlength) + "\n")
2899 for i in orfnames:
2900 out_file.write("FT gene ")
2901 out_file.write(positions[a])
2902 out_file.write("\n")
2903 out_file.write('FT /gene="' + i + '"\n')
2904 out_file.write("FT CDS ")
2905 out_file.write(positions[a])
2906 out_file.write("\n")
2907 out_file.write('FT /gene="' + i + '"\n')
2908 a += 1
2909 out_file.write("XX\nSQ Sequence " + str(dnaseqlength) + " BP; " + str(seq.count("a") + seq.count("A")) + " A; " + str(seq.count("c") + seq.count("C")) + " C; " + str(seq.count("g") + seq.count("G")) + " G; " + str(seq.count("t") + seq.count("T")) + " T; " + str(dnaseqlength - (seq.count("a") + seq.count("A") + seq.count("c") + seq.count("C") + seq.count("g") + seq.count("G") + seq.count("t") + seq.count("T"))) + " other;\n")
2910 seq2 = seq
2911 out_file.write(" ")
2912 grouplen=10
2913 textlen = len(seq)
2914 end = textlen - (textlen % grouplen)
2915 repeated_iterator = [iter(itertools.islice(seq, 0, end))] * grouplen
2916 parts = list(itertools.imap(lambda *chars: ''.join(chars),*repeated_iterator))
2917 if dnaseqlength%grouplen != 0:
2918 parts.append(seq[-1 * (dnaseqlength%grouplen):])
2919 w = 1
2920 for l in parts:
2921 out_file.write(l + " ")
2922 if w == len(parts):
2923 if w%6 == 0 and dnaseqlength%60 != 0:
2924 out_file.write((" " * (10 - dnaseqlength%grouplen) + " " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
2925 elif dnaseqlength%60 == 0:
2926 out_file.write((" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
2927 elif w%6 == 5 and dnaseqlength%grouplen == 0:
2928 out_file.write((" " + " " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
2929 elif dnaseqlength%grouplen != 0:
2930 out_file.write(" " * (10 - dnaseqlength%grouplen) + " " * (6 - len(parts)%6) + " " * (6 - len(parts)%6) + (" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
2931 else:
2932 out_file.write(" " * (6 - len(parts)%6) + " " * (5 - len(parts)%6) + (" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
2933 elif w%6 == 0:
2934 out_file.write((" " * (10 - len(str(w * 10)))) + str(w * 10) + "\n ")
2935 w += 1
2936 out_file.close()
2937 os.chdir("../../")
2938 infile = emblfile[6:]
2939 emblfile = emblfile[6:]
2940 if taxon == "p":
2941 glimmeroutputfolder = genomename + "/glimmer/"
2942 elif taxon == "e":
2943 glimmeroutputfolder = genomename + "/glimmerhmm/"
2944 try:
2945 os.mkdir(glimmeroutputfolder)
2946 except(IOError,OSError):
2947 pass
2948 proteins = embl2proteins(infile,sequence)
2949 genomic_accnr = proteins[1]
2950 dnaseqlength = proteins[2]
2951 proteins = proteins[0]
2952 writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta")
2953 else:
2954 #print "Reading embl/gbk file and creating input FASTA file for gene cluster detection..."
2955 logfile.write("Reading embl/gbk file and creating input FASTA file for gene cluster detection...\n")
2956 if infile.split(".")[-1] == "embl" or infile.split(".")[-1] == "EMBL" or infile.split(".")[-1] == "emb" or infile.split(".")[-1] == "EMB":
2957 sequence = ""
2958 proteins = embl2proteins(infile,sequence)
2959 genomic_accnr = proteins[1]
2960 dnaseqlength = proteins[2]
2961 proteins = proteins[0]
2962 writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta")
2963 elif infile.split(".")[-1] == "gbk" or infile.split(".")[-1] == "GBK" or infile.split(".")[-1] == "gb" or infile.split(".")[-1] == "GB" or infile.split(".")[-1] == "genbank" or infile.split(".")[-1] == "GENBANK":
2964 proteins = gbk2proteins(infile)
2965 genomic_accnr = proteins[1]
2966 dnaseqlength = proteins[2]
2967 proteins = proteins[0]
2968 writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta")
2969 accessiondict = proteins[4]
2970 seqdict = {}
2971 fullnamedict = {}
2972 strandsdict = {}
2973 z = 0
2974 for i in proteins[0]:
2975 name = i.split("|")[4]
2976 seq = proteins[1][z]
2977 seqdict[name] = seq
2978 strand = i.split("|")[3]
2979 strandsdict[name] = strand
2980 fullnamedict[name] = i
2981 z += 1
2982
2983 elapsed = (time.time() - starttime)
2984 #print "2968Time since start: " + str(elapsed)
2985
2986 #Run hmmsearch on proteins from input file and parse output
2987 #print "Performing HMM search on proteins for detection of signature genes..."
2988 logfile.write("Performing HMM search on proteins for detection of signature genes...\n")
2989 hmmslist = ["AMP-binding.hmm","BLS.hmm","CAS.hmm","Chal_sti_synt_C.hmm","Chal_sti_synt_N.hmm","Condensation.hmm","ene_KS.hmm","hyb_KS.hmm","itr_KS.hmm","mod_KS.hmm","tra_KS.hmm","LANC_like.hmm","ATd.hmm","PKS_AT.hmm","PKS_KS.hmm","PP-binding.hmm","t2clf.hmm","t2ks.hmm","t2ks2.hmm","Terpene_synth.hmm","Terpene_synth_C.hmm","strH_like.hmm","neoL_like.hmm","DOIS.hmm","valA_like.hmm","spcFG_like.hmm","spcDK_like_cou.hmm","spcDK_like_glyc.hmm","strK_like1.hmm","strK_like2.hmm","bt1fas.hmm","ft1fas.hmm","t2fas.hmm","hglD.hmm","hglE.hmm","fabH.hmm","AfsA.hmm","IucA_IucC.hmm","ectoine_synt.hmm","phytoene_synt.hmm","Lant_dehyd_N.hmm","Lant_dehyd_C.hmm","Antimicrobial18.hmm","Gallidermin.hmm","L_biotic_typeA.hmm","LE-DUF.hmm","LE-LAC481.hmm","LE-LanBC.hmm","LE-MER+2PEP.hmm","MA-2PEPA.hmm","MA-DUF.hmm","MA-EPI.hmm","MA-LAC481.hmm","MA-NIS+EPI.hmm","MA-NIS.hmm","indsynth.hmm","A-OX.hmm","LmbU.hmm","MoeO5.hmm","LipM.hmm","LipU.hmm","LipV.hmm","ToyB.hmm","TunD.hmm","melC.hmm","strepbact.hmm","goadsporin_like.hmm","Antimicrobial14.hmm","Bacteriocin_IId.hmm","BacteriocIIc_cy.hmm","Bacteriocin_II.hmm","Lactococcin.hmm","Antimicrobial17.hmm","Lactococcin_972.hmm","Bacteriocin_IIc.hmm","LcnG-beta.hmm","Bacteriocin_IIi.hmm","Subtilosin_A.hmm","Cloacin.hmm","Neocarzinostat.hmm","Linocin_M18.hmm","TIGR03603.hmm","TIGR03604.hmm","TIGR03605.hmm","TIGR03731.hmm","TIGR03651.hmm","TIGR03678.hmm","TIGR03693.hmm","TIGR03798.hmm","TIGR03882.hmm","TIGR03601.hmm","TIGR03602.hmm","tabtoxin.hmm","cycdipepsynth.hmm","cyanobactin_synth.hmm","fom1.hmm","bcpB.hmm","frbD.hmm","mitE.hmm",'Lycopene_cycl.hmm','terpene_cyclase.hmm','NapT7.hmm','fung_ggpps.hmm','fung_ggpps2.hmm','dmat.hmm','trichodiene_synth.hmm','novK.hmm','novJ.hmm','novI.hmm','novH.hmm','pur6.hmm','pur10.hmm','nikJ.hmm','nikO.hmm','mvnA.hmm','thiostrepton.hmm','NAD_binding_4.hmm','vlmB.hmm','salQ.hmm','prnB.hmm']
2990 for i in hmmslist:
2991 hmmsearch = hmmsearch_path + " " + "--cpu " + str(nrcpus) + " -o " + genomename + "/hmmoutput/" + i.split(".")[0] + "_output.txt" + " --noali --tblout " + genomename + "/hmmoutput/" + i.split(".")[0] + ".txt " + hmms_path + i + " " + genomename + "/genome_proteins.fasta"
2992 os.system(hmmsearch)
2993 #print "Parsing HMM outputs..."
2994 logfile.write("Parsing HMM outputs...\n")
2995 detecteddomainsdict = {}
2996 #Extract type I PKS proteins, KS cut-off: 50; AT cut-off: 20; exclude those sequences that score higher on type I FAS HMMs, type IV hglE-like KS domains
2997 t1pksprots = []
2998 transatpksprots = []
2999 if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
3000 ks = parsehmmoutput(50,hmmoutputfolder + "PKS_KS.txt")
3001 at = parsehmmoutput(50,hmmoutputfolder + "PKS_AT.txt")
3002 ft1fasks = parsehmmoutput(50,hmmoutputfolder + "ft1fas.txt")
3003 bt1fasks = parsehmmoutput(50,hmmoutputfolder + "bt1fas.txt")
3004 hgleks = parsehmmoutput(50,hmmoutputfolder + "hglE.txt")
3005 hgldks = parsehmmoutput(50,hmmoutputfolder + "hglD.txt")
3006 fabhks = parsehmmoutput(50,hmmoutputfolder + "fabH.txt")
3007 pksksprots = ks[0]
3008 pksatprots = at[0]
3009 pksatscores = at[1]
3010 pksksscores = ks[1]
3011 bt1fasprots = bt1fasks[0]
3012 bt1fasscores = bt1fasks[1]
3013 ft1fasprots = ft1fasks[0]
3014 ft1fasscores = ft1fasks[1]
3015 hgleprots = hgleks[0]
3016 hglescores = hgleks[1]
3017 hgldprots = hgldks[0]
3018 hgldscores = hgldks[1]
3019 fabhprots = fabhks[0]
3020 fabhscores = fabhks[1]
3021 for i in pksksprots:
3022 exclude = "n"
3023 score = pksksscores[pksksprots.index(i)]
3024 if i in bt1fasprots:
3025 bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
3026 if float(score) < float(bt1fasscore):
3027 exclude = "y"
3028 if i in ft1fasprots:
3029 ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
3030 if float(score) < float(ft1fasscore):
3031 exclude = "y"
3032 if i in hgldprots:
3033 hgldscore = hgldscores[hgldprots.index(i)]
3034 if float(score) < float(hgldscore):
3035 exclude = "y"
3036 if i in hgleprots:
3037 hglescore = hglescores[hgleprots.index(i)]
3038 if float(score) < float(hglescore):
3039 exclude = "y"
3040 if i in fabhprots:
3041 fabhscore = fabhscores[fabhprots.index(i)]
3042 if float(score) < float(fabhscore):
3043 exclude = "y"
3044 if i in pksatprots and exclude == "n":
3045 t1pksprots.append(i)
3046 if detecteddomainsdict.has_key(i):
3047 detdomlist = detecteddomainsdict[i]
3048 detdomlist.append(["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]])
3049 detdomlist.append(["PKS acyltransferase domain",pksatscores[pksatprots.index(i)]])
3050 detecteddomainsdict[i] = detdomlist
3051 else:
3052 detecteddomainsdict[i] = [["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]],["PKS acyltransferase domain",pksatscores[pksatprots.index(i)]]]
3053 #Extract trans-AT PKSs: proteins with KS hits but without AT hits, and with trans-AT specific ATd-hits
3054 atd = parsehmmoutput(65,hmmoutputfolder + "ATd.txt")
3055 traks = parsehmmoutput(50,hmmoutputfolder + "tra_KS.txt")
3056 traksprots = traks[0]
3057 atdprots = atd[0]
3058 atdscores = atd[1]
3059 for i in pksksprots:
3060 if i in atdprots and i in traksprots and i not in t1pksprots:
3061 transatpksprots.append(i)
3062 if detecteddomainsdict.has_key(i):
3063 detdomlist = detecteddomainsdict[i]
3064 detdomlist.append(["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]])
3065 detdomlist.append(["Trans-AT PKS AT-docking domain",atdscores[atdprots.index(i)]])
3066 detecteddomainsdict[i] = detdomlist
3067 else:
3068 detecteddomainsdict[i] = [["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]],["Trans-AT PKS AT-docking domain",atdscores[atdprots.index(i)]]]
3069 #Extract type II PKS & CLF proteins, KS-cut-off: 50, t2KS/clf score > modKS,eneKS,itrKS,traKS,t1fas,t2fas,hgle scores
3070 t2pksprots = []
3071 if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
3072 t2ks = parsehmmoutput(50,hmmoutputfolder + "t2ks.txt")
3073 t2ks2 = parsehmmoutput(450,hmmoutputfolder + "t2ks2.txt")
3074 t2clf = parsehmmoutput(50,hmmoutputfolder + "t2clf.txt")
3075 eneks = parsehmmoutput(50,hmmoutputfolder + "ene_KS.txt")
3076 hybks = parsehmmoutput(50,hmmoutputfolder + "hyb_KS.txt")
3077 modks = parsehmmoutput(50,hmmoutputfolder + "mod_KS.txt")
3078 itrks = parsehmmoutput(50,hmmoutputfolder + "itr_KS.txt")
3079 traks = parsehmmoutput(50,hmmoutputfolder + "tra_KS.txt")
3080 t2fasks = parsehmmoutput(50,hmmoutputfolder + "t2fas.txt")
3081 ft1fasks = parsehmmoutput(50,hmmoutputfolder + "ft1fas.txt")
3082 bt1fasks = parsehmmoutput(50,hmmoutputfolder + "bt1fas.txt")
3083 hgleks = parsehmmoutput(50,hmmoutputfolder + "hglE.txt")
3084 hgldks = parsehmmoutput(50,hmmoutputfolder + "hglD.txt")
3085 fabhks = parsehmmoutput(50,hmmoutputfolder + "fabH.txt")
3086 t2ksprots = t2ks[0]
3087 t2ks2prots = t2ks2[0]
3088 t2clfprots = t2clf[0]
3089 eneksprots = eneks[0]
3090 hybksprots = hybks[0]
3091 modksprots = modks[0]
3092 itrksprots = itrks[0]
3093 traksprots = traks[0]
3094 t2fasprots = t2fasks[0]
3095 t2ksscores = t2ks[1]
3096 t2ks2scores = t2ks2[1]
3097 t2clfscores = t2clf[1]
3098 eneksscores = eneks[1]
3099 hybksscores = hybks[1]
3100 modksscores = modks[1]
3101 itrksscores = itrks[1]
3102 traksscores = traks[1]
3103 t2fasscores = t2fasks[1]
3104 bt1fasprots = bt1fasks[0]
3105 bt1fasscores = bt1fasks[1]
3106 ft1fasprots = ft1fasks[0]
3107 ft1fasscores = ft1fasks[1]
3108 hgleprots = hgleks[0]
3109 hglescores = hgleks[1]
3110 hgldprots = hgldks[0]
3111 hgldscores = hgldks[1]
3112 fabhprots = fabhks[0]
3113 fabhscores = fabhks[1]
3114 for i in t2ksprots:
3115 type2 = "y"
3116 score = t2ksscores[t2ksprots.index(i)]
3117 if i in eneksprots:
3118 enescore = eneksscores[eneksprots.index(i)]
3119 if float(enescore) > float(score):
3120 type2 = "n"
3121 if i in hybksprots:
3122 hybscore = hybksscores[hybksprots.index(i)]
3123 if float(hybscore) > float(score):
3124 type2 = "n"
3125 if i in modksprots:
3126 modscore = modksscores[modksprots.index(i)]
3127 if float(modscore) > float(score):
3128 type2 = "n"
3129 if i in itrksprots:
3130 itrscore = itrksscores[itrksprots.index(i)]
3131 if float(itrscore) > float(score):
3132 type2 = "n"
3133 if i in traksprots:
3134 trascore = traksscores[traksprots.index(i)]
3135 if float(trascore) > float(score):
3136 type2 = "n"
3137 if i in bt1fasprots:
3138 bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
3139 if float(bt1fasscore) > float(score):
3140 type2 = "n"
3141 if i in ft1fasprots:
3142 ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
3143 if float(ft1fasscore) > float(score):
3144 type2 = "n"
3145 if i in t2fasprots:
3146 t2fasscore = t2fasscores[t2fasprots.index(i)]
3147 if float(t2fasscore) > float(score):
3148 type2 = "n"
3149 if i in hgleprots:
3150 hglescore = hglescores[hgleprots.index(i)]
3151 if float(hglescore) > float(score):
3152 type2 = "n"
3153 if i in fabhprots:
3154 fabhscore = fabhscores[fabhprots.index(i)]
3155 if float(fabhscore) > float(score):
3156 type2 = "n"
3157 if type2 == "y" and i not in t2pksprots and i not in t1pksprots:
3158 t2pksprots.append(i)
3159 if detecteddomainsdict.has_key(i):
3160 detdomlist = detecteddomainsdict[i]
3161 detdomlist.append(["Type II ketosynthase",t2ksscores[t2ksprots.index(i)]])
3162 detecteddomainsdict[i] = detdomlist
3163 else:
3164 detecteddomainsdict[i] = [["Type II ketosynthase",t2ksscores[t2ksprots.index(i)]]]
3165 for i in t2clfprots:
3166 type2 = "y"
3167 score = t2clfscores[t2clfprots.index(i)]
3168 if i in eneksprots:
3169 enescore = eneksscores[eneksprots.index(i)]
3170 if float(enescore) > float(score):
3171 type2 = "n"
3172 if i in hybksprots:
3173 hybscore = hybksscores[hybksprots.index(i)]
3174 if float(hybscore) > float(score):
3175 type2 = "n"
3176 if i in modksprots:
3177 modscore = modksscores[modksprots.index(i)]
3178 if float(modscore) > float(score):
3179 type2 = "n"
3180 if i in itrksprots:
3181 itrscore = itrksscores[itrksprots.index(i)]
3182 if float(itrscore) > float(score):
3183 type2 = "n"
3184 if i in traksprots:
3185 trascore = traksscores[traksprots.index(i)]
3186 if float(trascore) > float(score):
3187 type2 = "n"
3188 if i in bt1fasprots:
3189 bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
3190 if float(bt1fasscore) > float(score):
3191 type2 = "n"
3192 if i in ft1fasprots:
3193 ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
3194 if float(ft1fasscore) > float(score):
3195 type2 = "n"
3196 if i in t2fasprots:
3197 t2fasscore = t2fasscores[t2fasprots.index(i)]
3198 if float(t2fasscore) > float(score):
3199 type2 = "n"
3200 if i in hgleprots:
3201 hglescore = hglescores[hgleprots.index(i)]
3202 if float(hglescore) > float(score):
3203 type2 = "n"
3204 if i in fabhprots:
3205 fabhscore = fabhscores[fabhprots.index(i)]
3206 if float(fabhscore) > float(score):
3207 type2 = "n"
3208 if type2 == "y" and i not in t2pksprots and i not in t1pksprots:
3209 t2pksprots.append(i)
3210 if detecteddomainsdict.has_key(i):
3211 detdomlist = detecteddomainsdict[i]
3212 detdomlist.append(["Chain length factor",t2clfscores[t2clfprots.index(i)]])
3213 detecteddomainsdict[i] = detdomlist
3214 else:
3215 detecteddomainsdict[i] = [["Chain length factor",t2clfscores[t2clfprots.index(i)]]]
3216 for i in t2ks2prots:
3217 type2 = "y"
3218 score = t2ks2scores[t2ks2prots.index(i)]
3219 if i in eneksprots:
3220 enescore = eneksscores[eneksprots.index(i)]
3221 if float(enescore) > float(score):
3222 type2 = "n"
3223 if i in hybksprots:
3224 hybscore = hybksscores[hybksprots.index(i)]
3225 if float(hybscore) > float(score):
3226 type2 = "n"
3227 if i in modksprots:
3228 modscore = modksscores[modksprots.index(i)]
3229 if float(modscore) > float(score):
3230 type2 = "n"
3231 if i in itrksprots:
3232 itrscore = itrksscores[itrksprots.index(i)]
3233 if float(itrscore) > float(score):
3234 type2 = "n"
3235 if i in traksprots:
3236 trascore = traksscores[traksprots.index(i)]
3237 if float(trascore) > float(score):
3238 type2 = "n"
3239 if i in bt1fasprots:
3240 bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
3241 if float(bt1fasscore) > float(score):
3242 type2 = "n"
3243 if i in ft1fasprots:
3244 ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
3245 if float(ft1fasscore) > float(score):
3246 type2 = "n"
3247 if i in t2fasprots:
3248 t2fasscore = t2fasscores[t2fasprots.index(i)]
3249 if float(t2fasscore) > float(score):
3250 type2 = "n"
3251 if i in hgleprots:
3252 hglescore = hglescores[hgleprots.index(i)]
3253 if float(hglescore) > float(score):
3254 type2 = "n"
3255 if i in fabhprots:
3256 fabhscore = fabhscores[fabhprots.index(i)]
3257 if float(fabhscore) > float(score):
3258 type2 = "n"
3259 if type2 == "y" and i not in t2pksprots and i not in t1pksprots:
3260 t2pksprots.append(i)
3261 if detecteddomainsdict.has_key(i):
3262 detdomlist = detecteddomainsdict[i]
3263 detdomlist.append(["Type II ketosynthase, model 2",t2ks2scores[t2ks2prots.index(i)]])
3264 detecteddomainsdict[i] = detdomlist
3265 else:
3266 detecteddomainsdict[i] = [["Type II ketosynthase, model 2",t2ks2scores[t2ks2prots.index(i)]]]
3267 #Extract type III PKS proteins
3268 t3pksprots = []
3269 if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
3270 t3n = parsehmmoutput(63,hmmoutputfolder + "Chal_sti_synt_N.txt")
3271 t3c = parsehmmoutput(35,hmmoutputfolder + "Chal_sti_synt_C.txt")
3272 t3nprots = t3n[0]
3273 t3nscores = t3n[1]
3274 t3cprots = t3c[0]
3275 t3cscores = t3c[1]
3276 for i in t3cprots:
3277 if i not in t3pksprots and i not in t1pksprots and i not in t2pksprots:
3278 t3pksprots.append(i)
3279 if detecteddomainsdict.has_key(i):
3280 detdomlist = detecteddomainsdict[i]
3281 detdomlist.append(["Chalcone/stilbene synthase,C-terminus",t3cscores[t3cprots.index(i)]])
3282 detecteddomainsdict[i] = detdomlist
3283 else:
3284 detecteddomainsdict[i] = [["Chalcone/stilbene synthase,C-terminus",t3cscores[t3cprots.index(i)]]]
3285 for i in t3nprots:
3286 if i not in t3pksprots and i not in t1pksprots and i not in t2pksprots:
3287 t3pksprots.append(i)
3288 if detecteddomainsdict.has_key(i):
3289 detdomlist = detecteddomainsdict[i]
3290 detdomlist.append(["Chalcone/stilbene synthase,N-terminus",t3nscores[t3nprots.index(i)]])
3291 detecteddomainsdict[i] = detdomlist
3292 else:
3293 detecteddomainsdict[i] = [["Chalcone/stilbene synthase,N-terminus",t3nscores[t3nprots.index(i)]]]
3294 #Extract 'type IV' hglE-like PKS proteins, cut-off:50; only if not already scored as type 1-3 PKS, and not if FAS HMM has higher score
3295 t4pksprots = []
3296 if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
3297 t2fasks = parsehmmoutput(50,hmmoutputfolder + "t2fas.txt")
3298 t2fasprots = t2fasks[0]
3299 t2fasscores = t2fasks[1]
3300 for i in hgleprots:
3301 type4 = "y"
3302 score = hglescores[hgleprots.index(i)]
3303 if i in bt1fasprots:
3304 bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
3305 if float(bt1fasscore) > float(score):
3306 type4 = "n"
3307 if i in ft1fasprots:
3308 ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
3309 if float(ft1fasscore) > float(score):
3310 type4 = "n"
3311 if i in t2fasprots:
3312 t2fasscore = t2fasscores[t2fasprots.index(i)]
3313 if float(t2fasscore) > float(score):
3314 type4 = "n"
3315 if i in fabhprots:
3316 fabhscore = fabhscores[fabhprots.index(i)]
3317 if float(fabhscore) > float(score):
3318 type4 = "n"
3319 if i not in t1pksprots and i not in t2pksprots and i not in t3pksprots and i not in transatpksprots and type4 == "y":
3320 t4pksprots.append(i)
3321 if detecteddomainsdict.has_key(i):
3322 detdomlist = detecteddomainsdict[i]
3323 detdomlist.append(["Atypical PKS domain, HglE-like",hglescores[hgleprots.index(i)]])
3324 detecteddomainsdict[i] = detdomlist
3325 else:
3326 detecteddomainsdict[i] = [["Atypical PKS domain, HglE-like",hglescores[hgleprots.index(i)]]]
3327 for i in hgldprots:
3328 type4 = "y"
3329 score = hgldscores[hgldprots.index(i)]
3330 if i in bt1fasprots:
3331 bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
3332 if float(bt1fasscore) > float(score):
3333 type4 = "n"
3334 if i in ft1fasprots:
3335 ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
3336 if float(ft1fasscore) > float(score):
3337 type4 = "n"
3338 if i in t2fasprots:
3339 t2fasscore = t2fasscores[t2fasprots.index(i)]
3340 if float(t2fasscore) > float(score):
3341 type4 = "n"
3342 if i in fabhprots:
3343 fabhscore = fabhscores[fabhprots.index(i)]
3344 if float(fabhscore) > float(score):
3345 type4 = "n"
3346 if i not in t1pksprots and i not in t2pksprots and i not in t3pksprots and i not in transatpksprots and type4 == "y" and i not in t4pksprots:
3347 t4pksprots.append(i)
3348 if detecteddomainsdict.has_key(i):
3349 detdomlist = detecteddomainsdict[i]
3350 detdomlist.append(["Atypical PKS domain, HglD-like",hgldscores[hgldprots.index(i)]])
3351 detecteddomainsdict[i] = detdomlist
3352 else:
3353 detecteddomainsdict[i] = [["Atypical PKS domain, HglD-like",hgldscores[hgldprots.index(i)]]]
3354 #Extract NRPS proteins, C cut-off: 20; A cut-off:20, both should be there, or single domain proteins C,A, or T should be within 20kb of each other or a full NRPS
3355 nrpsprots = []
3356 if 1 in geneclustertypes or 5 in geneclustertypes:
3357 cond = parsehmmoutput(20,hmmoutputfolder + "Condensation.txt")
3358 amp = parsehmmoutput(20,hmmoutputfolder + "AMP-binding.txt")
3359 ampox = parsehmmoutput(50,hmmoutputfolder + "A-OX.txt")
3360 ampoxprots = ampox[0]
3361 ampoxscores = ampox[1]
3362 for i in ampox[0]:
3363 if i not in amp:
3364 amp.append(i)
3365 cprots = cond[0]
3366 cscores = cond[1]
3367 aprots = amp[0]
3368 ascores = amp[1]
3369 nrpsprots = []
3370 for i in cprots:
3371 if i in aprots:
3372 nrpsprots.append(i)
3373 if detecteddomainsdict.has_key(i):
3374 detdomlist = detecteddomainsdict[i]
3375 detdomlist.append(["Condensation domain",cscores[cprots.index(i)]])
3376 if i in aprots:
3377 detdomlist.append(["Adenylation domain",ascores[aprots.index(i)]])
3378 elif i in ampoxprots:
3379 detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
3380 detecteddomainsdict[i] = detdomlist
3381 else:
3382 if i in aprots:
3383 detecteddomainsdict[i] = [["Condensation domain",cscores[cprots.index(i)]],["Adenylation domain",ascores[aprots.index(i)]]]
3384 elif i in ampoxprots:
3385 detecteddomainsdict[i] = [["Condensation domain",cscores[cprots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
3386 for i in t1pksprots:
3387 if i in aprots:
3388 nrpsprots.append(i)
3389 if detecteddomainsdict.has_key(i):
3390 detdomlist = detecteddomainsdict[i]
3391 if i in aprots:
3392 detdomlist.append(["Adenylation domain",ascores[aprots.index(i)]])
3393 elif i in ampoxprots:
3394 detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
3395 detecteddomainsdict[i] = detdomlist
3396 else:
3397 if i in aprots:
3398 detecteddomainsdict[i] = [["Adenylation domain",ascores[aprots.index(i)]]]
3399 elif i in ampoxprots:
3400 detecteddomainsdict[i] = [["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
3401 single_aprots = []
3402 single_cprots = []
3403 single_pptprots = []
3404 pptprots = parsehmmoutput(20,hmmoutputfolder + "PP-binding.txt")[0]
3405 for i in aprots:
3406 if i not in nrpsprots:
3407 single_aprots.append(i)
3408 for i in cprots:
3409 if i not in nrpsprots:
3410 single_cprots.append(i)
3411 for i in pptprots:
3412 if i not in nrpsprots:
3413 single_pptprots.append(i)
3414 genelist = proteins[2]
3415 genedict = proteins[3]
3416 single_aprots_positions = {}
3417 single_cprots_positions = {}
3418 single_pptprots_positions = {}
3419 nrpsprots_positions = {}
3420 for j in single_aprots:
3421 if j in genelist:
3422 protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
3423 protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
3424 single_aprots_positions[j] = int((protend_abs + protstart_abs) / 2)
3425 for j in single_cprots:
3426 if j in genelist:
3427 protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
3428 protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
3429 single_cprots_positions[j] = int((protend_abs + protstart_abs) / 2)
3430 for j in single_pptprots:
3431 if j in genelist:
3432 protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
3433 protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
3434 single_pptprots_positions[j] = int((protend_abs + protstart_abs) / 2)
3435 for j in nrpsprots:
3436 if j in genelist:
3437 protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
3438 protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
3439 nrpsprots_positions[j] = int((protend_abs + protstart_abs) / 2)
3440 nrpsprots2 = []
3441 for i in nrpsprots:
3442 nrpsprots2.append(i)
3443 for j in single_aprots:
3444 include = "n"
3445 pos = single_aprots_positions[j]
3446 for i in single_cprots:
3447 pos2 = single_cprots_positions[i]
3448 if abs(pos - pos2) < 20000:
3449 include = "y"
3450 for i in nrpsprots2:
3451 pos2 = nrpsprots_positions[i]
3452 if abs(pos - pos2) < 20000:
3453 include = "y"
3454 if include == "y":
3455 nrpsprots.append(j)
3456 if detecteddomainsdict.has_key(j):
3457 detdomlist = detecteddomainsdict[j]
3458 if j in aprots:
3459 detdomlist.append(["Adenylation domain",ascores[aprots.index(j)]])
3460 elif j in ampoxprots:
3461 detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(j)]])
3462 detecteddomainsdict[j] = detdomlist
3463 else:
3464 if j in aprots:
3465 detecteddomainsdict[j] = [["Adenylation domain",ascores[aprots.index(j)]]]
3466 elif j in ampoxprots:
3467 detecteddomainsdict[j] = [["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(j)]]]
3468 for j in single_cprots:
3469 include = "n"
3470 pos = single_cprots_positions[j]
3471 for i in single_aprots:
3472 pos2 = single_aprots_positions[i]
3473 if abs(pos - pos2) < 20000:
3474 include = "y"
3475 for i in nrpsprots2:
3476 pos2 = nrpsprots_positions[i]
3477 if abs(pos - pos2) < 20000:
3478 include = "y"
3479 if include == "y":
3480 nrpsprots.append(j)
3481 if detecteddomainsdict.has_key(j):
3482 detdomlist = detecteddomainsdict[j]
3483 detdomlist.append(["Condensation domain",cscores[cprots.index(j)]])
3484 detecteddomainsdict[j] = detdomlist
3485 else:
3486 detecteddomainsdict[j] = [["Condensation domain",cscores[cprots.index(j)]]]
3487 #Extract Terpene synthase proteins, various cut-offs
3488 terpeneprots = []
3489 if 1 in geneclustertypes or 6 in geneclustertypes:
3490 terpene = parsehmmoutput(23,hmmoutputfolder + "Terpene_synth_C.txt")
3491 terpeneprots = terpene[0]
3492 terpenescores = terpene[1]
3493 for i in terpeneprots:
3494 if detecteddomainsdict.has_key(i):
3495 detdomlist = detecteddomainsdict[i]
3496 detdomlist.append(["Terpene synthase, C-terminus",terpenescores[terpeneprots.index(i)]])
3497 detecteddomainsdict[i] = detdomlist
3498 else:
3499 detecteddomainsdict[i] = [["Terpene synthase, C-terminus",terpenescores[terpeneprots.index(i)]]]
3500 if 1 in geneclustertypes or 6 in geneclustertypes:
3501 physqualdata = parsehmmoutput(20,hmmoutputfolder + "phytoene_synt.txt")
3502 physqualprots = physqualdata[0]
3503 physqualscores = physqualdata[1]
3504 for i in physqualprots:
3505 if i not in terpeneprots:
3506 terpeneprots.append(i)
3507 if detecteddomainsdict.has_key(i):
3508 detdomlist = detecteddomainsdict[i]
3509 detdomlist.append(["Phytoene/squalene synthase",physqualscores[physqualprots.index(i)]])
3510 detecteddomainsdict[i] = detdomlist
3511 else:
3512 detecteddomainsdict[i] = [["Phytoene/squalene synthase",physqualscores[physqualprots.index(i)]]]
3513 if 1 in geneclustertypes or 6 in geneclustertypes:
3514 lycopenedata = parsehmmoutput(80,hmmoutputfolder + "Lycopene_cycl.txt")
3515 lycopeneprots = lycopenedata[0]
3516 lycopenescores = lycopenedata[1]
3517 for i in lycopeneprots:
3518 if i not in terpeneprots:
3519 terpeneprots.append(i)
3520 if detecteddomainsdict.has_key(i):
3521 detdomlist = detecteddomainsdict[i]
3522 detdomlist.append(["Lycopene cyclase",lycopenescores[lycopeneprots.index(i)]])
3523 detecteddomainsdict[i] = detdomlist
3524 else:
3525 detecteddomainsdict[i] = [["Lycopene cyclase",lycopenescores[lycopeneprots.index(i)]]]
3526 if 1 in geneclustertypes or 6 in geneclustertypes:
3527 terpene_cyclasesdata = parsehmmoutput(50,hmmoutputfolder + "terpene_cyclase.txt")
3528 terpene_cyclases = terpene_cyclasesdata[0]
3529 terpene_cyclases_scores = terpene_cyclasesdata[1]
3530 for i in terpene_cyclases:
3531 if i not in terpeneprots:
3532 terpeneprots.append(i)
3533 if detecteddomainsdict.has_key(i):
3534 detdomlist = detecteddomainsdict[i]
3535 detdomlist.append(["Terpene cyclase",terpene_cyclases_scores[terpene_cyclases.index(i)]])
3536 detecteddomainsdict[i] = detdomlist
3537 else:
3538 detecteddomainsdict[i] = [["Terpene cyclase",terpene_cyclases_scores[terpene_cyclases.index(i)]]]
3539 if 1 in geneclustertypes or 6 in geneclustertypes:
3540 NapT7 = parsehmmoutput(250,hmmoutputfolder + "NapT7.txt")
3541 NapT7prots = NapT7[0]
3542 NapT7scores = NapT7[1]
3543 for i in NapT7prots:
3544 if i not in terpeneprots:
3545 terpeneprots.append(i)
3546 if detecteddomainsdict.has_key(i):
3547 detdomlist = detecteddomainsdict[i]
3548 detdomlist.append(["NapT7",NapT7scores[NapT7prots.index(i)]])
3549 detecteddomainsdict[i] = detdomlist
3550 else:
3551 detecteddomainsdict[i] = [["NapT7",NapT7scores[NapT7prots.index(i)]]]
3552 if 1 in geneclustertypes or 6 in geneclustertypes:
3553 fung_ggpps = parsehmmoutput(420,hmmoutputfolder + "fung_ggpps.txt")
3554 fung_ggppsprots = fung_ggpps[0]
3555 fung_ggppsscores = fung_ggpps[1]
3556 for i in fung_ggppsprots:
3557 if i not in terpeneprots:
3558 terpeneprots.append(i)
3559 if detecteddomainsdict.has_key(i):
3560 detdomlist = detecteddomainsdict[i]
3561 detdomlist.append(["Fungal geranylgeranyl pyrophosphate synthase, model 1",fung_ggppsscores[fung_ggppsprots.index(i)]])
3562 detecteddomainsdict[i] = detdomlist
3563 else:
3564 detecteddomainsdict[i] = [["Fungal geranylgeranyl pyrophosphate synthase, model 1",fung_ggppsscores[fung_ggppsprots.index(i)]]]
3565 if 1 in geneclustertypes or 6 in geneclustertypes:
3566 fung_ggpps2 = parsehmmoutput(312,hmmoutputfolder + "fung_ggpps2.txt")
3567 fung_ggpps2prots = fung_ggpps2[0]
3568 fung_ggpps2scores = fung_ggpps2[1]
3569 for i in fung_ggpps2prots:
3570 if i not in terpeneprots:
3571 terpeneprots.append(i)
3572 if detecteddomainsdict.has_key(i):
3573 detdomlist = detecteddomainsdict[i]
3574 detdomlist.append(["Fungal geranylgeranyl pyrophosphate synthase, model 2",fung_ggpps2scores[fung_ggpps2prots.index(i)]])
3575 detecteddomainsdict[i] = detdomlist
3576 else:
3577 detecteddomainsdict[i] = [["Fungal geranylgeranyl pyrophosphate synthase, model 2",fung_ggpps2scores[fung_ggpps2prots.index(i)]]]
3578 if 1 in geneclustertypes or 6 in geneclustertypes:
3579 dmat = parsehmmoutput(200,hmmoutputfolder + "dmat.txt")
3580 dmatprots = dmat[0]
3581 dmatscores = dmat[1]
3582 for i in dmatprots:
3583 if i not in terpeneprots:
3584 terpeneprots.append(i)
3585 if detecteddomainsdict.has_key(i):
3586 detdomlist = detecteddomainsdict[i]
3587 detdomlist.append(["Dimethylallyl tryptophan synthase",dmatscores[dmatprots.index(i)]])
3588 detecteddomainsdict[i] = detdomlist
3589 else:
3590 detecteddomainsdict[i] = [["Dimethylallyl tryptophan synthase",dmatscores[dmatprots.index(i)]]]
3591 if 1 in geneclustertypes or 6 in geneclustertypes:
3592 trichodiene_synth = parsehmmoutput(150,hmmoutputfolder + "trichodiene_synth.txt")
3593 trichodiene_synthprots = trichodiene_synth[0]
3594 trichodiene_synthscores = trichodiene_synth[1]
3595 for i in trichodiene_synthprots:
3596 if i not in terpeneprots:
3597 terpeneprots.append(i)
3598 if detecteddomainsdict.has_key(i):
3599 detdomlist = detecteddomainsdict[i]
3600 detdomlist.append(["Trichodiene synthase",trichodiene_synthscores[trichodiene_synthprots.index(i)]])
3601 detecteddomainsdict[i] = detdomlist
3602 else:
3603 detecteddomainsdict[i] = [["Trichodiene synthase",trichodiene_synthscores[trichodiene_synthprots.index(i)]]]
3604 #Extract lantibiotic proteins, LanC cut-off: 80, Lant_dehN & Lant_dehC combination cut-off: 20 each
3605 lantprots = []
3606 if 1 in geneclustertypes or 7 in geneclustertypes:
3607 lantc = parsehmmoutput(80,hmmoutputfolder + "LANC_like.txt")
3608 lancprots = lantc[0]
3609 lancscores = lantc[1]
3610 landehn = parsehmmoutput(20,hmmoutputfolder + "Lant_dehyd_N.txt")
3611 landehnprots = landehn[0]
3612 landehnscores = landehn[1]
3613 landehc = parsehmmoutput(20,hmmoutputfolder + "Lant_dehyd_C.txt")
3614 landehcprots = landehc[0]
3615 landehcscores = landehc[1]
3616 lanti1 = parsehmmoutput(20,hmmoutputfolder + "Antimicrobial18.txt")
3617 lanti1prots = lanti1[0]
3618 lanti1scores = lanti1[1]
3619 lanti2 = parsehmmoutput(20,hmmoutputfolder + "Gallidermin.txt")
3620 lanti2prots = lanti2[0]
3621 lanti2scores = lanti2[1]
3622 lanti3 = parsehmmoutput(20,hmmoutputfolder + "L_biotic_typeA.txt")
3623 lanti3prots = lanti3[0]
3624 lanti3scores = lanti3[1]
3625 lanti4 = parsehmmoutput(20,hmmoutputfolder + "LE-DUF.txt")
3626 lanti4prots = lanti4[0]
3627 lanti4scores = lanti4[1]
3628 lanti5 = parsehmmoutput(20,hmmoutputfolder + "LE-LAC481.txt")
3629 lanti5prots = lanti5[0]
3630 lanti5scores = lanti5[1]
3631 lanti6 = parsehmmoutput(20,hmmoutputfolder + "LE-LanBC.txt")
3632 lanti6prots = lanti6[0]
3633 lanti6scores = lanti6[1]
3634 lanti7 = parsehmmoutput(20,hmmoutputfolder + "LE-MER+2PEP.txt")
3635 lanti7prots = lanti7[0]
3636 lanti7scores = lanti7[1]
3637 lanti8 = parsehmmoutput(20,hmmoutputfolder + "MA-2PEPA.txt")
3638 lanti8prots = lanti8[0]
3639 lanti8scores = lanti8[1]
3640 lanti9 = parsehmmoutput(20,hmmoutputfolder + "MA-DUF.txt")
3641 lanti9prots = lanti9[0]
3642 lanti9scores = lanti9[1]
3643 lanti10 = parsehmmoutput(20,hmmoutputfolder + "MA-EPI.txt")
3644 lanti10prots = lanti10[0]
3645 lanti10scores = lanti10[1]
3646 lanti11 = parsehmmoutput(20,hmmoutputfolder + "MA-LAC481.txt")
3647 lanti11prots = lanti11[0]
3648 lanti11scores = lanti11[1]
3649 lanti12 = parsehmmoutput(20,hmmoutputfolder + "MA-NIS+EPI.txt")
3650 lanti12prots = lanti12[0]
3651 lanti12scores = lanti12[1]
3652 lanti13 = parsehmmoutput(20,hmmoutputfolder + "MA-NIS.txt")
3653 lanti13prots = lanti13[0]
3654 lanti13scores = lanti13[1]
3655 lanti14 = parsehmmoutput(18,hmmoutputfolder + "TIGR03731.txt")
3656 lanti14prots = lanti14[0]
3657 lanti14scores = lanti14[1]
3658 lantiprots = lanti1prots + lanti2prots + lanti3prots + lanti4prots + lanti5prots + lanti6prots + lanti7prots + lanti8prots + lanti9prots + lanti10prots + lanti11prots + lanti12prots + lanti13prots + lanti14prots
3659 lantiprots2 = []
3660 for i in lantiprots:
3661 if i not in lantiprots2:
3662 lantiprots2.append(i)
3663 lantiprots = lantiprots2
3664 for i in lancprots:
3665 lantprots.append(i)
3666 if detecteddomainsdict.has_key(i):
3667 detdomlist = detecteddomainsdict[i]
3668 detdomlist.append(["LanC lanthionine synthase domain",lancscores[lancprots.index(i)]])
3669 detecteddomainsdict[i] = detdomlist
3670 else:
3671 detecteddomainsdict[i] = [["LanC lanthionine synthase domain",lancscores[lancprots.index(i)]]]
3672 for i in landehnprots:
3673 if i in landehcprots and i not in lantprots:
3674 lantprots.append(i)
3675 if detecteddomainsdict.has_key(i):
3676 detdomlist = detecteddomainsdict[i]
3677 detdomlist.append(["Lantibiotic dehydratase, N-terminus",landehnscores[landehnprots.index(i)]])
3678 detdomlist.append(["Lantibiotic dehydratase, C-terminus",landehcscores[landehcprots.index(i)]])
3679 detecteddomainsdict[i] = detdomlist
3680 else:
3681 detecteddomainsdict[i] = [["Lantibiotic dehydratase, N-terminus",landehnscores[landehnprots.index(i)]],["Lantibiotic dehydratase, C-terminus",landehcscores[landehcprots.index(i)]]]
3682 for i in lantiprots:
3683 if i not in lantprots:
3684 lantprots.append(i)
3685 if detecteddomainsdict.has_key(i):
3686 detdomlist = detecteddomainsdict[i]
3687 if i in lanti1prots:
3688 detdomlist.append(["Antimicrobial18 domain",lanti1scores[lanti1prots.index(i)]])
3689 detecteddomainsdict[i] = detdomlist
3690 else:
3691 if i in lanti1prots:
3692 detecteddomainsdict[i] = [["Antimicrobial18 domain",lanti1scores[lanti1prots.index(i)]]]
3693 if detecteddomainsdict.has_key(i):
3694 detdomlist = detecteddomainsdict[i]
3695 if i in lanti2prots:
3696 detdomlist.append(["Gallidermin domain",lanti2scores[lanti2prots.index(i)]])
3697 detecteddomainsdict[i] = detdomlist
3698 else:
3699 if i in lanti2prots:
3700 detecteddomainsdict[i] = [["Gallidermin domain",lanti2scores[lanti2prots.index(i)]]]
3701 if detecteddomainsdict.has_key(i):
3702 detdomlist = detecteddomainsdict[i]
3703 if i in lanti3prots:
3704 detdomlist.append(["L_biotic_typeA domain",lanti3scores[lanti3prots.index(i)]])
3705 detecteddomainsdict[i] = detdomlist
3706 else:
3707 if i in lanti3prots:
3708 detecteddomainsdict[i] = [["L_biotic_typeA domain",lanti3scores[lanti3prots.index(i)]]]
3709 if detecteddomainsdict.has_key(i):
3710 detdomlist = detecteddomainsdict[i]
3711 if i in lanti4prots:
3712 detdomlist.append(["LE-DUF domain",lanti4scores[lanti4prots.index(i)]])
3713 detecteddomainsdict[i] = detdomlist
3714 else:
3715 if i in lanti4prots:
3716 detecteddomainsdict[i] = [["LE-DUF domain",lanti4scores[lanti4prots.index(i)]]]
3717 if detecteddomainsdict.has_key(i):
3718 detdomlist = detecteddomainsdict[i]
3719 if i in lanti5prots:
3720 detdomlist.append(["LE-LAC481 domain",lanti5scores[lanti5prots.index(i)]])
3721 detecteddomainsdict[i] = detdomlist
3722 else:
3723 if i in lanti5prots:
3724 detecteddomainsdict[i] = [["LE-LAC481 domain",lanti5scores[lanti5prots.index(i)]]]
3725 if detecteddomainsdict.has_key(i):
3726 detdomlist = detecteddomainsdict[i]
3727 if i in lanti6prots:
3728 detdomlist.append(["LE-LanBC domain",lanti6scores[lanti6prots.index(i)]])
3729 detecteddomainsdict[i] = detdomlist
3730 else:
3731 if i in lanti6prots:
3732 detecteddomainsdict[i] = [["LE-LanBC domain",lanti6scores[lanti6prots.index(i)]]]
3733 if detecteddomainsdict.has_key(i):
3734 detdomlist = detecteddomainsdict[i]
3735 if i in lanti7prots:
3736 detdomlist.append(["LE-MER+2PEP domain",lanti7scores[lanti7prots.index(i)]])
3737 detecteddomainsdict[i] = detdomlist
3738 else:
3739 if i in lanti7prots:
3740 detecteddomainsdict[i] = [["LE-MER+2PEP domain",lanti7scores[lanti7prots.index(i)]]]
3741 if detecteddomainsdict.has_key(i):
3742 detdomlist = detecteddomainsdict[i]
3743 if i in lanti8prots:
3744 detdomlist.append(["MA-2PEPA domain",lanti8scores[lanti8prots.index(i)]])
3745 detecteddomainsdict[i] = detdomlist
3746 else:
3747 if i in lanti8prots:
3748 detecteddomainsdict[i] = [["MA-2PEPA domain",lanti8scores[lanti8prots.index(i)]]]
3749 if detecteddomainsdict.has_key(i):
3750 detdomlist = detecteddomainsdict[i]
3751 if i in lanti9prots:
3752 detdomlist.append(["MA-DUF domain",lanti9scores[lanti9prots.index(i)]])
3753 detecteddomainsdict[i] = detdomlist
3754 else:
3755 if i in lanti9prots:
3756 detecteddomainsdict[i] = [["MA-DUF domain",lanti9scores[lanti9prots.index(i)]]]
3757 if detecteddomainsdict.has_key(i):
3758 detdomlist = detecteddomainsdict[i]
3759 if i in lanti10prots:
3760 detdomlist.append(["MA-EPI domain",lanti10scores[lanti10prots.index(i)]])
3761 detecteddomainsdict[i] = detdomlist
3762 else:
3763 if i in lanti10prots:
3764 detecteddomainsdict[i] = [["MA-EPI domain",lanti10scores[lanti10prots.index(i)]]]
3765 if detecteddomainsdict.has_key(i):
3766 detdomlist = detecteddomainsdict[i]
3767 if i in lanti11prots:
3768 detdomlist.append(["MA-LAC481 domain",lanti11scores[lanti11prots.index(i)]])
3769 detecteddomainsdict[i] = detdomlist
3770 else:
3771 if i in lanti11prots:
3772 detecteddomainsdict[i] = [["MA-LAC481 domain",lanti11scores[lanti11prots.index(i)]]]
3773 if detecteddomainsdict.has_key(i):
3774 detdomlist = detecteddomainsdict[i]
3775 if i in lanti12prots:
3776 detdomlist.append(["MA-NIS+EPI domain",lanti12scores[lanti12prots.index(i)]])
3777 detecteddomainsdict[i] = detdomlist
3778 else:
3779 if i in lanti12prots:
3780 detecteddomainsdict[i] = [["MA-NIS+EPI domain",lanti12scores[lanti12prots.index(i)]]]
3781 if detecteddomainsdict.has_key(i):
3782 detdomlist = detecteddomainsdict[i]
3783 if i in lanti13prots:
3784 detdomlist.append(["MA-NIS domain",lanti13scores[lanti13prots.index(i)]])
3785 detecteddomainsdict[i] = detdomlist
3786 else:
3787 if i in lanti13prots:
3788 detecteddomainsdict[i] = [["MA-NIS domain",lanti13scores[lanti13prots.index(i)]]]
3789 if detecteddomainsdict.has_key(i):
3790 detdomlist = detecteddomainsdict[i]
3791 if i in lanti14prots:
3792 detdomlist.append(["TIGR03731: lantibiotic, gallidermin/nisin family",lanti14scores[lanti14prots.index(i)]])
3793 detecteddomainsdict[i] = detdomlist
3794 else:
3795 if i in lanti14prots:
3796 detecteddomainsdict[i] = [["TIGR03731: lantibiotic, gallidermin/nisin family",lanti14scores[lanti14prots.index(i)]]]
3797 #Bacteriocin proteins, various cut-offs
3798 bcinprots = []
3799 if 1 in geneclustertypes or 8 in geneclustertypes:
3800 bcin1prots = parsehmmoutput(50,hmmoutputfolder + "strepbact.txt")[0]
3801 bcin2prots = parsehmmoutput(90,hmmoutputfolder + "Antimicrobial14.txt")[0]
3802 bcin3prots = parsehmmoutput(23,hmmoutputfolder + "Bacteriocin_IId.txt")[0]
3803 bcin4prots = parsehmmoutput(92,hmmoutputfolder + "BacteriocIIc_cy.txt")[0]
3804 bcin5prots = parsehmmoutput(40,hmmoutputfolder + "Bacteriocin_II.txt")[0]
3805 bcin6prots = parsehmmoutput(24,hmmoutputfolder + "Lactococcin.txt")[0]
3806 bcin7prots = parsehmmoutput(31,hmmoutputfolder + "Antimicrobial17.txt")[0]
3807 bcin8prots = parsehmmoutput(25,hmmoutputfolder + "Lactococcin_972.txt")[0]
3808 bcin9prots = parsehmmoutput(27,hmmoutputfolder + "Bacteriocin_IIc.txt")[0]
3809 bcin10prots = parsehmmoutput(78,hmmoutputfolder + "LcnG-beta.txt")[0]
3810 bcin11prots = parsehmmoutput(56,hmmoutputfolder + "Bacteriocin_IIi.txt")[0]
3811 bcin12prots = parsehmmoutput(98,hmmoutputfolder + "Subtilosin_A.txt")[0]
3812 bcin13prots = parsehmmoutput(27,hmmoutputfolder + "Cloacin.txt")[0]
3813 bcin14prots = parsehmmoutput(25,hmmoutputfolder + "Linocin_M18.txt")[0]
3814 bcin15prots = parsehmmoutput(150,hmmoutputfolder + "TIGR03603.txt")[0]
3815 bcin16prots = parsehmmoutput(440,hmmoutputfolder + "TIGR03604.txt")[0]
3816 bcin17prots = parsehmmoutput(200,hmmoutputfolder + "TIGR03605.txt")[0]
3817 bcin18prots = parsehmmoutput(18,hmmoutputfolder + "TIGR03651.txt")[0]
3818 bcin19prots = parsehmmoutput(35,hmmoutputfolder + "TIGR03678.txt")[0]
3819 bcin20prots = parsehmmoutput(400,hmmoutputfolder + "TIGR03693.txt")[0]
3820 bcin21prots = parsehmmoutput(16,hmmoutputfolder + "TIGR03798.txt")[0]
3821 bcin22prots = parsehmmoutput(150,hmmoutputfolder + "TIGR03882.txt")[0]
3822 bcin23prots = parsehmmoutput(50,hmmoutputfolder + "TIGR03601.txt")[0]
3823 bcin24prots = parsehmmoutput(50,hmmoutputfolder + "TIGR03602.txt")[0]
3824 bcin25prots = parsehmmoutput(20,hmmoutputfolder + "mvnA.txt")[0]
3825 bcin26prots = parsehmmoutput(20,hmmoutputfolder + "thiostrepton.txt")[0]
3826 bcin1scores = parsehmmoutput(50,hmmoutputfolder + "strepbact.txt")[1]
3827 bcin2scores = parsehmmoutput(90,hmmoutputfolder + "Antimicrobial14.txt")[1]
3828 bcin3scores = parsehmmoutput(23,hmmoutputfolder + "Bacteriocin_IId.txt")[1]
3829 bcin4scores = parsehmmoutput(92,hmmoutputfolder + "BacteriocIIc_cy.txt")[1]
3830 bcin5scores = parsehmmoutput(40,hmmoutputfolder + "Bacteriocin_II.txt")[1]
3831 bcin6scores = parsehmmoutput(24,hmmoutputfolder + "Lactococcin.txt")[1]
3832 bcin7scores = parsehmmoutput(31,hmmoutputfolder + "Antimicrobial17.txt")[1]
3833 bcin8scores = parsehmmoutput(25,hmmoutputfolder + "Lactococcin_972.txt")[1]
3834 bcin9scores = parsehmmoutput(27,hmmoutputfolder + "Bacteriocin_IIc.txt")[1]
3835 bcin10scores = parsehmmoutput(78,hmmoutputfolder + "LcnG-beta.txt")[1]
3836 bcin11scores = parsehmmoutput(56,hmmoutputfolder + "Bacteriocin_IIi.txt")[1]
3837 bcin12scores = parsehmmoutput(98,hmmoutputfolder + "Subtilosin_A.txt")[1]
3838 bcin13scores = parsehmmoutput(27,hmmoutputfolder + "Cloacin.txt")[1]
3839 bcin14scores = parsehmmoutput(25,hmmoutputfolder + "Linocin_M18.txt")[1]
3840 bcin15scores = parsehmmoutput(150,hmmoutputfolder + "TIGR03603.txt")[1]
3841 bcin16scores = parsehmmoutput(440,hmmoutputfolder + "TIGR03604.txt")[1]
3842 bcin17scores = parsehmmoutput(200,hmmoutputfolder + "TIGR03605.txt")[1]
3843 bcin18scores = parsehmmoutput(18,hmmoutputfolder + "TIGR03651.txt")[1]
3844 bcin19scores = parsehmmoutput(35,hmmoutputfolder + "TIGR03678.txt")[1]
3845 bcin20scores = parsehmmoutput(400,hmmoutputfolder + "TIGR03693.txt")[1]
3846 bcin21scores = parsehmmoutput(16,hmmoutputfolder + "TIGR03798.txt")[1]
3847 bcin22scores = parsehmmoutput(150,hmmoutputfolder + "TIGR03882.txt")[1]
3848 bcin23scores = parsehmmoutput(50,hmmoutputfolder + "TIGR03601.txt")[1]
3849 bcin24scores = parsehmmoutput(50,hmmoutputfolder + "TIGR03602.txt")[1]
3850 bcin25scores = parsehmmoutput(20,hmmoutputfolder + "mvnA.txt")[1]
3851 bcin26scores = parsehmmoutput(20,hmmoutputfolder + "thiostrepton.txt")[1]
3852 bcinprots = bcin1prots + bcin2prots + bcin3prots + bcin4prots + bcin5prots + bcin6prots + bcin7prots + bcin8prots + bcin9prots + bcin10prots + bcin11prots + bcin12prots + bcin13prots + bcin14prots + bcin15prots + bcin16prots + bcin17prots + bcin18prots + bcin19prots + bcin20prots + bcin21prots + bcin22prots + bcin23prots + bcin24prots + bcin25prots + bcin26prots
3853 bcinprots2 = []
3854 for i in bcinprots:
3855 if detecteddomainsdict.has_key(i):
3856 detdomlist = detecteddomainsdict[i]
3857 if i in bcin1prots:
3858 detdomlist.append(["Putative Streptomyces bacteriocin",bcin1scores[bcin1prots.index(i)]])
3859 detecteddomainsdict[i] = detdomlist
3860 else:
3861 if i in bcin1prots:
3862 detecteddomainsdict[i] = [["Putative Streptomyces bacteriocin",bcin1scores[bcin1prots.index(i)]]]
3863 if detecteddomainsdict.has_key(i):
3864 detdomlist = detecteddomainsdict[i]
3865 if i in bcin2prots:
3866 detdomlist.append(["Antimicrobial14 domain",bcin2scores[bcin2prots.index(i)]])
3867 detecteddomainsdict[i] = detdomlist
3868 else:
3869 if i in bcin2prots:
3870 detecteddomainsdict[i] = [["Antimicrobial14 domain",bcin2scores[bcin2prots.index(i)]]]
3871 if detecteddomainsdict.has_key(i):
3872 detdomlist = detecteddomainsdict[i]
3873 if i in bcin3prots:
3874 detdomlist.append(["Bacteriocin_IId domain",bcin3scores[bcin3prots.index(i)]])
3875 detecteddomainsdict[i] = detdomlist
3876 else:
3877 if i in bcin3prots:
3878 detecteddomainsdict[i] = [["Bacteriocin_IId domain",bcin3scores[bcin3prots.index(i)]]]
3879 if detecteddomainsdict.has_key(i):
3880 detdomlist = detecteddomainsdict[i]
3881 if i in bcin4prots:
3882 detdomlist.append(["BacteriocIIc_cy domain",bcin4scores[bcin4prots.index(i)]])
3883 detecteddomainsdict[i] = detdomlist
3884 else:
3885 if i in bcin4prots:
3886 detecteddomainsdict[i] = [["BacteriocIIc_cy domain",bcin4scores[bcin4prots.index(i)]]]
3887 if detecteddomainsdict.has_key(i):
3888 detdomlist = detecteddomainsdict[i]
3889 if i in bcin5prots:
3890 detdomlist.append(["Bacteriocin_II domain",bcin5scores[bcin5prots.index(i)]])
3891 detecteddomainsdict[i] = detdomlist
3892 else:
3893 if i in bcin5prots:
3894 detecteddomainsdict[i] = [["Bacteriocin_II domain",bcin5scores[bcin5prots.index(i)]]]
3895 if detecteddomainsdict.has_key(i):
3896 detdomlist = detecteddomainsdict[i]
3897 if i in bcin6prots:
3898 detdomlist.append(["Lactococcin",bcin6scores[bcin6prots.index(i)]])
3899 detecteddomainsdict[i] = detdomlist
3900 else:
3901 if i in bcin6prots:
3902 detecteddomainsdict[i] = [["Lactococcin",bcin6scores[bcin6prots.index(i)]]]
3903 if detecteddomainsdict.has_key(i):
3904 detdomlist = detecteddomainsdict[i]
3905 if i in bcin7prots:
3906 detdomlist.append(["Antimicrobial17 domain",bcin7scores[bcin7prots.index(i)]])
3907 detecteddomainsdict[i] = detdomlist
3908 else:
3909 if i in bcin7prots:
3910 detecteddomainsdict[i] = [["Antimicrobial17 domain",bcin7scores[bcin7prots.index(i)]]]
3911 if detecteddomainsdict.has_key(i):
3912 detdomlist = detecteddomainsdict[i]
3913 if i in bcin8prots:
3914 detdomlist.append(["Lactococcin_972 domain",bcin8scores[bcin8prots.index(i)]])
3915 detecteddomainsdict[i] = detdomlist
3916 else:
3917 if i in bcin8prots:
3918 detecteddomainsdict[i] = [["Lactococcin_972 domain",bcin8scores[bcin8prots.index(i)]]]
3919 if detecteddomainsdict.has_key(i):
3920 detdomlist = detecteddomainsdict[i]
3921 if i in bcin9prots:
3922 detdomlist.append(["Bacteriocin_IIc domain",bcin9scores[bcin9prots.index(i)]])
3923 detecteddomainsdict[i] = detdomlist
3924 else:
3925 if i in bcin9prots:
3926 detecteddomainsdict[i] = [["Bacteriocin_IIc domain",bcin9scores[bcin9prots.index(i)]]]
3927 if detecteddomainsdict.has_key(i):
3928 detdomlist = detecteddomainsdict[i]
3929 if i in bcin10prots:
3930 detdomlist.append(["LcnG-beta domain",bcin10scores[bcin10prots.index(i)]])
3931 detecteddomainsdict[i] = detdomlist
3932 else:
3933 if i in bcin10prots:
3934 detecteddomainsdict[i] = [["LcnG-beta domain",bcin10scores[bcin10prots.index(i)]]]
3935 if detecteddomainsdict.has_key(i):
3936 detdomlist = detecteddomainsdict[i]
3937 if i in bcin11prots:
3938 detdomlist.append(["Bacteriocin_IIi domain",bcin11scores[bcin11prots.index(i)]])
3939 detecteddomainsdict[i] = detdomlist
3940 else:
3941 if i in bcin11prots:
3942 detecteddomainsdict[i] = [["Bacteriocin_IIi domain",bcin11scores[bcin11prots.index(i)]]]
3943 if detecteddomainsdict.has_key(i):
3944 detdomlist = detecteddomainsdict[i]
3945 if i in bcin12prots:
3946 detdomlist.append(["Subtilosin_A domain",bcin12scores[bcin12prots.index(i)]])
3947 detecteddomainsdict[i] = detdomlist
3948 else:
3949 if i in bcin12prots:
3950 detecteddomainsdict[i] = [["Subtilosin_A domain",bcin12scores[bcin12prots.index(i)]]]
3951 if detecteddomainsdict.has_key(i):
3952 detdomlist = detecteddomainsdict[i]
3953 if i in bcin13prots:
3954 detdomlist.append(["Cloacin domain",bcin13scores[bcin13prots.index(i)]])
3955 detecteddomainsdict[i] = detdomlist
3956 else:
3957 if i in bcin13prots:
3958 detecteddomainsdict[i] = [["Cloacin domain",bcin13scores[bcin13prots.index(i)]]]
3959 if detecteddomainsdict.has_key(i):
3960 detdomlist = detecteddomainsdict[i]
3961 if i in bcin14prots:
3962 detdomlist.append(["Linocin_M18 domain",bcin14scores[bcin14prots.index(i)]])
3963 detecteddomainsdict[i] = detdomlist
3964 else:
3965 if i in bcin14prots:
3966 detecteddomainsdict[i] = [["Linocin_M18 domain",bcin14scores[bcin14prots.index(i)]]]
3967 if detecteddomainsdict.has_key(i):
3968 detdomlist = detecteddomainsdict[i]
3969 if i in bcin15prots:
3970 detdomlist.append(["TIGR03603: bacteriocin biosynthesis cyclodehydratase",bcin15scores[bcin15prots.index(i)]])
3971 detecteddomainsdict[i] = detdomlist
3972 else:
3973 if i in bcin15prots:
3974 detecteddomainsdict[i] = [["TIGR03603: bacteriocin biosynthesis cyclodehydratase",bcin15scores[bcin15prots.index(i)]]]
3975 if detecteddomainsdict.has_key(i):
3976 detdomlist = detecteddomainsdict[i]
3977 if i in bcin16prots:
3978 detdomlist.append(["TGIR03604: bacteriocin biosynthesis docking scaffold",bcin16scores[bcin16prots.index(i)]])
3979 detecteddomainsdict[i] = detdomlist
3980 else:
3981 if i in bcin16prots:
3982 detecteddomainsdict[i] = [["TGIR03604: bacteriocin biosynthesis docking scaffold",bcin16scores[bcin16prots.index(i)]]]
3983 if detecteddomainsdict.has_key(i):
3984 detdomlist = detecteddomainsdict[i]
3985 if i in bcin17prots:
3986 detdomlist.append(["TGIR03605: SagB-type dehydrogenase",bcin17scores[bcin17prots.index(i)]])
3987 detecteddomainsdict[i] = detdomlist
3988 else:
3989 if i in bcin17prots:
3990 detecteddomainsdict[i] = [["TGIR03605: SagB-type dehydrogenase",bcin17scores[bcin17prots.index(i)]]]
3991 if detecteddomainsdict.has_key(i):
3992 detdomlist = detecteddomainsdict[i]
3993 if i in bcin18prots:
3994 detdomlist.append(["TIGR03651: bacteriocin, circularin A/uberolysin family",bcin18scores[bcin18prots.index(i)]])
3995 detecteddomainsdict[i] = detdomlist
3996 else:
3997 if i in bcin18prots:
3998 detecteddomainsdict[i] = [["TIGR03651: bacteriocin, circularin A/uberolysin family",bcin18scores[bcin18prots.index(i)]]]
3999 if detecteddomainsdict.has_key(i):
4000 detdomlist = detecteddomainsdict[i]
4001 if i in bcin19prots:
4002 detdomlist.append(["TIGR03678: bacteriocin, microcyclamide/patellamide family",bcin19scores[bcin19prots.index(i)]])
4003 detecteddomainsdict[i] = detdomlist
4004 else:
4005 if i in bcin19prots:
4006 detecteddomainsdict[i] = [["TIGR03678: bacteriocin, microcyclamide/patellamide family",bcin19scores[bcin19prots.index(i)]]]
4007 if detecteddomainsdict.has_key(i):
4008 detdomlist = detecteddomainsdict[i]
4009 if i in bcin20prots:
4010 detdomlist.append(["TIGR03693: thiazole-containing bacteriocin maturation protein",bcin20scores[bcin20prots.index(i)]])
4011 detecteddomainsdict[i] = detdomlist
4012 else:
4013 if i in bcin20prots:
4014 detecteddomainsdict[i] = [["TIGR03693: thiazole-containing bacteriocin maturation protein",bcin20scores[bcin20prots.index(i)]]]
4015 if detecteddomainsdict.has_key(i):
4016 detdomlist = detecteddomainsdict[i]
4017 if i in bcin21prots:
4018 detdomlist.append(["TIGR03798: bacteriocin propeptide",bcin21scores[bcin21prots.index(i)]])
4019 detecteddomainsdict[i] = detdomlist
4020 else:
4021 if i in bcin21prots:
4022 detecteddomainsdict[i] = [["TIGR03798: bacteriocin propeptide",bcin21scores[bcin21prots.index(i)]]]
4023 if detecteddomainsdict.has_key(i):
4024 detdomlist = detecteddomainsdict[i]
4025 if i in bcin22prots:
4026 detdomlist.append(["TIGR03882: bacteriocin biosynthesis cyclodehydratase",bcin22scores[bcin22prots.index(i)]])
4027 detecteddomainsdict[i] = detdomlist
4028 else:
4029 if i in bcin22prots:
4030 detecteddomainsdict[i] = [["TIGR03882: bacteriocin biosynthesis cyclodehydratase",bcin22scores[bcin22prots.index(i)]]]
4031 if detecteddomainsdict.has_key(i):
4032 detdomlist = detecteddomainsdict[i]
4033 if i in bcin23prots:
4034 detdomlist.append(["TIGR03601: bacteriocin, BA_2677 family",bcin23scores[bcin23prots.index(i)]])
4035 detecteddomainsdict[i] = detdomlist
4036 else:
4037 if i in bcin23prots:
4038 detecteddomainsdict[i] = [["TIGR03601: bacteriocin, BA_2677 family",bcin23scores[bcin23prots.index(i)]]]
4039 if detecteddomainsdict.has_key(i):
4040 detdomlist = detecteddomainsdict[i]
4041 if i in bcin24prots:
4042 detdomlist.append(["TIGR03602: bacteriocin protoxin, streptolysin S family",bcin24scores[bcin24prots.index(i)]])
4043 detecteddomainsdict[i] = detdomlist
4044 else:
4045 if i in bcin24prots:
4046 detecteddomainsdict[i] = [["TIGR03602: bacteriocin protoxin, streptolysin S family",bcin24scores[bcin24prots.index(i)]]]
4047 if detecteddomainsdict.has_key(i):
4048 detdomlist = detecteddomainsdict[i]
4049 if i in bcin25prots:
4050 detdomlist.append(["Bacteriocin, microviridin family",bcin25scores[bcin25prots.index(i)]])
4051 detecteddomainsdict[i] = detdomlist
4052 else:
4053 if i in bcin25prots:
4054 detecteddomainsdict[i] = [["Bacteriocin, microviridin family",bcin25scores[bcin25prots.index(i)]]]
4055 if detecteddomainsdict.has_key(i):
4056 detdomlist = detecteddomainsdict[i]
4057 if i in bcin26prots:
4058 detdomlist.append(["Thiopeptide, thiostrepton-like",bcin26scores[bcin26prots.index(i)]])
4059 detecteddomainsdict[i] = detdomlist
4060 else:
4061 if i in bcin26prots:
4062 detecteddomainsdict[i] = [["Thiopeptide, thiostrepton-like",bcin26scores[bcin26prots.index(i)]]]
4063 if i not in bcinprots2:
4064 bcinprots2.append(i)
4065 bcinprots = bcinprots2
4066 #Extract beta-lactam synthetase proteins, cut-off: 250
4067 lactamprots = []
4068 if 1 in geneclustertypes or 9 in geneclustertypes:
4069 bls = parsehmmoutput(250,hmmoutputfolder + "BLS.txt")
4070 blsprots = bls[0]
4071 blsscores = bls[1]
4072 for i in bls[0]:
4073 lactamprots.append(i)
4074 if detecteddomainsdict.has_key(i):
4075 detdomlist = detecteddomainsdict[i]
4076 detdomlist.append(["Beta-lactam synthase",blsscores[blsprots.index(i)]])
4077 detecteddomainsdict[i] = detdomlist
4078 else:
4079 detecteddomainsdict[i] = [["Beta-lactam synthase",blsscores[blsprots.index(i)]]]
4080 cas = parsehmmoutput(250,hmmoutputfolder + "CAS.txt")
4081 casprots = cas[0]
4082 casscores = cas[1]
4083 for i in cas[0]:
4084 if i not in lactamprots:
4085 lactamprots.append(i)
4086 if detecteddomainsdict.has_key(i):
4087 detdomlist = detecteddomainsdict[i]
4088 detdomlist.append(["Clavulanic acid synthase-like",casscores[casprots.index(i)]])
4089 detecteddomainsdict[i] = detdomlist
4090 else:
4091 detecteddomainsdict[i] = [["Clavulanic acid synthase-like",casscores[casprots.index(i)]]]
4092 tabtoxin = parsehmmoutput(500,hmmoutputfolder + "tabtoxin.txt")
4093 tabtoxinprots = tabtoxin[0]
4094 tabtoxinscores = tabtoxin[1]
4095 for i in tabtoxin[0]:
4096 if i not in lactamprots:
4097 lactamprots.append(i)
4098 if detecteddomainsdict.has_key(i):
4099 detdomlist = detecteddomainsdict[i]
4100 detdomlist.append(["Tabtoxin synthase-like",tabtoxinscores[tabtoxinprots.index(i)]])
4101 detecteddomainsdict[i] = detdomlist
4102 else:
4103 detecteddomainsdict[i] = [["Tabtoxin synthase-like",tabtoxinscores[tabtoxinprots.index(i)]]]
4104 #Extract aminoglycoside / aminocyclitol biosynthesis clusters, clusters taken from Flatt & Mahmud et al. 2007
4105 amglyccyclprots = []
4106 if 1 in geneclustertypes or 10 in geneclustertypes:
4107 strH = parsehmmoutput(200,hmmoutputfolder + "strH_like.txt")
4108 strhprots = strH[0]
4109 strhscores = strH[1]
4110 for i in strH[0]:
4111 amglyccyclprots.append(i)
4112 if detecteddomainsdict.has_key(i):
4113 detdomlist = detecteddomainsdict[i]
4114 detdomlist.append(["StrH-like glycosyltransferase",strhscores[strhprots.index(i)]])
4115 detecteddomainsdict[i] = detdomlist
4116 else:
4117 detecteddomainsdict[i] = [["StrH-like glycosyltransferase",strhscores[strhprots.index(i)]]]
4118 strK1 = parsehmmoutput(800,hmmoutputfolder + "strK_like1.txt")
4119 strk1prots = strK1[0]
4120 strk1scores = strK1[1]
4121 for i in strK1[0]:
4122 amglyccyclprots.append(i)
4123 if detecteddomainsdict.has_key(i):
4124 detdomlist = detecteddomainsdict[i]
4125 detdomlist.append(["StrK-like phosphatase",strk1scores[strk1prots.index(i)]])
4126 detecteddomainsdict[i] = detdomlist
4127 else:
4128 detecteddomainsdict[i] = [["StrK-like phosphatase",strk1scores[strk1prots.index(i)]]]
4129 strK2 = parsehmmoutput(650,hmmoutputfolder + "strK_like2.txt")
4130 strk2prots = strK2[0]
4131 strk2scores = strK2[1]
4132 for i in strK2[0]:
4133 amglyccyclprots.append(i)
4134 if detecteddomainsdict.has_key(i):
4135 detdomlist = detecteddomainsdict[i]
4136 detdomlist.append(["StrK-like phosphatase, model 2",strk2scores[strk2prots.index(i)]])
4137 detecteddomainsdict[i] = detdomlist
4138 else:
4139 detecteddomainsdict[i] = [["StrK-like phosphatase, model 2",strk2scores[strk2prots.index(i)]]]
4140 neoL = parsehmmoutput(50,hmmoutputfolder + "neoL_like.txt")
4141 neolprots = neoL[0]
4142 neolscores = neoL[1]
4143 for i in neoL[0]:
4144 amglyccyclprots.append(i)
4145 if detecteddomainsdict.has_key(i):
4146 detdomlist = detecteddomainsdict[i]
4147 detdomlist.append(["NeoL-like deacetylase",neolscores[neolprots.index(i)]])
4148 detecteddomainsdict[i] = detdomlist
4149 else:
4150 detecteddomainsdict[i] = [["NeoL-like deacetylase",neolscores[neolprots.index(i)]]]
4151 DOIS = parsehmmoutput(500,hmmoutputfolder + "DOIS.txt")
4152 doisprots = DOIS[0]
4153 doisscores = DOIS[1]
4154 for i in DOIS[0]:
4155 amglyccyclprots.append(i)
4156 if detecteddomainsdict.has_key(i):
4157 detdomlist = detecteddomainsdict[i]
4158 detdomlist.append(["2-deoxy-scyllo-inosose synthase",doisscores[doisprots.index(i)]])
4159 detecteddomainsdict[i] = detdomlist
4160 else:
4161 detecteddomainsdict[i] = [["2-deoxy-scyllo-inosose synthase",doisscores[doisprots.index(i)]]]
4162 valA = parsehmmoutput(600,hmmoutputfolder + "valA_like.txt")
4163 valaprots = valA[0]
4164 valascores = valA[1]
4165 for i in valA[0]:
4166 amglyccyclprots.append(i)
4167 if detecteddomainsdict.has_key(i):
4168 detdomlist = detecteddomainsdict[i]
4169 detdomlist.append(["2-epi-5-epi-valiolone synthase, ValA-like",valascores[valaprots.index(i)]])
4170 detecteddomainsdict[i] = detdomlist
4171 else:
4172 detecteddomainsdict[i] = [["2-epi-5-epi-valiolone synthase, ValA-like",valascores[valaprots.index(i)]]]
4173 spcFG = parsehmmoutput(200,hmmoutputfolder + "spcFG_like.txt")
4174 spcfgprots = spcFG[0]
4175 spcfgscores = spcFG[1]
4176 for i in spcFG[0]:
4177 amglyccyclprots.append(i)
4178 if detecteddomainsdict.has_key(i):
4179 detdomlist = detecteddomainsdict[i]
4180 detdomlist.append(["SpcF/SpcG-like glycosyltransferase",spcfgscores[spcfgprots.index(i)]])
4181 detecteddomainsdict[i] = detdomlist
4182 else:
4183 detecteddomainsdict[i] = [["SpcF/SpcG-like glycosyltransferase",spcfgscores[spcfgprots.index(i)]]]
4184 spcDK_glyc = parsehmmoutput(600,hmmoutputfolder + "spcDK_like_glyc.txt")
4185 spcdkglycprots = spcDK_glyc[0]
4186 spcdkglycscores = spcDK_glyc[1]
4187 for i in spcDK_glyc[0]:
4188 amglyccyclprots.append(i)
4189 if detecteddomainsdict.has_key(i):
4190 detdomlist = detecteddomainsdict[i]
4191 detdomlist.append(["SpcD/SpcK-like thymidylyltransferase",spcdkglycscores[spcdkglycprots.index(i)]])
4192 detecteddomainsdict[i] = detdomlist
4193 else:
4194 detecteddomainsdict[i] = [["SpcD/SpcK-like thymidylyltransferase",spcdkglycscores[spcdkglycprots.index(i)]]]
4195 salQ = parsehmmoutput(480,hmmoutputfolder + "salQ.txt")
4196 salqprots = salQ[0]
4197 salqscores = salQ[1]
4198 for i in salqprots:
4199 amglyccyclprots.append(i)
4200 if detecteddomainsdict.has_key(i):
4201 detdomlist = detecteddomainsdict[i]
4202 detdomlist.append(["2-epi-5-epi-valiolone synthase, SalQ-like",salqscores[salqprots.index(i)]])
4203 detecteddomainsdict[i] = detdomlist
4204 else:
4205 detecteddomainsdict[i] = [["2-epi-5-epi-valiolone synthase, SalQ-like",salqscores[salqprots.index(i)]]]
4206 #Extract aminocoumarin biosynthesis clusters
4207 aminocoumarinprots = []
4208 if 1 in geneclustertypes or 11 in geneclustertypes:
4209 novK = parsehmmoutput(200,hmmoutputfolder + "novK.txt")
4210 novkprots = novK[0]
4211 novkscores = novK[1]
4212 for i in novkprots:
4213 aminocoumarinprots.append(i)
4214 if detecteddomainsdict.has_key(i):
4215 detdomlist = detecteddomainsdict[i]
4216 detdomlist.append(["NovK-like reductase",novkscores[novkprots.index(i)]])
4217 detecteddomainsdict[i] = detdomlist
4218 else:
4219 detecteddomainsdict[i] = [["NovK-like reductase",novkscores[novkprots.index(i)]]]
4220 novJ = parsehmmoutput(350,hmmoutputfolder + "novJ.txt")
4221 novjprots = novJ[0]
4222 novjscores = novJ[1]
4223 for i in novjprots:
4224 aminocoumarinprots.append(i)
4225 if detecteddomainsdict.has_key(i):
4226 detdomlist = detecteddomainsdict[i]
4227 detdomlist.append(["NovJ-like reductase",novjscores[novjprots.index(i)]])
4228 detecteddomainsdict[i] = detdomlist
4229 else:
4230 detecteddomainsdict[i] = [["NovJ-like reductase",novjscores[novjprots.index(i)]]]
4231 novI = parsehmmoutput(600,hmmoutputfolder + "novI.txt")
4232 noviprots = novI[0]
4233 noviscores = novI[1]
4234 for i in noviprots :
4235 aminocoumarinprots.append(i)
4236 if detecteddomainsdict.has_key(i):
4237 detdomlist = detecteddomainsdict[i]
4238 detdomlist.append(["NovI-like cytochrome P450",noviscores[noviprots.index(i)]])
4239 detecteddomainsdict[i] = detdomlist
4240 else:
4241 detecteddomainsdict[i] = [["NovI-like cytochrome P450",noviscores[noviprots.index(i)]]]
4242 novH = parsehmmoutput(750,hmmoutputfolder + "novH.txt")
4243 novhprots = novH[0]
4244 novhscores = novH[1]
4245 for i in novhprots:
4246 aminocoumarinprots.append(i)
4247 if detecteddomainsdict.has_key(i):
4248 detdomlist = detecteddomainsdict[i]
4249 detdomlist.append(["NovH-like protein",novhscores[novhprots.index(i)]])
4250 detecteddomainsdict[i] = detdomlist
4251 else:
4252 detecteddomainsdict[i] = [["NovH-like protein",novhscores[novhprots.index(i)]]]
4253 spcDK_like_cou = parsehmmoutput(600,hmmoutputfolder + "spcDK_like_cou.txt")
4254 spcDK_like_cou_prots = spcDK_like_cou[0]
4255 spcDK_like_cou_scores = spcDK_like_cou[1]
4256 for i in spcDK_like_cou_prots:
4257 aminocoumarinprots.append(i)
4258 if detecteddomainsdict.has_key(i):
4259 detdomlist = detecteddomainsdict[i]
4260 detdomlist.append(["SpcD/SpcK-like thymidylyltransferase, aminocoumarins group",spcDK_like_cou_scores[spcDK_like_cou_prots.index(i)]])
4261 detecteddomainsdict[i] = detdomlist
4262 else:
4263 detecteddomainsdict[i] = [["SpcD/SpcK-like thymidylyltransferase, aminocoumarins group",spcDK_like_cou_scores[spcDK_like_cou_prots.index(i)]]]
4264 #Extract siderophores biosynthesis proteins, IucA/C and AlcB
4265 siderophoreprots = []
4266 if 1 in geneclustertypes or 12 in geneclustertypes:
4267 siderophore = parsehmmoutput(30,hmmoutputfolder + "IucA_IucC.txt")
4268 siderophoreprots = siderophore[0]
4269 siderophorescores = siderophore[1]
4270 for i in siderophoreprots:
4271 if detecteddomainsdict.has_key(i):
4272 detdomlist = detecteddomainsdict[i]
4273 detdomlist.append(["IucA-IucC domain",siderophorescores[siderophoreprots.index(i)]])
4274 detecteddomainsdict[i] = detdomlist
4275 else:
4276 detecteddomainsdict[i] = [["IucA-IucC domain",siderophorescores[siderophoreprots.index(i)]]]
4277 #Extract ectoine biosynthesis proteins
4278 ectprots = []
4279 if 1 in geneclustertypes or 13 in geneclustertypes:
4280 ect = parsehmmoutput(35,hmmoutputfolder + "ectoine_synt.txt")
4281 ectprots = ect[0]
4282 ectscores = ect[1]
4283 for i in ectprots:
4284 if detecteddomainsdict.has_key(i):
4285 detdomlist = detecteddomainsdict[i]
4286 detdomlist.append(["Ectoine synthase",ectscores[ectprots.index(i)]])
4287 detecteddomainsdict[i] = detdomlist
4288 else:
4289 detecteddomainsdict[i] = [["Ectoine synthase",ectscores[ectprots.index(i)]]]
4290 #Extract butyrolactone biosynthesis proteins
4291 butyrprots = []
4292 if 1 in geneclustertypes or 14 in geneclustertypes:
4293 butyr= parsehmmoutput(25,hmmoutputfolder + "AfsA.txt")
4294 butyrprots = butyr[0]
4295 butyrscores = butyr[1]
4296 for i in butyrprots:
4297 if detecteddomainsdict.has_key(i):
4298 detdomlist = detecteddomainsdict[i]
4299 detdomlist.append(["AfsA butyrolactone synthesis domain",butyrscores[butyrprots.index(i)]])
4300 detecteddomainsdict[i] = detdomlist
4301 else:
4302 detecteddomainsdict[i] = [["AfsA butyrolactone synthesis domain",butyrscores[butyrprots.index(i)]]]
4303 #Extract indole biosynthesis proteins
4304 indoleprots = []
4305 if 1 in geneclustertypes or 15 in geneclustertypes:
4306 indole = parsehmmoutput(100,hmmoutputfolder + "indsynth.txt")
4307 indoleprots = indole[0]
4308 indolescores = indole[1]
4309 for i in indoleprots:
4310 if detecteddomainsdict.has_key(i):
4311 detdomlist = detecteddomainsdict[i]
4312 detdomlist.append(["StaD-like chromopyrrolic acid synthase domain",indolescores[indoleprots.index(i)]])
4313 detecteddomainsdict[i] = detdomlist
4314 else:
4315 detecteddomainsdict[i] = [["StaD-like chromopyrrolic acid synthase domain",indolescores[indoleprots.index(i)]]]
4316 #Extract nucleoside antibiotic biosynthesis proteins
4317 nucleoprots = []
4318 if 1 in geneclustertypes or 16 in geneclustertypes:
4319 nucleoprots = []
4320 lipm = parsehmmoutput(50,hmmoutputfolder + "LipM.txt")
4321 lipmprots = lipm[0]
4322 lipmscores = lipm[1]
4323 lipu = parsehmmoutput(30,hmmoutputfolder + "LipU.txt")
4324 lipuprots = lipu[0]
4325 lipuscores = lipu[1]
4326 lipv = parsehmmoutput(375,hmmoutputfolder + "LipV.txt")
4327 lipvprots = lipv[0]
4328 lipvscores = lipv[1]
4329 toyb = parsehmmoutput(175,hmmoutputfolder + "ToyB.txt")
4330 toybprots = toyb[0]
4331 toybscores = toyb[1]
4332 tund = parsehmmoutput(200,hmmoutputfolder + "TunD.txt")
4333 tundprots = tund[0]
4334 tundscores = tund[1]
4335 pur6 = parsehmmoutput(200,hmmoutputfolder + "pur6.txt")
4336 pur6prots = pur6[0]
4337 pur6scores = pur6[1]
4338 pur10 = parsehmmoutput(600,hmmoutputfolder + "pur10.txt")
4339 pur10prots = pur10[0]
4340 pur10scores = pur10[1]
4341 nikj = parsehmmoutput(200,hmmoutputfolder + "nikJ.txt")
4342 nikjprots = nikj[0]
4343 nikjscores = nikj[1]
4344 niko = parsehmmoutput(400,hmmoutputfolder + "nikO.txt")
4345 nikoprots = niko[0]
4346 nikoscores = niko[1]
4347 for i in lipmprots:
4348 if i not in nucleoprots:
4349 nucleoprots.append(i)
4350 if detecteddomainsdict.has_key(i):
4351 detdomlist = detecteddomainsdict[i]
4352 detdomlist.append(["LipM-like nucleotidyltransferase",lipmscores[lipmprots.index(i)]])
4353 detecteddomainsdict[i] = detdomlist
4354 else:
4355 detecteddomainsdict[i] = [["LipM-like nucleotidyltransferase",lipmscores[lipmprots.index(i)]]]
4356 for i in lipuprots:
4357 if i not in nucleoprots:
4358 nucleoprots.append(i)
4359 if detecteddomainsdict.has_key(i):
4360 detdomlist = detecteddomainsdict[i]
4361 detdomlist.append(["LipU-like protein",lipuscores[lipuprots.index(i)]])
4362 detecteddomainsdict[i] = detdomlist
4363 else:
4364 detecteddomainsdict[i] = [["LipU-like protein",lipuscores[lipuprots.index(i)]]]
4365 for i in lipvprots:
4366 if i not in nucleoprots:
4367 nucleoprots.append(i)
4368 if detecteddomainsdict.has_key(i):
4369 detdomlist = detecteddomainsdict[i]
4370 detdomlist.append(["LipV-like dehydrogenase",lipvscores[lipvprots.index(i)]])
4371 detecteddomainsdict[i] = detdomlist
4372 else:
4373 detecteddomainsdict[i] = [["LipV-like dehydrogenase",lipvscores[lipvprots.index(i)]]]
4374 for i in toybprots:
4375 if i not in nucleoprots:
4376 nucleoprots.append(i)
4377 if detecteddomainsdict.has_key(i):
4378 detdomlist = detecteddomainsdict[i]
4379 detdomlist.append(["ToyB-like synthase",toybscores[toybprots.index(i)]])
4380 detecteddomainsdict[i] = detdomlist
4381 else:
4382 detecteddomainsdict[i] = [["ToyB-like synthase",toybscores[toybprots.index(i)]]]
4383 for i in tundprots:
4384 if i not in nucleoprots:
4385 nucleoprots.append(i)
4386 if detecteddomainsdict.has_key(i):
4387 detdomlist = detecteddomainsdict[i]
4388 detdomlist.append(["TunD-like putative N-acetylglucosamine transferase",tundscores[tundprots.index(i)]])
4389 detecteddomainsdict[i] = detdomlist
4390 else:
4391 detecteddomainsdict[i] = [["TunD-like putative N-acetylglucosamine transferase",tundscores[tundprots.index(i)]]]
4392 for i in pur6prots:
4393 if i not in nucleoprots:
4394 nucleoprots.append(i)
4395 if detecteddomainsdict.has_key(i):
4396 detdomlist = detecteddomainsdict[i]
4397 detdomlist.append(["Pur6-like synthetase",pur6scores[pur6prots.index(i)]])
4398 detecteddomainsdict[i] = detdomlist
4399 else:
4400 detecteddomainsdict[i] = [["Pur6-like synthetase",pur6scores[pur6prots.index(i)]]]
4401 for i in pur10prots:
4402 if i not in nucleoprots:
4403 nucleoprots.append(i)
4404 if detecteddomainsdict.has_key(i):
4405 detdomlist = detecteddomainsdict[i]
4406 detdomlist.append(["Pur10-like oxidoreductase",pur10scores[pur10prots.index(i)]])
4407 detecteddomainsdict[i] = detdomlist
4408 else:
4409 detecteddomainsdict[i] = [["Pur10-like oxidoreductase",pur10scores[pur10prots.index(i)]]]
4410 for i in nikjprots:
4411 if i not in nucleoprots:
4412 nucleoprots.append(i)
4413 if detecteddomainsdict.has_key(i):
4414 detdomlist = detecteddomainsdict[i]
4415 detdomlist.append(["NikJ-like protein",nikjscores[nikjprots.index(i)]])
4416 detecteddomainsdict[i] = detdomlist
4417 else:
4418 detecteddomainsdict[i] = [["NikJ-like protein",nikjscores[nikjprots.index(i)]]]
4419 for i in nikoprots:
4420 if i not in nucleoprots:
4421 nucleoprots.append(i)
4422 if detecteddomainsdict.has_key(i):
4423 detdomlist = detecteddomainsdict[i]
4424 detdomlist.append(["NikO-like enolpyruvyl transferase",nikoscores[nikoprots.index(i)]])
4425 detecteddomainsdict[i] = detdomlist
4426 else:
4427
4428 detecteddomainsdict[i] = [["NikO-like enolpyruvyl transferase",nikoscores[nikoprots.index(i)]]]
4429 #Extract phosphoglycolipid biosynthesis proteins
4430 phosphoprots = []
4431 if 1 in geneclustertypes or 17 in geneclustertypes:
4432 phosphogl = parsehmmoutput(65,hmmoutputfolder + "MoeO5.txt")
4433 phosphoprots = phosphogl[0]
4434 phosphoscores = phosphogl[1]
4435 for i in phosphoprots:
4436 if detecteddomainsdict.has_key(i):
4437 detdomlist = detecteddomainsdict[i]
4438 detdomlist.append(["MoeO5-like prenyl-3-phosphoglycerate synthase",phosphoscores[phosphoprots.index(i)]])
4439 detecteddomainsdict[i] = detdomlist
4440 else:
4441 detecteddomainsdict[i] = [["MoeO5-like prenyl-3-phosphoglycerate synthase",phosphoscores[phosphoprots.index(i)]]]
4442 #Extract melanin biosynthesis proteins
4443 melaninprots = []
4444 if 1 in geneclustertypes or 18 in geneclustertypes:
4445 melanin = parsehmmoutput(40,hmmoutputfolder + "melC.txt")
4446 melaninprots = melanin[0]
4447 melaninscores = melanin[1]
4448 for i in melaninprots:
4449 if detecteddomainsdict.has_key(i):
4450 detdomlist = detecteddomainsdict[i]
4451 detdomlist.append(["MelC-like melanin synthase",melaninscores[melaninprots.index(i)]])
4452 detecteddomainsdict[i] = detdomlist
4453 else:
4454 detecteddomainsdict[i] = [["MelC-like melanin synthase",melaninscores[melaninprots.index(i)]]]
4455 #Extract other putative secondary metabolite biosynthesis proteins
4456 otherprots = []
4457 amp_t_prots = []
4458 if 1 in geneclustertypes or 19 in geneclustertypes:
4459 pptb = parsehmmoutput(20,hmmoutputfolder + "PP-binding.txt")
4460 pptbprots = pptb[0]
4461 pptbscores = pptb[1]
4462 cond = parsehmmoutput(20,hmmoutputfolder + "Condensation.txt")
4463 amp = parsehmmoutput(20,hmmoutputfolder + "AMP-binding.txt")
4464 ampprots = amp[0]
4465 ampscores = amp[1]
4466 ampox = parsehmmoutput(50,hmmoutputfolder + "A-OX.txt")
4467 ampoxprots = ampox[0]
4468 ampoxscores = ampox[1]
4469 nad4 = parsehmmoutput(40,hmmoutputfolder + "NAD_binding_4.txt")
4470 nad4prots = nad4[0]
4471 nad4scores = nad4[1]
4472 cprots = cond[0]
4473 aprots = amp[0]
4474 for i in ampox[0]:
4475 if i not in aprots:
4476 aprots.append(i)
4477 nrpsprots2 = []
4478 for i in cprots:
4479 if i in aprots:
4480 nrpsprots2.append(i)
4481 tprots = pptb[0]
4482 for i in tprots:
4483 if i in aprots and i not in nrpsprots2 and i not in aminocoumarinprots:
4484 otherprots.append(i)
4485 amp_t_prots.append(i)
4486 if detecteddomainsdict.has_key(i):
4487 detdomlist = detecteddomainsdict[i]
4488 detdomlist.append(["PP-binding domain",pptbscores[pptbprots.index(i)]])
4489 if i in ampprots:
4490 detdomlist.append(["Adenylation domain",ampscores[ampprots.index(i)]])
4491 elif i in ampoxprots:
4492 detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
4493 detecteddomainsdict[i] = detdomlist
4494 else:
4495 if i in ampprots:
4496 detecteddomainsdict[i] = [["PP-binding domain",pptbscores[pptbprots.index(i)]],["Adenylation domain",ampscores[ampprots.index(i)]]]
4497 elif i in ampoxprots:
4498 detecteddomainsdict[i] = [["PP-binding domain",pptbscores[pptbprots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
4499 for i in nad4prots:
4500 if i in aprots and i not in aminocoumarinprots:
4501 otherprots.append(i)
4502 amp_t_prots.append(i)
4503 if detecteddomainsdict.has_key(i):
4504 detdomlist = detecteddomainsdict[i]
4505 detdomlist.append(["NAD-binding domain 4",nad4scores[nad4prots.index(i)]])
4506 if i in ampprots:
4507 detdomlist.append(["Adenylation domain",ampscores[ampprots.index(i)]])
4508 elif i in ampoxprots:
4509 detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
4510 detecteddomainsdict[i] = detdomlist
4511 else:
4512 if i in ampprots:
4513 detecteddomainsdict[i] = [["NAD-binding domain 4",nad4scores[nad4prots.index(i)]],["Adenylation domain",ampscores[ampprots.index(i)]]]
4514 elif i in ampoxprots:
4515 detecteddomainsdict[i] = [["NAD-binding domain 4",nad4scores[nad4prots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
4516 lmbu = parsehmmoutput(50,hmmoutputfolder + "LmbU.txt")
4517 lmbuprots = lmbu[0]
4518 lmbuscores = lmbu[1]
4519 for i in lmbuprots:
4520 if i not in otherprots:
4521 otherprots.append(i)
4522 if detecteddomainsdict.has_key(i):
4523 detdomlist = detecteddomainsdict[i]
4524 detdomlist.append(["LmbU-like protein",lmbuscores[lmbuprots.index(i)]])
4525 detecteddomainsdict[i] = detdomlist
4526 else:
4527 detecteddomainsdict[i] = [["LmbU-like protein",lmbuscores[lmbuprots.index(i)]]]
4528 goadsporin = parsehmmoutput(500,hmmoutputfolder + "goadsporin_like.txt")
4529 goadsporinprots = goadsporin[0]
4530 goadsporinscores = goadsporin[1]
4531 for i in goadsporinprots:
4532 if i not in otherprots:
4533 otherprots.append(i)
4534 if detecteddomainsdict.has_key(i):
4535 detdomlist = detecteddomainsdict[i]
4536 detdomlist.append(["Goadsporin-like protein",goadsporinscores[goadsporinprots.index(i)]])
4537 detecteddomainsdict[i] = detdomlist
4538 else:
4539 detecteddomainsdict[i] = [["Goadsporin-like protein",goadsporinscores[goadsporinprots.index(i)]]]
4540 neocarzinostat = parsehmmoutput(28,hmmoutputfolder + "Neocarzinostat.txt")
4541 neocarzinostatprots = neocarzinostat[0]
4542 neocarzinostatscores = neocarzinostat[1]
4543 for i in neocarzinostatprots:
4544 if i not in otherprots:
4545 otherprots.append(i)
4546 if detecteddomainsdict.has_key(i):
4547 detdomlist = detecteddomainsdict[i]
4548 detdomlist.append(["Neocarzinostatin-like protein",neocarzinostatscores[neocarzinostatprots.index(i)]])
4549 detecteddomainsdict[i] = detdomlist
4550 else:
4551 detecteddomainsdict[i] = [["Neocarzinostatin-like protein",neocarzinostatscores[neocarzinostatprots.index(i)]]]
4552 cyanobactin = parsehmmoutput(80,hmmoutputfolder + "cyanobactin_synth.txt")
4553 cyanobactinprots = cyanobactin[0]
4554 cyanobactinscores = cyanobactin[1]
4555 for i in cyanobactinprots:
4556 if i not in otherprots:
4557 otherprots.append(i)
4558 if detecteddomainsdict.has_key(i):
4559 detdomlist = detecteddomainsdict[i]
4560 detdomlist.append(["Cyanobactin protease",cyanobactinscores[cyanobactinprots.index(i)]])
4561 detecteddomainsdict[i] = detdomlist
4562 else:
4563 detecteddomainsdict[i] = [["Cyanobactin protease",cyanobactinscores[cyanobactinprots.index(i)]]]
4564 cycdipeptide = parsehmmoutput(110,hmmoutputfolder + "cycdipepsynth.txt")
4565 cycdipeptideprots = cycdipeptide[0]
4566 cycdipeptidescores = cycdipeptide[1]
4567 for i in cycdipeptideprots:
4568 if i not in otherprots:
4569 otherprots.append(i)
4570 if detecteddomainsdict.has_key(i):
4571 detdomlist = detecteddomainsdict[i]
4572 detdomlist.append(["Cyclodipeptide synthase",cycdipeptidescores[cycdipeptideprots.index(i)]])
4573 detecteddomainsdict[i] = detdomlist
4574 else:
4575 detecteddomainsdict[i] = [["Cyclodipeptide synthase",cycdipeptidescores[cycdipeptideprots.index(i)]]]
4576 fom1 = parsehmmoutput(750,hmmoutputfolder + "fom1.txt")
4577 fom1prots = fom1[0]
4578 fom1scores = fom1[1]
4579 for i in fom1prots:
4580 if i not in otherprots:
4581 otherprots.append(i)
4582 if detecteddomainsdict.has_key(i):
4583 detdomlist = detecteddomainsdict[i]
4584 detdomlist.append(["Fom1-like phosphomutase",fom1scores[fom1prots.index(i)]])
4585 detecteddomainsdict[i] = detdomlist
4586 else:
4587 detecteddomainsdict[i] = [["Fom1-like phosphomutase",fom1scores[fom1prots.index(i)]]]
4588 bcpb = parsehmmoutput(400,hmmoutputfolder + "bcpB.txt")
4589 bcpbprots = bcpb[0]
4590 bcpbscores = bcpb[1]
4591 for i in bcpbprots:
4592 if i not in otherprots:
4593 otherprots.append(i)
4594 if detecteddomainsdict.has_key(i):
4595 detdomlist = detecteddomainsdict[i]
4596 detdomlist.append(["BcpB-like phosphomutase",bcpbscores[bcpbprots.index(i)]])
4597 detecteddomainsdict[i] = detdomlist
4598 else:
4599 detecteddomainsdict[i] = [["BcpB-like phosphomutase",bcpbscores[bcpbprots.index(i)]]]
4600 frbd = parsehmmoutput(350,hmmoutputfolder + "frbD.txt")
4601 frbdprots = frbd[0]
4602 frbdscores = frbd[1]
4603 for i in frbdprots:
4604 if i not in otherprots:
4605 otherprots.append(i)
4606 if detecteddomainsdict.has_key(i):
4607 detdomlist = detecteddomainsdict[i]
4608 detdomlist.append(["FrbD-like phosphomutase",frbdscores[frbdprots.index(i)]])
4609 detecteddomainsdict[i] = detdomlist
4610 else:
4611 detecteddomainsdict[i] = [["FrbD-like phosphomutase",frbdscores[frbdprots.index(i)]]]
4612 mite = parsehmmoutput(400,hmmoutputfolder + "mitE.txt")
4613 miteprots = mite[0]
4614 mitescores = mite[1]
4615 for i in miteprots:
4616 if i not in otherprots:
4617 otherprots.append(i)
4618 if detecteddomainsdict.has_key(i):
4619 detdomlist = detecteddomainsdict[i]
4620 detdomlist.append(["MitE-like CoA-ligase",mitescores[miteprots.index(i)]])
4621 detecteddomainsdict[i] = detdomlist
4622 else:
4623 detecteddomainsdict[i] = [["MitE-like CoA-ligase",mitescores[miteprots.index(i)]]]
4624 vlmb = parsehmmoutput(250,hmmoutputfolder + "vlmB.txt")
4625 vlmbprots = vlmb[0]
4626 vlmbscores = vlmb[1]
4627 for i in vlmbprots:
4628 if i not in otherprots:
4629 otherprots.append(i)
4630 if detecteddomainsdict.has_key(i):
4631 detdomlist = detecteddomainsdict[i]
4632 detdomlist.append(["Valanimycin biosynthesis VlmB domain",vlmbscores[vlmbprots.index(i)]])
4633 detecteddomainsdict[i] = detdomlist
4634 else:
4635 detecteddomainsdict[i] = [["Valanimycin biosynthesis VlmB domain",vlmbscores[vlmbprots.index(i)]]]
4636 prnb = parsehmmoutput(200,hmmoutputfolder + "prnB.txt")
4637 prnbprots = prnb[0]
4638 prnbscores = prnb[1]
4639 for i in prnbprots:
4640 if i not in otherprots:
4641 otherprots.append(i)
4642 if detecteddomainsdict.has_key(i):
4643 detdomlist = detecteddomainsdict[i]
4644 detdomlist.append(["Pyrrolnitrin biosynthesis PrnB domain",prnbscores[prnbprots.index(i)]])
4645 detecteddomainsdict[i] = detdomlist
4646 else:
4647 detecteddomainsdict[i] = [["Pyrrolnitrin biosynthesis PrnB domain",prnbscores[prnbprots.index(i)]]]
4648 if 5 not in geneclustertypes and 1 not in geneclustertypes:
4649 nrpsprots = []
4650 if 4 not in geneclustertypes and 1 not in geneclustertypes:
4651 t3pksprots = []
4652 if 3 not in geneclustertypes and 1 not in geneclustertypes:
4653 t2pksprots = []
4654 if 2 not in geneclustertypes and 1 not in geneclustertypes:
4655 t1pksprots = []
4656 t4pksprots = []
4657 transatpksprots = []
4658 #Assemble all core sec met proteins
4659 allsecmetprots = []
4660 for i in t1pksprots:
4661 if i not in allsecmetprots:
4662 allsecmetprots.append(i)
4663 for i in transatpksprots:
4664 if i not in allsecmetprots:
4665 allsecmetprots.append(i)
4666 for i in t2pksprots:
4667 if i not in allsecmetprots:
4668 allsecmetprots.append(i)
4669 for i in t3pksprots:
4670 if i not in allsecmetprots:
4671 allsecmetprots.append(i)
4672 for i in t4pksprots:
4673 if i not in allsecmetprots:
4674 allsecmetprots.append(i)
4675 for i in nrpsprots:
4676 if i not in allsecmetprots:
4677 allsecmetprots.append(i)
4678 for i in terpeneprots:
4679 if i not in allsecmetprots:
4680 allsecmetprots.append(i)
4681 for i in lantprots:
4682 if i not in allsecmetprots:
4683 allsecmetprots.append(i)
4684 for i in bcinprots:
4685 if i not in allsecmetprots:
4686 allsecmetprots.append(i)
4687 for i in lactamprots:
4688 if i not in allsecmetprots:
4689 allsecmetprots.append(i)
4690 for i in amglyccyclprots:
4691 if i not in allsecmetprots:
4692 allsecmetprots.append(i)
4693 for i in siderophoreprots:
4694 if i not in allsecmetprots:
4695 allsecmetprots.append(i)
4696 for i in ectprots:
4697 if i not in allsecmetprots:
4698 allsecmetprots.append(i)
4699 for i in butyrprots:
4700 if i not in allsecmetprots:
4701 allsecmetprots.append(i)
4702 for i in indoleprots:
4703 if i not in allsecmetprots:
4704 allsecmetprots.append(i)
4705 for i in nucleoprots:
4706 if i not in allsecmetprots:
4707 allsecmetprots.append(i)
4708 for i in phosphoprots:
4709 if i not in allsecmetprots:
4710 allsecmetprots.append(i)
4711 for i in melaninprots:
4712 if i not in allsecmetprots:
4713 allsecmetprots.append(i)
4714 for i in aminocoumarinprots:
4715 if i not in allsecmetprots:
4716 allsecmetprots.append(i)
4717 for i in otherprots:
4718 if i not in allsecmetprots:
4719 allsecmetprots.append(i)
4720 allsecmetprots.sort()
4721
4722 if len(allsecmetprots) == 0:
4723 logfile.write("No secondary metabolite biosynthesis gene clusters detected in this nucleotide file.\n")
4724 logfile.close()
4725 print >> sys.stderr, "No secondary metabolite biosynthesis gene clusters detected in this nucleotide file."
4726 sys.exit(1)
4727
4728 elapsed = (time.time() - starttime)
4729 #print "4713Time since start: " + str(elapsed)
4730
4731 #Extract approximate gene clusters based on hmmsearch results, create list of core PKS / NRPS genes for further analysis (use less strict parameters for this then in gene cluster detection to include all PKS/NRPS domains)
4732 #Create nucleotide fasta files with sec met gene clusters
4733 #print "Extracting gene clusters from gbk/embl file using detected signature genes..."
4734 logfile.write("Extracting gene clusters from gbk/embl file using detected signature genes...\n")
4735 fastafile = open(genomename + "/clusterblast/geneclusterprots.fasta","w")
4736 txtfile = open(genomename + "/clusterblast/geneclusters.txt","w")
4737 wb = Workbook()
4738 font1 = Font()
4739 style1 = XFStyle()
4740 style1.font = font1
4741 font1.bold = True
4742 ws0 = wb.add_sheet('0')
4743 ws0.write(0,0,"Input accession number",style1)
4744 ws0.write(0,1,"Input name",style1)
4745 ws0.write(0,2,"Gene cluster type",style1)
4746 ws0.write(0,3,"Gene cluster genes",style1)
4747 if clusterblast == "y":
4748 ws0.write(0,4,"Compound with gene cluster of highest homology",style1)
4749 protcodes = allsecmetprots
4750 nuccode = genomename
4751 gbkfile = open(infile,"r")
4752 output = gbkfile.read()
4753 output = output.replace("\r","\n")
4754 #Extract description of nucleotide from gbk/embl file
4755 if ".gbk" in infile or ".GBK" in infile or ".gb" in infile or ".GB" in infile or ".genbank" in infile or ".GENBANK" in infile:
4756 try:
4757 nucname1 = output.split("ACCESSION ")[0]
4758 nucname2 = nucname1.split("DEFINITION ")[1]
4759 nucname3 = nucname2.replace("\n","")
4760 while " " in nucname3:
4761 nucname3 = nucname3.replace(" "," ")
4762 nucname = nucname3
4763 except(KeyError,IOError,IndexError):
4764 nucname = "input_nucleotide"
4765 elif ".embl" in infile or ".EMBL" in infile or ".emb" in infile or ".EMB" in infile:
4766 try:
4767 nucname1 = output.split("DE ")[1]
4768 nucname2 = nucname1.split("\n")[0]
4769 nucname3 = nucname2.replace("\n","")
4770 while " " in nucname3:
4771 nucname3 = nucname3.replace(" "," ")
4772 nucname = nucname3
4773 except(KeyError,IOError,IndexError):
4774 nucname = "input_nucleotide"
4775 protstartlocations = []
4776 protendlocations = []
4777 genelist = proteins[2]
4778 genedict = proteins[3]
4779 #Save all locations of query proteins on the nucleotide in a list
4780 for j in protcodes:
4781 if j in genelist:
4782 protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
4783 protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
4784 protstartlocations.append(protstart_abs)
4785 protendlocations.append(protend_abs)
4786 #Identify clusters of genes based on protein locations on the nucleotide
4787 clusterstarts = []
4788 clusterends = []
4789 protstartlocations.sort()
4790 protendlocations.sort()
4791 nrlocations = len(protstartlocations)
4792 a = 0
4793 for i in protstartlocations:
4794 if a == 0:
4795 start = str(i)
4796 clusterstarts.append(start)
4797 if len(protendlocations) == 1:
4798 clusterends.append(protendlocations[a])
4799 elif a == nrlocations - 1:
4800 if i < ((protendlocations[a - 1]) + 20000):
4801 clusterends.append(str(protendlocations[a]))
4802 else:
4803 end = str(protendlocations[a - 1])
4804 clusterends.append(end)
4805 clusterstarts.append(str(i))
4806 clusterends.append(str(protendlocations[a]))
4807 else:
4808 if i > ((protendlocations[a - 1]) + 20000):
4809 clusterends.append(str(protendlocations[a - 1]))
4810 start = str(i)
4811 clusterstarts.append(start)
4812 else:
4813 pass
4814 a += 1
4815 lastendlocation = i
4816 #Extend clusters with 20kb on each side of the identified core genes
4817 clusterstarts2 = []
4818 for i in clusterstarts:
4819 j = int(i) - 20000
4820 if j < 0:
4821 j = 0
4822 clusterstarts2.append(j)
4823 clusterstarts = clusterstarts2
4824 clusterends2 = []
4825 for i in clusterends:
4826 j = int(i) + 20000
4827 clusterends2.append(j)
4828 clusterends = clusterends2
4829 #For each genbank secondary metabolite gene cluster: extract all proteins and write to fasta,
4830 a = 0
4831 clusterinfo = {}
4832 geneclusters = []
4833 geneclustergenes = []
4834 allcoregenes = []
4835 for i in clusterstarts:
4836 cstart = int(i)
4837 cend = int(clusterends[a])
4838 a += 1
4839 clusternr = a
4840 geneclusters.append(clusternr)
4841 coregenes = []
4842 clustergenes = []
4843 #For each gene in nucleotide, check if it is inside this cluster; if, so append info to list of clustergenes
4844 if a == 1:
4845 for i in genelist:
4846 geneinfo = genedict[i][:-1]
4847 geneinfo.append(i)
4848 genedict[i] = geneinfo
4849 for i in genelist:
4850 geneinfo = genedict[i]
4851 genestart = int(geneinfo[0])
4852 geneend = int(geneinfo[1])
4853 if (genestart > cstart and genestart < cend) or (geneend > cstart and geneend < cend):
4854 clustergenes.append(geneinfo)
4855 #Determine type of cluster
4856 type = "other"
4857 z = 0
4858 for k in clustergenes:
4859 i = k[4]
4860 if i in t1pksprots:
4861 if z == 0:
4862 type = "t1pks"
4863 elif "t1pks" not in type:
4864 type = type + "-t1pks"
4865 z = 1
4866 if i in transatpksprots:
4867 if z == 0:
4868 type = "transatpks"
4869 elif "transatpks" not in type:
4870 type = type + "-transatpks"
4871 z = 1
4872 if i in t2pksprots:
4873 if z == 0:
4874 type = "t2pks"
4875 elif "t2pks" not in type:
4876 type = type + "-t2pks"
4877 z = 1
4878 if i in t3pksprots:
4879 if z == 0:
4880 type = "t3pks"
4881 elif "t3pks" not in type:
4882 type = type + "-t3pks"
4883 z = 1
4884 if i in t4pksprots:
4885 if z == 0:
4886 type = "t1pks"
4887 elif "t1pks" not in type:
4888 type = type + "-t1pks"
4889 z = 1
4890 if i in nrpsprots:
4891 if z == 0:
4892 type = "nrps"
4893 elif "nrps" not in type:
4894 type = type + "-nrps"
4895 z = 1
4896 if i in terpeneprots:
4897 if z == 0:
4898 type= "terpene"
4899 elif "terpene" not in type:
4900 type = type + "-terpene"
4901 z = 1
4902 if i in lantprots:
4903 if z == 0:
4904 type= "lant"
4905 elif "lant" not in type:
4906 type = type + "-lant"
4907 z = 1
4908 if i in bcinprots:
4909 if z == 0:
4910 type= "bcin"
4911 elif "bcin" not in type:
4912 type = type + "-bcin"
4913 z = 1
4914 if i in lactamprots:
4915 if z == 0:
4916 type = "blactam"
4917 elif "blactam" not in type:
4918 type = type + "-blactam"
4919 z = 1
4920 if i in amglyccyclprots:
4921 if z == 0:
4922 type = "amglyccycl"
4923 elif "amglyccycl" not in type:
4924 type = type + "-amglyccycl"
4925 z = 1
4926 if i in siderophoreprots:
4927 if z == 0:
4928 type = "siderophore"
4929 elif "siderophore" not in type:
4930 type = type + "-siderophore"
4931 z = 1
4932 if i in ectprots:
4933 if z == 0:
4934 type = "ectoine"
4935 elif "ectoine" not in type:
4936 type = type + "-ectoine"
4937 z = 1
4938 if i in indoleprots:
4939 if z == 0:
4940 type = "indole"
4941 elif "indole" not in type:
4942 type = type + "-indole"
4943 z = 1
4944 if i in nucleoprots:
4945 if z == 0:
4946 type = "nucleoside"
4947 elif "nucleoside" not in type:
4948 type = type + "-nucleoside"
4949 z = 1
4950 if i in phosphoprots:
4951 if z == 0:
4952 type = "phosphoglycolipid"
4953 elif "phosphoglycolipid" not in type:
4954 type = type + "-phosphoglycolipid"
4955 z = 1
4956 if i in butyrprots:
4957 if z == 0:
4958 type = "butyrolactone"
4959 elif "butyrolactone" not in type:
4960 type = type + "-butyrolactone"
4961 z = 1
4962 if i in melaninprots:
4963 if z == 0:
4964 type = "melanin"
4965 elif "melanin" not in type:
4966 type = type + "-melanin"
4967 z = 1
4968 if i in aminocoumarinprots:
4969 if z == 0:
4970 type = "aminocoumarin"
4971 elif "aminocoumarin" not in type:
4972 type = type + "-aminocoumarin"
4973 z = 1
4974 if "other-" in type[:6]:
4975 type = type[6:]
4976 #Shorten gene cluster if type is among typically short gene cluster types
4977 if cend > dnaseqlength:
4978 cend = dnaseqlength
4979 if type == "t3pks" or type == "t2pks":
4980 if cstart != 0:
4981 cstart = cstart + 5000
4982 if cend != dnaseqlength:
4983 cend = cend - 5000
4984 clustergenes2 = []
4985 for i in clustergenes:
4986 start = int(i[0])
4987 end = int(i[1])
4988 if (start > cstart and start < cend) or (end > cstart and end < cend):
4989 clustergenes2.append(i)
4990 clustergenes = clustergenes2
4991 if type == "bcin" or type == "siderophore" or type == "lant" or type == "terpene":
4992 if cstart != 0:
4993 cstart = cstart + 10000
4994 if cend != dnaseqlength:
4995 cend = cend - 10000
4996 clustergenes2 = []
4997 for i in clustergenes:
4998 start = int(i[0])
4999 end = int(i[1])
5000 if (start > cstart and start < cend) or (end > cstart and end < cend):
5001 clustergenes2.append(i)
5002 clustergenes = clustergenes2
5003 if type == "butyrolactone" or type == "melanin" or type == "ectoine":
5004 if cstart != 0:
5005 cstart = cstart + 17000
5006 if cend != dnaseqlength:
5007 cend = cend - 17000
5008 clustergenes2 = []
5009 for i in clustergenes:
5010 start = int(i[0])
5011 end = int(i[1])
5012 if (start > cstart and start < cend) or (end > cstart and end < cend):
5013 clustergenes2.append(i)
5014 clustergenes = clustergenes2
5015 #For all clustergenes, write info to fasta
5016 for i in clustergenes:
5017 start = str(i[0])
5018 end = str(i[1])
5019 strand = i[2]
5020 seq = seqdict[i[4]]
5021 ann = i[3].replace(" ","_")
5022 accession = i[4]
5023 name = nuccode + "|c" + str(a) + "|" + start + "-" + end + "|" + strand + "|" + accession + "|" + ann
5024 fastafile.write(">" + name + "\n" + seq + "\n")
5025 if accession not in geneclustergenes:
5026 geneclustergenes.append(accession)
5027 #Write gene cluster info to separate txt file
5028 txtfile.write(nuccode + "\t" + nucname + "\t" + "c" + str(a) + "\t" + type + "\t")
5029 ws0.write(a,0,genomic_accnr)
5030 try:
5031 ws0.write(a,1,nucname)
5032 except:
5033 ws0.write(a,1,"Name to long to be contained in Excel cell; see txt file in downloadable zip archive.")
5034 ws0.write(a,2,type)
5035 xlsgenesfield = ""
5036 for i in clustergenes:
5037 txtfile.write(i[4] + ";")
5038 xlsgenesfield = xlsgenesfield + i[4] + ";"
5039 txtfile.write("\t")
5040 for i in clustergenes:
5041 txtfile.write(accessiondict[i[4]] + ";")
5042 xlsgenesfield = xlsgenesfield[:-1]
5043 try:
5044 ws0.write(a,3,xlsgenesfield)
5045 except:
5046 ws0.write(a,3,"Too many genes to be contained in Excel cell; see txt file in downloadable zip archive.")
5047 txtfile.write("\n")
5048 #Write gene cluster info to clusterinfo dictionary
5049 for i in clustergenes:
5050 if i[4] in allsecmetprots:
5051 coregenes.append(i[4])
5052 allcoregenes.append(i[4])
5053 clusterinfo[clusternr] = [type,cstart,cend,coregenes,clustergenes]
5054 #Close xls, fasta and txt files
5055 fastafile.close()
5056 txtfile.close()
5057
5058 #Analysis of core PKS/NRPS genes (separate py), detect subgroups and predict specificities and final products
5059 #Make list of PKS / NRPS gene clusters to be analysed
5060 #print "Analysing core PKS/NRPS genes..."
5061 logfile.write("Analysing core PKS/NRPS genes...\n")
5062 pksnrpsgeneclusters = []
5063 pksnrpscoregenes = []
5064 for i in geneclusters:
5065 if "t1pks" in clusterinfo[i][0] or "t4pks" in clusterinfo[i][0] or "transatpks" in clusterinfo[i][0] or "nrps" in clusterinfo[i][0]:
5066 pksnrpsgeneclusters.append(i)
5067 for i in t1pksprots:
5068 pksnrpscoregenes.append(i)
5069 for i in transatpksprots:
5070 pksnrpscoregenes.append(i)
5071 for i in t4pksprots:
5072 pksnrpscoregenes.append(i)
5073 for i in nrpsprots:
5074 pksnrpscoregenes.append(i)
5075 for i in amp_t_prots:
5076 pksnrpscoregenes.append(i)
5077 pksnrpsgenestartdict = {}
5078 for i in pksnrpscoregenes:
5079 start = int(genedict[i][0])
5080 pksnrpsgenestartdict[i] = start
5081 pksnrpscoregenes = sortdictkeysbyvalues(pksnrpsgenestartdict)
5082 nrpsnames = []
5083 nrpsseqs = []
5084 pksnrpsnames = []
5085 pksnrpsseqs = []
5086 pksnames = []
5087 pksseqs = []
5088 calnames = []
5089 calseqs = []
5090 krnames = []
5091 krseqs = []
5092 nrpspkstypedict = {}
5093 domaindict = {}
5094 if len(pksnrpscoregenes) > 0:
5095 #Write PKS / NRPS core genes to FASTA file
5096 for i in pksnrpscoregenes:
5097 name = i
5098 seq = seqdict[i]
5099 pksnrpsnames.append(name)
5100 pksnrpsseqs.append(seq)
5101 writefasta(pksnrpsnames,pksnrpsseqs,genomename + "/nrpspks_proteins.fasta")
5102 #Analyse for abMotifs
5103 hmmsearch = hmmscan_path + " --cpu " + str(nrcpus) + " -E 0.1 -o " + genomename + "/nrpspks/abmotifshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/abmotifshmm.txt "+ hmms_path +"abmotifs.hmm " + genomename + "/nrpspks_proteins.fasta"
5104 os.system(hmmsearch)
5105 mhmmlengthsdict = hmmlengths(hmms_path+"abmotifs.hmm")
5106 motifdict = hmmscanparse(genomename + "/nrpspks/abmotifshmm_output.txt",mhmmlengthsdict)
5107 #Analyse for C/A/PCP/E/KS/AT/ATd/DH/KR/ER/ACP/TE/TD/COM/Docking/MT/CAL domains
5108 hmmsearch = hmmscan_path + " --cut_tc --cpu " + str(nrcpus) + " -o " + genomename + "/nrpspks/nrpspkshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/nrpspkshmm.txt "+ hmms_path +"nrpspksdomains.hmm " + genomename + "/nrpspks_proteins.fasta"
5109 os.system(hmmsearch)
5110 hmmlengthsdict = hmmlengths(hmms_path+"nrpspksdomains.hmm")
5111 domaindict = hmmscanparse(genomename + "/nrpspks/nrpspkshmm_output.txt",hmmlengthsdict)
5112 nrpspksdomainsfile = open(genomename + "/nrpspks/nrpspksdomains.txt","w")
5113 #Analyse KS domains & PKS/NRPS protein domain composition to detect NRPS/PKS types
5114 kshmmsearch = hmmscan_path + " --cut_tc --cpu " + str(nrcpus) + " -o " + genomename + "/nrpspks/kshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/kshmm.txt " + hmms_path + "ksdomains.hmm " + genomename + "/nrpspks_proteins.fasta"
5115 os.system(kshmmsearch)
5116 kshmmlengthsdict = hmmlengths(hmms_path+"ksdomains.hmm")
5117 ksdomaindict = hmmscanparse(genomename + "/nrpspks/kshmm_output.txt",kshmmlengthsdict)
5118 for k in pksnrpscoregenes:
5119 #structure of domaindict: domaindict[genename] = [[name,start,end,evalue,score],[name,start,end,evalue,score], etc.]
5120 domainlist = []
5121 nrKSdomains = 0
5122 for i in domaindict[k]:
5123 domainlist.append(i[0])
5124 if i[0] == "PKS_KS":
5125 nrKSdomains += 1
5126 modKSscore = 0
5127 traKSscore = 0
5128 eneKSscore = 0
5129 iterKSscore = 0
5130 for i in ksdomaindict[k]:
5131 if i[0] == "Trans-AT-KS":
5132 traKSscore += 1
5133 if i[0] == "Modular-KS":
5134 modKSscore += 1
5135 if i[0] == "Enediyne-KS":
5136 eneKSscore += 1
5137 if i[0] == "Iterative-KS":
5138 iterKSscore += 1
5139 for i in domaindict[k]:
5140 if "Cglyc" in domainlist and "Epimerization" in domainlist and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
5141 type = "Glycopeptide NRPS"
5142 elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist) and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
5143 type = "NRPS"
5144 elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist) or "AMP-binding" in domainlist and ("PKS_KS" in domainlist or "PKS_AT" in domainlist):
5145 type = "Hybrid PKS-NRPS"
5146 elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" not in domainlist and "Trans-AT_docking" in domainlist and traKSscore > modKSscore and traKSscore > iterKSscore and traKSscore > eneKSscore:
5147 type = "Type I Trans-AT PKS"
5148 elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and iterKSscore > modKSscore and iterKSscore > traKSscore and iterKSscore > eneKSscore and nrKSdomains < 3:
5149 type = "Type I Iterative PKS"
5150 elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and eneKSscore > modKSscore and eneKSscore > traKSscore and eneKSscore > iterKSscore and nrKSdomains < 3:
5151 type = "Type I Enediyne PKS"
5152 elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and ((modKSscore > eneKSscore and modKSscore > traKSscore and modKSscore > iterKSscore) or nrKSdomains > 3):
5153 type = "Type I Modular PKS"
5154 elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist:
5155 type = "PKS-like protein"
5156 elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist or "AMP-binding" in domainlist) and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
5157 type = "NRPS-like protein"
5158 else:
5159 type = "PKS/NRPS-like protein"
5160 nrpspkstypedict[k] = type
5161 #Write data to output file
5162 for k in pksnrpscoregenes:
5163 j = domaindict[k]
5164 l = motifdict[k]
5165 nrpspksdomainsfile.write(">> " + k + "\n")
5166 nrpspksdomainsfile.write(">> " + nrpspkstypedict[k] + "\n")
5167 nrpspksdomainsfile.write("name\tstart\tend\te-value\tscore\n")
5168 for i in j:
5169 #nrpspksdomainsfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n")
5170 nrpspksdomainsfile.write("%s\t%s\t%s\t%s\t%s\n" % (i[0], i[1], i[2], i[3], i[4]) )
5171 nrpspksdomainsfile.write("** Motifs: **\n")
5172 for i in l:
5173 #nrpspksdomainsfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n")
5174 nrpspksdomainsfile.write("%s\t%s\t%s\t%s\t%s\n" % (i[0], i[1], i[2], i[3], i[4]) )
5175 nrpspksdomainsfile.write("\n\n")
5176 nrpspksdomainsfile.close()
5177
5178 elapsed = (time.time() - starttime)
5179 #print "5163Time since start: " + str(elapsed)
5180
5181 #Predict NRPS A domain specificities with NRPSPredictor and Minowa et al. method
5182 #print "Predicting NRPS A domain substrate specificities by NRPSPredictor"
5183 logfile.write("Predicting NRPS A domain substrate specificities by NRPSPredictor\n")
5184 #NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor
5185 for k in pksnrpscoregenes:
5186 j = domaindict[k]
5187 nr = 0
5188 for i in j:
5189 if i[0] == "AMP-binding" or i[0] == "A-OX":
5190 nr += 1
5191 start = int(i[1])
5192 end = int(i[2]) + 120
5193 seq = seqdict[k][start:end]
5194 name = k + "_A" + str(nr)
5195 nrpsnames.append(name)
5196 nrpsseqs.append(seq)
5197 if len(nrpsnames) > 0:
5198 writefasta(nrpsnames,nrpsseqs,"NRPSPredictor2/nrpsseqs.fasta")
5199 #nrpspredcommand = "perl nrpsSpecPredictor.pl nrpsseqs.fasta ../" + nrpspredictoroutputfolder + " ." #OLD NRPSPREDICTOR1 command
5200 os.chdir("NRPSPredictor2/")
5201 #Get NRPSPredictor2 code predictions, output sig file for input for NRPSPredictor2 SVMs
5202 if sys.platform == ('win32'):
5203 nrpspred2codecommand = 'nrpscodepred nrpsseqs.fasta input.sig nrpscodes.txt > nul'
5204 if sys.platform == ('linux2'):
5205 nrpspred2codecommand = 'python nrpscodepred.py nrpsseqs.fasta input.sig nrpscodes.txt > /dev/null'
5206 os.system(nrpspred2codecommand)
5207 #Run NRPSPredictor2 SVM
5208 currentdir = os.getcwd()
5209 if sys.platform == ('win32'):
5210 nrpspred2command = 'java -Ddatadir="' + currentdir + '\\data" -cp build/NRPSpredictor2.jar;lib/java-getopt-1.0.13.jar;lib/Utilities.jar;lib/libsvm.jar org.roettig.NRPSpredictor2.NRPSpredictor2 -i input.sig -r ..\\' + nrpspredictoroutputfolder + 'nrpspredictor2.out -s 1'
5211 if sys.platform == ('linux2'):
5212 nrpspred2command = './NRPSpredictor2.sh -i input.sig -r ../' + nrpspredictoroutputfolder + 'nrpspredictor2.out -s 1'
5213 os.popen(nrpspred2command)
5214 #Copy NRPSPredictor results
5215 if sys.platform == ('win32'):
5216 copycommand = 'copy/y nrpscodes.txt ..\\' + nrpspredictoroutputfolder.replace("/","\\") + ' > nul'
5217 if sys.platform == ('linux2'):
5218 copycommand = 'cp nrpscodes.txt ../' + nrpspredictoroutputfolder + " > /dev/null"
5219 os.system(copycommand)
5220 os.chdir("..")
5221 elapsed = (time.time() - starttime)
5222 #print "5206Time since start: " + str(elapsed)
5223 # folgendes bis zum naechsten time braucht 500s, liegt wohl haupsaechlich an schlechtem minowa_A code
5224 #Minowa method: extract AMP-binding domain, and run Minowa_A
5225 if len(nrpsnames) > 0:
5226 #print "Predicting NRPS A domain substrate specificities by Minowa et al. method\n"
5227 logfile.write("Predicting NRPS A domain substrate specificities by Minowa et al. method")
5228 nrpsnames2 = []
5229 nrpsseqs2 = []
5230 for k in pksnrpscoregenes:
5231 j = domaindict[k]
5232 nr = 0
5233 for i in j:
5234 if i[0] in ["AMP-binding", "A-OX"]:
5235 nr += 1
5236 start = int(i[1])
5237 end = int(i[2])
5238 seq = seqdict[k][start:end]
5239 name = k + "_A" + str(nr)
5240 nrpsnames2.append(name)
5241 nrpsseqs2.append(seq)
5242 writefasta(nrpsnames2,nrpsseqs2,minowanrpsoutputfolder + "nrpsseqs.fasta")
5243 if sys.platform == ('win32'):
5244 minowanrpscommand = "minowa_A ../" + minowanrpsoutputfolder + "nrpsseqs.fasta ../" + minowanrpsoutputfolder + "nrpspredoutput.txt"
5245 if sys.platform == ('linux2'):
5246 minowanrpscommand = "python minowa_A.py ../" + minowanrpsoutputfolder + "nrpsseqs.fasta ../" + minowanrpsoutputfolder + "nrpspredoutput.txt"
5247 os.chdir("Minowa/")
5248 os.system(minowanrpscommand)
5249 os.chdir("..")
5250
5251 elapsed = (time.time() - starttime)
5252 #print "5235Time since start: " + str(elapsed)
5253 #Predict PKS AT domain specificities with Minowa et al. method and PKS code (NP searcher / ClustScan / own?)
5254 for k in pksnrpscoregenes:
5255 j = domaindict[k]
5256 nr = 0
5257 for i in j:
5258 if i[0] == "PKS_AT":
5259 nr += 1
5260 start = int(i[1])
5261 end = int(i[2])
5262 seq = seqdict[k][start:end]
5263 name = k + "_AT" + str(nr)
5264 pksnames.append(name)
5265 pksseqs.append(seq)
5266 if len(pksnames) > 0:
5267 writefasta(pksnames,pksseqs,pkssignatureoutputfolder + "pksseqs.fasta")
5268 writefasta(pksnames,pksseqs,minowapksoutputfolder + "pksseqs.fasta")
5269 #Run PKS signature analysis
5270 elapsed = (time.time() - starttime)
5271 #print "5254Time since start: " + str(elapsed)
5272 print "Predicting PKS AT domain substrate specificities by Yadav et al. PKS signature sequences"
5273 logfile.write("Predicting PKS AT domain substrate specificities by Yadav et al. PKS signature sequences\n")
5274 if sys.platform == ('win32'):
5275 pkspredcommand = "PKS_analysis ../" + pkssignatureoutputfolder + "pksseqs.fasta ../" + pkssignatureoutputfolder + "pkspredoutput.txt"
5276 if sys.platform == ('linux2'):
5277 pkspredcommand = "python PKS_analysis.py ../" + pkssignatureoutputfolder + "pksseqs.fasta ../" + pkssignatureoutputfolder + "pkspredoutput.txt"
5278 os.chdir("pkssignatures/")
5279 os.system(pkspredcommand)
5280 os.chdir("..")
5281 #Minowa method: run Minowa_AT
5282 elapsed = (time.time() - starttime)
5283 #print "5266Time since start: " + str(elapsed)
5284 print "Predicting PKS AT domain substrate specificities by Minowa et al. method"
5285 logfile.write("Predicting PKS AT domain substrate specificities by Minowa et al. method\n")
5286 if sys.platform == ('win32'):
5287 minowapkscommand = "minowa_AT ../" + minowapksoutputfolder + "pksseqs.fasta ../" + minowapksoutputfolder + "pkspredoutput.txt"
5288 if sys.platform == ('linux2'):
5289 minowapkscommand = "python minowa_AT.py ../" + minowapksoutputfolder + "pksseqs.fasta ../" + minowapksoutputfolder + "pkspredoutput.txt"
5290 os.chdir("Minowa/")
5291 os.system(minowapkscommand)
5292 os.chdir("..")
5293
5294 #Predict PKS CAL domain specificities with Minowa et al. method
5295 elapsed = (time.time() - starttime)
5296 #print "5279Time since start: " + str(elapsed)
5297 print "Predicting CAL domain substrate specificities by Minowa et al. method"
5298 logfile.write("Predicting CAL domain substrate specificities by Minowa et al. method\n")
5299 for k in pksnrpscoregenes:
5300 j = domaindict[k]
5301 nr = 0
5302 for i in j:
5303 if i[0] == "CAL_domain":
5304 nr += 1
5305 start = int(i[1])
5306 end = int(i[2])
5307 seq = seqdict[k][start:end]
5308 name = k + "_CAL" + str(nr)
5309 calnames.append(name)
5310 calseqs.append(seq)
5311 if len(calnames) > 0:
5312 writefasta(calnames,calseqs,minowacaloutputfolder + "calseqs.fasta")
5313 if sys.platform == ('win32'):
5314 minowacalcommand = "minowa_CAL ../" + minowacaloutputfolder + "calseqs.fasta ../" + minowacaloutputfolder + "calpredoutput.txt"
5315 if sys.platform == ('linux2'):
5316 minowacalcommand = "python minowa_CAL.py ../" + minowacaloutputfolder + "calseqs.fasta ../" + minowacaloutputfolder + "calpredoutput.txt"
5317 os.chdir("Minowa/")
5318 os.system(minowacalcommand)
5319 os.chdir("..")
5320
5321 elapsed = (time.time() - starttime)
5322 #print "5305Time since start: " + str(elapsed)
5323 #Predict PKS KR domain stereochemistry using pattern as published in ClustScan
5324 print "Predicting PKS KR activity and stereochemistry using KR fingerprints from Starcevic et al."
5325 logfile.write("Predicting PKS KR activity and stereochemistry using KR fingerprints from Starcevic et al.\n")
5326 for k in pksnrpscoregenes:
5327 j = domaindict[k]
5328 nr = 0
5329 for i in j:
5330 if i[0] == "PKS_KR":
5331 nr += 1
5332 start = int(i[1])
5333 end = int(i[2])
5334 seq = seqdict[k][start:end]
5335 name = k + "_KR" + str(nr)
5336 krnames.append(name)
5337 krseqs.append(seq)
5338 if len(krnames) > 0:
5339 writefasta(krnames,krseqs,kranalysisoutputfolder + "krseqs.fasta")
5340 if sys.platform == ('win32'):
5341 kranalysiscommand = "kr_analysis ../" + kranalysisoutputfolder + "krseqs.fasta ../" + kranalysisoutputfolder + "krpredoutput.txt"
5342 if sys.platform == ('linux2'):
5343 kranalysiscommand = "python kr_analysis.py ../" + kranalysisoutputfolder + "krseqs.fasta ../" + kranalysisoutputfolder + "krpredoutput.txt"
5344 os.chdir("kr_analysis/")
5345 os.system(kranalysiscommand)
5346 os.chdir("..")
5347
5348 #Read and parse all substrate specificity prediction output files
5349 minowa_nrps_preds = {}
5350 minowa_nrps_preds_details = {}
5351 nrps_svm_preds = {}
5352 nrps_svm_preds_details = {}
5353 nrps_code_preds = {}
5354 nrps_code_preds_details = {}
5355 substratetransdict2 = {'pipecolate':'pip','fOHOrn':'orn','beta-Lys':'blys','5NhOrn':'orn','OHOrn':'orn','Aad':'Aaa','bOHTyr':'bht'}
5356 if len(nrpsnames) > 0:
5357 minowa_a_file = open(minowanrpsoutputfolder + "nrpspredoutput.txt","r")
5358 minowa_a_file = minowa_a_file.read()
5359 minowa_a_file = minowa_a_file.replace("\r","\n")
5360 parts = minowa_a_file.split("\\\\\n")[1:]
5361 for i in parts:
5362 partlines = i.split("\n")
5363 acc = partlines[0]
5364 tophit = partlines[2].split("\t")[0]
5365 if tophit in substratetransdict2.keys():
5366 tophit = substratetransdict2[tophit]
5367 minowa_nrps_preds[acc] = tophit.lower()
5368 minowa_nrps_preds_details[acc] = "<b>Minowa HMM method A-domain<br>Substrate specificity prediction top hits:</b><br>\n" + partlines[1] + "<br>\n" + partlines[2] + "<br>\n" + partlines[3] + "<br>\n" + partlines[4] + "<br><br>\n\n"
5369 nrpspredictorfile1 = open(nrpspredictoroutputfolder + "nrpspredictor2.out","r")
5370 nrpspredictorfile2 = open(nrpspredictoroutputfolder + "nrpscodes.txt","r")
5371 nrpspredictorfile1 = nrpspredictorfile1.read()
5372 nrpspredictorfile1 = nrpspredictorfile1.replace("\r","\n")
5373 lines = nrpspredictorfile1.split("\n")[1:-1]
5374 for k in lines:
5375 tabs = k.split("\t")
5376 nrps_svm_preds[tabs[0]] = tabs[6]
5377 nrps_svm_preds_details[tabs[0]] = "<b> NRPSPredictor2 SVM prediction details:</b><br>\n8 Angstrom 34 AA code:<br>\n" + tabs[1] + "<br>\nPredicted physicochemical class:<br>\n" + tabs[3] + "<br>\nLarge clusters prediction:<br>\n" + tabs[4] + "<br>\nSmall clusters prediction:<br>\n" + tabs[5] + "<br>\nSingle AA prediction:<br>\n" + tabs[6] + "<br><br>\n\n"
5378 nrpspredictorfile2 = nrpspredictorfile2.read()
5379 nrpspredictorfile2 = nrpspredictorfile2.replace("\r","\n")
5380 lines = nrpspredictorfile2.split("\n")[:-1]
5381 for k in lines:
5382 tabs = k.split("\t")
5383 nrps_code_preds[tabs[0]] = tabs[1]
5384 nrps_code_preds_details[tabs[0]] = "<b> NRPSPredictor2 Stachelhaus code prediction:</b><br>\n" + tabs[1] + "<br><br>\n\n"
5385 minowa_pks_preds_details = {}
5386 minowa_pks_preds = {}
5387 pks_code_preds ={}
5388 pks_code_preds_details ={}
5389 substratetransdict = {'Malonyl-CoA':'mal','Methylmalonyl-CoA':'mmal','Methoxymalonyl-CoA':'mxmal','Ethylmalonyl-CoA':'emal','Isobutyryl-CoA':'isobut','2-Methylbutyryl-CoA':'2metbut','trans-1,2-CPDA':'trans-1,2-CPDA','Acetyl-CoA':'Acetyl-CoA','Benzoyl-_CoA':'benz','Propionyl-CoA':'prop','3-Methylbutyryl-CoA':'3metbut','Ethylmalonyl-CoA':'Ethyl_mal','CE-Malonyl-CoA':'cemal','2-Rhyd-Malonyl-CoA':'2Rhydmal','CHC-CoA':'CHC-CoA','inactive':'inactive'}
5390 if len(pksnames) > 0:
5391 minowa_at_file = open(minowapksoutputfolder + "pkspredoutput.txt","r")
5392 minowa_at_file = minowa_at_file.read()
5393 minowa_at_file = minowa_at_file.replace("\r","\n")
5394 parts = minowa_at_file.split("\\\\\n")[1:]
5395 for i in parts:
5396 partlines = i.split("\n")
5397 acc = partlines[0]
5398 if substratetransdict.has_key(partlines[2].split("\t")[0]):
5399 tophit = substratetransdict[partlines[2].split("\t")[0]]
5400 else:
5401 tophit = "pk"
5402 minowa_pks_preds[acc] = tophit
5403 minowa_pks_preds_details[acc] = "<b>Minowa HMM method AT-domain<br>Substrate specificity prediction top hits:</b><br>\n" + partlines[1] + "<br>\n" + partlines[2] + "<br>\n" + partlines[3] + "<br>\n" + partlines[4] + "<br><br>\n\n"
5404 pkssignaturefile = open(pkssignatureoutputfolder + "pkspredoutput.txt","r")
5405 pkssignaturefile = pkssignaturefile.read()
5406 pkssignaturefile = pkssignaturefile.replace("\r","\n")
5407 parts = pkssignaturefile.split("//\n")[1:]
5408 for i in parts:
5409 partlines = i.split("\n")
5410 partlines2 = []
5411 for j in partlines:
5412 if j != "":
5413 partlines2.append(j)
5414 partlines = partlines2
5415 acc = partlines[0].split("\t")[0]
5416 if len(partlines) > 2:
5417 tophit = (partlines[1].split("\t")[0]).split("__")[1]
5418 pks_code_preds[acc] = tophit
5419 codes = []
5420 prots = []
5421 scores = []
5422 for i in partlines[1:4]:
5423 codes.append(i.split("\t")[0])
5424 prot = i.split("\t")[1]
5425 prot = prot.replace("_AT"," (AT")
5426 prot = prot.replace("__","): ")
5427 prots.append(prot)
5428 scores.append(i.split("\t")[2])
5429 if len(prots) >= 3:
5430 pks_code_preds_details[acc] = "<b>PKS Active Site Signature method<br>AT-domain substrate specificity prediction top hits:</b><br>\nCode:" + partlines[0].split("\t")[1] + "<br>\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)<br>\n" + codes[1] + " - " + prots[1] + " : (" + scores[1] + "% identity)<br>\n" + codes[2] + " - " + prots[2] + " : (" + scores[2] + "% identity)<br><br>\n\n"
5431 elif len(prots) == 2:
5432 pks_code_preds_details[acc] = "<b>PKS Active Site Signature method<br>AT-domain substrate specificity prediction top hits:</b><br>\nCode:" + partlines[0].split("\t")[1] + "<br>\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)<br>\n" + codes[1] + " - " + prots[1] + " : (" + scores[1] + "% identity)<br><br>\n\n"
5433 elif len(prots) == 1:
5434 pks_code_preds_details[acc] = "<b>PKS Active Site Signature method<br>AT-domain substrate specificity prediction top hits:</b><br>\nCode:" + partlines[0].split("\t")[1] + "<br>\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)<br><br>\n\n"
5435 else:
5436 pks_code_preds[acc] = "N/A"
5437 pks_code_preds_details[acc] = "<b>PKS Active Site Signature method<br>No AT-domain substrate specificity prediction hits above 40% identity.<br>\n\n"
5438 minowa_cal_preds = {}
5439 minowa_cal_preds_details = {}
5440 if len(calnames) > 0:
5441 minowa_cal_file = open(minowacaloutputfolder + "calpredoutput.txt","r")
5442 minowa_cal_file = minowa_cal_file.read()
5443 minowa_cal_file = minowa_cal_file.replace("\r","\n")
5444 parts = minowa_cal_file.split("\\\\\n")[1:]
5445 for i in parts:
5446 partlines = i.split("\n")
5447 acc = partlines[0]
5448 tophit = partlines[2].split("\t")[0]
5449 minowa_cal_preds[acc] = tophit
5450 minowa_cal_preds_details[acc] = "<b>Minowa HMM method<br>CAL-domain substrate specificity prediction top hits:</b><br>\n" + partlines[1] + "<br>\n" + partlines[2] + "<br>\n" + partlines[3] + "<br>\n" + partlines[4] + "<br><br>\n\n"
5451 kr_activity_preds = {}
5452 kr_stereo_preds = {}
5453 if len(krnames) > 0:
5454 krfile = open(kranalysisoutputfolder + "krpredoutput.txt","r")
5455 krfile = krfile.read()
5456 krfile = krfile.replace("\r","\n")
5457 krlines = krfile.split("\n")[:-1]
5458 for i in krlines:
5459 tabs = i.split("\t")
5460 kr_activity_preds[tabs[0]] = tabs[1]
5461 kr_stereo_preds[tabs[0]] = tabs[2]
5462
5463 #Combine substrate specificity predictions into consensus prediction
5464 consensuspreds = {}
5465 #available_smiles_parts = ['ALA','ARG','ASN','ASP','CYS','GLN','GLU','GLY','HIS','ILE','LEU','LYS','PHE','PRO','SER','THR','TRP','TYR','VAL','MET','ORN','ala','arg','asn','asp','cys','gln','glu','gly','his','ile','leu','lys','phe','pro','ser','thr','trp','tyr','val','met','orn','Ala','Arg','Asn','Asp','Cys','Gln','Glu','Gly','His','Ile','Leu','Lys','Phe','Pro','Ser','Thr','Trp','Tyr','Val','Met','Orn','MPRO','23DHB','34DHB','2HIVA','PGLY','DAB','BALA','AEO','4MHA','PICO','AAA','DHA','SCY','PIP','BMT','ADDS','mpro','23dhb','34dhb','2hiva','pgly','dab','bala','aeo','4mha','pico','aaa','dha','scy','pip','bmt','adds','Mpro','23Dhb','34Dhb','2Hiva','Pgly','Dab','Bala','Aeo','4Mha','Pico','Aaa','Dha','Scy','Pip','Bmt','Adds','mal','mmal','omal','emal','nrp','pk']
5466 available_smiles_parts = ['GLY','ALA','VAL','LEU','ILE','MET','PRO','PHE','TRP','SER','THR','ASN','GLN','TYR','CYS','LYS','ARG','HIS','ASP','GLU','MPRO','ORN','PGLY','DAB','BALA','AEO','DHA','PIP','BMT','gly','ala','val','leu','ile','met','pro','phe','trp','ser','thr','asn','gln','tyr','cys','lys','arg','his','asp','glu','aaa','mpro','dhb','2hiva','orn','pgly','dab','bala','aeo','4mha','pico','phg','dha','scy','pip','bmt','adds','aad','abu','hiv','dhpg','bht','3-me-glu','4pPro','ala-b','ala-d','dht','Sal','tcl','lys-b','hpg','hyv-d','iva','vol','mal','mmal','mxmal','emal','nrp','pk','Gly','Ala','Val','Leu','Ile','Met','Pro','Phe','Trp','Ser','Thr','Asn','Gln','Tyr','Cys','Lys','Arg','His','Asp','Glu','Mpro','23Dhb','34Dhb','2Hiva','Orn','Pgly','Dab','Bala','Aeo','4Mha','Pico','Aaa','Dha','Scy','Pip','Bmt','Adds','DHpg','DHB','nrp','pk']
5467 for i in pksnrpscoregenes:
5468 nra = 0
5469 nrat = 0
5470 nrcal = 0
5471 j = domaindict[i]
5472 for k in j:
5473 if k[0] == "PKS_AT":
5474 nrat += 1
5475 preds = []
5476 preds.append(minowa_pks_preds[i + "_AT" + str(nrat)])
5477 preds.append(pks_code_preds[i + "_AT" + str(nrat)])
5478 cpred = "n"
5479 for l in preds:
5480 if preds.count(l) > 1:
5481 if l in available_smiles_parts:
5482 consensuspreds[i + "_AT" + str(nrat)] = l
5483 else:
5484 consensuspreds[i + "_AT" + str(nrat)] = "pk"
5485 cpred = "y"
5486 if cpred == "n":
5487 consensuspreds[i + "_AT" + str(nrat)] = "pk"
5488 if k[0] == "AMP-binding" or k[0] == "A-OX":
5489 nra +=1
5490 preds = []
5491 preds.append(minowa_nrps_preds[i + "_A" + str(nra)])
5492 preds.append(nrps_svm_preds[i + "_A" + str(nra)])
5493 preds.append(nrps_code_preds[i + "_A" + str(nra)])
5494 cpred = "n"
5495 for l in preds:
5496 if preds.count(l) > 1:
5497 if l in available_smiles_parts:
5498 consensuspreds[i + "_A" + str(nra)] = l
5499 else:
5500 consensuspreds[i + "_A" + str(nra)] = "nrp"
5501 cpred = "y"
5502 if cpred == "n":
5503 consensuspreds[i + "_A" + str(nra)] = "nrp"
5504 if k[0] == "CAL_domain":
5505 nrcal += 1
5506 if minowa_cal_preds[i + "_CAL" + str(nrcal)] in available_smiles_parts:
5507 consensuspreds[i + "_CAL" + str(nrcal)] = minowa_cal_preds[i + "_CAL" + str(nrcal)]
5508 else:
5509 consensuspreds[i + "_CAL" + str(nrcal)] = "pk"
5510
5511 #Write all prediction details to HTML files for each gene to be used as pop-up window
5512 domainnamesdict = {}
5513 for i in pksnrpscoregenes:
5514 j = domaindict[i]
5515 domainnames = []
5516 for k in j:
5517 domainnames.append(k[0])
5518 domainnamesdict[i] = domainnames
5519 for i in pksnrpscoregenes:
5520 if "PKS_AT" in domainnamesdict[i] or "AMP-binding" in domainnamesdict[i] or "A-OX" in domainnamesdict[i] or "CAL_domain" in domainnamesdict[i]:
5521 j = domaindict[i]
5522 nrat = 0
5523 nra = 0
5524 nrcal = 0
5525 nrkr = 0
5526 for k in j:
5527 if k[0] == "PKS_AT":
5528 nrat += 1
5529 domainname = i + "_AT" + str(nrat)
5530 htmloutfile = open(substrspecsfolder + domainname + ".html","w")
5531 htmloutfile.write('<html>\n<head>\n<title>Prediction details</title>\n<STYLE type="text/css">\nbody{\n text-align:left;\n background-color:white;\n font-family: Tahoma, sans-serif;\n font-size: 0.8em;\n color: #810E15;\n}\n</STYLE>\n</head>\n<body>')
5532 htmloutfile.write(minowa_pks_preds_details[domainname])
5533 htmloutfile.write(pks_code_preds_details[domainname])
5534 htmloutfile.write("<b><i>Consensus Predictions: " + consensuspreds[domainname] + "</b></i>")
5535 htmloutfile.write('\n</body>\n</html>')
5536 htmloutfile.close()
5537 if k[0] == "AMP-binding" or k[0] == "A-OX":
5538 nra += 1
5539 domainname = i + "_A" + str(nra)
5540 htmloutfile = open(substrspecsfolder + domainname + ".html","w")
5541 htmloutfile.write('<html>\n<head>\n<title>Prediction details</title>\n<STYLE type="text/css">\nbody{\n text-align:left;\n background-color:white;\n font-family: Tahoma, sans-serif;\n font-size: 0.8em;\n color: #810E15;\n}\n</STYLE>\n</head>\n<body>')
5542 htmloutfile.write(nrps_svm_preds_details[domainname])
5543 htmloutfile.write(nrps_code_preds_details[domainname])
5544 htmloutfile.write(minowa_nrps_preds_details[domainname])
5545 htmloutfile.write("<b><i>Consensus Prediction: '" + consensuspreds[domainname] + "'</b></i>")
5546 htmloutfile.write('\n</body>\n</html>')
5547 htmloutfile.close()
5548 if k[0] == "CAL_domain":
5549 nrcal += 1
5550 domainname = i + "_CAL" + str(nrcal)
5551 htmloutfile = open(substrspecsfolder + domainname + ".html","w")
5552 htmloutfile.write('<html>\n<head>\n<title>Prediction details</title>\n<STYLE type="text/css">\nbody{\n text-align:left;\n background-color:white;\n font-family: Tahoma, sans-serif;\n font-size: 0.8em;\n color: #810E15;\n}\n</STYLE>\n</head>\n<body>')
5553 htmloutfile.write(minowa_cal_preds_details[domainname])
5554 htmloutfile.write('\n</body>\n</html>')
5555 htmloutfile.close()
5556
5557 elapsed = (time.time() - starttime)
5558 #print "5541Time since start: " + str(elapsed)
5559 #Predict biosynthetic gene order in gene cluster using starter domains, thioesterase domains, gene order and docking domains
5560 compound_pred_dict = {}
5561 dockingdomainanalysis = []
5562 nrpspksclusters = []
5563 a = 1
5564 for i in geneclusters:
5565 genecluster = i
5566 clustercoregenes = clusterinfo[i][3]
5567 clusterpksnrpsgenes = []
5568 for j in clustercoregenes:
5569 if j in pksnrpscoregenes:
5570 clusterpksnrpsgenes.append(j)
5571 if len(clusterpksnrpsgenes) > 0:
5572 nrpspksclusters.append(genecluster)
5573 pksgenes = 0
5574 clusterpksgenes = []
5575 nrpsgenes = 0
5576 clusternrpsgenes = []
5577 hybridgenes = 0
5578 clusterhybridgenes = []
5579 for j in clusterpksnrpsgenes:
5580 k = nrpspkstypedict[j]
5581 if "PKS" in k and "NRPS" not in k:
5582 pksgenes += 1
5583 clusterpksgenes.append(j)
5584 elif "PKS" not in k and "NRPS" in k:
5585 nrpsgenes += 1
5586 clusternrpsgenes.append(j)
5587 elif "PKS/NRPS" in k:
5588 if ("PKS_KS" in domainnamesdict[j] or "PKS_AT" in domainnamesdict[j]) and ("AMP-binding" not in domainnamesdict[j] and "A-OX" not in domainnamesdict[j] and "Condensation" not in domainnamesdict[j]):
5589 pksgenes += 1
5590 clusterpksgenes.append(j)
5591 elif ("PKS_KS" not in domainnamesdict[j] and "PKS_AT" not in domainnamesdict[j]) and ("AMP-binding" in domainnamesdict[j] or "A-OX" in domainnamesdict[j] or "Condensation" in domainnamesdict[j]):
5592 nrpsgenes += 1
5593 clusternrpsgenes.append(j)
5594 elif "PKS" in k and "NRPS" in k:
5595 hybridgenes += 1
5596 clusterhybridgenes.append(j)
5597 #If more than three PKS genes, use dock_dom_analysis if possible to identify order
5598 dock_dom_analysis = "failed"
5599 if pksgenes > 3 and nrpsgenes == 0 and hybridgenes == 0:
5600 #print "Predicting PKS gene order by docking domain sequence analysis"
5601 logfile.write("Predicting PKS gene order by docking domain sequence analysis")
5602 dockhtmlfile = open(htmlfolder + "docking_analysis" + str(genecluster) + ".html","w")
5603 #Find first and last genes based on starter module and TE / TD
5604 startergene = ""
5605 endinggene = ""
5606 for k in clusterpksgenes:
5607 if "Thioesterase" in domainnamesdict[k] or "TD" in domainnamesdict[k]:
5608 if endinggene == "":
5609 endinggene = k
5610 else:
5611 endinggene = ""
5612 if len(domainnamesdict[k]) >=2 and "PKS_AT" == domainnamesdict[k][0] and "ACP" == domainnamesdict[k][1]:
5613 if startergene == "":
5614 startergene = k
5615 else:
5616 startergene = ""
5617 if startergene == "":
5618 for k in clusterpksgenes:
5619 if len(domainnamesdict[k]) >=3 and "PKS_KS" == domainnamesdict[k][0] and "PKS_AT" == domainnamesdict[k][1] and "ACP" == domainnamesdict[k][2]:
5620 if startergene == "":
5621 startergene = k
5622 else:
5623 startergene = ""
5624 break
5625 #Extract N-terminal 50 residues of each non-starting protein, scan for docking domains using hmmsearch, parse output to locate interacting residues
5626 ntermintresdict = {}
5627 ntermnames = []
5628 ntermseqs = []
5629 for k in clusterpksgenes:
5630 if k != startergene:
5631 ntermnames.append(k)
5632 seq = seqdict[k]
5633 ntermseqs.append(seq[:50])
5634 ntermfasta = "docking_analysis/input.fasta"
5635 z = 0
5636 for k in ntermnames:
5637 writefasta([ntermnames[z]],[ntermseqs[z]],ntermfasta)
5638 os.chdir("docking_analysis")
5639 os.system("muscle -profile -quiet -in1 nterm.fasta -in2 input.fasta -out muscle.fasta")
5640 intresidues = extractpositions("nterm.fasta","muscle.fasta",[2,15],"EryAIII_5_6_ref",ntermnames[z])
5641 ntermintresdict[ntermnames[z]] = intresidues
5642 os.chdir("..")
5643 z += 1
5644 #Extract C-terminal 100 residues of each non-ending protein, scan for docking domains using hmmsearch, parse output to locate interacting residues
5645 ctermintresdict = {}
5646 ctermnames = []
5647 ctermseqs = []
5648 for k in clusterpksgenes:
5649 if k != endinggene:
5650 ctermnames.append(k)
5651 seq = seqdict[k]
5652 ctermseqs.append(seq[-100:])
5653 ctermfasta = "docking_analysis/input.fasta"
5654 z = 0
5655 for k in ctermnames:
5656 writefasta([ctermnames[z]],[ctermseqs[z]],ctermfasta)
5657 os.chdir("docking_analysis")
5658 os.system("muscle -profile -quiet -in1 cterm.fasta -in2 input.fasta -out muscle.fasta")
5659 intresidues = extractpositions("cterm.fasta","muscle.fasta",[55,64],"EryAII_ref",ctermnames[z])
5660 ctermintresdict[ctermnames[z]] = intresidues
5661 os.chdir("..")
5662 z += 1
5663 #If docking domains found in all, check for optimal order using interacting residues
5664 genes_to_order = []
5665 z = 0
5666 for k in clusterpksgenes:
5667 if k == startergene or k == endinggene:
5668 pass
5669 else:
5670 genes_to_order.append(k)
5671 z += 1
5672 possible_orders = list(itertools.permutations(genes_to_order,len(genes_to_order)))
5673 hydrophobic = ["A","V","I","L","F","W","Y","M"]
5674 positivecharge = ["H","K","R"]
5675 negativecharge = ["D","E"]
5676 other = ["C","G","P","S","T","N","Q","X","U"]
5677 possible_orders_scoredict = {}
5678 for k in possible_orders:
5679 score = 0
5680 interactions = []
5681 z = 0
5682 for l in k[:-1]:
5683 interactions.append([l,k[z + 1]])
5684 z += 1
5685 for l in interactions:
5686 res1a = ctermintresdict[l[0]][0]
5687 res1b = ntermintresdict[l[1]][0]
5688 res2a = ctermintresdict[l[0]][1]
5689 res2b = ntermintresdict[l[1]][1]
5690 if (res1a in hydrophobic and res1b in hydrophobic) or (res1a in positivecharge and res1b in negativecharge) or (res1a in negativecharge and res1b in positivecharge):
5691 score += 1
5692 if (res1a in positivecharge and res1b in positivecharge) or (res1a in negativecharge and res1b in negativecharge):
5693 score = score - 1
5694 if (res2a in hydrophobic and res2b in hydrophobic) or (res2a in positivecharge and res2b in negativecharge) or (res2a in negativecharge and res2b in positivecharge):
5695 score += 1
5696 if (res2a in positivecharge and res2b in positivecharge) or (res2a in negativecharge and res2b in negativecharge):
5697 score = score - 1
5698 possible_orders_scoredict[k] = score
5699 ranked_orders = sortdictkeysbyvaluesrev(possible_orders_scoredict)
5700 ranked_orders_part = []
5701 ranked_orders2 = []
5702 a = 0
5703 ranked_orders_len = len(ranked_orders) - 1
5704 for i in ranked_orders:
5705 if a == 0:
5706 score = possible_orders_scoredict[i]
5707 ranked_orders_part.append(i)
5708 elif a == ranked_orders_len:
5709 ranked_orders_part.append(i)
5710 ranked_orders2 = ranked_orders2 + ranked_orders_part
5711 else:
5712 if possible_orders_scoredict[i] == score:
5713 ranked_orders_part.append(i)
5714 else:
5715 ranked_orders_part.reverse()
5716 ranked_orders2 = ranked_orders2 + ranked_orders_part
5717 score = possible_orders_scoredict[i]
5718 ranked_orders_part = []
5719 ranked_orders_part.append(i)
5720 a += 1
5721 ranked_orders = ranked_orders2[:1000]
5722 geneorders = ranked_orders
5723 geneorders2 = []
5724 for l in geneorders:
5725 geneorder = []
5726 if startergene != "":
5727 geneorder.append(startergene)
5728 [ geneorder.append(m) for m in l ]
5729 #for m in l:
5730 # geneorder.append(m)
5731 if endinggene != "":
5732 geneorder.append(endinggene)
5733 geneorders2.append(geneorder)
5734 geneorders = geneorders2
5735 if len(ranked_orders) == 1000:
5736 dockhtmlfile.write('<html>\n<head>\n<LINK href="style.css" rel="stylesheet" type="text/css">\n</head>\n<body>\nDocking domain analysis. Score for 1000 highest scoring gene orders:<br><br><table border=1>\n')
5737 else:
5738 dockhtmlfile.write('<html>\n<head>\n<LINK href="style.css" rel="stylesheet" type="text/css">\n</head>\n<body>\nDocking domain analysis. Scores for all possible gene orders:<br><br><table border=1>\n')
5739 dockhtmlfile.write('<tr><td><b>Gene order</b></td><td><b>Score</b></td></tr>\n')
5740 for l in geneorders:
5741 string = "<tr><td>"
5742 for m in l:
5743 string = string + m + ","
5744 if startergene != "" and endinggene != "":
5745 string = string[:-1] + "</td><td>" + str(possible_orders_scoredict[tuple(l[1:-1])])
5746 elif startergene == "" and endinggene != "":
5747 string = string[:-1] + "</td><td>" + str(possible_orders_scoredict[tuple(l[:-1])])
5748 elif startergene != "" and endinggene == "":
5749 string = string[:-1] + "</td><td>" + str(possible_orders_scoredict[tuple(l[1:])])
5750 elif startergene == "" and endinggene == "":
5751 string = string[:-1] + "</td><td>" + str(possible_orders_scoredict[tuple(l)])
5752 dockhtmlfile.write(string + "</td></tr>\n")
5753 dockhtmlfile.write('\n</table></body></html>')
5754 dockhtmlfile.close()
5755 #print "Predicting PKS gene order by docking domain sequence analysis succeeded."
5756 #Write html outfile with docking domain analysis output
5757 #
5758 logfile.write("Predicting PKS gene order by docking domain sequence analysis succeeded.")
5759 dockingdomainanalysis.append(genecluster)
5760 #If NRPS genes, mixed NRPS/PKS genes, PKS genes without detected docking domains, or clusters with a 1-3 PKS genes, assume colinearity
5761 direction = 0
5762 for k in clusterpksnrpsgenes:
5763 if strandsdict[k] == "+":
5764 direction += 1
5765 elif strandsdict[k] == "-":
5766 direction = direction - 1
5767 if direction < 0:
5768 clusterpksnrpsgenes.reverse()
5769 if "Thioesterase" in domainnamesdict[clusterpksnrpsgenes[0]] or "TD" in domainnamesdict[clusterpksnrpsgenes[0]]:
5770 clusterpksnrpsgenes.reverse()
5771 geneorder = clusterpksnrpsgenes
5772 #Generate substrates order from predicted gene order and consensus predictions
5773 prediction = ""
5774 for k in geneorder:
5775 domains = domainnamesdict[k]
5776 nra = 0
5777 nrat = 0
5778 nrcal = 0
5779 for l in domains:
5780 if "PKS_AT" in l:
5781 nrat += 1
5782 prediction = prediction + consensuspreds[k + "_AT" + str(nrat)] + " "
5783 if "AMP-binding" in l or "A-OX" in l:
5784 nra += 1
5785 prediction = prediction + consensuspreds[k + "_A" + str(nra)] + " "
5786 if "CAL_domain" in l:
5787 nrcal += 1
5788 prediction = prediction + consensuspreds[k + "_CAL" + str(nrcal)] + " "
5789 prediction = prediction[:-1]
5790 compound_pred_dict[genecluster] = prediction
5791 a += 1
5792
5793 #Combine predictions into a prediction of the final chemical structure and generate images
5794 os.chdir("NRPeditor")
5795 failedstructures = []
5796 for i in geneclusters:
5797 genecluster = i
5798 if compound_pred_dict.has_key(genecluster):
5799 residues = compound_pred_dict[genecluster]
5800 nrresidues = len(residues.split(" "))
5801 if nrresidues > 1:
5802 if sys.platform == ('win32'):
5803 structcommand = 'main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "'
5804 elif sys.platform == ('linux2'):
5805 structcommand = './main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "'
5806 for i in residues.split(" "):
5807 structcommand = structcommand + i + " "
5808 structcommand = structcommand + 'TE"'
5809 smilesinfo = os.popen(structcommand)
5810 smilesinfo = smilesinfo.read()
5811 smiles_string = (smilesinfo.split("core peptide: ")[1]).split("\ntermintype")[0]
5812 if sys.platform == ('linux2'):
5813 smiles_string.replace("[X]","[*:X]")
5814 smiles_string2 = ""
5815 a = 1
5816 for k in smiles_string:
5817 if k == "X":
5818 smiles_string2 = smiles_string2 + str(a)
5819 a += 1
5820 else:
5821 smiles_string2 = smiles_string2 + k
5822 smiles_string = smiles_string2
5823 smilesfile = open("genecluster" + str(genecluster) + ".smi","w")
5824 smilesfile.write(smiles_string)
5825 smilesfile.close()
5826 depictstatus = depict_smile(genecluster,structuresfolder)
5827 if depictstatus == "failed":
5828 failedstructures.append(genecluster)
5829 elif clusterinfo[genecluster][0] == "ectoine":
5830 smiles_string = "CC1=NCCC(N1)C(=O)O"
5831 smilesfile = open("genecluster" + str(genecluster) + ".smi","w")
5832 smilesfile.write(smiles_string)
5833 smilesfile.close()
5834 depictstatus = depict_smile(genecluster,structuresfolder)
5835 if depictstatus == "failed":
5836 failedstructures.append(genecluster)
5837 elif genecluster in failedstructures:
5838 del failedstructures[failedstructures.index(genecluster)]
5839 compound_pred_dict[genecluster] = "ectoine "
5840 os.chdir("..")
5841
5842 elapsed = (time.time() - starttime)
5843 #print "5826 Time since start: " + str(elapsed)
5844 #ClusterBlast
5845 if clusterblast == "y":
5846 #Load gene cluster database into memory
5847 #print "ClusterBlast: Loading gene clusters database into memory..."
5848 logfile.write("ClusterBlast: Loading gene clusters database into memory...\n")
5849
5850 os.chdir(genomename + "/clusterblast")
5851 #file = open( os.path.join(antismash_path, "clusterblast/geneclusters.txt") ,"r")
5852 #filetext = file.read()
5853 #lines = filetext.split("\n")
5854 clusters = {}
5855 #for i in open(os.path.join(antismash_path, "clusterblast/geneclusters.txt")):
5856 bin_path = os.path.join(antismash_path, "clusterblast/geneclusters.bin")
5857 if os.path.exists( bin_path ):
5858 clusters = cPickle.load( open(bin_path) )
5859 #print clusters
5860 else:
5861 for line in open( os.path.join(antismash_path, "clusterblast/geneclusters.txt") ,"r"):
5862 line = line.strip()
5863 tabs = line.split("\t")
5864 accession = tabs[0]
5865 clusterdescription = tabs[1]
5866 clusternr = tabs[2]
5867 clustertype = tabs[3]
5868 clustername = accession + "_" + clusternr
5869 clustertags = tabs[4].split(";")
5870 clusterprots = tabs[5].split(";")
5871 clusters[clustername] = [clusterprots,clusterdescription,clustertype,clustertags]
5872 cPickle.dump(clusters, open(bin_path, 'w'), -1)
5873 #Load gene cluster database proteins info into memory
5874 #print "ClusterBlast: Loading gene cluster database proteins into memory..."
5875 logfile.write("ClusterBlast: Loading gene cluster database proteins into memory...\n")
5876 #file = open( os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta") ,"r")
5877 #filetext = file.read()
5878 #filetext = filetext.replace("\r","\n")
5879 #lines = filetext.split("\n")
5880 proteingeneclusters = {}
5881 proteinlocations = {}
5882 proteinstrands = {}
5883 proteinannotations = {}
5884 proteintags = {}
5885 bin_path = os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta.bin")
5886 if os.path.exists( bin_path ):
5887 (proteingeneclusters, proteinlocations, proteinstrands, proteinannotations, proteintags) = cPickle.load( open(bin_path, 'r') )
5888 else:
5889 for line in open( os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta") ,"r"):
5890 line = line.replace('\n', '')
5891 if line.startswith(">"):
5892 tabs = line.split("|")
5893 #print 'Protein:', tabs
5894 protein = tabs[6]
5895 locustag = tabs[4]
5896 if accessiondict.has_key(locustag):
5897 locustag = "h_" + locustag
5898 proteintags[protein] = locustag
5899 clustername = tabs[0] + "_" + tabs[1]
5900 proteingeneclusters[protein] = clustername
5901 location = tabs[2]
5902 proteinlocations[protein] = location
5903 strand = tabs[3]
5904 proteinstrands[protein] = strand
5905 annotation = tabs[5]
5906 proteinannotations[protein] = annotation
5907 cPickle.dump([proteingeneclusters, proteinlocations, proteinstrands, proteinannotations, proteintags], open(bin_path, 'w'), -1)
5908 #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly
5909 #print "Finding internal homologs in each gene cluster.."
5910 logfile.write("Finding internal homologs in each gene cluster..\n")
5911 internalhomologygroupsdict = {}
5912 for i in geneclusters:
5913 clusternumber = i
5914 #Create input fasta files for BLAST search
5915 queryclusterprotslist = clusterinfo[i][4]
5916 queryclusterprots = []
5917 for i in queryclusterprotslist:
5918 queryclusterprots.append(i[4])
5919 queryclusternames = []
5920 queryclusterseqs = []
5921 for i in queryclusterprots:
5922 seq = seqdict[i]
5923 name = fullnamedict[i]
5924 queryclusterseqs.append(seq)
5925 queryclusternames.append(name)
5926 writefasta(queryclusternames,queryclusterseqs,"internal_input.fasta")
5927 #Run and parse BLAST search
5928 makeblastdbcommand = "makeblastdb -in internal_input.fasta -out internal_input.fasta -dbtype prot"
5929 blastsearch = "blastp -db internal_input.fasta -query internal_input.fasta -outfmt 6 -max_target_seqs 1000 -evalue 1e-05 -out internal_input.out"
5930 if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
5931 os.popen(makeblastdbcommand)
5932 os.popen(blastsearch)
5933 else:
5934 os.system(makeblastdbcommand)
5935 os.system(blastsearch)
5936 #print "5920 makeblastdb finised"
5937 blastoutput = open("internal_input.out","r").read()
5938 minseqcoverage = 25
5939 minpercidentity = 30
5940 seqlengths = fastaseqlengths(proteins)
5941 iblastinfo = blastparse(blastoutput,minseqcoverage,minpercidentity,seqlengths,geneclustergenes)
5942 iblastdict = iblastinfo[0]
5943 iquerylist = iblastinfo[1]
5944 #find and store internal homologs
5945 groups = []
5946 for j in queryclusternames:
5947 jsplit = j.split("|")[4]
5948 if iblastdict.has_key(j):
5949 hits = iblastdict[j][0]
5950 group = []
5951 for k in hits:
5952 if k[:2] == "h_":
5953 group.append(k[2:])
5954 elif k.count("|") > 4:
5955 group.append(k.split("|")[4])
5956 else:
5957 group.append(k)
5958 if jsplit not in group:
5959 group.append( jsplit )
5960 x = 0
5961 for l in groups:
5962 for m in group:
5963 if m in l:
5964 del groups[x]
5965 [group.append(n) for n in l if n not in group]
5966 #for n in l:
5967 # if n not in group:
5968 # group.append(n)
5969 break
5970 x += 1
5971 group.sort()
5972 groups.append(group)
5973 else:
5974 groups.append([ jsplit ])
5975 internalhomologygroupsdict[clusternumber] = groups
5976
5977 #Run BLAST on gene cluster proteins of each cluster and parse output
5978 #print "5961 Running NCBI BLAST+ gene cluster searches.."
5979 logfile.write("Running NCBI BLAST+ gene cluster searches..\n")
5980 for i in geneclusters:
5981 clusternumber = i
5982 #print " Gene cluster " + str(clusternumber)
5983 #Create input fasta files for BLAST search
5984 queryclusterprotslist = clusterinfo[i][4]
5985 queryclusterprots = []
5986 for i in queryclusterprotslist:
5987 queryclusterprots.append(i[4])
5988 queryclusternames = []
5989 queryclusterseqs = []
5990 for i in queryclusterprots:
5991 seq = seqdict[i]
5992 name = fullnamedict[i]
5993 queryclusterseqs.append(seq)
5994 queryclusternames.append(name)
5995 equalpartsizes = int(len(queryclusternames)/nrcpus)
5996 for i in range(nrcpus):
5997 if i == 0:
5998 setnames = queryclusternames[:equalpartsizes]
5999 setseqs = queryclusterseqs[:equalpartsizes]
6000 elif i == (nrcpus - 1):
6001 setnames = queryclusternames[(i*equalpartsizes):]
6002 setseqs = queryclusterseqs[(i*equalpartsizes):]
6003 else:
6004 setnames = queryclusternames[(i*equalpartsizes):((i+1)*equalpartsizes)]
6005 setseqs = queryclusterseqs[(i*equalpartsizes):((i+1)*equalpartsizes)]
6006 writefasta(setnames,setseqs,"input" + str(i) + ".fasta")
6007 processes = []
6008 processnames = []
6009 for i in range(nrcpus):
6010 processes.append(Process(target=runblast, args=["input" + str(i) + ".fasta"]))
6011 [i.start() for i in processes]
6012 time.sleep(10)
6013 while True:
6014 processrunning = "n"
6015 for i in processes:
6016 if i.is_alive():
6017 processrunning = "y"
6018 if processrunning == "y":
6019 time.sleep(5)
6020 else:
6021 break
6022 [i.join() for i in processes]
6023 blastoutput = ""
6024 for i in range(nrcpus):
6025 output = open("input" + str(i) + ".out","r")
6026 output = output.read()
6027 blastoutput = blastoutput + output
6028 os.chdir("..")
6029 blastoutputfile = open("./clusterblastoutput.txt","w")
6030 blastoutputfile.write(blastoutput)
6031 blastoutputfile.close()
6032 os.chdir("clusterblast")
6033 #print " Blast search finished. Parsing results..."
6034 logfile.write(" Blast search finished. Parsing results...\n")
6035 minseqcoverage = 25
6036 minpercidentity = 30
6037 seqlengths = fastaseqlengths(proteins)
6038 blastinfo = blastparse(blastoutput,minseqcoverage,minpercidentity,seqlengths,geneclustergenes)
6039 blastdict = blastinfo[0]
6040 querylist = blastinfo[1]
6041 #Remove queries without hits
6042 querylist2 = []
6043 for i in querylist:
6044 if blastdict.has_key(i):
6045 querylist2.append(i)
6046 else:
6047 pass
6048 querylist = querylist2
6049 hitclusters = blastinfo[2]
6050 #Score BLAST output on all gene clusters
6051 #Rank gene cluster hits based on 1) number of protein hits covering >25% sequence length or at least 100aa alignment, with >30% identity and 2) cumulative blast score
6052 #Find number of protein hits and cumulative blast score for each gene cluster
6053 #print " Scoring Blast outputs on database of gene clusters..."
6054 logfile.write(" Scoring Blast outputs on database of gene clusters...\n")
6055 hitclusterdict = {}
6056 hitclusterdata = {}
6057 for i in hitclusters:
6058 hitclusterdatalist = []
6059 nrhits = float(0)
6060 nrcoregenehits = float(0)
6061 cumblastscore = float(0)
6062 hitpositions = []
6063 hitposcorelist = []
6064 for j in querylist:
6065 querynrhits = 0
6066 querycumblastscore = float(0)
6067 nrhitsplus = "n"
6068 for k in blastdict[j][0]:
6069 if i == blastdict[j][1][k][0]:
6070 if [querylist.index(j),clusters[i][0].index(blastdict[j][1][k][9])] not in hitpositions:
6071 nrhitsplus = "y"
6072 querynrhits += 1
6073 blastscore = float(blastdict[j][1][k][6]) / 1000000
6074 querycumblastscore = querycumblastscore + blastscore
6075 hitclusterdatalist.append([j,k,blastdict[j][1][k][5],blastdict[j][1][k][6],blastdict[j][1][k][7],blastdict[j][1][k][8]])
6076 hitclusterdata[i] = hitclusterdatalist
6077 hitpositions.append([querylist.index(j),clusters[i][0].index(blastdict[j][1][k][9])])
6078 if nrhitsplus == "y":
6079 nrhits += 1
6080 if j.split("|")[4] in allcoregenes:
6081 nrcoregenehits += 0.1
6082 for hit in range(querynrhits):
6083 hitposcorelist.append(1)
6084 else:
6085 for hit in range(querynrhits):
6086 hitposcorelist.append(0)
6087 cumblastscore = cumblastscore + float(querycumblastscore)
6088 query_givenscores_querydict = {}
6089 query_givenscores_hitdict = {}
6090 #Find groups of hits
6091 hitgroupsdict = {}
6092 for p in hitpositions:
6093 if not hitgroupsdict.has_key(p[0]):
6094 hitgroupsdict[p[0]] = [p[1]]
6095 else:
6096 hitgroupsdict[p[0]].append(p[1])
6097 #Calculate synteny score; give score only if more than one hits (otherwise no synteny possible), and only once for every query gene and every hit gene
6098 synteny_score = 0
6099 z = 1
6100 if nrhits > 1:
6101 for p in hitpositions[:-1]:
6102 tandem = "n"
6103 #Check if a gene homologous to this gene has already been scored for synteny in the previous entry
6104 if p[1] in hitgroupsdict[hitpositions[z][0]]:
6105 tandem = "y"
6106 #Score entry
6107 if ((not query_givenscores_querydict.has_key(p[0])) or query_givenscores_querydict[p[0]] == 0) and ((not query_givenscores_hitdict.has_key(p[1])) or query_givenscores_hitdict[p[1]] == 0) and tandem == "n":
6108 q = hitpositions[z]
6109 if (abs(p[0] - q[0]) < 2) and abs(p[0]-q[0]) == abs(p[1]-q[1]):
6110 synteny_score += 1
6111 if hitposcorelist[z - 1] == 1 or hitposcorelist[z] == 1:
6112 synteny_score += 1
6113 query_givenscores_querydict[p[0]] = 1
6114 query_givenscores_hitdict[p[1]] = 1
6115 else:
6116 query_givenscores_querydict[p[0]] = 0
6117 query_givenscores_hitdict[p[1]] = 0
6118 z += 1
6119 #Give bonus to gene clusters with >0 core gene hits
6120 if nrcoregenehits > 0:
6121 corebonus = 3
6122 else:
6123 corebonus = 0
6124 #sorting score is based on number of hits (discrete values) & cumulative blast score (behind comma values)
6125 sortingscore = nrhits + synteny_score + corebonus + nrcoregenehits + cumblastscore
6126 hitclusterdict[i] = sortingscore
6127 #Sort gene clusters
6128 rankedclusters = sortdictkeysbyvaluesrev(hitclusterdict)
6129 rankedclustervalues = sortdictkeysbyvaluesrevv(hitclusterdict)
6130 #Output for each hit: table of genes and locations of input cluster, table of genes and locations of hit cluster, table of hits between the clusters
6131 #print " Writing output file..."
6132 logfile.write(" Writing output file...\n")
6133 #os.chdir("..")
6134 #os.chdir(genomename)
6135 #os.chdir("clusterblast")
6136 out_file = open("cluster" + str(clusternumber) + ".txt","w")
6137 out_file.write("ClusterBlast scores for " + infile)
6138 out_file.write("\n\nTable of genes, locations, strands and annotations of query cluster:\n")
6139 #out_file.write("\n")
6140 #out_file.write("Table of genes, locations, strands and annotations of query cluster:")
6141 #out_file.write("\n")
6142 for i in queryclusterprots:
6143 out_file.write("%s\t%s\t%s\t%s\t%s\t\n" % (i, proteins[3][i][0], proteins[3][i][1], proteins[3][i][2], proteins[3][i][3]))
6144 """out_file.write(i)
6145 out_file.write("\t")
6146 out_file.write(proteins[3][i][0])
6147 out_file.write("\t")
6148 out_file.write(proteins[3][i][1])
6149 out_file.write("\t")
6150 out_file.write(proteins[3][i][2])
6151 out_file.write("\t")
6152 out_file.write(proteins[3][i][3])
6153 out_file.write("\t")
6154 out_file.write("\n")"""
6155 out_file.write("\n\nSignificant hits: \n")
6156 #out_file.write("\n")
6157 #out_file.write("Significant hits: ")
6158 #out_file.write("\n")
6159 z = 0
6160 for i in rankedclusters[:100]:
6161 #out_file.write(str(z+1) + ". " + i + "\t" + clusters[i][1])
6162 #out_file.write("\n")
6163 out_file.write("%s. %s\t%s\n" % ((z+1), i, clusters[i][1]) )
6164 z += 1
6165 out_file.write("\n\n")
6166 #out_file.write("\n")
6167 z = 0
6168 out_file.write("Details:")
6169 for i in rankedclusters[:100]:
6170 value = str(rankedclustervalues[z])
6171 nrhits = value.split(".",1)[0]
6172 if nrhits > 0:
6173 cumblastscore = str(int(float(value.split(".")[1])))
6174 out_file.write("\n\n>>\n\n%s. %s\nSource: %s\nType: %s\nNumber of proteins with BLAST hits to this cluster: %s\nCumulative BLAST score: %s\n\nTable of genes, locations, strands and annotations of subject cluster:\n" % (z+1, i, clusters[i][1], clusters[i][2], nrhits, cumblastscore))
6175 clusterproteins = clusters[i][0]
6176 #print 'clusterproteins\n\n', clusterproteins
6177 """out_file.write("\n\n")
6178 out_file.write(">>")
6179 out_file.write("\n")
6180 cumblastscore = str(int(float(value.split(".")[1])))
6181 out_file.write("\n")
6182 out_file.write(str(z+1) + ". " + i)
6183 out_file.write("\n")
6184 out_file.write("Source: " + clusters[i][1])
6185 out_file.write("\n")
6186 out_file.write("Type: " + clusters[i][2])
6187 out_file.write("\n")
6188 out_file.write("Number of proteins with BLAST hits to this cluster: " + nrhits)
6189 out_file.write("\n")
6190 out_file.write("Cumulative BLAST score: " + cumblastscore)
6191 out_file.write("\n")
6192 out_file.write("\n")
6193 out_file.write("Table of genes, locations, strands and annotations of subject cluster:")
6194 out_file.write("\n")
6195 clusterproteins = clusters[i][0]"""
6196
6197 for j in clusterproteins:
6198 #print '##########asdfasdf######', j, '---'+proteinlocations.keys()[0]+ '---', proteinannotations.has_key(j), proteinstrands.has_key(j), proteinlocations.has_key(j)
6199 if proteinlocations.has_key(j) and proteinannotations.has_key(j) and proteinstrands.has_key(j):
6200 if proteintags[j] == "no_locus_tag":
6201 out_file.write(j)
6202 else:
6203 out_file.write(proteintags[j])
6204 out_file.write( "\t%s\t%s\t%s\t%s\t%s\n" % (j, proteinlocations[j].split("-")[0], proteinlocations[j].split("-")[1], proteinstrands[j], proteinannotations[j]) )
6205 """out_file.write("\t")
6206 out_file.write(j)
6207 out_file.write("\t")
6208 out_file.write(proteinlocations[j].split("-")[0])
6209 out_file.write("\t")
6210 out_file.write(proteinlocations[j].split("-")[1])
6211 out_file.write("\t")
6212 out_file.write(proteinstrands[j])
6213 out_file.write("\t")
6214 out_file.write(proteinannotations[j])
6215 out_file.write("\n")
6216 """
6217
6218 out_file.write("\nTable of Blast hits (query gene, subject gene, %identity, blast score, %coverage, e-value):\n")
6219 if i in hitclusterdata.keys():
6220 tabledata = hitclusterdata[i]
6221 for x in tabledata:
6222 w = 0
6223 for y in x:
6224 if w == 0:
6225 out_file.write( "%s\t" % y.split("|")[4] )
6226 #out_file.write("\t")
6227 w += 1
6228 else:
6229 out_file.write("%s\t" % y)
6230 #out_file.write("\t")
6231 out_file.write("\n")
6232 else:
6233 "data not found"
6234 out_file.write("\n")
6235 out_file.write("\n")
6236 z += 1
6237 #os.chdir("..")
6238 #os.chdir("..")
6239 #os.chdir("clusterblast")
6240 os.chdir("..")
6241 out_file.close()
6242
6243 elapsed = (time.time() - starttime)
6244 #print "Time since start: " + str(elapsed)
6245 #smCOG analysis
6246 smcogtreedict = {}
6247 if smcogs == "y":
6248 #print "Performing smCOG analysis"
6249 logfile.write("Performing smCOG analysis\n")
6250 hmmsearch = hmmscan_path + " --cpu " + str(nrcpus) + " -E 1E-6 -o " + "./smcogs/smcogshmm_output.txt" + " --noali --tblout " + "./smcogs/smcogshmm.txt "+ hmms_path +"smcogs.hmm " + "./clusterblast/geneclusterprots.fasta"
6251 #print hmmsearch
6252 os.system(hmmsearch)
6253 #print 'finised'
6254 smcoghmmlengthsdict = hmmlengths(hmms_path+"smcogs.hmm")
6255 smcogdict = hmmscanparse("./smcogs/smcogshmm_output.txt", smcoghmmlengthsdict)
6256 smcogdict2 = {}
6257 for i in smcogdict.keys():
6258 newkey = i.split("|")[4]
6259 smcogdict2[newkey] = smcogdict[i]
6260 smcogdict = smcogdict2
6261 #Write output
6262 #os.chdir(genomename)
6263 os.chdir("smcogs")
6264 smcogfile = open("smcogs.txt","w")
6265 for k in geneclustergenes:
6266 if k not in pksnrpscoregenes:
6267 l = smcogdict[k]
6268 smcogfile.write(">> " + k + "\n")
6269 smcogfile.write("name\tstart\tend\te-value\tscore\n")
6270 smcogfile.write("** smCOG hits **\n")
6271 for i in l:
6272 smcogfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n")
6273 smcogfile.write("\n\n")
6274 smcogfile.close()
6275 os.chdir("..")
6276 os.chdir("..")
6277 #smCOG phylogenetic tree construction
6278 #print "Calculating and drawing phylogenetic trees of cluster genes with smCOG members"
6279 logfile.write("Calculating and drawing phylogenetic trees of cluster genes with smCOG members")
6280 os.chdir("smcogtree")
6281 smcoganalysisgenes = []
6282 #for k in geneclustergenes:
6283 # if k not in pksnrpscoregenes:
6284 # smcoganalysisgenes.append(k)
6285 [smcoganalysisgenes.append(k) for k in geneclustergenes if k not in pksnrpscoregenes]
6286 smcogsets = []
6287 equalpartsizes = int(len(smcoganalysisgenes)/nrcpus)
6288 for i in range(nrcpus):
6289 if i == 0:
6290 geneslist = smcoganalysisgenes[:equalpartsizes]
6291 elif i == (nrcpus - 1):
6292 geneslist = smcoganalysisgenes[(i*equalpartsizes):]
6293 else:
6294 geneslist = smcoganalysisgenes[(i*equalpartsizes):((i+1)*equalpartsizes)]
6295 smcogsets.append(geneslist)
6296 processes = []
6297 processnames = []
6298 z = 0
6299 for k in smcogsets:
6300 processes.append(Process(target=smcog_analysis, args=[k,z,accessiondict,seqdict,smcogdict,smcogsoutputfolder]))
6301 z += 1
6302 for k in processes:
6303 k.start()
6304 time.sleep(1)
6305 while True:
6306 processrunning = "n"
6307 for k in processes:
6308 if k.is_alive():
6309 processrunning = "y"
6310 if processrunning == "y":
6311 time.sleep(5)
6312 else:
6313 break
6314 for k in processes:
6315 k.join()
6316 os.chdir("..")
6317 currentpath = os.getcwd()
6318 os.chdir(smcogsoutputfolder)
6319 dircontents = getdircontents()
6320 for k in dircontents:
6321 #POTENTIAL pERFORMANCE gainfor k in glob.glob('*.png'):
6322 if ".png" in k:
6323 tag = k.split(".png")[0]
6324 smcogtreedict[tag] = tag + ".png"
6325 os.chdir(currentpath)
6326
6327
6328 ##Visualization
6329 #Read in ClusterBlast data
6330 #Read in PubMed / PubChem links of database gene clusters
6331 if clusterblast == "y":
6332 if genomename in os.getcwd():
6333 os.chdir('..')
6334 pubmed_dict = {}
6335 pubchem_dict = {}
6336 known_compound_dict = {}
6337 #pubfile = open(antismash_path + "pubmed_pubchem_links.txt","r")
6338 #pubfile = pubfile.read()
6339 #publines = pubfile.split("\n")
6340 #for i in publines:
6341 bin_path = os.path.join(antismash_path, "pubmed_pubchem_links.bin")
6342 if os.path.exists( bin_path ):
6343 (pubmed_dict, pubchem_dict, known_compound_dict) = cPickle.load( open(bin_path) )
6344 else:
6345 for line in open(antismash_path + "pubmed_pubchem_links.txt","r"):
6346 line = line.replace('\n', '')
6347 tabs = line.split("\t")
6348 acc = tabs[0]
6349 if tabs[1] != "":
6350 pubmed_dict[acc] = tabs[1]
6351 if tabs[2] != "":
6352 pubchem_dict[acc] = tabs[2]
6353 if tabs[3] != "":
6354 known_compound_dict[acc] = tabs[3]
6355 cPickle.dump([pubmed_dict, pubchem_dict, known_compound_dict], open(bin_path, 'w'), -1)
6356 #print "Writing visualization SVGs and XHTML"
6357 logfile.write("Writing visualization SVGs and XHTML\n")
6358 queryclusterdata = {}
6359 nrhitgeneclusters = {}
6360 cblastclusternr = 1
6361 #print os.getcwd()
6362 if clusterblast == "y":
6363 for x in geneclusters:
6364 clusterblastfile = open(clusterblastoutputfolder + "cluster" + str(x) + ".txt","r")
6365 #print clusterblastfile
6366 clusterblastfile = clusterblastfile.read()
6367 clusterblastfile = clusterblastfile.replace("\r","\n")
6368 toptenhitclusters = []
6369 #Identify top ten hits for visualization
6370 hitlines = ((clusterblastfile.split("Significant hits: \n")[1]).split("\nDetails:")[0]).split("\n")
6371 #print '\n\n#######hitlines\n', hitlines
6372 a = 0
6373 cb_accessiondict = {}
6374 b = 1
6375 for i in hitlines:
6376 if " " in i:
6377 cb_accessiondict[b] = (i.split("\t")[0]).split(" ")[1]
6378 if genomic_accnr == "" or genomic_accnr not in i:
6379 b += 1
6380 if a < 10:
6381 if len(i) < 80:
6382 toptenhitclusters.append(i)
6383 elif len(i) >= 80:
6384 j = i[0:77] + "..."
6385 toptenhitclusters.append(j)
6386 a += 1
6387 #print clusterblastfile
6388 details = (clusterblastfile.split("\nDetails:")[1]).split(">>")[1:]
6389 #print details
6390 nrhitclusters = len(toptenhitclusters)
6391 #Save query gene cluster data
6392 querylines = ((clusterblastfile.split("Table of genes, locations, strands and annotations of query cluster:\n")[1]).split("\n\n\nSignificant hits:")[0]).split("\n")
6393 queryclustergenes = []
6394 queryclustergenesdetails = {}
6395 for i in querylines:
6396 tabs = i.split("\t")
6397 queryclustergenes.append(tabs[0])
6398 queryclustergenesdetails[tabs[0]] = [tabs[1],tabs[2],tabs[3],tabs[4]]
6399 #For every gene cluster, store hit genes and details
6400 colorgroupsdict = {}
6401 hitclusterdata = {}
6402 hitclusternr = 1
6403 compound_found = "n"
6404 nrhitgeneclusters[x] = 0
6405 for i in details:
6406 hitclustergenes = []
6407 hitclustergenesdetails = {}
6408 #Only calculate for first ten hit gene clusters
6409 if genomic_accnr == "" or genomic_accnr not in i:
6410 if hitclusternr <= 10:
6411 nrhitgeneclusters[x] = hitclusternr
6412 accession = cb_accessiondict[hitclusternr]
6413 hitclustergeneslines = ((i.split("Table of genes, locations, strands and annotations of subject cluster:\n")[1]).split("\n\nTable of Blast hits ")[0]).split("\n")
6414 #print '***********\n', i, '\n'
6415 #print hitclustergeneslines
6416 for j in hitclustergeneslines:
6417 tabs = j.split("\t")
6418 hitclustergenes.append(tabs[0])
6419 hitclustergenesdetails[tabs[0]] = [tabs[2],tabs[3],tabs[4],tabs[5],tabs[1]]
6420
6421 blasthitslines = ((i.split("%coverage, e-value):\n")[1]).split("\n\n")[0]).split("\n")
6422 querygeneswithhits = []
6423 coregeneswithhits = []
6424
6425
6426 blasthitdict = {}
6427 blastdetailsdict = {}
6428 querygenes = []
6429 revblasthitdict = {}
6430 hitgenes = []
6431
6432
6433 for k in blasthitslines:
6434 tabs = k.split("\t")
6435 if tabs[0] not in querygeneswithhits:
6436 querygeneswithhits.append(tabs[0])
6437 if tabs[0] in allcoregenes and tabs[0] not in coregeneswithhits:
6438 coregeneswithhits.append(tabs[0])
6439
6440
6441 if blasthitdict.has_key(tabs[0]):
6442 hits = blasthitdict[tabs[0]]
6443 hits.append(tabs[1])
6444 blasthitdict[tabs[0]] = hits
6445 if revblasthitdict.has_key(tabs[1]):
6446 revhits = revblasthitdict[tabs[1]]
6447 revhits.append(tabs[0])
6448 revblasthitdict[tabs[1]] = revhits
6449 else:
6450 revblasthitdict[tabs[1]] = [tabs[0]]
6451 blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
6452 if tabs[0] not in querygenes:
6453 querygenes.append(tabs[0])
6454 hitgenes.append(tabs[1])
6455 else:
6456 blasthitdict[tabs[0]] = [tabs[1]]
6457 if revblasthitdict.has_key(tabs[1]):
6458 revhits = revblasthitdict[tabs[1]]
6459 revhits.append(tabs[0])
6460 revblasthitdict[tabs[1]] = revhits
6461 else:
6462 revblasthitdict[tabs[1]] = [tabs[0]]
6463 blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
6464 if tabs[0] not in querygenes:
6465 querygenes.append(tabs[0])
6466 hitgenes.append(tabs[1])
6467
6468
6469
6470 for k in known_compound_dict.keys():
6471 if k in i and compound_found == "n" and len(querygeneswithhits) > 2 and len(coregeneswithhits) > 0:
6472 ws0.write(x,4,known_compound_dict[k])
6473 compound_found = "y"
6474 """blasthitdict = {}
6475 blastdetailsdict = {}
6476 querygenes = []
6477 revblasthitdict = {}
6478 hitgenes = []
6479 for i in blasthitslines:
6480 tabs = i.split("\t")
6481 if blasthitdict.has_key(tabs[0]):
6482 hits = blasthitdict[tabs[0]]
6483 hits.append(tabs[1])
6484 blasthitdict[tabs[0]] = hits
6485 if revblasthitdict.has_key(tabs[1]):
6486 revhits = revblasthitdict[tabs[1]]
6487 revhits.append(tabs[0])
6488 revblasthitdict[tabs[1]] = revhits
6489 else:
6490 revblasthitdict[tabs[1]] = [tabs[0]]
6491 blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
6492 if tabs[0] not in querygenes:
6493 querygenes.append(tabs[0])
6494 hitgenes.append(tabs[1])
6495 else:
6496 blasthitdict[tabs[0]] = [tabs[1]]
6497 if revblasthitdict.has_key(tabs[1]):
6498 revhits = revblasthitdict[tabs[1]]
6499 revhits.append(tabs[0])
6500 revblasthitdict[tabs[1]] = revhits
6501 else:
6502 revblasthitdict[tabs[1]] = [tabs[0]]
6503 blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
6504 if tabs[0] not in querygenes:
6505 querygenes.append(tabs[0])
6506 hitgenes.append(tabs[1])
6507 """
6508 #Make groups of genes for coloring
6509 colorgroups = []
6510 internalgroups = internalhomologygroupsdict[x]
6511 for i in internalgroups:
6512 querygenes_and_hits = []
6513 for j in i:
6514 #Make list of query gene and its hits
6515 additionalhits = []
6516 #For each hit, check if it was also hit by another gene; if so, only add it to the group if this hit had the lowest blast score
6517 otherscores = []
6518 queryscore = 0
6519 if blasthitdict.has_key(j):
6520 for k in blasthitdict[j]:
6521 for l in blastdetailsdict.keys():
6522 if k in l and j in l:
6523 queryscore = blastdetailsdict[l][1]
6524 elif k in l and j not in l:
6525 otherscores.append(blastdetailsdict[l][1])
6526 allscores = otherscores + [queryscore]
6527 if queryscore == max(allscores):
6528 additionalhits.append(k)
6529 #Add additional hits to the querygenes_and_hits list that will form a colorgroup
6530 querygenes_and_hits = querygenes_and_hits + additionalhits
6531 if j not in querygenes_and_hits:
6532 querygenes_and_hits.append(j)
6533 if len(querygenes_and_hits) > 0:
6534 colorgroups.append(querygenes_and_hits)
6535 colorgroupsdict[hitclusternr] = colorgroups
6536 hitclusterdata[hitclusternr] = [colorgroupsdict,hitclustergenes,hitclustergenesdetails,queryclustergenes,queryclustergenesdetails,toptenhitclusters,accession]
6537 hitclusternr += 1
6538 elif hitclusternr > 10 and hitclusternr <= 50:
6539 blasthitslines = ((i.split("%coverage, e-value):\n")[1]).split("\n\n")[0]).split("\n")
6540 querygeneswithhits = []
6541 coregeneswithhits = []
6542 for k in blasthitslines:
6543 tabs = k.split("\t")
6544 if tabs[0] not in querygeneswithhits:
6545 querygeneswithhits.append( tabs[0] )
6546 if tabs[0] in allcoregenes and tabs[0] not in coregeneswithhits:
6547 coregeneswithhits.append(tabs[0])
6548 for k in known_compound_dict.keys():
6549 if k in i and compound_found == "n" and len(querygeneswithhits) > 2 and len(coregeneswithhits) > 0:
6550 ws0.write(x,4,known_compound_dict[k])
6551 compound_found = "y"
6552 hitclusternr += 1
6553 queryclusterdata[cblastclusternr] = [nrhitclusters,hitclusterdata]
6554 cblastclusternr += 1
6555 wb.save(genomename + "/" + genomename + ".geneclusters.xls")
6556 #Gather and store data on each gene cluster
6557 gtrcoglist = ['SMCOG1045','SMCOG1062','SMCOG1102']
6558 transportercoglist = ['SMCOG1000','SMCOG1005','SMCOG1011','SMCOG1020','SMCOG1029','SMCOG1033','SMCOG1035','SMCOG1044','SMCOG1065','SMCOG1067','SMCOG1069','SMCOG1074','SMCOG1085','SMCOG1096','SMCOG1106','SMCOG1118','SMCOG1131','SMCOG1166','SMCOG1169','SMCOG1184','SMCOG1202','SMCOG1205','SMCOG1214','SMCOG1234','SMCOG1243','SMCOG1245','SMCOG1252','SMCOG1254','SMCOG1288']
6559 qgeneclusterdata = {}
6560 if smcogs == "y":
6561 smcogdict2 = {}
6562 smcogdescriptions = {}
6563 for i in smcogdict.keys():
6564 if len(smcogdict[i]) > 0 and len(smcogdict[i][0]) > 0 and ":" in smcogdict[i][0][0]:
6565 smcogdict2[i] = (smcogdict[i][0][0]).split(":")[0]
6566 smcogdescriptions[(smcogdict[i][0][0]).split(":")[0]] = (smcogdict[i][0][0]).split(":")[1]
6567 elif len(smcogdict[i]) > 0:
6568 smcogdict2[i] = smcogdict[i][0][0]
6569 smcogdict = smcogdict2
6570 for genecluster in geneclusters:
6571 clustergenes = clusterinfo[genecluster][4]
6572 clustergenes2 = []
6573 #for i in clustergenes:
6574 # clustergenes2.append(i[4])
6575 [clustergenes2.append(i[4]) for i in clustergenes]
6576 clustergenes = clustergenes2
6577 clusternr = 1
6578 clustertype = clusterinfo[genecluster][0]
6579 annotations = {}
6580 colors = []
6581 starts = []
6582 ends = []
6583 strands = []
6584 pksnrpsprots = []
6585 gtrs = []
6586 transporters = []
6587 for j in clustergenes:
6588 annotations[j] = proteins[3][j][3]
6589 starts.append(int(proteins[3][j][0]))
6590 ends.append(int(proteins[3][j][1]))
6591 strands.append(proteins[3][j][2])
6592 if j in allcoregenes:
6593 colors.append("#810E15")
6594 else:
6595 colors.append("grey")
6596 if j in pksnrpscoregenes:
6597 pksnrpsprots.append(j)
6598 if smcogs == "y":
6599 if smcogdict.has_key(j) and len(smcogdict[j]) > 0 :
6600 if smcogdict[j][0] in gtrcoglist:
6601 gtrs.append(j)
6602 if smcogdict[j][0] in transportercoglist:
6603 transporters.append(j)
6604 clustersize = max(ends) - min(starts)
6605 if clusterblast == "n":
6606 nrhitgeneclusters = {}
6607 for i in geneclusters:
6608 nrhitgeneclusters[i] = 0
6609 hitgeneclusters = range(1,(nrhitgeneclusters[genecluster] + 1))
6610 hitgeneclusterdata = {}
6611 hitgeneclusterdata[genecluster] = [hitgeneclusters]
6612 pksnrpsprotsnames = nrpspkstypedict
6613 pksnrpsdomains = {}
6614 domlist = []
6615 domsdetails = {}
6616 substrspecnrpspredictordict = {}
6617 substrspecminowadict = {}
6618 substrspecpkssigdict = {}
6619 substrspecconsensusdict = {}
6620 krpredictionsdict = {}
6621 for i in pksnrpsprots:
6622 domlist = []
6623 domsdetails = {}
6624 doms = domaindict[i]
6625 for j in doms:
6626 nr = 1
6627 while j[0] + str(nr) in domlist:
6628 nr += 1
6629 domname = j[0] + str(nr)
6630 domlist.append(domname)
6631 domsdetails[domname] = [j[1],j[2]]
6632 if "AMP-binding" in domname or "A-OX" in domname:
6633 domname2 = i + "_" + "A" + str(nr)
6634 substrspecminowadict[domname2] = minowa_nrps_preds[i + "_A" + str(nr)]
6635 substrspecnrpspredictordict[domname2] = [nrps_code_preds[i + "_A" + str(nr)],nrps_svm_preds[i + "_A" + str(nr)]]
6636 substrspecconsensusdict[domname2] = consensuspreds[i + "_A" + str(nr)]
6637 if "PKS_AT" in domname:
6638 domname2 = i + "_" + "AT" + str(nr)
6639 substrspecminowadict[domname2] = minowa_pks_preds[i + "_AT" + str(nr)]
6640 substrspecpkssigdict[domname2] = pks_code_preds[i + "_AT" + str(nr)]
6641 substrspecconsensusdict[domname2] = consensuspreds[i + "_AT" + str(nr)]
6642 if "CAL_domain" in domname:
6643 domname2 = i + "_" + "CAL" + str(nr)
6644 substrspecminowadict[domname2] = minowa_cal_preds[i + "_CAL" + str(nr)]
6645 substrspecconsensusdict[domname2] = consensuspreds[i + "_CAL" + str(nr)]
6646 if "CAL_domain" in domname:
6647 domname2 = i + "_" + "CAL" + str(nr)
6648 substrspecminowadict[domname2] = minowa_cal_preds[i + "_CAL" + str(nr)]
6649 substrspecconsensusdict[domname2] = consensuspreds[i + "_CAL" + str(nr)]
6650 if "PKS_KR" in domname:
6651 domname2 = i + "_" + "KR" + str(nr)
6652 krpredictionsdict[domname2] = [kr_activity_preds[i + "_KR" + str(nr)],kr_stereo_preds[i + "_KR" + str(nr)]]
6653 pksnrpsdomains[i] = [domlist,domsdetails]
6654 if compound_pred_dict.has_key(genecluster):
6655 structpred = compound_pred_dict[genecluster]
6656 else:
6657 structpred = "N/A"
6658 qgeneclusterdata[genecluster] = [clustertype,clustersize,clustergenes,annotations,starts,ends,strands,pksnrpsprots,pksnrpsprotsnames,pksnrpsdomains,substrspecnrpspredictordict,substrspecminowadict,substrspecpkssigdict,substrspecconsensusdict,gtrs,transporters,colors,hitgeneclusterdata,structpred,krpredictionsdict]
6659
6660 #Create genecluster svg for each gene cluster
6661 geneposdict = {}
6662 for qclusternr in geneclusters:
6663 data = qgeneclusterdata[qclusternr]
6664 #Some of the below 23 lines may already be internal to script, scan to remove unnecessary data fetching
6665 clustertype = data[0]
6666 clustersize = data[1]
6667 genes = data[2]
6668 annotations = data[3]
6669 starts = data[4]
6670 ends = data[5]
6671 strands = data[6]
6672 pksnrpsprots = data[7]
6673 pksnrpsprotsnames = data[8]
6674 pksnrpsdomains = data[9]
6675 substrspecnrpspredictordict = data[10]
6676 substrspecminowadict = data[11]
6677 substrspecpkssigdict = data[12]
6678 substrspecconsensusdict = data[13]
6679 gtrs = data[14]
6680 transporters = data[15]
6681 colors = data[16]
6682 hitgeneclusterdata = data[17]
6683 structpred = data[18]
6684 krpredictionsdict = data[19]
6685 relpositions = relativepositions(starts,ends,clustersize)
6686 rel_starts = relpositions[0]
6687 rel_ends = relpositions[1]
6688 y = 0
6689 for i in genes:
6690 geneposdict[i] = [starts[y],ends[y]]
6691 y += 1
6692 s = geneclustersvg(genes,rel_starts,rel_ends,strands,geneposdict,pksnrpsprots,pksnrpsdomains,qclusternr)
6693 outfile = open(svgfolder + "genecluster" + str(qclusternr) + ".svg","w")
6694 outfile.write(s.getXML())
6695 outfile.close()
6696 #Create ClusterBlast svg
6697 if clusterblast == "y":
6698 clusterblastpositiondata = {}
6699 #Create alignment svg for each pair of hit&query
6700 for i in geneclusters:
6701 hitclusters = range(queryclusterdata[i][0] + 1)[1:]
6702 #Create svgs for pairwise gene cluster alignment
6703 colorschemedict,rgbcolorscheme = calculate_colorgroups(i,hitclusters,queryclusterdata,internalhomologygroupsdict)
6704 for k in hitclusters:
6705 cresults = clusterblastresults(i,[k],queryclusterdata,colorschemedict,rgbcolorscheme)
6706 s = cresults[0]
6707 clusterblastpositiondata[str(i) + "_"+str(k)] = cresults[1]
6708 outfile = open(svgfolder + "clusterblast" + str(i) + "_" + str(k) + ".svg","w")
6709 outfile.write(s.getXML())
6710 outfile.close()
6711 #Create svgs for multiple gene cluster alignment
6712 cresults = clusterblastresults(i,hitclusters,queryclusterdata,colorschemedict,rgbcolorscheme)
6713 s = cresults[0]
6714 clusterblastpositiondata[str(i) + "_all"] = cresults[1]
6715 outfile = open(svgfolder + "clusterblast" + str(i) + "_all.svg","w")
6716 outfile.write(s.getXML())
6717 outfile.close()
6718
6719 #Create folder for SEARCHGTR HTML files, load search form template
6720 formtemplate = open("search_form.html","r")
6721 formtemplate = formtemplate.read()
6722 formtemplate = formtemplate.replace("\r","\n")
6723 formtemplateparts = formtemplate.split("FASTASEQUENCE")
6724 #Create HTML file with gene cluster info in hidden div tags
6725 htmlfile = open("empty.xhtml","r")
6726 html = htmlfile.read()
6727 html = html.replace("\r","\n")
6728 htmlparts = html.split("<SPLIT HERE>")
6729 htmloutfile = open(genomename + "/display.xhtml","w")
6730 htmloutfile.write(htmlparts[0])
6731 #Add lines toreload all svgs up front
6732 for qclusternr in geneclusters:
6733 htmloutfile.write(' loadsvg(' + str(qclusternr) + ');\n')
6734 if clusterblast == "y":
6735 cblastclusters = [1,2,3,4,5,6,7,8,9,10]
6736 for qclusternr in geneclusters:
6737 nrhitclusters = queryclusterdata[qclusternr][0]
6738 for j in range(nrhitclusters):
6739 htmloutfile.write(' loadcblastsvg(' + str(qclusternr) + ',' + str(j+1) + ');\n')
6740 #For each gene cluster, add hidden div tags for gene names, add hidden div tags for NRPS/PKS domains, add hidden div tags for ClusterBLAST depictions
6741 htmloutfile.write(htmlparts[1])
6742 for qclusternr in geneclusters:
6743 data = qgeneclusterdata[qclusternr]
6744 pksnrpsprots = data[7]
6745 pksnrpsprotsnames = data[8]
6746 pksnrpsdomains = data[9]
6747 a = 0
6748 for i in pksnrpsprots:
6749 for j in pksnrpsdomains[i][0]:
6750 htmloutfile.write(' $("#b' + str(qclusternr) + '_00' + str(a) + '_div").hide();\n')
6751 a += 1
6752 htmloutfile.write(htmlparts[2])
6753 #Add top menu
6754 gifdict = {"t1pks":"16","t2pks":"17","t3pks":"18","t4pks":"20","nrps":"10","amglyccycl":"1","bcin":"2","blactam":"3","butyrolactone":"4","ectoine":"5","terpene":"19","indole":"7","lant":"8","melanin":"9","nucleoside":"12","other":"13","phosphoglycolipid":"14","siderophore":"15"}
6755 htmloutfile.write('<img border="0" align="top" src="images/empty.png" name="img0_" />\n')
6756 menubutton_nr = 1
6757 nrclustercolumns = 1
6758 for i in geneclusters:
6759 if qgeneclusterdata[i][0] in gifdict.keys():
6760 typenr = gifdict[qgeneclusterdata[i][0]]
6761 elif "-" in qgeneclusterdata[i][0]:
6762 typenr = "6"
6763 else:
6764 typenr = "13"
6765 htmloutfile.write('<a href="javascript:displaycluster(' + str(i) + ')"><img align="top" border="0" src="images/img' + str(i) + '_1.png" name="img' + str(i) + '_" onmouseover="over(' + str(i) + '),over2(0,' + typenr + ')" onmouseout="out(' + str(i) + '),out2(0,' + typenr + ')"/></a>\n')
6766 if menubutton_nr == 22 or menubutton_nr == 49:
6767 htmloutfile.write('<br/>')
6768 nrclustercolumns += 1
6769 menubutton_nr += 1
6770
6771 #Add gene cluster description
6772 htmloutfile.write(htmlparts[3])
6773 extrapixelsdict = {}
6774 for qclusternr in geneclusters:
6775 data = qgeneclusterdata[qclusternr]
6776 clustertype = data[0]
6777 clustersize = data[1]
6778 genes = data[2]
6779 annotations = data[3]
6780 starts = data[4]
6781 ends = data[5]
6782 strands = data[6]
6783 pksnrpsprots = data[7]
6784 pksnrpsprotsnames = data[8]
6785 pksnrpsdomains = data[9]
6786 substrspecnrpspredictordict = data[10]
6787 substrspecminowadict = data[11]
6788 substrspecpkssigdict = data[12]
6789 substrspecconsensusdict = data[13]
6790 gtrs = data[14]
6791 transporters = data[15]
6792 colors = data[16]
6793 hitgeneclusterdata = data[17]
6794 structpred = data[18]
6795 krpredictionsdict = data[19]
6796 relpositions = relativepositions(starts,ends,clustersize)
6797 rel_starts = relpositions[0]
6798 rel_ends = relpositions[1]
6799 #Create genes overview pop-up HTMLs
6800 genepopupoutfile = open(htmlfolder + "geneclustergenes" + str(qclusternr) + '.html',"w")
6801 genepopupoutfile.write('<html>\n<head>\n<LINK href="style.css" rel="stylesheet" type="text/css">\n</head>\n<body>\nOverview of gene cluster genes:<br><br><table border=1>\n')
6802 genepopupoutfile.write('<tr><td><b>Gene</b></td><td><b>Annotation</b></td><td><b>Start position</b></td><td><b>End position</b></td><td><b>Strand</b></td></tr>\n')
6803 for i in genes:
6804 genepopupoutfile.write('<tr><td>' + i + '</td><td>' + annotations[i].replace("_"," ") + '</td><td>' + str(starts[genes.index(i)]) + '</td><td>' + str(ends[genes.index(i)]) + '</td><td>' + strands[genes.index(i)] + '</td></tr>\n')
6805 genepopupoutfile.write('\n</table><br><br><br>Biosynthetic gene cluster signature gene domains detected: <br><br>\n')
6806 genepopupoutfile.write('<table border=1><tr><td><b>Gene</b></td><td><b>Detected domains</b></td><td><b>Bit scores</b></td>\n')
6807 for i in genes:
6808 if i in allcoregenes:
6809 detected_doms = detecteddomainsdict[i]
6810 for j in detected_doms:
6811 genepopupoutfile.write('<tr><td>' + i + '</td><td>' + str(j[0]) + '</td><td>' + str(j[1]) + '</td>\n')
6812 genepopupoutfile.write('\n</table><br><br><br>')
6813 genepopupoutfile.write('\n</body>\n</html>\n')
6814 genepopupoutfile.close()
6815 #Add gene cluster description on top
6816 if qclusternr == 1:
6817 htmloutfile.write('<div id="genecluster'+ str(qclusternr) + '">')
6818 else:
6819 htmloutfile.write('\n\n<div id="genecluster'+ str(qclusternr) + '" style="display:none">')
6820 #Add menu bars 1 & 2
6821 htmloutfile.write('<div id="bartext1" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:' + str(113 + nrclustercolumns * 28) + 'px; left:30px;"><b>Gene cluster description</b></div>')
6822 htmloutfile.write('<div id="bartext2" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:' + str(263 + nrclustercolumns * 28) + 'px; left:30px;"><b>PKS/NRPS domain annotation</b></div>')
6823 htmloutfile.write('<div id="descrbar1" style="position:absolute; z-index:1; top:' + str(110 + nrclustercolumns * 28) + 'px;"><img src="images/bar.png" height="25" width="' + str(int(0.75 * screenwidth)) + '"/></div>\n')
6824 htmloutfile.write('<div class="help" id="help1" style="position:absolute; z-index:1; top:' + str(112 + nrclustercolumns * 28) + 'px; left:' + str(int(screenwidth * 0.75) - 20) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#panel1" target="_blank"><img border="0" src="images/help.png"/></a></div>\n')
6825 htmloutfile.write('<div id="descrbar2" style="position:absolute; z-index:1; top:' + str(260 + nrclustercolumns * 28) + 'px;"><img src="images/bar.png" height="25" width="' + str(int(0.75 * screenwidth)) + '"/></div>\n')
6826 htmloutfile.write('<div class="help" id="help2" style="position:absolute; z-index:1; top:' + str(262 + nrclustercolumns * 28) + 'px; left:' + str(int(screenwidth * 0.75) - 20) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#panel2" target="_blank"><img border="0" src="images/help.png"/></a></div>\n')
6827 if screenwidth < 1280:
6828 htmloutfile.write('<div class="clusterdescr" style="font-size:0.7em; position:absolute; top:' + str(125 + nrclustercolumns * 28) + 'px; left:' + str(12) + 'px;">\n')
6829 else:
6830 htmloutfile.write('<div class="clusterdescr" style="font-size:0.8em; position:absolute; top:' + str(120 + nrclustercolumns * 28) + 'px; left:' + str(12) + 'px;">\n')
6831 htmloutfile.write("<br/>Gene Cluster " + str(qclusternr) + ". Type = " + clustertype + ". Location: "+ str(starts[0]) + " - " + str(ends[-1]) + " nt. Click on genes for more information.")
6832 if len(genomic_accnr) > 4:
6833 htmloutfile.write('&nbsp;&nbsp;<a href="http://www.ncbi.nlm.nih.gov/nuccore/' + genomic_accnr + '" target="_blank">GBK</a>')
6834 #Genes overview pop-up.
6835 if len(clustertype) > 20:
6836 htmloutfile.write('<br/>')
6837 htmloutfile.write('&nbsp;&nbsp;&nbsp;&nbsp;<a href="html/geneclustergenes' + str(qclusternr) + '.html" onclick=\'window.open("html/geneclustergenes' + str(qclusternr) + '.html","popup","width=800,height=800,scrollbars=yes,resizable=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>Genes and detection info overview</a>')
6838 htmloutfile.write("</div>\n\n")
6839 htmloutfile.write('<div id="display' + str(qclusternr) + '">\n')
6840 if nrclustercolumns > 1:
6841 spacers = nrclustercolumns - 1
6842 for i in range(spacers):
6843 htmloutfile.write('<img src="images/spacer.png"/>\n')
6844 htmloutfile.write('</div>\n')
6845 #Add gene pop-ups
6846 a = 0
6847 for i in genes:
6848 htmloutfile.write('<div id="a' + str(qclusternr) + '_00' + str(a) + '_div" class="hidden popup" style="position:absolute; z-index:2; top:' + str(185 + nrclustercolumns * 28) + 'px; left:' + str(int(((rel_starts[a] + rel_ends[a])/2)*0.875)) + 'px;">\n')
6849 htmloutfile.write(annotations[i].replace("_"," ").replace("&","&amp;") + "\n")
6850 if smcogs == "y":
6851 if smcogdict.has_key(i):
6852 smcog = smcogdict[i]
6853 htmloutfile.write("<br/>smCOG: " + smcog + " (" + smcogdescriptions[smcog].replace("_"," ").replace("&","&amp;") + ")\n")
6854 if smcog in gtrcoglist:
6855 formfileloc = searchgtrfolder + i + ".html"
6856 formfile = open(formfileloc,"w")
6857 specificformtemplate = formtemplateparts[0].replace("GlycTr",i)
6858 formfile.write(specificformtemplate)
6859 formfile.write(i + "\n" + seqdict[i])
6860 formfile.write(formtemplateparts[1])
6861 formfile.close()
6862 htmloutfile.write("<br/><a href=\"searchgtr/" + i + ".html\" target=\"_blank\"> Run SEARCHGTr on this gene </a>\n")
6863 if smcog in transportercoglist:
6864 link = "http://blast.jcvi.org/er-blast/index.cgi?project=transporter;program=blastp;sequence=sequence%0A" + seqdict[i]
6865 htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> TransportDB BLAST on this gene </a>\n")
6866 else:
6867 htmloutfile.write("<br/>smCOG: -\n")
6868 link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp&amp;QUERY=" + seqdict[i] + "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch"
6869 htmloutfile.write("<br/>Location: " + str(starts[a]) + "-" + str(ends[a]) + "\n")
6870 htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a><br/>\n")
6871 browse_start = starts[a] - 10000
6872 browse_end = ends[a] + 10000
6873 if browse_start < 0:
6874 browse_start = 0
6875 if browse_end > dnaseqlength:
6876 browse_end = dnaseqlength
6877 if genomic_accnr != "none" and genomic_accnr != "":
6878 htmloutfile.write('<a href="http://www.ncbi.nlm.nih.gov/projects/sviewer/?Db=gene&amp;DbFrom=protein&amp;Cmd=Link&amp;noslider=1&amp;id=' + genomic_accnr + '&amp;from=' + str(browse_start) + '&amp;to=' + str(browse_end) + '" target=\"_blank\">View genomic context</a><br/>\n')
6879 if smcogs == "y":
6880 if smcogtreedict.has_key(i.rpartition(".")[0]):
6881 htmloutfile.write('<a href="smcogs/' + smcogtreedict[i.rpartition(".")[0]] + '" onclick=\'window.open("smcogs/' + smcogtreedict[i.rpartition(".")[0]] + '","popup","width=1280,height=1500,resizable=yes,scrollbars=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>View smCOG seed phylogenetic tree with this gene</a>\n')
6882 elif smcogtreedict.has_key(i):
6883 htmloutfile.write('<a href="smcogs/' + smcogtreedict[i] + '" onclick=\'window.open("smcogs/' + smcogtreedict[i] + '","popup","width=1280,height=1500,resizable=yes,scrollbars=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>View smCOG seed phylogenetic tree with this gene</a>\n')
6884 htmloutfile.write("</div>\n\n")
6885 htmloutfile.write('<div id="a' + str(qclusternr) + '_00' + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(162 + nrclustercolumns * 28) + 'px; left:' + str(float((rel_starts[a]+rel_ends[a])/2)*0.9375) + 'px;">\n')
6886 htmloutfile.write(i)
6887 htmloutfile.write("</div>\n\n")
6888 a += 1
6889 #Early calculation of nr of domains to be able to fit structure prediction information of large NRPSs/PKSs
6890 pksnrpsdomainnr = 0
6891 krdomainnr = 0
6892 adomainnr = 0
6893 for i in pksnrpsprots:
6894 doms = pksnrpsdomains[i][0]
6895 first = "no"
6896 nra = 0
6897 nrat = 0
6898 nrkr = 0
6899 nrcal = 0
6900 for j in doms:
6901 if "AMP-binding" in j or "A-OX" in j:
6902 j = "A"
6903 nra += 1
6904 adomainnr += 1
6905 z = nra
6906 if "KR" in j:
6907 j = "KR"
6908 nrkr += 1
6909 krdomainnr += 1
6910 z = nrkr
6911 if "AT" in j and "docking" not in j:
6912 j = "AT"
6913 nrat += 1
6914 pksnrpsdomainnr += 1
6915 z = nrat
6916 if "CAL" in j:
6917 j = "CAL"
6918 nrcal += 1
6919 pksnrpsdomainnr += 1
6920 z = nrcal
6921 pixels = adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16) + 375
6922 extrapixels = pixels - (676 + len(pksnrpsprots) * 99)
6923 if extrapixels < 0:
6924 extrapixels = 0
6925 extrapixelsdict[qclusternr] = extrapixels
6926 #Add picture of predicted chemical structure
6927 htmloutfile.write('<div id="verticalbar1" style="position:absolute; left:' + str(int(screenwidth * 0.75) + 12) + 'px; top:' + str(106 + nrclustercolumns * 28) + 'px;"><img src="images/linefill.png" height="' + str(1126 + len(pksnrpsprots) * 99 + extrapixels) + '" width="2"/></div>\n')
6928 htmloutfile.write('<div id="verticalbar2" style="position:absolute; left:' + str(int(screenwidth * 0.98)) + 'px; top:0px;"><img src="images/linefill.png" height="' + str(1288 + len(pksnrpsprots) * 99 + nrclustercolumns * 28 + extrapixels) + '" width="2"/></div>\n')
6929 htmloutfile.write('<div id="horizbar1" style="position:absolute; left:0px; top:' + str(92 + nrclustercolumns * 28) + 'px;"><img src="images/linefill.png" height="2" width="' + str(screenwidth * 0.98) + '"/></div>\n')
6930 htmloutfile.write('<div id="horizbar2" style="position:absolute; left:0px; top:82px;"><img src="images/linefill.png" height="2" width="' + str(screenwidth * 0.98) + '"/></div>\n')
6931 htmloutfile.write('<div id="horizbar3" style="position:absolute; left:0px; top:' + str(1223 + len(pksnrpsprots) * 99 + nrclustercolumns * 28 + extrapixels) + 'px;"><img src="images/linefill.png" height="2" width="' + str(screenwidth * 0.98) + '"/></div>\n')
6932 if screenwidth < 1280:
6933 htmloutfile.write('<div id="bartext4" style="color:#FFFFFF; font-size:0.8em; position:absolute; z-index:2; top:' + str(114 + nrclustercolumns * 28) + 'px; left:' + str(int(screenwidth * 0.75) + 30) + 'px;"><b>Predicted core structure</b></div>\n')
6934 else:
6935 htmloutfile.write('<div id="bartext4" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:' + str(113 + nrclustercolumns * 28) + 'px; left:' + str(int(screenwidth * 0.75) + 30) + 'px;"><b>Predicted core structure</b></div>\n')
6936 htmloutfile.write('<div class="title" style="position:absolute; top:' + str(110 + nrclustercolumns * 28) + 'px; left:' + str(screenwidth * 0.75 + 20) + 'px;">\n')
6937 htmloutfile.write('<div id="descrbar4" style="right:25px; position:absolute; z-index:1; top:0px; left:0px;"><img src="images/bar.png" height="25" width="' + str(int(0.21 * screenwidth)) + '"/></div>\n')
6938 htmloutfile.write('<div class="help" id="help4" style="position:absolute; z-index:1; top:2px; left:' + str(int(screenwidth * 0.2) - 20) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#sidepanel1" target="_blank"><img border="0" src="images/help.png"/></a></div>\n')
6939 if qclusternr in failedstructures:
6940 htmloutfile.write('<br/><br/><img src="images/nostructure_icon.png" border="1" width="' + str(int(screenwidth * 0.19)) + '" height="200" />\n')
6941 elif " " in structpred:
6942 htmloutfile.write('<br/><br/><a href="structures/genecluster' + str(qclusternr) + '.png" onclick=\'window.open("structures/genecluster' + str(qclusternr) + '.png","popup","width=600,height=300,scrollbars=yes,resizable=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'><img src="structures/genecluster' + str(qclusternr) + '_icon.png" border="1" width="' + str(int(screenwidth * 0.19)) + '" height="200" /></a>\n')
6943 else:
6944 htmloutfile.write('<br/><br/><img src="images/nostructure_icon.png" border="1" width="' + str(int(screenwidth * 0.19)) + '" height="200" />\n')
6945 htmloutfile.write('<div class="clusterdescr" style="font-size:0.8em;">\n')
6946 htmloutfile.write("Monomers prediction: " + structpred + "<br/>\n")
6947 if qclusternr in dockingdomainanalysis:
6948 htmloutfile.write('<a href="html/docking_analysis' + str(qclusternr) + '.html" onclick=\'window.open("html/docking_analysis' + str(qclusternr) + '.html","popup","width=600,height=1200,scrollbars=yes,resizable=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>Docking domain analysis results.</a><br/>\n')
6949 nrpsfound = "no"
6950 pksnrpsdomainnr = 0
6951 adomainnr = 0
6952 krdomainnr = 0
6953 for i in pksnrpsprots:
6954 doms = pksnrpsdomains[i][0]
6955 first = "no"
6956 nra = 0
6957 nrat = 0
6958 nrkr = 0
6959 nrcal = 0
6960 for j in doms:
6961 if "AMP-binding" in j or "A-OX" in j:
6962 j = "A"
6963 nra += 1
6964 adomainnr += 1
6965 z = nra
6966 if "KR" in j:
6967 j = "KR"
6968 nrkr += 1
6969 krdomainnr += 1
6970 z = nrkr
6971 if "AT" in j and "docking" not in j:
6972 j = "AT"
6973 nrat += 1
6974 pksnrpsdomainnr += 1
6975 z = nrat
6976 if "CAL" in j:
6977 j = "CAL"
6978 nrcal += 1
6979 pksnrpsdomainnr += 1
6980 z = nrcal
6981 prediction = "no"
6982 domname = str(i) + "_" + str(j) + str(z)
6983 if domname in substrspecnrpspredictordict.keys():
6984 nrpsfound = "yes"
6985 prediction = "yes"
6986 if substrspecnrpspredictordict[domname][0] == "nrp":
6987 if first == "no":
6988 first = "yes"
6989 htmloutfile.write(i + ':<br/>')
6990 htmloutfile.write('<font size="1">&nbsp;&nbsp;NRPSPredictor code prediction, '+ str(j) + str(z) + ': ?</font><br/>\n')
6991 else:
6992 if first == "no":
6993 first = "yes"
6994 htmloutfile.write(i + ':<br/>')
6995 htmloutfile.write('<font size="1">&nbsp;&nbsp;NRPSPredictor code prediction, '+ str(j) + str(z) + ': ' + substrspecnrpspredictordict[domname][0] + '</font><br/>\n')
6996 if substrspecnrpspredictordict[domname][1] == "nrp":
6997 if first == "no":
6998 first = "yes"
6999 htmloutfile.write(i + ':<br/>')
7000 htmloutfile.write('<font size="1">&nbsp;&nbsp;NRPSPredictor SVM prediction, '+ str(j) + str(z) + ': ?</font><br/>\n')
7001 else:
7002 if first == "no":
7003 first = "yes"
7004 htmloutfile.write(i + ':<br/>')
7005 htmloutfile.write('<font size="1">&nbsp;&nbsp;NRPSPredictor SVM prediction, '+ str(j) + str(z) + ': ' + substrspecnrpspredictordict[domname][1] + '</font><br/>\n')
7006 if domname in substrspecminowadict.keys():
7007 prediction = "yes"
7008 if substrspecminowadict[domname] == "nrp" or substrspecminowadict[domname] == "pk":
7009 if first == "no":
7010 first = "yes"
7011 htmloutfile.write(i + ':<br/>')
7012 htmloutfile.write('<font size="1">&nbsp;&nbsp;Minowa prediction, '+ str(j) + str(z) + ': ?</font><br/>\n')
7013 else:
7014 if first == "no":
7015 first = "yes"
7016 htmloutfile.write(i + ':<br/>')
7017 htmloutfile.write('<font size="1">&nbsp;&nbsp;Minowa prediction, '+ str(j) + str(z) + ': ' + substrspecminowadict[domname] + '</font><br/>\n')
7018 if domname in substrspecpkssigdict.keys():
7019 prediction = "yes"
7020 if substrspecpkssigdict[domname] == "pk":
7021 if first == "no":
7022 first = "yes"
7023 htmloutfile.write(i + ':<br/>')
7024 htmloutfile.write('<font size="1">&nbsp;&nbsp;PKS code prediction, '+ str(j) + str(z) + ': ?</font><br/>\n')
7025 else:
7026 if first == "no":
7027 first = "yes"
7028 htmloutfile.write(i + ':<br/>')
7029 htmloutfile.write('<font size="1">&nbsp;&nbsp;PKS code prediction, '+ str(j) + str(z) + ': ' + substrspecpkssigdict[domname] + '</font><br/>\n')
7030 if domname in krpredictionsdict.keys():
7031 if first == "no":
7032 first = "yes"
7033 htmloutfile.write(i + ':<br/>')
7034 htmloutfile.write('<font size="1">&nbsp;&nbsp;KR activity, '+ str(j) + str(z) + ': ' + krpredictionsdict[domname][0] + "</font><br/>\n")
7035 htmloutfile.write('<font size="1">&nbsp;&nbsp;KR stereochemistry, '+ str(j) + str(z) + ': ' + krpredictionsdict[domname][1] + "</font><br/>\n")
7036 #Add link to prediction details pop-up
7037 if prediction == "yes":
7038 htmloutfile.write('<font size="1">&nbsp;&nbsp;&nbsp;&nbsp;<a href="substrspecs/' + domname + '.html" onclick=\'window.open("substrspecs/' + domname + '.html","popup","width=500,height=400,scrollbars=yes,resizable=no,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>Prediction details</a></font><br/>\n')
7039 if nrpsfound == "yes":
7040 htmloutfile.write('<br/><a href="http://bioinfo.lifl.fr/norine/form2.jsp" target="_blank">Perform Norine peptide search</a>')
7041 htmloutfile.write('</div>')
7042 if screenwidth < 1280:
7043 htmloutfile.write('<div id="bartext5" style="color:#FFFFFF; font-size:0.8em; position:absolute; z-index:2; top:' + str(624 + adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:10px;"><b>File outputs</b></div>\n')
7044 else:
7045 htmloutfile.write('<div id="bartext5" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:' + str(623 + adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:10px;"><b>Downloadable output files</b></div>\n')
7046 htmloutfile.write('<div id="descrbar5" style="right:25px; position:absolute; z-index:1; top:' + str(620 + adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:0px;"><img src="images/bar.png" height="25" width="' + str(int(0.21 * screenwidth)) + '"/></div>\n')
7047 htmloutfile.write('<div class="help" id="help5" style="position:absolute; z-index:1; top:' + str(622 + adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:' + str(int(screenwidth * 0.2) - 20) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#sidepanel2" target="_blank"><img border="0" src="images/help.png"/></a></div>\n')
7048 htmloutfile.write('<div class="text" id="outputinfo" style="font-size:0.8em; right:25px; position:absolute; z-index:1; top:' + str(655 + adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:0px;">')
7049 if fullhmm == "y" or fullblast == "y":
7050 htmloutfile.write('<a href="' + oldgenomename + '.final.embl" target="_blank">Open EMBL summary file</a><br/><br/>')
7051 #htmloutfile.write('<a href="' + genomename + '.final.csv" target="_blank">Download CSV summary file</a><br/><br/>')
7052 if fullhmm == "y":
7053 htmloutfile.write('<a href="' + oldgenomename + '.cluster_prediction.png" onclick=\'window.open("' + oldgenomename + '.cluster_prediction.png","popup","width=1024,height=1400,scrollbars=0,resizable=0,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>Sec. met. enriched genome regions</a><br/><br/>')
7054 htmloutfile.write('<a href="' + genomename + '.geneclusters.xls" target="_blank">Open XLS overview table</a><br/><br/>')
7055 htmloutfile.write('</div>')
7056 htmloutfile.write("</div>\n\n")
7057 #Add descriptions of NRPS/PKS genes
7058 htmloutfile.write('<div class="title" style="position:absolute; top:' + str(180) + 'px; left:' + str(12) + 'px;">\n')
7059 htmloutfile.write("</div>\n\n")
7060 z = 1
7061 for i in pksnrpsprots:
7062 htmloutfile.write('<div class="text" style="position:absolute; top:' + str(228 + 84 * z + nrclustercolumns * 28) + 'px; left:' + str(12) + 'px;">\n')
7063 htmloutfile.write(i + " (" + pksnrpsprotsnames[i].lower() + ")")
7064 htmloutfile.write("</div>\n\n")
7065 z += 1
7066 #Add NRPS/PKS domain pop-ups
7067 longestprot = 0
7068 protlengthdict = {}
7069 for i in pksnrpsprots:
7070 protlength = (geneposdict[i][1] - geneposdict[i][0]) / 3
7071 protlengthdict[i] = protlength
7072 if protlength > longestprot:
7073 longestprot = protlength
7074 try:
7075 aa2pixelratio = longestprot * 0.75 / screenwidth
7076 except:
7077 aa2pixelratio = 0.1
7078 a = 0
7079 z = 1
7080 for i in pksnrpsprots:
7081 domainsdict = pksnrpsdomains[i][1]
7082 nra = 0
7083 nrat = 0
7084 nrkr = 0
7085 nrcal = 0
7086 for j in pksnrpsdomains[i][0]:
7087 startpos = domainsdict[j][0]
7088 endpos = domainsdict[j][1]
7089 htmloutfile.write('<div id="b' + str(qclusternr) + '_00' + str(a) + '_div" class="hidden popup" style="position:absolute; z-index:2; top:' + str(277 + 84 * z + nrclustercolumns * 28) + 'px; left:' + str( ( ( (endpos+startpos) / 2) / aa2pixelratio) * 0.9375 ) + 'px;">\n')
7090 htmloutfile.write("Domain " + j + " (" + i + ")")
7091 link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp&amp;QUERY=" + seqdict[i][startpos:endpos] + "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch"
7092 htmloutfile.write("<br/>Location: " + str(startpos) + "-" + str(endpos) + " AA\n")
7093 domid = i + "_" + j
7094 if "AMP-binding" in j or "A-OX" in j:
7095 j = "A"
7096 nra += 1
7097 y = nra
7098 if "PKS_KR" in j:
7099 j = "KR"
7100 nrkr += 1
7101 y = nrkr
7102 if "PKS_AT" in j:
7103 j = "AT"
7104 nrat += 1
7105 y = nrat
7106 if "CAL_domain" in j:
7107 j = "CAL"
7108 nrcal += 1
7109 y = nrcal
7110 prediction = "no"
7111 domid = str(i) + "_" + str(j) + str(y)
7112 if substrspecnrpspredictordict.has_key(domid) or substrspecminowadict.has_key(domid) or substrspecpkssigdict.has_key(domid):
7113 htmloutfile.write("<br/>Predicted substrate: " + substrspecconsensusdict[domid] + "\n")
7114 if substrspecnrpspredictordict.has_key(domid):
7115 htmloutfile.write("<br/>-NRPSPredictor code: " + substrspecnrpspredictordict[domid][0] + "\n")
7116 htmloutfile.write("<br/>-NRPSPredictor SVM: " + substrspecnrpspredictordict[domid][1] + "\n")
7117 if substrspecminowadict.has_key(domid):
7118 htmloutfile.write("<br/>-Minowa HMM: " + substrspecminowadict[domid] + "\n")
7119 if substrspecpkssigdict.has_key(domid):
7120 htmloutfile.write("<br/>-PKS code: " + substrspecpkssigdict[domid] + "\n")
7121 if krpredictionsdict.has_key(domid):
7122 htmloutfile.write("<br/>KR activity: " + krpredictionsdict[domid][0] + "\n")
7123 htmloutfile.write("<br/>KR stereochemistry: " + krpredictionsdict[domid][1] + "\n")
7124 htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this domain </a>\n")
7125 htmloutfile.write("</div>\n\n")
7126 a += 1
7127 z += 1
7128 htmloutfile.write('</div>\n')
7129
7130 if clusterblast == "y":
7131 #Write ClusterBlast divs with pictures and description pop-up tags
7132 htmloutfile.write('<div id="clusterblastview" class="clusterdescr">\n\n')
7133 #Add menu bar 3
7134 htmloutfile.write('<div id="bartext3" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:3px; left:20px;"><b>Homologous gene clusters</b></div>')
7135 htmloutfile.write('<div id="descrbar3" style="position:absolute; z-index:1; top:0px;"><img src="images/bar.png" height="25" width="' + str(int(0.75*screenwidth)) + '"/></div>')
7136 htmloutfile.write('<div class="help" id="help3" style="position:absolute; z-index:1; top:2px; left:' + str(int(screenwidth * 0.75) - 30) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#panel3" target="_blank"><img border="0" src="images/help.png"/></a></div>')
7137 for qclusternr in geneclusters:
7138 nrhitclusters = queryclusterdata[qclusternr][0]
7139 hitclusterdata = queryclusterdata[qclusternr][1]
7140 if qclusternr == 1:
7141 htmloutfile.write('<div id="qcluster' + str(qclusternr) + '">\n<br/><br/>\n<div align="left">\n<form name="clusterform' + str(qclusternr) + '">\n<select name="selection' + str(qclusternr) + '" onchange="javascript:navigate(this);">\n')
7142 else:
7143 htmloutfile.write('<div id="qcluster' + str(qclusternr) + '" style="display:none">\n<br/><br/>\n<div align="left">\n<form name="clusterform' + str(qclusternr) + '">\n<select name="selection' + str(qclusternr) + '" onchange="javascript:navigate(this);">\n')
7144 htmloutfile.write('<option value="">Select gene cluster alignment</option>\n')
7145 for i in range(nrhitclusters):
7146 htmloutfile.write('<option value="javascript:displaycblastresults(' + str(qclusternr) + ',' + str(i+1) + ')">' + hitclusterdata[i+1][5][i].replace("&","&amp;") + '</option>\n')
7147 htmloutfile.write('</select>\n</form>\n\n</div>')
7148 htmloutfile.write('<div style="position:absolute; top:33px; left:' + str(screenwidth*0.625) + 'px;"><img src="images/button.gif" name="button' + str(qclusternr) + '" onclick="javascript:displaybutton(' + str(qclusternr) + ');"/></div>')
7149 clustersizes = []
7150 for i in range(nrhitclusters):
7151 hitclusterdata = queryclusterdata[qclusternr][1]
7152 queryclustergenes = hitclusterdata[1][3]
7153 queryclustergenesdetails = hitclusterdata[1][4]
7154 hitclusternumber = i + 1
7155 cluster_acc = hitclusterdata[hitclusternumber][6]
7156 hitclustergenes = hitclusterdata[hitclusternumber][1]
7157 hitclustergenesdetails = hitclusterdata[hitclusternumber][2]
7158 relpositiondata = clusterblastpositiondata[str(qclusternr) + "_" + str(i+1)]
7159 qrel_starts = relpositiondata[0][0]
7160 qrel_ends = relpositiondata[0][1]
7161 hrel_starts = relpositiondata[1][hitclusternumber ][0]
7162 hrel_ends = relpositiondata[1][hitclusternumber ][1]
7163 strandsbalance = relpositiondata[2][hitclusternumber]
7164 if strandsbalance < 0:
7165 hitclustergenes.reverse()
7166 if qclusternr == 1 and (i+1) == 1:
7167 htmloutfile.write('<div id="hitcluster' + str(qclusternr) + '_' + str(i+1) + '">\n')
7168 else:
7169 htmloutfile.write('<div id="hitcluster' + str(qclusternr) + '_' + str(i+1) + '" style="display:none">\n')
7170 #Insert gene cluster descriptions
7171 cdescription = hitclusterdata[i+1][5][i].replace("&","&amp;").replace("\t"," ").partition(" ")[2].partition(" ")[2].split(", whole")[0].split(", complete")[0]
7172 if len(nucname) < 80:
7173 qdescription = nucname
7174 else:
7175 qdescription = nucname[0:77] + "..."
7176 htmloutfile.write('<div id="descriptionquery" style="text-align:right; position:absolute; top:70px; right:50px; font-size:10px; font-style:italic">' + qdescription + '</div>\n')
7177 htmloutfile.write('<div id="description' + str(qclusternr) + '" style="text-align:right; position:absolute; top:137px; right:50px; font-size:10px; font-style:italic">' + cdescription + '</div>\n')
7178 #Insert pubmed/pubchem links
7179 htmloutfile.write('<div id="pub_pics" style="position:absolute; top:60px; left:' + str(int(screenwidth * 0.0)) + 'px; font-size:10px"> Hit cluster cross-links: \n')
7180 htmloutfile.write('&nbsp;&nbsp;<a href="http://www.ncbi.nlm.nih.gov/nuccore/' + cluster_acc.split(".")[0] + '" target="_blank"><img align="bottom" border="0" src="images/genbank.gif"/></a>\n')
7181 present = "n"
7182 for j in pubmed_dict.keys():
7183 if j in cluster_acc:
7184 present = "y"
7185 for j in pubchem_dict.keys():
7186 if j in cluster_acc:
7187 present = "y"
7188 if present == "y":
7189 for j in pubmed_dict.keys():
7190 if j in cluster_acc:
7191 pubmedstring = pubmed_dict[j]
7192 htmloutfile.write('&nbsp;&nbsp;<a href="http://www.ncbi.nlm.nih.gov/pubmed/' + pubmedstring + '" target="_blank"><img align="bottom" border="0" src="images/pubmed.gif"/></a>\n')
7193 for j in pubchem_dict.keys():
7194 if j in cluster_acc:
7195 pubchemstring = pubchem_dict[j]
7196 if "," in pubchemstring:
7197 htmloutfile.write('&nbsp;&nbsp;<a href="http://www.ncbi.nlm.nih.gov/sites/entrez?db=pccompound&amp;term=' + pubchemstring + '" target="_blank"><img align="bottom" border="0" src="images/struct.gif"/></a>\n')
7198 else:
7199 htmloutfile.write('&nbsp;&nbsp;<a href="http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=' + pubchemstring + '" target="_blank"><img align="bottom" border="0" src="images/struct.gif"/></a>\n')
7200 htmloutfile.write('</div>\n\n')
7201 #Create gene pop-ups
7202 a = 0
7203 for j in queryclustergenes:
7204 j_accession = accessiondict[j]
7205 htmloutfile.write('<div id="q' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_div" class="hidden popup" style="position:absolute; top:' + str(113) + 'px; left:' + str(int(float(qrel_starts[a])*0.875)) + 'px;">\n')
7206 htmloutfile.write(queryclustergenesdetails[j][3].replace("_"," ").replace("&","&amp;") + "\n")
7207 link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp&amp;QUERY=" + j_accession + "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch"
7208 htmloutfile.write("<br/>Location: " + str(queryclustergenesdetails[j][0]) + "-" + str(queryclustergenesdetails[j][1]) + "\n")
7209 htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a>\n")
7210 htmloutfile.write("</div>\n\n")
7211 htmloutfile.write('<div id="q' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(83) + 'px; left:' + str(int(float((float(qrel_starts[a])+float(qrel_ends[a]))/2)*0.9375)) + 'px;">\n')
7212 htmloutfile.write(j)
7213 htmloutfile.write("</div>\n\n")
7214 a+= 1
7215 a = 0
7216 for j in hitclustergenes:
7217 j_accession = hitclustergenesdetails[j][4]
7218 htmloutfile.write('<div id="h' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_div" class="hidden popup" style="position:absolute; top:' + str(183) + 'px; left:' + str(int(float(hrel_starts[a])*0.875)) + 'px;">\n')
7219 htmloutfile.write(hitclustergenesdetails[j][3].replace("_"," ").replace("&","&amp;") + "\n")
7220 link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp&amp;QUERY=" + j_accession + "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch"
7221 htmloutfile.write("<br/>Location: " + str(hitclustergenesdetails[j][0]) + "-" + str(hitclustergenesdetails[j][1]) + "\n")
7222 htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a>\n")
7223 htmloutfile.write("</div>\n\n")
7224 htmloutfile.write('<div id="h' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(153) + 'px; left:' + str(int(float((float(hrel_starts[a])+float(hrel_ends[a]))/2)*0.9375)) + 'px;">\n')
7225 htmloutfile.write(j)
7226 htmloutfile.write("</div>\n\n")
7227 a += 1
7228 htmloutfile.write('</div>\n')
7229 #Find new relative positions for display of all gene clusters in one picture
7230 relpositiondata = clusterblastpositiondata[str(qclusternr) + "_all"]
7231 qrel_starts = relpositiondata[0][0]
7232 qrel_ends = relpositiondata[0][1]
7233 htmloutfile.write('<div id="hitcluster' + str(qclusternr) + '_all" style="display:none">\n')
7234 if len(nucname) < 80:
7235 qdescription = nucname
7236 else:
7237 qdescription = nucname[0:77] + "..."
7238 htmloutfile.write('<div id="descriptionquery" style="text-align:right; position:absolute; top:60px; right:50px; font-size:10px; font-style:italic">' + qdescription + '</div>\n')
7239 for i in range(nrhitclusters):
7240 hitclusterdata = queryclusterdata[qclusternr][1]
7241 queryclustergenes = hitclusterdata[1][3]
7242 queryclustergenesdetails = hitclusterdata[1][4]
7243 hitclusternumber = i + 1
7244 hrel_starts = relpositiondata[1][hitclusternumber][0]
7245 hrel_ends = relpositiondata[1][hitclusternumber][1]
7246 cluster_acc = hitclusterdata[hitclusternumber][6]
7247 hitclustergenes = hitclusterdata[hitclusternumber][1]
7248 hitclustergenesdetails = hitclusterdata[hitclusternumber][2]
7249 strandsbalance = relpositiondata[2][hitclusternumber]
7250 cdescription = hitclusterdata[i+1][5][i].replace("&","&amp;").replace("\t"," ").partition(" ")[2].partition(" ")[2].split(", whole")[0].split(", complete")[0]
7251 htmloutfile.write('<div id="description' + str(qclusternr) + '" style="text-align:right; position:absolute; top:' + str(60 + (57 * hitclusternumber)) + 'px; right:50px; font-size:10px; font-style:italic">' + cdescription + '</div>\n')
7252 if hitclusternumber == 1:
7253 a = 0
7254 for j in queryclustergenes:
7255 htmloutfile.write('<div id="all_' + str(qclusternr) + "_0_" + str(a) + '_div" class="hidden popup" style="position:absolute; top:' + str(100) + 'px; left:' + str(int(float(qrel_starts[a])*0.875)) + 'px; z-index:2;">\n')
7256 htmloutfile.write(queryclustergenesdetails[j][3].replace("_"," ").replace("&","&amp;") + "\n")
7257 link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp&amp;QUERY=" + j + "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch"
7258 htmloutfile.write("<br/>Location: " + str(queryclustergenesdetails[j][0]) + "-" + str(queryclustergenesdetails[j][1]) + "\n")
7259 htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a>\n")
7260 htmloutfile.write("</div>\n\n")
7261 htmloutfile.write('<div id="all_' + str(qclusternr) + "_0_" + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(75) + 'px; left:' + str(int(float((float(qrel_starts[a])+float(qrel_ends[a]))/2)*0.9375)) + 'px;">\n')
7262 htmloutfile.write(j)
7263 htmloutfile.write("</div>\n\n")
7264 a+= 1
7265 a = 0
7266 for j in hitclustergenes:
7267 htmloutfile.write('<div id="all_' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_div" class="hidden popup" style="position:absolute; top:' + str(100 + 57 * hitclusternumber) + 'px; left:' + str(int(float(hrel_starts[a])*0.875)) + 'px; z-index:2;">\n')
7268 htmloutfile.write(hitclustergenesdetails[j][3].replace("_"," ").replace("&","&amp;") + "\n")
7269 link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp&amp;QUERY=" + j + "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch"
7270 htmloutfile.write("<br/>Location: " + str(hitclustergenesdetails[j][0]) + "-" + str(hitclustergenesdetails[j][1]) + "\n")
7271 htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a>\n")
7272 htmloutfile.write("</div>\n\n")
7273 htmloutfile.write('<div id="all_' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(75 + 56.75 * hitclusternumber) + 'px; left:' + str(int(float((float(hrel_starts[a])+float(hrel_ends[a]))/2)*0.9375)) + 'px;">\n')
7274 htmloutfile.write(j)
7275 htmloutfile.write("</div>\n\n")
7276 a += 1
7277 htmloutfile.write('</div>\n')
7278 htmloutfile.write('</div>\n\n')
7279 if clusterblast == "y":
7280 htmloutfile.write('</div>\n')
7281 for i in geneclusters:
7282 data = qgeneclusterdata[i]
7283 extrapixels = extrapixelsdict[i]
7284 pksnrpsprots = data[7]
7285 if i == 1:
7286 htmloutfile.write('<div id="creditsbar' + str(i) + '" class="banner" style="position:absolute; width:' + str(int(0.98 * screenwidth)) +'px; align:\'left\'; height:75; top:' + str(1242 + int(len(pksnrpsprots) * 99) + nrclustercolumns * 28 + extrapixels) + 'px; left:0px; color:#810E15; z-index:-1;">')
7287 else:
7288 htmloutfile.write('<div id="creditsbar' + str(i) + '" class="banner" style="display:none; position:absolute; width:' + str(int(0.98 * screenwidth)) +'px; align:\'left\'; height:75; top:' + str(1242 + int(len(pksnrpsprots) * 99) + nrclustercolumns * 28 + extrapixels) + 'px; left:0px; color:#810E15; z-index:-1;">')
7289 htmloutfile.write('<div style="float:center; font-size:0.9em;">\n<div style="position:absolute; top:0px; left:30px;">\n<img src="images/ruglogo.gif" border="0"/>&nbsp;&nbsp;&nbsp;&nbsp;\n<img src="images/gbblogo.gif" border="0"/>&nbsp;&nbsp;&nbsp;&nbsp;\n<img src="images/tueblogo.gif" border="0"/>&nbsp;&nbsp;&nbsp;&nbsp;\n<img src="images/ucsflogo.gif" border="0"/>&nbsp;&nbsp;&nbsp;&nbsp;\n</div>\n<div style="position:absolute; top:0px; left:600px;">\nantiSMASH: Rapid identification, annotation and analysis of secondary metabolite biosynthesis gene clusters.\n<br/>Marnix H. Medema, Kai Blin, Peter Cimermancic, Victor de Jager, Piotr Zakrzewski, Michael A. Fischbach, Tilmann Weber, Rainer Breitling &amp; Eriko Takano\n<br/><i>Nucleic Acids Research</i> (2011), proposal submitted.\n</div>\n</div>\n</div>')
7290 #Add final part of HTML file
7291 htmloutfile.write(htmlparts[-1])
7292 #Copy accessory files for HTML viewing
7293 if sys.platform == ('win32'):
7294 copycommand1 = "copy/y vis\\* " + genomename + " > nul"
7295 copycommand2 = "copy/y vis\\html\\* " + genomename + "\\html > nul"
7296 copycommand3 = "copy/y vis\\images\\* " + genomename + "\\images > nul"
7297 elif sys.platform == ('linux2'):
7298 copycommand1 = "cp -r vis/* " + genomename + " > /dev/null"
7299 copycommand2 = "true"
7300 copycommand3 = "true"
7301 os.system(copycommand1)
7302 os.system(copycommand2)
7303 os.system(copycommand3)
7304
7305 #Generate EMBL output
7306 emblfile = open(genomename + "/embl_lines.txt","w")
7307 for i in geneclustergenes:
7308 emblfile.write(i + "\t")
7309 if smcogs == "y":
7310 if smcogdict.has_key(i):
7311 emblfile.write("smCOG: " + smcogdict[i] + ":" + smcogdescriptions[smcogdict[i]] + "\t")
7312 if nrpspkstypedict.has_key(i):
7313 emblfile.write("NRPS/PKS type: " + nrpspkstypedict[i] + "\t")
7314 if domaindict.has_key(i):
7315 domains = domaindict[i]
7316 for j in domains:
7317 emblfile.write(j[0] + " (" + str(j[1]) + "-" + str(j[2]) + "); E-value:" + str(j[3]) + "; Bit score: " + str(j[4]) + "\t")
7318 nrat = 0
7319 for k in minowa_pks_preds.keys():
7320 if i in k:
7321 nrat += 1
7322 emblfile.write("AT-domain " + str(nrat) + " Minowa substrate specificity prediction: " + minowa_pks_preds[k] + "\t")
7323 nrat = 0
7324 for k in pks_code_preds.keys():
7325 if i in k:
7326 nrat += 1
7327 emblfile.write("AT-domain " + str(nrat) + " PKS code substrate specificity prediction: " + pks_code_preds[k] + "\t")
7328 nrcal = 0
7329 for k in minowa_cal_preds.keys():
7330 if i in k:
7331 nrcal += 1
7332 emblfile.write("CAL-domain " + str(nrcal) + " Minowa substrate specificity prediction: " + minowa_cal_preds[k] + "\t")
7333 nra = 0
7334 for k in minowa_nrps_preds.keys():
7335 if i in k:
7336 nra += 1
7337 emblfile.write("A-domain " + str(nra) + " Minowa substrate specificity prediction: " + minowa_nrps_preds[k] + "\t")
7338 nra = 0
7339 for k in nrps_code_preds.keys():
7340 if i in k:
7341 nra += 1
7342 emblfile.write("A-domain " + str(nra) + " Stachelhaus code substrate specificity prediction: " + nrps_code_preds[k] + "\t")
7343 nra = 0
7344 for k in nrps_svm_preds.keys():
7345 if i in k:
7346 nra += 1
7347 emblfile.write("A-domain " + str(nra) + " NRPSPredictor2 SVM substrate specificity prediction: " + nrps_svm_preds[k] + "\t")
7348 nrkr = 0
7349 for k in kr_activity_preds.keys():
7350 if i in k:
7351 nrkr += 1
7352 emblfile.write("KR-domain " + str(nrat) + " activity prediction: " + kr_activity_preds[k] + "\t")
7353 emblfile.write("KR-domain " + str(nrat) + " predicted stereochemistry group: " + kr_stereo_preds[k] + "\t")
7354 if motifdict.has_key(i):
7355 l = motifdict[i]
7356 for m in l:
7357 emblfile.write("Motif " + str(m[0]) + " (" + str(m[1]) + "-" + str(m[2]) + "). E-value: " + str(m[3]) + "; Bit score: " + str(m[4]) + "\t")
7358 emblfile.write("\n")
7359 emblfile.write("\n\n>>\n\n")
7360 #enter separate domain entries
7361 for i in geneclustergenes:
7362 strand = strandsdict[i]
7363 startpos = geneposdict[i][0]
7364 endpos = geneposdict[i][1]
7365 if domaindict.has_key(i):
7366 domains = domaindict[i]
7367 for j in domains:
7368 if strand == "+":
7369 emblfile.write("misc_feature\t" + str(startpos + j[1] * 3) + ".." + str(startpos + j[2] * 3) + "\t" + str(j[0]) + " domain;\tE-value: " + str(j[3]) + "\tBit score: " + str(j[4]) + "\t/colour=2\n")
7370 elif strand == "-":
7371 emblfile.write("misc_feature\tcomplement(" + str(endpos - j[2] * 3) + ".." + str(endpos - j[1] * 3) + ")\t" + str(j[0]) + "domain;\tE-value: " + str(j[3]) + "Bit score: " + str(j[4]) + "\t/colour=2\n")
7372 if motifdict.has_key(i):
7373 l = motifdict[i]
7374 for m in l:
7375 if strand == "+":
7376 emblfile.write("misc_feature\t" + str(startpos + m[1] * 3) + ".." + str(startpos + m[2] * 3) + "\t" + str(m[0]) + " motif;\tE-value: " + str(m[3]) + "\tBit score: " + str(m[4]) + "\t/colour=6\n")
7377 elif strand == "-":
7378 emblfile.write("misc_feature\tcomplement(" + str(endpos - m[2] * 3) + ".." + str(endpos - m[1] * 3) + ")\t" + str(m[0]) + " motif;\tE-value: " + str(m[3]) + "\tBit score: " + str(m[4]) + "\t/colour=6\n")
7379 emblfile.write("\n\n>>\n\n")
7380 for i in geneclusters:
7381 cstart = clusterinfo[i][1]
7382 if cstart == 0:
7383 cstart = 1
7384 cend = clusterinfo[i][2]
7385 emblfile.write("misc_feature\t" + str(cstart) + ".." + str(cend) + "\t" + clusterinfo[i][0] + " gene cluster\t/colour=13\n")
7386 emblfile.close()
7387
7388 #Close open html file
7389 htmloutfile.close()
7390
7391 #Run whole-genome BLAST / HMM CLUSEAN modules & ClusterFinder
7392 if sys.platform == ('win32'):
7393 copycommand = "copy " + infile + " " + genomename + ' > nul'
7394 if sys.platform == ('linux2'):
7395 copycommand = "cp " + infile + " " + genomename
7396 os.system(copycommand)
7397 os.chdir(genomename)
7398 args = "--cpus %s " % nrcpus
7399 if fullblast == "n":
7400 args += "--without-blast "
7401 if fullhmm == "n":
7402 args += "--without-hmmer "
7403 if fullhmm == "y":
7404 args += '--pfamdbpath %s ' % pfamdbpath
7405 if fullblast == "y":
7406 args += '--blastdbpath %s ' % blastdbpath
7407 logfile.write("Running CLUSEAN pipeline modules.\n")
7408 if sys.platform == ('win32'):
7409 os.system("python ..\\clusean\\scripts\\runPipeline.py %s" % args)
7410 if sys.platform == ('linux2'):
7411 os.system( antismash_path + "clusean/scripts/runPipeline.py %s" % args)
7412 #print antismash_path + "clusean/scripts/runPipeline.py %s" % args
7413
7414 os.chdir('..')
7415
7416 #Close log file
7417 logfile.write("antiSMASH successfully finished in " + str(elapsed) + " seconds.\n")
7418 #print "antiSMASH successfully finished in " + str(elapsed) + " seconds.\n"
7419 logfile.close()