antismash: antismash.py comparison

comparison antismash.py @ 0:6a37d0a4510a default tip

initial uploaded

author	bjoern-gruening
date	Thu, 15 Mar 2012 05:23:03 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:6a37d0a4510a
+#!/usr/bin/env python
+## Copyright (c) 2010 Marnix H. Medema
+## University of Groningen
+## Department of Microbial Physiology / Groningen Bioinformatics Centre
+## License: GNU General Public License v3 or later
+## A copy of GNU GPL v3 should have been included in this software package in LICENSE.txt.
+##Functions necessary for this script
+import linecache, cPickle
+DEBUG = True
+def invalidoptions(argument):
+if len(argument) > 0:
+print >> sys.stderr, "Invalid options input:"
+print >> sys.stderr, argument
+print "From the command line, input antismash --help for more information."
+logfile.write("Invalid options input: " + argument + "\n")
+logfile.close()
+sys.exit(1)
+def sortdictkeysbyvalues(dict):
+items = [(value, key) for key, value in dict.items()]
+items.sort()
+return [key for value, key in items]
+def sortdictkeysbyvaluesrev(dict):
+items = [(value, key) for key, value in dict.items()]
+items.sort()
+items.reverse()
+return [key for value, key in items]
+def sortdictkeysbyvaluesrevv(dict):
+items = [(value, key) for key, value in dict.items()]
+items.sort()
+items.reverse()
+return [value for value, key in items]
+def get_sequence(fasta):
+"""get the description and trimmed dna sequence"""
+#in_file = open(fasta, 'r')
+#content = in_file.readlines()
+#in_file.close()
+#content2 = []
+#for i in content:
+#if i != "":
+#  content2.append(i)
+content = []
+[content.append(line) for line in open(fasta, 'r') if line]
+#content = content2
+while content[0] == "" or content[0] == "\n":
+content = content[1:]
+header = content[0]
+content = content[1:]
+content = [x.rstrip() for x in content]
+seq = "".join(content)
+if ">" not in header or ">" in seq:
+print >> sys.stderr, "FASTA file not properly formatted; should be single sequence starting with '>' and sequence name."
+logfile.write("FASTA file not properly formatted; should started with '>' and sequence name on first line.\n")
+logfile.close()
+sys.exit(1)
+return seq
+def complement(seq):
+complement = {'a': 't', 'c': 'g', 'g': 'c', 't': 'a', 'n': 'n', 'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
+complseq = []
+for base in seq:
+if base in complement.keys():
+complbase = complement[str(base)]
+complseq.append(complbase)
+else:
+complbase = 'n'
+complseq.append(complbase)
+return complseq
+def reverse_complement(seq):
+seq = list(seq)
+seq.reverse()
+revcompl = complement(seq)
+revcomplstr = str()
+for i in revcompl:
+revcomplstr = revcomplstr + str(i)
+return  revcomplstr
+def fastaseqlengths(proteins):
+names = proteins[0]
+seqs = proteins[1]
+seqlengths = {}
+a = 0
+for i in names:
+#seq = seqs[a]
+#seqlength = len(seq)
+#seqlengths[i] = seqlength
+seqlengths[i] = len(seqs[a])
+a += 1
+return seqlengths
+# Function that reads the fasta file into a dictionary
+def fastadict(fasta):
+file = open(fasta,"r")
+filetext = file.read()
+filetext = filetext.replace("\r","\n")
+filetext = filetext.strip()
+#Replaces all spaces with "_" to avoid problems
+filetext = filetext.replace(' ','_')
+filetext = filetext.split()
+dictseq = {}
+for a in filetext:
+if ">" in a[0]:
+f = str()
+d = a[1:68]
+else:
+e = a
+f += e
+dictseq[d] = f
+return dictseq
+# Function that extracts all sequence names from the fasta dictionary
+def lnames(fastadict):
+items = fastadict.items()
+items.sort()
+return [names for names, seqs in items]
+# Function that extracts all sequences from the fasta dictionary
+def lseqs(fastadict):
+items = fastadict.items()
+items.sort()
+return [seqs for names, seqs in items]
+def extractpositions(refmusclefile,newmusclefile,positions,refsequencename,querysequencename):
+dict = fastadict(refmusclefile)
+seqs = lseqs(dict)
+names = lnames(dict)
+#startpos = 2
+residues = []
+#Count residues in ref sequence and put positions in list
+muscle_dict = fastadict(newmusclefile)
+muscle_seqs = lseqs(muscle_dict)
+muscle_names = lnames(muscle_dict)
+refseqnr = muscle_names.index(refsequencename)
+#Extract activity signature
+refseq = muscle_seqs[refseqnr]
+poslist = []
+b = 0
+c = 0
+while refseq != "":
+i = refseq[0]
+if c in positions and i != "-":
+poslist.append(b)
+if i != "-":
+c += 1
+b += 1
+refseq = refseq[1:]
+#Extract positions from query sequence
+query_seqnr = muscle_names.index(querysequencename)
+query_seq = muscle_seqs[query_seqnr]
+for j in poslist:
+residues.append(query_seq[j])
+return residues
+def parsegenes(genes):
+genedict = {}
+genelist = []
+joinlist = []
+joindict = {}
+accessiondict = {}
+error = "n"
+errorlocations = []
+genenr = 0
+for i in genes:
+if "     gene            " in i:
+i = i.split("     gene            ")[0]
+elif "FT   gene            " in i:
+i = i.split("FT   gene            ")[0]
+join = "no"
+genenr += 1
+#Find gene location info for each gene
+if "complement" in i.split("\n")[0].lower() and i.split("\n")[0][-1] == ")":
+location = i.split("\n")[0]
+elif "complement" in i.split("\n")[0].lower() and i.split("\n")[0][-1] != ")":
+location = i.split("   /")[0]
+while ")" not in location.replace(" ","")[-3:]:
+location = location.rpartition("\n")[0]
+location = location.replace("\n","")
+location = location.replace(" ","")
+elif "join" in i.split("\n")[0].lower() and i.split("\n")[0][-1] == ")":
+location = i.split("\n")[0]
+elif "join" in i.split("\n")[0].lower() and i.split("\n")[0][-1] != ")":
+location = i.split("/")[0]
+while ")" not in location.replace(" ","")[-3:]:
+location = location.rpartition("\n")[0]
+location = location.replace("\n","")
+location = location.replace(" ","")
+else:
+location = i.split("\n")[0]
+original_location = location
+#location info found in gbk/embl file, now extract start and end positions
+if location.count("(") != location.count(")"):
+error = "y"
+errorlocations.append(original_location)
+continue
+if "join(complement" in location.lower():
+location = location.lower()
+join = "yes"
+location2 = location.partition("join(")[2][:-1].replace("<","").replace(">","")
+if ("complement(" in location2[0:12] and location2[-1] != ")") or ")," in location2:
+error = "y"
+errorlocations.append(original_location)
+continue
+elif ("complement(" in location2[0:12] and location2[-1] == ")" and location2[12:-2].count(")") == 0 and location2[12:-2].count("(") == 0):
+location2 = location2.partition("complement(")[2][:-1]
+start = location2.split(",")[0]
+start = start.split("..")[0]
+start = start.replace("<","")
+end = location2.split(",")[-1]
+if ".." in end:
+end = end.split("..")[1]
+end = end.replace(">","")
+joinedparts = location2.split(",")
+joinedparts2 = []
+for j in joinedparts:
+newjoinedpart = j.replace("<","")
+newjoinedpart = newjoinedpart.replace(">","")
+joinedparts2.append(newjoinedpart)
+strand = "-"
+else:
+error = "y"
+errorlocations.append(original_location)
+continue
+elif "complement" in location.lower():
+location = location.lower()
+location = location.partition("complement(")[2][:-1]
+if "join(" in location.lower():
+join = "yes"
+location = location.lower()
+location2 = location.partition("join(")[2][:-1]
+start = location2.split(",")[0]
+start = start.split("..")[0]
+start = start.replace("<","")
+end = location2.split(",")[-1]
+if ".." in end:
+end = end.split("..")[1]
+end = end.replace(">","")
+joinedparts = location2.split(",")
+joinedparts2 = []
+for j in joinedparts:
+newjoinedpart = j.replace("<","")
+newjoinedpart = newjoinedpart.replace(">","")
+joinedparts2.append(newjoinedpart)
+else:
+start = location.split("..")[0]
+start = start.replace("<","")
+if ".." in location:
+end = location.split("..")[1]
+else:
+end = location
+end = end.replace(">","")
+strand = "-"
+else:
+if "join(" in location.lower():
+join = "yes"
+location = location.lower()
+location2 = location.partition("join(")[2][:-1]
+start = location2.split(",")[0]
+start = start.split("..")[0]
+start = start.replace("<","")
+end = location2.split(",")[-1]
+if ".." in end:
+end = end.split("..")[1]
+end = end.replace(">","")
+joinedparts = location2.split(",")
+joinedparts2 = []
+for j in joinedparts:
+newjoinedpart = j.replace("<","")
+newjoinedpart = newjoinedpart.replace(">","")
+joinedparts2.append(newjoinedpart)
+else:
+start = location.split("..")[0]
+start = start.replace("<","")
+if ".." in location:
+end = location.split("..")[1]
+else:
+end = location
+end = end.replace(">","")
+strand = "+"
+try:
+if int(start) > int(end):
+start2 = end
+end2 = start
+start = start2
+end = end2
+except ValueError:
+error = "y"
+errorlocations.append(original_location)
+continue
+#Correct for alternative codon start positions
+if "codon_start=" in i.lower():
+temp = i.lower().split("codon_start=")[1].split()[0]
+if '"' in temp:
+# temp ist "1" oder "2", dies kommt aus biopython
+temp = temp[1]
+else:
+# ohne anfuhrungszeichen ... 1 oder 2
+temp = temp[0]
+codonstart = temp
+if strand == "+":
+start = str(int(start) +  (int(codonstart) - 1))
+elif strand == "-":
+end = str(int(end) - (int(codonstart) - 1))
+#Find gene name for each gene, preferably locus_tag, than gene, than protein_ID
+a = 0
+b = 0
+genename = ""
+nrlines = len(i.split("\n"))
+while b == 0:
+line = i.split("\n")[a]
+if "protein_id=" in line:
+genename = (line.split("protein_id=")[1][1:-1]).replace(" ","_")
+genename = genename.replace("\\","_")
+genename = genename.replace("/","_")
+b += 1
+elif "protein_id=" in line.lower():
+genename = (line.lower().split("protein_id=")[1][1:-1]).replace(" ","_")
+genename = genename.replace("\\","_")
+genename = genename.replace("/","_")
+b += 1
+elif a == (nrlines - 1):
+genename = ""
+b += 1
+else:
+a += 1
+if len(genename) > 1:
+accnr = genename
+else:
+accnr = "no_accession_number_found"
+a = 0
+b = 0
+nrlines = len(i.split("\n"))
+while b == 0:
+line = i.split("\n")[a]
+if "gene=" in line:
+genename = (line.split("gene=")[1][1:-1]).replace(" ","_")
+genename = genename.replace("\\","_")
+genename = genename.replace("/","_")
+b += 1
+elif "gene=" in line.lower():
+genename = (line.lower().split("gene=")[1][1:-1]).replace(" ","_")
+genename = genename.replace("\\","_")
+genename = genename.replace("/","_")
+b += 1
+elif a == (nrlines - 1):
+b += 1
+else:
+a += 1
+a = 0
+b = 0
+nrlines = len(i.split("\n"))
+while b == 0:
+line = i.split("\n")[a]
+if "locus_tag=" in line:
+genename = (line.split("locus_tag=")[1][1:-1]).replace(" ","_")
+genename = genename.replace("\\","_")
+genename = genename.replace("/","_")
+b += 1
+elif "locus_tag=" in line.lower():
+genename = (line.lower().split("locus_tag=")[1][1:-1]).replace(" ","_")
+genename = genename.replace("\\","_")
+genename = genename.replace("/","_")
+b += 1
+elif a == (nrlines - 1):
+if genename == "":
+genename = "prot_ID_" + str(genenr)
+b += 1
+else:
+a += 1
+#Find sequence for each gene
+a = 0                                             ###Not all gbks contain protein sequences as translations, therefore sequences from gene clusters are now extracted from the database at a later stage if sequence is not in gbk
+b = 0
+sequence = ""
+while b < 2:
+line = i.split("\n")[a]
+if "translation=" in line:
+sequence = line.split("translation=")[1][1:]
+b += 1
+a += 1
+if line.count('"') > 1:
+sequence = line.split("translation=")[1][1:-1]
+b = 2
+elif "translation=" in line.lower():
+sequence = line.lower().split("translation=")[1][1:]
+b += 1
+a += 1
+if line.count('"') > 1:
+sequence = line.lower().split("translation=")[1][1:-1]
+b = 2
+elif a == (nrlines - 2) or a == (nrlines - 1):
+sequence = ""
+b = 2
+elif b == 1:
+if '"' in line:
+seqline = line.replace(" ","")
+seqline = seqline.split('"')[0]
+sequence = sequence + seqline
+b += 1
+else:
+seqline = line.replace(" ","")
+sequence = sequence + seqline
+a += 1
+else:
+a += 1
+sequence = sequence.upper()
+#Quality-check sequence
+forbiddencharacters = ["'",'"','=',';',':','[',']','>','<','|','\\',"/",'*','-','_','.',',','?',')','(','^','#','!','`','~','+','{','}','@','$','%','&']
+for z in forbiddencharacters:
+if z in sequence:
+sequence = ""
+#Find annotation for each gene
+a = 0
+b = 0
+while b == 0:
+line = i.split("\n")[a]
+if "product=" in line:
+annotation = line.split("product=")[1][1:]
+annotation = annotation.replace(" ","_")
+if annotation[-1] == '"':
+annotation = annotation[:-1]
+b += 1
+elif "product=" in line.lower():
+annotation = line.lower().split("product=")[1][1:]
+annotation = annotation.replace(" ","_")
+if annotation[-1] == '"':
+annotation = annotation[:-1]
+b += 1
+elif a == (nrlines - 1):
+annotation = "not_annotated"
+b += 1
+else:
+a += 1
+accessiondict[genename] = accnr
+if join == "yes":
+joinlist.append(genename)
+joindict[genename] = joinedparts2
+#Save data to dictionary
+if len(genename) > 1:
+genedict[genename] = [start,end,strand,annotation,sequence]
+genelist.append(genename)
+if error == "y":
+errorinfo = "\n".join(errorlocations)
+print >> sys.stderr, "Exit: locations in GBK/EMBL file not properly formatted:\n" + errorinfo
+logfile.write("Exit: GBK file not properly formatted, no sequence found or no CDS annotation found.\n")
+logfile.close()
+sys.exit(1)
+return [genelist, genedict, joinlist, joindict, accessiondict]
+def cleandnaseq(dnaseq):
+dnaseq = dnaseq.replace(" ","")
+dnaseq = dnaseq.replace("\t","")
+dnaseq = dnaseq.replace("\n","")
+dnaseq = dnaseq.replace("0","")
+dnaseq = dnaseq.replace("1","")
+dnaseq = dnaseq.replace("2","")
+dnaseq = dnaseq.replace("3","")
+dnaseq = dnaseq.replace("4","")
+dnaseq = dnaseq.replace("5","")
+dnaseq = dnaseq.replace("6","")
+dnaseq = dnaseq.replace("7","")
+dnaseq = dnaseq.replace("8","")
+dnaseq = dnaseq.replace("9","")
+dnaseq = dnaseq.replace("/","")
+dnaseq = dnaseq.replace("u","t")
+dnaseq = dnaseq.replace("U","T")
+dnaseq = dnaseq.replace("r","n")
+dnaseq = dnaseq.replace("R","n")
+dnaseq = dnaseq.replace("y","n")
+dnaseq = dnaseq.replace("Y","n")
+dnaseq = dnaseq.replace("w","n")
+dnaseq = dnaseq.replace("W","n")
+dnaseq = dnaseq.replace("s","n")
+dnaseq = dnaseq.replace("S","n")
+dnaseq = dnaseq.replace("m","n")
+dnaseq = dnaseq.replace("M","n")
+dnaseq = dnaseq.replace("k","n")
+dnaseq = dnaseq.replace("K","n")
+dnaseq = dnaseq.replace("h","n")
+dnaseq = dnaseq.replace("H","n")
+dnaseq = dnaseq.replace("b","n")
+dnaseq = dnaseq.replace("B","n")
+dnaseq = dnaseq.replace("v","n")
+dnaseq = dnaseq.replace("V","n")
+dnaseq = dnaseq.replace("d","n")
+dnaseq = dnaseq.replace("D","n")
+return dnaseq
+def extractprotfasta(genelist,genedict,dnaseq,rc_dnaseq,joinlist,joindict,accessiondict):
+names = []
+seqs = []
+for i in genelist:
+genename = i
+#If suitable translation found in gbk, use that
+if len(genedict[i][4]) > 5:
+protseq = genedict[i][4]
+i = genedict[i]
+#If no suitable translation found in gbk, extract from DNA sequence
+else:
+i = genedict[i]
+y = int(i[0])
+z = int(i[1])
+if i[2] == "+":
+if genename in joinlist:
+geneseq = ""
+for j in joindict[genename]:
+partstart = int(j.split("..")[0])
+if ".." in j:
+partend = int(j.split("..")[1])
+else:
+partend = int(j)
+geneseqpart = dnaseq[(partstart - 1):partend]
+geneseq = geneseq + geneseqpart
+else:
+geneseq = dnaseq[(y - 1):z]
+protseq = translate(geneseq)
+elif i[2] == "-":
+if genename in joinlist:
+geneseq = ""
+joinlistrev = joindict[genename]
+joinlistrev.reverse()
+for j in joinlistrev:
+partstart = int(j.split("..")[0])
+if ".." in j:
+partend = int(j.split("..")[1])
+else:
+partend = int(j)
+geneseqpart = rc_dnaseq[(len(rc_dnaseq) - partend):(len(rc_dnaseq) - partstart + 1)]
+geneseq = geneseq + geneseqpart
+else:
+geneseq = rc_dnaseq[(len(rc_dnaseq) - z):(len(rc_dnaseq) - y + 1)]
+protseq = translate(geneseq)
+name = "input" + "|" + "c1" + "|" + i[0] + "-" + i[1] + "|" + i[2] + "|" + genename + "|" + i[3]
+seqs.append(protseq)
+names.append(name)
+proteins = [names,seqs,genelist,genedict,accessiondict]
+return proteins
+def gbk2proteins(gbkfile):
+file = open(gbkfile,"r")
+filetext = file.read()
+filetext = filetext.replace("\r","\n")
+if "     CDS             " not in filetext or "\nORIGIN" not in filetext:
+print >> sys.stderr, "Exit: GBK file not properly formatted, no sequence found or no CDS annotation found."
+logfile.write("Exit: GBK file not properly formatted, no sequence found or no CDS annotation found.\n")
+logfile.close()
+sys.exit(1)
+cdspart = filetext.split("\nORIGIN")[0]
+#Extract DNA sequence and calculate reverse complement of it
+dnaseq = filetext.split("\nORIGIN")[1]
+dnaseq = cleandnaseq(dnaseq)
+sequence = dnaseq
+if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
+print >> sys.stderr, "Protein GBK/EMBL file provided. Please provide nucleotide GBK/EMBL file."
+sys.exit(1)
+dnaseqlength = len(dnaseq)
+rc_dnaseq = reverse_complement(dnaseq)
+#Extract genes
+genes = cdspart.split("     CDS             ")
+genes = genes[1:]
+try:
+genesdetails = parsegenes(genes)
+except ValueError, e:
+print >> sys.stderr, "Could not parse genes from GBK/EMBL file. Please check if your GBK/EMBL file is valid."
+raise
+print >> sys.stderr, "Error was: %s" % e
+print len(genes)
+sys.exit(1)
+genelist = genesdetails[0]
+genedict = genesdetails[1]
+joinlist = genesdetails[2]
+joindict = genesdetails[3]
+accessiondict = genesdetails[4]
+#Locate all genes on DNA sequence and translate to protein sequence
+proteins = extractprotfasta(genelist,genedict,dnaseq,rc_dnaseq,joinlist,joindict,accessiondict)
+textlines = filetext.split("\n//")[0]
+textlines = textlines.split("\n")
+accession = ""
+for i in textlines:
+if accession == "":
+if "LOCUS       " in i:
+j = i.split("LOCUS       ")[1]
+accession = j.split(" ")[0]
+if len(accession) < 4:
+accession = ""
+#Test if accession number is probably real GenBank/RefSeq acc nr
+numbers = range(0,10)
+letters = []
+for i in ascii_letters:
+letters.append(i)
+nrnumbers = 0
+nrletters = 0
+for i in accession:
+if i in letters:
+nrletters += 1
+try:
+j = int(i)
+if j in numbers:
+nrnumbers += 1
+except:
+pass
+if nrnumbers < 3 or nrletters < 1:
+accession = ""
+return [proteins,accession,dnaseqlength]
+def embl2proteins(emblfile,sequence):
+file = open(emblfile,"r")
+filetext = file.read()
+filetext = filetext.replace("\r","\n")
+file.close()
+if "FT   CDS " not in filetext or ("\nSQ" not in filetext and len(sequence) < 1):
+logfile.write("Exit: EMBL file not properly formatted, no sequence found or no CDS annotation found.\n")
+print >> sys.stderr, "Exit: EMBL file not properly formatted, no sequence found or no CDS annotation found.\n"
+logfile.close()
+sys.exit(1)
+cdspart = filetext.split("\nSQ  ")[0]
+#Extract DNA sequence and calculate reverse complement of it
+seqpart = filetext.split("\nSQ  ")[1]
+seqlines = seqpart.split("\n")[1:]
+dnaseq = ""
+for i in seqlines:
+dnaseq = dnaseq + i
+dnaseq = cleandnaseq(dnaseq)
+sequence = dnaseq
+if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
+print >> sys.stderr, "Protein GBK/EMBL file provided. Please provide nucleotide GBK/EMBL file."
+sys.exit(1)
+dnaseqlength = len(dnaseq)
+rc_dnaseq = reverse_complement(dnaseq)
+#Extract genes
+genes = cdspart.split("FT   CDS             ")
+genes = genes[1:]
+try:
+genesdetails = parsegenes(genes)
+except ValueError, e:
+print >> sys.stderr, "Could not parse genes from GBK/EMBL file. Please check if your GBK/EMBL file is valid."
+print >> sys.stderr, "Error was: %s" % e
+sys.exit(1)
+genelist = genesdetails[0]
+genedict = genesdetails[1]
+joinlist = genesdetails[2]
+joindict = genesdetails[3]
+accessiondict = genesdetails[4]
+#Locate all genes on DNA sequence and translate to protein sequence
+proteins = extractprotfasta(genelist,genedict,dnaseq,rc_dnaseq,joinlist,joindict,accessiondict)
+textlines = filetext.split("SQ   ")[0]
+textlines = textlines.split("\n")
+accession = ""
+for i in textlines:
+if accession == "":
+if "AC   " in i:
+j = i.split("AC   ")[1]
+j = j.replace(" ","")
+accession = j.split(";")[0]
+if len(accession) < 4:
+accession = ""
+#Test if accession number is probably real GenBank/RefSeq acc nr
+numbers = range(0,10)
+letters = []
+for i in ascii_letters:
+letters.append(i)
+nrnumbers = 0
+nrletters = 0
+for i in accession:
+if i in letters:
+nrletters += 1
+try:
+j = int(i)
+if j in numbers:
+nrnumbers += 1
+except:
+pass
+if nrnumbers < 3 or nrletters < 1:
+accession = ""
+return [proteins,accession,dnaseqlength]
+def translate(sequence):
+#Translation table standard genetic code; according to http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
+transldict = { 'TTT': 'F', 'TCT': 'S', 'TAT': 'Y', 'TGT': 'C',
+'TTC': 'F', 'TCC': 'S', 'TAC': 'Y', 'TGC': 'C',
+'TTA': 'L', 'TCA': 'S', 'TAA': '*', 'TGA': '*',
+'TTG': 'L', 'TCG': 'S', 'TAG': '*', 'TGG': 'W',
+'CTT': 'L', 'CCT': 'P', 'CAT': 'H', 'CGT': 'R',
+'CTC': 'L', 'CCC': 'P', 'CAC': 'H', 'CGC': 'R',
+'CTA': 'L', 'CCA': 'P', 'CAA': 'Q', 'CGA': 'R',
+'CTG': 'L', 'CCG': 'P', 'CAG': 'Q', 'CGG': 'R',
+'ATT': 'I', 'ACT': 'T', 'AAT': 'N', 'AGT': 'S',
+'ATC': 'I', 'ACC': 'T', 'AAC': 'N', 'AGC': 'S',
+'ATA': 'I', 'ACA': 'T', 'AAA': 'K', 'AGA': 'R',
+'ATG': 'M', 'ACG': 'T', 'AAG': 'K', 'AGG': 'R',
+'GTT': 'V', 'GCT': 'A', 'GAT': 'D', 'GGT': 'G',
+'GTC': 'V', 'GCC': 'A', 'GAC': 'D', 'GGC': 'G',
+'GTA': 'V', 'GCA': 'A', 'GAA': 'E', 'GGA': 'G',
+'GTG': 'V', 'GCG': 'A', 'GAG': 'E', 'GGG': 'G',
+'ttt': 'F', 'tct': 'S', 'tat': 'Y', 'tgt': 'C',
+'ttc': 'F', 'tcc': 'S', 'tac': 'Y', 'tgc': 'C',
+'tta': 'L', 'tca': 'S', 'taa': '*', 'tga': '*',
+'ttg': 'L', 'tcg': 'S', 'tag': '*', 'tgg': 'W',
+'ctt': 'L', 'cct': 'P', 'cat': 'H', 'cgt': 'R',
+'ctc': 'L', 'ccc': 'P', 'cac': 'H', 'cgc': 'R',
+'cta': 'L', 'cca': 'P', 'caa': 'Q', 'cga': 'R',
+'ctg': 'L', 'ccg': 'P', 'cag': 'Q', 'cgg': 'R',
+'att': 'I', 'act': 'T', 'aat': 'N', 'agt': 'S',
+'atc': 'I', 'acc': 'T', 'aac': 'N', 'agc': 'S',
+'ata': 'I', 'aca': 'T', 'aaa': 'K', 'aga': 'R',
+'atg': 'M', 'acg': 'T', 'aag': 'K', 'agg': 'R',
+'gtt': 'V', 'gct': 'A', 'gat': 'D', 'ggt': 'G',
+'gtc': 'V', 'gcc': 'A', 'gac': 'D', 'ggc': 'G',
+'gta': 'V', 'gca': 'A', 'gaa': 'E', 'gga': 'G',
+'gtg': 'V', 'gcg': 'A', 'gag': 'E', 'ggg': 'G'}
+triplets = []
+triplet = ""
+a = 0
+for i in sequence:
+if a < 2:
+a += 1
+triplet = triplet + i
+elif a == 2:
+triplet = triplet + i
+triplets.append(triplet)
+triplet = ""
+a = 0
+protseq = ""
+aanr = 0
+for i in triplets:
+aanr += 1
+if aanr == 1:
+protseq = protseq + "M"
+else:
+if "n" in i or "N" in i or i not in transldict.keys():
+protseq = protseq + "X"
+else:
+protseq = protseq + transldict[i]
+if  len(protseq) > 0 and protseq[-1] == "*":
+protseq = protseq[:-1]
+return protseq
+def writefasta(names,seqs,file):
+e = 0
+f = len(names) - 1
+try:
+out_file = open(file,"w")
+while e <= f:
+out_file.write(">%s\n%s\n" % (names[e], seqs[e]) )
+#out_file.write(">")
+#out_file.write(names[e])
+#out_file.write("\n")
+#out_file.write(seqs[e])
+#out_file.write("\n")
+e += 1
+out_file.close()
+except(IOError,OSError,NotImplementedError):
+print >> sys.stderr, "FASTA file not created."
+logfile.write("FASTA file not created.\n")
+def parsehmmoutput(cutoff,file):
+#file = open(file,"r")
+#filetext = file.read()
+#filetext = filetext.replace("\r","\n")
+#lines = filetext.split("\n")
+protlines = []
+#for i in lines:
+#  if len(i) > 1 and i[0] != "#":
+#    protlines.append(i)
+[protlines.append(line.strip()) for line in open(file,"r") if len(line) > 1 and not line.startswith('#')]
+proteins = []
+scores = []
+#measuringline = lines[2]
+measuringline = linecache.getline(file, 3)
+x = 0
+y = 0
+for i in measuringline:
+y += 1
+if "-" in i:
+x += 1
+else:
+if x > 1:
+break
+for i in protlines:
+#accession = ""
+#a = 0
+protname = i[0:y]
+protnameparts = protname.split("|")
+accession = protnameparts[4]
+score = i[(y+76):(y+82)]
+score = float(score.replace(" ",""))
+if score > cutoff and len(accession) > 1:
+proteins.append(accession)
+scores.append(score)
+return [proteins,scores]
+def sortonsecondvalueoflist(first,second):
+f = int(first[1])
+s = second[1]
+if f > s:
+value = 1
+elif f < s:
+value = -1
+elif f == s:
+value = 0
+return value
+def hmmlengths(hmmfile):
+hmmlengthsdict = {}
+file = open(hmmfile,"r")
+filetext = file.read()
+filetext = filetext.replace("\r","\n")
+hmms = filetext.split("//")[:-1]
+for i in hmms:
+namepart = i.split("NAME  ")[1]
+name = namepart.split("\n", 1)[0]
+lengthpart = i.split("LENG  ")[1]
+#print lengthline
+#tabs = lengthline.split(" ")
+#tabs2 = []
+#for j in tabs:
+#  if j != "":
+#    tabs2.append(j)
+#print tabs2
+length = lengthpart.split("\n", 1)[0]
+hmmlengthsdict[name] = int(length)
+return hmmlengthsdict
+def hmmscanparse(hmmscanoutputfile,hmmlengthsdict):
+domaindict = {}
+file = open(hmmscanoutputfile,"r")
+filetext = file.read()
+filetext = filetext.replace("\r","\n")
+outputs = filetext.split("Query:       ")[1:]
+for i in outputs:
+protname = i.split("\n", 1)[0]
+protname = protname.split(" ", 1)[0]
+domainresults = i.split("Domain annotation for each model:\n")[1]
+domainresults = domainresults.split("\n\nInternal pipeline statistics summary:")[0]
+domains = domainresults.split(">> ")
+domainlist = []
+#Find all domains
+for i in domains:
+tokens = i.split('\n')
+domainname = tokens[0]
+domainname = domainname.split(" ", 1)[0]
+domainresults = tokens[3:-2]
+for i in domainresults:
+tabs = i.split(" ")
+tabs2 = []
+[tabs2.append(tab) for tab in tabs if tab != '']
+#for i in tabs:
+#  if i != "":
+#    tabs2.append(i)
+tabs = tabs2
+start = int(tabs[12])
+end = int(tabs[13])
+evalue = tabs[5]
+score = float(tabs[2])
+domainlist.append([domainname,start,end,evalue,score])
+domainlist.sort(sortonsecondvalueoflist)
+#Purify domain list to remove overlapping domains, only keeping those with the highest scores
+if len(domainlist) > 1:
+domainlist2 = [domainlist[0]]
+for i in domainlist[1:]:
+maxoverlap = 20
+if i[1] < (domainlist2[-1][2] - maxoverlap):
+if i[4] < domainlist2[-1][4]:
+pass
+elif i[4] > domainlist2[-1][4]:
+del domainlist2[-1]
+domainlist2.append(i)
+else:
+domainlist2.append(i)
+domainlist = domainlist2
+#Merge domain fragments which are really one domain
+if len(domainlist) > 1:
+domainlist2 = [domainlist[0]]
+for i in domainlist[1:]:
+alilength1 = int(domainlist2[-1][2]) - int(domainlist2[-1][1])
+alilength2 = int(i[2]) - int(i[1])
+domainlength = hmmlengthsdict[i[0]]
+if i[0] == domainlist2[-1][0] and (alilength1 < (0.75 * domainlength) or alilength2 < (0.75 * domainlength)) and (alilength1 + alilength2) < (1.5 * domainlength):
+name = i[0]
+start = domainlist2[-1][1]
+end = i[2]
+evalue = str(float(domainlist2[-1][3]) * float(i[3]))
+score = str(float(domainlist2[-1][4]) + float(i[4]))
+del domainlist2[-1]
+domainlist2.append([name,start,end,evalue,score])
+else:
+domainlist2.append(i)
+domainlist = domainlist2
+#Remove incomplete domains (covering less than 60% of total domain hmm length)
+if len(domainlist) > 1:
+domainlist2 = []
+for i in domainlist:
+alilength = int(i[2]) - int(i[1])
+domainlength = hmmlengthsdict[i[0]]
+if alilength > (0.6 * domainlength):
+domainlist2.append(i)
+domainlist = domainlist2
+#Save domainlist to domaindict
+domaindict[protname] = domainlist
+return domaindict
+def blastparse(blasttext,minseqcoverage,minpercidentity,seqlengths,geneclustergenes):
+blastdict = {}
+querylist = []
+hitclusters = []
+blastlines = blasttext.split("\n")[:-1]
+#Filter for best blast hits (of one query on each subject)
+query_subject_combinations = []
+blastlines2 = []
+for i in blastlines:
+tabs = i.split("\t")
+query = tabs[0]
+subject = tabs[1]
+query_subject_combination = query + "_" + subject
+if query_subject_combination in query_subject_combinations:
+pass
+else:
+query_subject_combinations.append(query_subject_combination)
+blastlines2.append(i)
+blastlines = blastlines2
+#Filters blastlines to get rid of hits that do not meet criteria
+blastlines2 = []
+for i in blastlines:
+tabs = i.split("\t")
+perc_ident = int(tabs[2].split(".",1)[0])
+alignmentlength = float(tabs[3])
+evalue = str(tabs[10])
+blastscore = int(tabs[11].split(".",1)[0])
+if seqlengths.has_key(query):
+perc_coverage = (float(tabs[3]) / seqlengths[query]) * 100
+if perc_ident > minpercidentity and (perc_coverage > minseqcoverage or alignmentlength > 40):
+blastlines2.append(i)
+blastlines = blastlines2
+#Goes through the blastlines. For each query, creates a querydict and hitlist, and adds these to the blastdict when finding the next query
+firstquery = "y"
+for i in blastlines:
+tabs = i.split("\t")
+query = tabs[0]
+second_column_split = tabs[1].split("|")
+subject = second_column_split[4]
+if subject == "no_locus_tag":
+subject = second_column_split[6]
+if subject in geneclustergenes:
+subject = "h_" + subject
+if len(second_column_split) > 6:
+locustag = second_column_split[6]
+else:
+locustag = ""
+subject_genecluster = second_column_split[0] + "_" + second_column_split[1]
+subject_start = (second_column_split[2]).split("-")[0]
+subject_end = (second_column_split[2]).split("-")[1]
+subject_strand  = second_column_split[3]
+subject_annotation = second_column_split[5]
+perc_ident = int(tabs[2].split(".")[0])
+alignmentlength = float(tabs[3])
+evalue = str(tabs[10])
+blastscore = int(tabs[11].split(".", 1)[0])
+if seqlengths.has_key(query):
+perc_coverage = (float(tabs[3]) / seqlengths[query]) * 100
+else:
+seqlength = len(seqdict[query.split("|")[4]])
+perc_coverage = (float(tabs[3]) / seqlength) * 100
+if firstquery == "y": #Only until the first blastline with good hit
+firstquery = "n"
+querylist.append(query)
+subjectlist = []
+querydict = {}
+subjectlist.append(subject)
+querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
+if subject_genecluster not in hitclusters:
+hitclusters.append(subject_genecluster)
+last_query = query
+elif i == blastlines[-1]: #Only for the last blastline
+if query not in querylist:
+subjectlist.append(subject)
+querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
+blastdict[query] = [subjectlist,querydict]
+querylist.append(query)
+if subject_genecluster not in hitclusters:
+hitclusters.append(subject_genecluster)
+else:
+subjectlist.append(subject)
+querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
+blastdict[query] = [subjectlist,querydict]
+else: #For all but the first and last blastlines
+if query not in querylist:
+blastdict[last_query] = [subjectlist,querydict]
+querylist.append(query)
+subjectlist = []
+querydict = {}
+subjectlist.append(subject)
+querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
+if subject_genecluster not in hitclusters:
+hitclusters.append(subject_genecluster)
+last_query = query
+else:
+subjectlist.append(subject)
+querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
+if subject_genecluster not in hitclusters:
+hitclusters.append(subject_genecluster)
+return [blastdict,querylist,hitclusters]
+def getdircontents():
+return os.listdir('.')
+"""
+if sys.platform == ('win32'):
+dircontents = os.popen("dir/w")
+dircontents = dircontents.read()
+dircontents = dircontents.replace("\n"," ")
+dircontents = dircontents.split(" ")
+if sys.platform == ('linux2'):
+dircontents = os.popen("ls")
+dircontents = dircontents.read()
+dircontents = dircontents.replace("\n"," ")
+dircontents = dircontents.replace("\r"," ")
+dircontents = dircontents.split(" ")
+return dircontents
+"""
+def _gene_arrow(start,end,strand,color,base,height):
+halfheight = height/2
+if start > end:
+start2 = end
+end2 = start
+start = start2
+end = end2
+dist = 100
+oh = ShapeBuilder()
+if (end - start) < halfheight:
+if (strand == "+"):
+pointsAsTuples=[(start,base),
+(end,base - halfheight),
+(start,base - height),
+(start,base)
+]
+elif (strand == "-"):
+pointsAsTuples=[(start,base - halfheight),
+(end,base - height),
+(end,base),
+(start,base - halfheight)
+]
+else:
+if (strand == "+"):
+arrowstart = end-halfheight
+pointsAsTuples=[(start,base),
+(arrowstart,base),
+(end,base-halfheight),
+(arrowstart,base - height),
+(start,base - height),
+(start,base)
+]
+elif (strand == "-"):
+arrowstart = start + halfheight
+pointsAsTuples=[(start,base - halfheight),
+(arrowstart,base - height),
+(end,base - height),
+(end,base),
+(arrowstart,base),
+(start,base - halfheight)
+]
+pg=oh.createPolygon(points=oh.convertTupleArrayToPoints(pointsAsTuples),strokewidth=1, stroke='black', fill=color)
+return pg
+def _gene_label(start,end,name,y,screenwidth):
+#Add gene label
+txt = name
+myStyle = StyleBuilder()
+myStyle.setFontFamily(fontfamily="Verdana")
+#myStyle.setFontWeight(fontweight='bold')
+myStyle.setFontStyle(fontstyle='italic')
+myStyle.setFontSize('10px')
+myStyle.setFilling('#600000')
+x =  ((start + end)/2)
+base = 35
+height = 10
+halfheight = height/2
+y =  base + halfheight
+t1 = text(txt,x,y)
+t1.set_style(myStyle.getStyle())
+return t1
+def relativepositions(starts,ends,largestclustersize):
+rel_starts = []
+rel_ends = []
+#Assign relative start and end sites for visualization
+lowest_start = int(starts[0])
+leftboundary = lowest_start
+for i in starts:
+i = float(float(int(i) - int(leftboundary)) / largestclustersize) * screenwidth * 0.75
+i = int(i)
+rel_starts.append(i)
+for i in ends:
+i = float(float(int(i) - int(leftboundary)) / largestclustersize) * screenwidth * 0.75
+i = int(i)
+rel_ends.append(i)
+return [rel_starts,rel_ends]
+def startendsitescheck(starts,ends):
+#Check whether start sites are always lower than end sites, reverse if necessary
+starts2 = []
+ends2 = []
+a = 0
+for i in starts:
+if int(i) > int(ends[a]):
+starts2.append(ends[a])
+ends2.append(i)
+else:
+starts2.append(i)
+ends2.append(ends[a])
+a += 1
+ends = ends2
+starts = starts2
+return [starts,ends]
+def RadialGradient(startcolor,stopcolor,gradientname):
+d = defs()
+rg = radialGradient()
+rg.set_id(gradientname)
+s = stop(offset="0%")
+s.set_stop_color(startcolor)
+s.set_stop_opacity(1)
+rg.addElement(s)
+s = stop(offset="100%")
+s.set_stop_color(stopcolor)
+s.set_stop_opacity(1)
+rg.addElement(s)
+d.addElement(rg)
+return d
+def LinearGradient(startcolor,stopcolor,gradientname):
+d = defs()
+lg = linearGradient()
+lg.set_id(gradientname)
+s = stop(offset="0%")
+s.set_stop_color(startcolor)
+s.set_stop_opacity(1)
+lg.addElement(s)
+s = stop(offset="100%")
+s.set_stop_color(stopcolor)
+s.set_stop_opacity(1)
+lg.addElement(s)
+d.addElement(lg)
+return d
+def generate_rgbscheme(nr):
+usablenumbers = [1,2,4,8,12,18,24,32,48,64,10000]
+lengthsdict = {1:[1,1,1],2:[1,1,2],4:[1,2,2],8:[2,2,2],12:[2,2,3],18:[2,3,3],24:[3,3,3],32:[3,3,4],48:[3,4,4],64:[4,4,4]}
+shortestdistance = 10000
+for i in usablenumbers:
+distance = i - nr
+if distance >= 0:
+if distance < shortestdistance:
+shortestdistance = distance
+closestnr = i
+toohigh = "n"
+if closestnr == 10000:
+toohigh = "y"
+closestnr = 64
+xyznumbers = lengthsdict[closestnr]
+x = xyznumbers[0]
+y = xyznumbers[1]
+z = xyznumbers[2]
+xpoints = []
+xpoint = (255/z)/2
+for i in range(x):
+xpoints.append(xpoint)
+xpoint += (255/x)
+ypoints = []
+ypoint = (255/z)/2
+for i in range(y):
+ypoints.append(ypoint)
+ypoint += (255/y)
+zpoints = []
+zpoint = (255/z)/2
+for i in range(z):
+zpoints.append(zpoint)
+zpoint += (255/z)
+colorlist = []
+for i in xpoints:
+for j in ypoints:
+#for k in zpoints:
+#  rgb = "rgb(%s,%s,%s)" % (i, j, k)
+#  #rgb = "rgb(" + str(i) + "," + str(j) + "," + str(k) + ")"
+#  colorlist.append(rgb)
+[colorlist.append("rgb(%s,%s,%s)" % (i, j, k)) for k in zpoints]
+if toohigh == "y":
+colorlist = colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist + colorlist
+if closestnr == 24:
+colorlist = colorlist[:15] + colorlist[18:]
+if closestnr == 32:
+colorlist = colorlist[:21] + colorlist[24:]
+colorlist2 = []
+if closestnr == 1:
+colorlist2.append("red")
+if closestnr == 2:
+colorlist2.append("red")
+colorlist2.append("green")
+if closestnr == 4:
+colorlist2.append("red")
+colorlist2.append("green")
+colorlist2.append("blue")
+colorlist2.append("yellow")
+if closestnr == 8:
+neworder=[4,1,2,5,6,7,3,0]
+colorlist2 = [colorlist[i] for i in neworder]
+if closestnr == 12:
+neworder=[6,3,5,9,7,2,11,4,8,1,10,0]
+colorlist2 = [colorlist[i] for i in neworder]
+if closestnr == 18:
+neworder=[9,6,2,14,15,8,12,10,3,5,7,11,4,1,16,13,0]
+colorlist2 = [colorlist[i] for i in neworder]
+if closestnr == 24:
+neworder=[15,12,9,6,5,0,21,1,16,14,8,17,2,23,22,3,13,7,10,4,18,20,19,11]
+colorlist2 = [colorlist[i] for i in neworder]
+if closestnr == 32:
+neworder = [21,19,27,6,8,1,14,7,20,13,9,30,4,23,18,12,5,29,24,17,11,31,2,28,22,15,26,3,20,16,10,25]
+colorlist2 = [colorlist[i] for i in neworder]
+if closestnr > 32:
+random.shuffle(colorlist)
+colorlist2 = colorlist
+colorlist = colorlist2
+return colorlist
+def geneclustersvg(genes,rel_starts,rel_ends,strands,geneposdict,pksnrpsprots,pksnrpsdomains,qclusternr):
+nrgenes = len(genes)
+#Define relative start and end positions for plotting
+s = svg(x = 0, y = 0, width = (screenwidth * 0.75), height = (259 + 99 * len(pksnrpsprots)))
+viewbox = "0 -30 " + str(screenwidth * 0.8) + " " + str(185 + 70 * len(pksnrpsprots))
+s.set_viewBox(viewbox)
+s.set_preserveAspectRatio("none")
+#Add line behind gene arrows
+oh = ShapeBuilder()
+group = g()
+group.addElement(oh.createLine(10,60,10 + (screenwidth * 0.75),60, strokewidth = 2, stroke = "grey"))
+s.addElement(group)
+#Add gene arrows
+a = 0
+y = 0
+for x in range(nrgenes):
+group = g()
+#group.addElement(_gene_label(rel_starts[a],rel_ends[a],genes[a],y,screenwidth))
+group.addElement(_gene_arrow(10 + rel_starts[a],10 + rel_ends[a],strands[a],colors[a],65,10))
+#Can be used for domains
+#   group.addElement(oh.createRect(rel_starts[a],45,(rel_ends[a]-rel_starts[a]),10, strokewidth = 2, stroke = "black", fill="#237845"))
+group.set_id("a" + str(qclusternr) + "_00%s"%x)
+s.addElement(group)
+if y == 0:
+y = 1
+elif y == 1:
+y = 0
+a += 1
+#Add domain depictions
+oh = ShapeBuilder()
+group = g()
+#Determine longest protein to decide on scaling
+longestprot = 0
+protlengthdict = {}
+for i in pksnrpsprots:
+protlength = (geneposdict[i][1] - geneposdict[i][0]) / 3
+protlengthdict[i] = protlength
+if protlength > longestprot:
+longestprot = protlength
+z = 1
+w = 0
+ksnr = 1
+atnr = 1
+dhnr = 1
+krnr = 1
+ernr = 1
+acpnr = 1
+cnr = 1
+enr = 1
+anr = 1
+pcpnr = 1
+tenr = 1
+othernr = 1
+for i in pksnrpsprots:
+domains = pksnrpsdomains[i][0]
+domainsdict = pksnrpsdomains[i][1]
+protlength = protlengthdict[i]
+group.addElement(oh.createLine(10,(125 + z * 60 ),10 + ((float(protlength) / float(longestprot)) * (screenwidth * 0.75)),(125 + z * 60 ), strokewidth = 1, stroke = "grey"))
+s.addElement(group)
+try:
+aa2pixelratio = longestprot * 0.75 / screenwidth
+except:
+aa2pixelratio = 0.1
+#print 'logestprot', longestprot
+#print 'scrennwidth', screenwidth
+#print aa2pixelratio
+myStyle = StyleBuilder()
+myStyle.setFontFamily(fontfamily="MS Reference Sans Serif")
+myStyle.setFontWeight(fontweight='bold')
+myStyle.setFontSize('12px')
+for j in domains:
+startpos = domainsdict[j][0]
+endpos = domainsdict[j][1]
+if "PKS_KS" in j:
+c = LinearGradient("#08B208","#81F781","KS_domain"+str(qclusternr) + "_" + str(ksnr))
+d = LinearGradient("#81F781","#08B208","KS_line"+str(qclusternr) + "_" + str(ksnr))
+e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#KS_line' + str(qclusternr) + "_" + str(ksnr) + ")",fill="url(#KS_domain" + str(qclusternr) + "_" + str(ksnr) + ")")
+f = text("KS",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A2A0A')
+if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
+myStyle.setFontSize('8px')
+f = text("KS",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#3B0B0B')
+elif ((endpos-startpos) / aa2pixelratio) < 20:
+f = "notext"
+if f != "notext":
+f.set_style(myStyle.getStyle())
+myStyle.setFontSize('12px')
+group = g()
+group.addElement(c)
+group.addElement(d)
+group.addElement(e)
+if f != "notext":
+group.addElement(f)
+group.set_id("b" + str(qclusternr) + "_00%s"%w)
+s.addElement(group)
+ksnr += 1
+elif "PKS_AT" in j:
+c = LinearGradient("#DC0404","#F78181","AT_domain"+str(qclusternr) + "_" + str(atnr))
+d = LinearGradient("#F78181","#DC0404","AT_line"+str(qclusternr) + "_" + str(atnr))
+e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#AT_line' + str(qclusternr) + "_" + str(atnr) + ")",fill="url(#AT_domain" + str(qclusternr) + "_" + str(atnr) + ")")
+f = text("AT",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#2A1B0A')
+if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
+myStyle.setFontSize('8px')
+f = text("AT",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#2A1B0A')
+elif ((endpos-startpos) / aa2pixelratio) < 20:
+f = "notext"
+if f != "notext":
+f.set_style(myStyle.getStyle())
+myStyle.setFontSize('12px')
+group = g()
+group.addElement(c)
+group.addElement(d)
+group.addElement(e)
+if f != "notext":
+group.addElement(f)
+group.set_id("b" + str(qclusternr) + "_00%s"%w)
+s.addElement(group)
+atnr += 1
+elif "PKS_DH" in j:
+c = LinearGradient("#B45F04","#F7BE81","DH_domain"+str(qclusternr) + "_" + str(dhnr))
+d = LinearGradient("#F7BE81","#B45F04","DH_line"+str(qclusternr) + "_" + str(dhnr))
+e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#DH_line' + str(qclusternr) + "_" + str(dhnr) + ")",fill="url(#DH_domain" + str(qclusternr) + "_" + str(dhnr) + ")")
+f = text("DH",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#3B0B0B')
+if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
+myStyle.setFontSize('8px')
+f = text("DH",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#3B0B0B')
+elif ((endpos-startpos) / aa2pixelratio) < 20:
+f = "notext"
+if f != "notext":
+f.set_style(myStyle.getStyle())
+myStyle.setFontSize('12px')
+group = g()
+group.addElement(c)
+group.addElement(d)
+group.addElement(e)
+if f != "notext":
+group.addElement(f)
+group.set_id("b" + str(qclusternr) + "_00%s"%w)
+s.addElement(group)
+dhnr += 1
+elif "PKS_KR" in j:
+c = LinearGradient("#089E4B","#81F781","KR_domain"+str(qclusternr) + "_" + str(krnr))
+d = LinearGradient("#81F781","#089E4B","KR_line"+str(qclusternr) + "_" + str(krnr))
+e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#KR_line' + str(qclusternr) + "_" + str(krnr) + ")",fill="url(#KR_domain" + str(qclusternr) + "_" + str(krnr) + ")")
+f = text("KR",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A2A1B')
+if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
+myStyle.setFontSize('8px')
+f = text("KR",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A2A1B')
+elif ((endpos-startpos) / aa2pixelratio) < 20:
+f = "notext"
+if f != "notext":
+f.set_style(myStyle.getStyle())
+myStyle.setFontSize('12px')
+group = g()
+group.addElement(c)
+group.addElement(d)
+group.addElement(e)
+if f != "notext":
+group.addElement(f)
+group.set_id("b" + str(qclusternr) + "_00%s"%w)
+s.addElement(group)
+krnr += 1
+elif "PKS_ER" in j:
+c = LinearGradient("#089E85","#81F7F3","ER_domain"+str(qclusternr) + "_" + str(ernr))
+d = LinearGradient("#81F7F3","#089E85","ER_line"+str(qclusternr) + "_" + str(ernr))
+e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#ER_line' + str(qclusternr) + "_" + str(ernr) + ")",fill="url(#ER_domain" + str(qclusternr) + "_" + str(ernr) + ")")
+f = text("ER",((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A2A29')
+if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
+myStyle.setFontSize('8px')
+f = text("ER",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A2A29')
+elif ((endpos-startpos) / aa2pixelratio) < 20:
+f = "notext"
+if f != "notext":
+f.set_style(myStyle.getStyle())
+myStyle.setFontSize('12px')
+group = g()
+group.addElement(c)
+group.addElement(d)
+group.addElement(e)
+if f != "notext":
+group.addElement(f)
+group.set_id("b" + str(qclusternr) + "_00%s"%w)
+s.addElement(group)
+ernr += 1
+elif "ACP" in j:
+c = LinearGradient("#084BC6","#81BEF7","ACP_domain"+str(qclusternr) + "_" + str(acpnr))
+d = LinearGradient("#81BEF7","#084BC6","ACP_line"+str(qclusternr) + "_" + str(acpnr))
+e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#ACP_line' + str(qclusternr) + "_" + str(acpnr) + ")",fill="url(#ACP_domain" + str(qclusternr) + "_" + str(acpnr) + ")")
+f = text("ACP",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A1B2A')
+if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
+myStyle.setFontSize('8px')
+f = text("ACP",((-2 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A1B2A')
+elif ((endpos-startpos) / aa2pixelratio) < 20:
+f = "notext"
+if f != "notext":
+f.set_style(myStyle.getStyle())
+myStyle.setFontSize('12px')
+group = g()
+group.addElement(c)
+group.addElement(d)
+group.addElement(e)
+if f != "notext":
+group.addElement(f)
+group.set_id("b" + str(qclusternr) + "_00%s"%w)
+s.addElement(group)
+acpnr += 1
+elif ("C" in j or "Heterocyclization" in j ) and "ACP" not in j and "PCP" not in j and "NRPS-COM" not in j and "CAL" not in j:
+c = LinearGradient("#393989","#8181F7","C_domain"+str(qclusternr) + "_" + str(cnr))
+d = LinearGradient("#8181F7","#393989","C_line"+str(qclusternr) + "_" + str(cnr))
+e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#C_line' + str(qclusternr) + "_" + str(cnr) + ")",fill="url(#C_domain" + str(qclusternr) + "_" + str(cnr) + ")")
+f = text("C",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A0A2A')
+if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
+myStyle.setFontSize('8px')
+f = text("C",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A0A2A')
+elif ((endpos-startpos) / aa2pixelratio) < 20:
+f = "notext"
+if f != "notext":
+f.set_style(myStyle.getStyle())
+myStyle.setFontSize('12px')
+group = g()
+group.addElement(c)
+group.addElement(d)
+group.addElement(e)
+if f != "notext":
+group.addElement(f)
+group.set_id("b" + str(qclusternr) + "_00%s"%w)
+s.addElement(group)
+cnr += 1
+elif "Epimerization" in j and "ER" not in j and "TE" not in j:
+c = LinearGradient("#393989","#8181F7","E_domain"+str(qclusternr) + "_" + str(enr))
+d = LinearGradient("#8181F7","#393989","E_line"+str(qclusternr) + "_" + str(enr))
+e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#E_line' + str(qclusternr) + "_" + str(enr) + ")",fill="url(#E_domain" + str(qclusternr) + "_" + str(enr) + ")")
+f = text("E",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A0A2A')
+if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
+myStyle.setFontSize('8px')
+f = text("E",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A0A2A')
+elif ((endpos-startpos) / aa2pixelratio) < 20:
+f = "notext"
+if f != "notext":
+f.set_style(myStyle.getStyle())
+myStyle.setFontSize('12px')
+group = g()
+group.addElement(c)
+group.addElement(d)
+group.addElement(e)
+if f != "notext":
+group.addElement(f)
+group.set_id("b" + str(qclusternr) + "_00%s"%w)
+s.addElement(group)
+enr += 1
+elif ("AMP" in j or "A-OX" in j):
+c = LinearGradient("#56157F","#BE81F7","A_domain"+str(qclusternr) + "_" + str(anr))
+d = LinearGradient("#BE81F7","#56157F","A_line"+str(qclusternr) + "_" + str(anr))
+e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#A_line' + str(qclusternr) + "_" + str(anr) + ")",fill="url(#A_domain" + str(qclusternr) + "_" + str(anr) + ")")
+f = text("A",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#1B0A2A')
+if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
+myStyle.setFontSize('8px')
+f = text("A",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#1B0A2A')
+elif ((endpos-startpos) / aa2pixelratio) < 20:
+f = "notext"
+if f != "notext":
+f.set_style(myStyle.getStyle())
+myStyle.setFontSize('12px')
+group = g()
+group.addElement(c)
+group.addElement(d)
+group.addElement(e)
+if f != "notext":
+group.addElement(f)
+group.set_id("b" + str(qclusternr) + "_00%s"%w)
+s.addElement(group)
+anr += 1
+elif "PCP" in j:
+c = LinearGradient("#084BC6","#81BEF7","PCP_domain"+str(qclusternr) + "_" + str(pcpnr))
+d = LinearGradient("#81BEF7","#084BC6","PCP_line"+str(qclusternr) + "_" + str(pcpnr))
+e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#PCP_line' + str(qclusternr) + "_" + str(pcpnr) + ")",fill="url(#PCP_domain" + str(qclusternr) + "_" + str(pcpnr) + ")")
+f = text("PCP",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0A1B2A')
+if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
+myStyle.setFontSize('8px')
+f = text("PCP",((-2 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0A1B2A')
+elif ((endpos-startpos) / aa2pixelratio) < 20:
+f = "notext"
+if f != "notext":
+f.set_style(myStyle.getStyle())
+myStyle.setFontSize('12px')
+group = g()
+group.addElement(c)
+group.addElement(d)
+group.addElement(e)
+if f != "notext":
+group.addElement(f)
+group.set_id("b" + str(qclusternr) + "_00%s"%w)
+s.addElement(group)
+pcpnr += 1
+elif "Thioesterase" in j or "TD" in j:
+c = LinearGradient("#750072","#F5A9F2","TE_domain"+str(qclusternr) + "_" + str(tenr))
+d = LinearGradient("#F5A9F2","#750072","TE_line"+str(qclusternr) + "_" + str(tenr))
+e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#TE_line' + str(qclusternr) + "_" + str(tenr) + ")",fill="url(#TE_domain" + str(qclusternr) + "_" + str(tenr) + ")")
+if "Thioesterase" in j:
+f = text("TE",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#2A0A29')
+else:
+f = text("TD",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#2A0A29')
+if ((endpos-startpos) / aa2pixelratio) < 100 and ((endpos-startpos) / aa2pixelratio) >= 20:
+myStyle.setFontSize('8px')
+if "Thioesterase" in j:
+f = text("TE",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#2A0A29')
+else:
+f = text("TD",((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#2A0A29')
+elif ((endpos-startpos) / aa2pixelratio) < 20:
+f = "notext"
+if f != "notext":
+f.set_style(myStyle.getStyle())
+myStyle.setFontSize('12px')
+group = g()
+group.addElement(c)
+group.addElement(d)
+group.addElement(e)
+if f != "notext":
+group.addElement(f)
+group.set_id("b" + str(qclusternr) + "_00%s"%w)
+s.addElement(group)
+tenr += 1
+else:
+c = LinearGradient("#929292","#DBDBDB","other_domain"+str(qclusternr) + "_" + str(othernr))
+d = LinearGradient("#DBDBDB","#929292","other_line"+str(qclusternr) + "_" + str(othernr))
+e = oh.createRect(str(10 + startpos / aa2pixelratio),str((125 + z * 60 ) - 8),str((endpos-startpos) / aa2pixelratio),15,8,strokewidth=1,stroke='url(#other_line' + str(qclusternr) + "_" + str(othernr) + ")",fill="url(#other_domain" + str(qclusternr) + "_" + str(othernr) + ")")
+domname = (((((((((j.replace("0","")).replace("1","")).replace("2","")).replace("3","")).replace("4","")).replace("5","")).replace("6","")).replace("7","")).replace("8","")).replace("9","")
+if len(domname) == 1:
+f = text(domname,((startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0B0B0B')
+elif len(domname) == 2:
+f = text(domname,((-4 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0B0B0B')
+elif len(domname) == 3:
+f = text(domname,((-12 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 4),fill='#0B0B0B')
+if len(domname) > 3 or ((endpos-startpos) / aa2pixelratio) < 100:
+myStyle.setFontSize('8px')
+f = text(domname,((-16 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0B0B0B')
+if len(domname) > 4 and ((endpos-startpos) / aa2pixelratio) < 100:
+myStyle.setFontSize('6px')
+f = text(domname,((-16 + startpos / aa2pixelratio) + 0.5 * ((endpos-startpos) / aa2pixelratio)), ((125 + z * 60 ) + 3),fill='#0B0B0B')
+if ((endpos-startpos) / aa2pixelratio) < 60:
+f = "notext"
+if f != "notext":
+f.set_style(myStyle.getStyle())
+myStyle.setFontSize('12px')
+group = g()
+group.addElement(c)
+group.addElement(d)
+group.addElement(e)
+if f != "notext":
+group.addElement(f)
+group.set_id("b" + str(qclusternr) + "_00%s"%w)
+s.addElement(group)
+othernr += 1
+w += 1
+z += 1
+s.addElement(group)
+return s
+def calculate_colorgroups(queryclusternumber,hitclusternumbers,queryclusterdata,internalhomologygroupsdict):
+#Extract data and generate color scheme
+nrhitclusters = queryclusterdata[queryclusternumber][0]
+hitclusterdata = queryclusterdata[queryclusternumber][1]
+queryclustergenes = hitclusterdata[1][3]
+queryclustergenesdetails = hitclusterdata[1][4]
+colorgroupsdict = {}
+colorgroupslengthlist = []
+colorgroupslist = []
+for hitclusternumber in hitclusternumbers:
+colorgroups = hitclusterdata[hitclusternumber][0][hitclusternumber]
+colorgroupsdict[hitclusternumber] = colorgroups
+colorgroupslengthlist.append(len(colorgroups))
+colorgroupslist.append(colorgroups)
+metacolorgroups = []
+internalgroups = internalhomologygroupsdict[queryclusternumber]
+for i in internalgroups:
+metagroup = []
+for j in i:
+for m in colorgroupslist:
+for l in m:
+if j in l:
+#for k in l:
+#  if k not in metagroup:
+#    metagroup.append(k)
+[metagroup.append(k) for k in l if k not in metagroup]
+if len(metagroup) > 1 and metagroup not in metacolorgroups:
+metacolorgroups.append(metagroup)
+#Generate RGB scheme
+rgbcolorscheme = generate_rgbscheme(len(metacolorgroups))
+rgbcolorscheme.append("#FFFFFF")
+#Create colorschemedict in which all genes that are hits of the same query gene get the same color
+colorschemedict = {}
+z = 0
+for i in queryclustergenes:
+for j in metacolorgroups:
+if i in j:
+for l in j:
+if colorschemedict.has_key(l):
+pass
+else:
+colorschemedict[l] = z
+#[colorschemedict[l] = z for l in j if not coloschemedict.has_key(l)]
+if z in colorschemedict.values():
+z += 1
+return colorschemedict,rgbcolorscheme
+def clusterblastresults(queryclusternumber,hitclusternumbers,queryclusterdata,colorschemedict,rgbcolorscheme):
+#print "Generating svg for cluster",queryclusternumber
+#Extract data and generate color scheme
+nrhitclusters = queryclusterdata[queryclusternumber][0]
+hitclusterdata = queryclusterdata[queryclusternumber][1]
+queryclustergenes = hitclusterdata[1][3]
+queryclustergenesdetails = hitclusterdata[1][4]
+colorgroupsdict = {}
+colorgroupslengthlist = []
+colorgroupslist = []
+for hitclusternumber in hitclusternumbers:
+colorgroups = hitclusterdata[hitclusternumber][0][hitclusternumber]
+colorgroupsdict[hitclusternumber] = colorgroups
+colorgroupslengthlist.append(len(colorgroups))
+colorgroupslist.append(colorgroups)
+#Find out whether hit gene cluster needs to be inverted compared to query gene cluster
+strandsbalancedict = {}
+for m in hitclusternumbers:
+hitclustergenesdetails = hitclusterdata[m][2]
+strandsbalance = 0
+for i in queryclustergenes:
+refstrand = queryclustergenesdetails[i][2]
+for j in colorgroupsdict[m]:
+if i in j:
+for k in j:
+if k in hitclusterdata[m][1] and hitclustergenesdetails[k][2] == refstrand:
+strandsbalance += 1
+elif k in hitclusterdata[m][1] and hitclusterdata[m][2][k][2] != refstrand:
+strandsbalance = strandsbalance - 1
+strandsbalancedict[m] = strandsbalance
+#Generate coordinates for SVG figure
+qnrgenes = len(queryclustergenes)
+qstarts =[]
+qends = []
+qstrands =[]
+qcolors = []
+for i in queryclustergenes:
+qgenedata = queryclustergenesdetails[i]
+if qgenedata[0] > qgenedata[1]:
+qstarts.append(qgenedata[0])
+qends.append(qgenedata[1])
+else:
+qstarts.append(qgenedata[1])
+qends.append(qgenedata[0])
+qstrands.append(qgenedata[2])
+if colorschemedict.has_key(i):
+qcolors.append(colorschemedict[i])
+else:
+qcolors.append("white")
+qstarts_ends = startendsitescheck(qstarts,qends)
+qstarts = qstarts_ends[0]
+qends = qstarts_ends[1]
+hdata = {}
+for m in hitclusternumbers:
+hitclustergenes = hitclusterdata[m][1]
+hitclustergenesdetails = hitclusterdata[m][2]
+hnrgenes = len(hitclustergenes)
+hstarts =[]
+hends = []
+hstrands =[]
+hcolors = []
+for i in hitclustergenes:
+hgenedata = hitclustergenesdetails[i]
+if hgenedata[0] > hgenedata[1]:
+hstarts.append(hgenedata[0])
+hends.append(hgenedata[1])
+else:
+hstarts.append(hgenedata[1])
+hends.append(hgenedata[0])
+hstrands.append(hgenedata[2])
+if colorschemedict.has_key(i):
+hcolors.append(colorschemedict[i])
+else:
+hcolors.append("white")
+#Invert gene cluster if needed
+if strandsbalancedict[m] < 0:
+hstarts2 = []
+hends2 = []
+hstrands2 = []
+for i in hstarts:
+hstarts2.append(str(100000000 - int(i)))
+hstarts = hstarts2
+hstarts.reverse()
+for i in hends:
+hends2.append(str(100000000 - int(i)))
+hends = hends2
+hends.reverse()
+for i in hstrands:
+if i == "+":
+hstrands2.append("-")
+elif i == "-":
+hstrands2.append("+")
+hstrands = hstrands2
+hstrands.reverse()
+hcolors.reverse()
+hstarts_ends = startendsitescheck(hstarts,hends)
+hstarts = hstarts_ends[0]
+hends = hstarts_ends[1]
+hdata[m] = [hstarts,hends,hstrands,hcolors]
+#Find cluster size of largest cluster of query & all hit clusters assessed
+clustersizes = []
+for m in hitclusternumbers:
+hclustersize = int(hdata[m][1][-1]) - int(hdata[m][0][0])
+clustersizes.append(hclustersize)
+qclustersize = int(qends[-1]) - int(qstarts[0])
+clustersizes.append(qclustersize)
+largestclustersize = max(clustersizes)
+smallestclustersize = min(clustersizes)
+#Find relative positions
+qrelpositions = relativepositions(qstarts,qends,largestclustersize)
+qrel_starts = qrelpositions[0]
+qrel_ends = qrelpositions[1]
+qdata = [qrel_starts,qrel_ends,qstrands,qcolors]
+hdata2 = {}
+qdata2 = []
+for m in hitclusternumbers:
+hclustersize = int(hdata[m][1][-1]) - int(hdata[m][0][0])
+hrelpositions = relativepositions(hdata[m][0],hdata[m][1],largestclustersize)
+hrel_starts = hrelpositions[0]
+hrel_ends = hrelpositions[1]
+#Center-align smallest gene cluster
+if largestclustersize == hclustersize:
+qrel_ends2 = []
+qrel_starts2 = []
+for i in qrel_starts:
+qrel_starts2.append(int(i) + int(float(float((largestclustersize - qclustersize) / 2) / largestclustersize) * screenwidth * 0.75))
+for i in qrel_ends:
+qrel_ends2.append(int(i) + int(float(float((largestclustersize - qclustersize) / 2) / largestclustersize) * screenwidth * 0.75))
+qrel_ends = qrel_ends2
+qrel_starts = qrel_starts2
+else:
+hrel_ends2 = []
+hrel_starts2 = []
+for i in hrel_starts:
+hrel_starts2.append(int(i) + int(float(float((largestclustersize - hclustersize) / 2) / largestclustersize) * screenwidth * 0.75))
+for i in hrel_ends:
+hrel_ends2.append(int(i) + int(float(float((largestclustersize - hclustersize) / 2) / largestclustersize) * screenwidth * 0.75))
+hrel_ends = hrel_ends2
+hrel_starts = hrel_starts2
+hdata2[m] = [hrel_starts,hrel_ends,hdata[m][2],hdata[m][3]]
+qdata2 = [qrel_starts,qrel_ends,qdata[2],qdata[3]]
+hdata = hdata2
+qdata = qdata2
+s = svg(x = 0, y = 0, width = (screenwidth * 0.75), height = (270 + len(hitclusternumbers) * 50))
+viewbox = "0 0 " + str(screenwidth * 0.8) + " " + str(180 + len(hitclusternumbers) * 50)
+s.set_viewBox(viewbox)
+s.set_preserveAspectRatio("none")
+#Add line behind query gene cluster gene arrows
+oh = ShapeBuilder()
+group = g()
+group.addElement(oh.createLine(10,35,10 + (screenwidth * 0.75),35, strokewidth = 1, stroke = "grey"))
+s.addElement(group)
+#Add query gene cluster gene arrows
+a = 0
+y = 0
+for x in range(qnrgenes):
+group = g()
+#group.addElement(_gene_label(rel_starts[a],rel_ends[a],genes[a],y,screenwidth))
+if qcolors[a] == "white":
+group.addElement(_gene_arrow(10 + qrel_starts[a],10 + qrel_ends[a],qstrands[a],rgbcolorscheme[-1],40,10))
+else:
+group.addElement(_gene_arrow(10 + qrel_starts[a],10 + qrel_ends[a],qstrands[a],rgbcolorscheme[qcolors[a]],40,10))
+#Can be used for domains
+#group.addElement(oh.createRect(rel_starts[a],45,(rel_ends[a]-rel_starts[a]),10, strokewidth = 2, stroke = "black", fill="#237845"))
+if len(hitclusternumbers) == 1:
+group.set_id("q" + str(queryclusternumber) + "_" + str(hitclusternumbers[0]) + "_" + "%s"%x)
+else:
+group.set_id("all_" + str(queryclusternumber) + "_0_" + "%s"%x)
+s.addElement(group)
+if y == 0:
+y = 1
+elif y == 1:
+y = 0
+a += 1
+for m in hitclusternumbers:
+#Add line behind hit gene cluster gene arrows
+group.addElement(oh.createLine(10,35 + 50 * (hitclusternumbers.index(m) + 1),10 + (screenwidth * 0.75),35 + 50 * (hitclusternumbers.index(m) + 1), strokewidth = 1, stroke = "grey"))
+s.addElement(group)
+#Add hit gene cluster gene arrows
+hitclustergenes = hitclusterdata[m][1]
+hnrgenes = len(hitclustergenes)
+hrel_starts = hdata[m][0]
+hrel_ends = hdata[m][1]
+hstrands = hdata[m][2]
+hcolors = hdata[m][3]
+a = 0
+y = 0
+for x in range(hnrgenes):
+group = g()
+#group.addElement(_gene_label(rel_starts[a],rel_ends[a],genes[a],y,screenwidth))
+if hcolors[a] == "white":
+group.addElement(_gene_arrow(10 + hrel_starts[a],10 + hrel_ends[a],hstrands[a],rgbcolorscheme[-1],40 + 50 * (hitclusternumbers.index(m) + 1),10))
+else:
+group.addElement(_gene_arrow(10 + hrel_starts[a],10 + hrel_ends[a],hstrands[a],rgbcolorscheme[hcolors[a]],40 + 50 * (hitclusternumbers.index(m) + 1),10))
+#Can be used for domains
+#   group.addElement(oh.createRect(rel_starts[a],45,(rel_ends[a]-rel_starts[a]),10, strokewidth = 2, stroke = "black", fill="#237845"))
+if len(hitclusternumbers) == 1:
+group.set_id("h" + str(queryclusternumber) + "_" + str(m) + "_" + "%s"%x)
+else:
+group.set_id("all_" + str(queryclusternumber) + "_" + str(m) + "_" + "%s"%x)
+s.addElement(group)
+if y == 0:
+y = 1
+elif y == 1:
+y = 0
+a += 1
+return [s,[qdata,hdata,strandsbalancedict]]
+def runblast(query):
+blastsearch = "blastp  -db "+antismash_path+"clusterblast/geneclusterprots.fasta -query " + query + " -outfmt 6 -max_target_seqs 1000 -evalue 1e-05 -out " + query.split(".")[0] + ".out"
+os.system(blastsearch)
+def smcog_analysis(inputgenes,inputnr,accessiondict,seqdict,smcogdict,smcogsoutputfolder):
+#create input.fasta file with single query sequence to be used as input for MSA
+for k in inputgenes:
+gene = accessiondict[k]
+tag = k
+seq = seqdict[k]
+writefasta([tag],[seq],"input" + str(inputnr) + ".fasta")
+if len(smcogdict[k]) > 0:
+smcog = (smcogdict[k][0][0]).split(":")[0]
+#Align to multiple sequence alignment, output as fasta file
+fastafile = "input" + str(inputnr) + ".fasta"
+musclecommand = "muscle -quiet -profile -in1 " + str(smcog).lower() + "_muscle.fasta -in2 input" + str(inputnr) + ".fasta -out muscle" + str(inputnr) + ".fasta"
+os.system(musclecommand)
+#Trim alignment
+#edit muscle fasta file: remove all positions before the first and after the last position shared by >33% of all sequences
+file = open("muscle" + str(inputnr) + ".fasta","r")
+filetext = file.read()
+filetext = filetext.replace("\r","\n")
+lines = filetext.split("\n")
+##Combine all sequence lines into single lines
+lines2 = []
+seq = ""
+nrlines = len(lines)
+a = 0
+lines = lines[:-1]
+for i in lines:
+if a == (nrlines - 2):
+seq = seq + i
+lines2.append(seq)
+if i[0] == ">":
+lines2.append(seq)
+seq = ""
+lines2.append(i)
+else:
+seq = seq + i
+a += 1
+lines = lines2[1:]
+#Retrieve names and seqs from muscle fasta lines
+seqs = []
+names = []
+for i in lines:
+if len(i) > 0 and i[0] == ">":
+name = i[1:]
+names.append(name)
+else:
+seq = i
+seqs.append(seq)
+#Find first and last amino acids shared conserved >33%
+#Create list system to store conservation of residues
+conservationlist = []
+lenseqs = len(seqs[0])
+nrseqs = len(seqs)
+for i in range(lenseqs):
+conservationlist.append({"A":0,"B":0,"C":0,"D":0,"E":0,"F":0,"G":0,"H":0,"I":0,"J":0,"K":0,"L":0,"M":0,"N":0,"P":0,"Q":0,"R":0,"S":0,"T":0,"U":0,"V":0,"W":0,"X":0,"Y":0,"Z":0,"-":0})
+a = 0
+for i in seqs:
+aa = list(i)
+for i in aa:
+conservationlist[a][i] += 1
+a += 1
+a = 0
+firstsharedaa = 0
+lastsharedaa = lenseqs
+#Find first amino acid shared
+first = "yes"
+nr = 0
+for i in conservationlist:
+aa = sortdictkeysbyvaluesrev(i)
+if aa[0] != "-" and i[aa[1]] > (nrseqs / 3) and first == "yes":
+firstsharedaa = nr
+first = "no"
+nr += 1
+#Find last amino acid shared
+conservationlist.reverse()
+first = "yes"
+nr = 0
+for i in conservationlist:
+aa = sortdictkeysbyvaluesrev(i)
+if aa[0] != "-" and i[aa[1]] > (nrseqs / 3) and first == "yes":
+lastsharedaa = lenseqs - nr
+first = "no"
+nr += 1
+#Shorten sequences to detected conserved regions
+seqs2 = []
+for i in seqs:
+seq = i[firstsharedaa:lastsharedaa]
+seqs2.append(seq)
+seqs = seqs2
+seedfastaname = "trimmed_alignment" + str(inputnr) + ".fasta"
+writefasta(names,seqs,seedfastaname)
+#Draw phylogenetic tree with fasttree 2.1.1
+nwkfile = "tree" + str(inputnr) + ".nwk"
+if sys.platform == ('win32'):
+fasttreecommand = "fasttree -quiet -fastest -noml trimmed_alignment" + str(inputnr) + ".fasta > " + nwkfile
+elif sys.platform == ('linux2'):
+fasttreecommand = "./FastTree -quiet -fastest -noml trimmed_alignment" + str(inputnr) + ".fasta > " + nwkfile
+os.system(fasttreecommand)
+#Convert tree to XTG and draw PNG image using TreeGraph
+p = subprocess.Popen("java -Djava.awt.headless=true -jar TreeGraph.jar -convert tree" + str(inputnr) + ".nwk -xtg tree" + str(inputnr) + ".xtg", shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+processes_starttime = time.time()
+while True:
+if (time.time() - processes_starttime) > 300:
+if sys.platform == ('linux2'):
+os.kill(p.pid,signal.SIGKILL)
+break
+if sys.platform == ('win32'):
+subprocess.Popen("taskkill /F /T /PID %i"%p.pid , shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+break
+if p.poll() == 0:
+break
+time.sleep(2)
+out, err = p.communicate()
+output = out
+if "exception" not in output and "Exception" not in output:
+p = subprocess.Popen("java -Djava.awt.headless=true -jar TreeGraph.jar -image tree" + str(inputnr) + ".xtg " + tag.split(".")[0] + ".png", shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+processes_starttime = time.time()
+while True:
+if (time.time() - processes_starttime) > 300:
+if sys.platform == ('linux2'):
+os.kill(p.pid,signal.SIGKILL)
+break
+if sys.platform == ('win32'):
+subprocess.Popen("taskkill /F /T /PID %i"%p.pid , shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+break
+if p.poll() == 0:
+break
+time.sleep(2)
+out, err = p.communicate()
+output = out
+if "exception" not in output and "Exception" not in output:
+if sys.platform == ('win32'):
+copycommand = 'copy/y ' + tag.split(".")[0] + '.png "..\\' + smcogsoutputfolder + '" > nul'
+elif sys.platform == ('linux2'):
+copycommand = 'cp ' + tag.split(".")[0] + '.png "../' + smcogsoutputfolder + '" > /dev/null'
+os.system(copycommand)
+if sys.platform == ('win32'):
+os.system("del " + tag.split(".")[0] + ".png")
+os.system("del tree" + str(inputnr) + ".xtg")
+os.system("del trimmed_alignment" + str(inputnr) + ".fasta")
+elif sys.platform == ('linux2'):
+os.system("rm " + tag.split(".")[0] + ".png")
+os.system("rm tree" + str(inputnr) + ".xtg")
+os.system("rm trimmed_alignment" + str(inputnr) + ".fasta")
+def depict_smile(genecluster,structuresfolder):
+if sys.platform == ('win32'):
+indigo_depict_command1 = "indigo-depict genecluster" + str(genecluster) + ".smi " + "genecluster" + str(genecluster) + "_icon.png -query -w 200 -h 150"
+indigo_depict_command2 = "indigo-depict genecluster" + str(genecluster) + ".smi " + "genecluster" + str(genecluster) + ".png -query"
+elif sys.platform == ('linux2'):
+indigo_depict_command1 = "./indigo-depict genecluster" + str(genecluster) + ".smi " + "genecluster" + str(genecluster) + "_icon.png -query -w 200 -h 150"
+indigo_depict_command2 = "./indigo-depict genecluster" + str(genecluster) + ".smi " + "genecluster" + str(genecluster) + ".png -query"
+os.system(indigo_depict_command1)
+os.system(indigo_depict_command2)
+dircontents = getdircontents()
+geneclusterstring = "genecluster" + str(genecluster) + ".png"
+if geneclusterstring in dircontents:
+if sys.platform == ('win32'):
+structuresfolder = structuresfolder.replace("/","\\")
+copycommand1 = "copy/y genecluster" + str(genecluster) + ".png ..\\" + structuresfolder + ' > nul'
+copycommand2 = "copy/y genecluster" + str(genecluster) + "_icon.png ..\\" + structuresfolder + ' > nul'
+delcommand1 = "del genecluster" + str(genecluster) + ".png"
+delcommand2 = "del genecluster" + str(genecluster) + "_icon.png"
+delcommand3 = "del genecluster" + str(genecluster) + ".smi"
+os.system(copycommand1)
+os.system(copycommand2)
+os.system(delcommand1)
+os.system(delcommand2)
+os.system(delcommand3)
+if sys.platform == ('linux2'):
+copycommand1 = "cp genecluster" + str(genecluster) + ".png ../" + structuresfolder
+copycommand2 = "cp genecluster" + str(genecluster) + "_icon.png ../" + structuresfolder
+delcommand1 = "rm genecluster" + str(genecluster) + ".png"
+delcommand2 = "rm genecluster" + str(genecluster) + "_icon.png"
+delcommand3 = "rm genecluster" + str(genecluster) + ".smi"
+os.system(copycommand1)
+os.system(copycommand2)
+os.system(delcommand1)
+os.system(delcommand2)
+return "success"
+else:
+return "failed"
+##Core script
+import os
+from os import system
+import sys
+import multiprocessing
+import time
+from multiprocessing import Process, freeze_support
+import random
+import string
+import itertools
+from pysvg.filter import *
+from pysvg.gradient import *
+from pysvg.linking import *
+from pysvg.script import *
+from pysvg.shape import *
+from pysvg.structure import *
+from pysvg.style import *
+from pysvg.text import *
+from pysvg.builders import *
+from string import ascii_letters
+from pyExcelerator import *
+from pyExcelerator.Workbook import *
+import signal
+import subprocess
+starttime = time.time()
+os.environ['NRPS2BASEDIR'] = os.path.join(os.getcwd(), 'NRPSPredictor2')
+#Fix sys.argv input
+options = []
+for i in sys.argv:
+if i.count('"') > 1:
+j = i.split(' ')
+for k in j:
+if k[0] == '"':
+k = k + '"'
+elif k[-1] == '"':
+k = '"' + k
+options.append(k)
+else:
+options.append(i)
+sys.argv = options
+#Redirect stdout and stderr if GUI-executed
+if "--gui" in sys.argv and len(sys.argv) < (sys.argv.index("--gui") + 2):
+print >> sys.stderr, "Invalid options input: --gui without n or y"
+print "From the command line, input antismash --help for more information."
+logfile = open("antismash.log","w")
+logfile.write("Invalid options input: --gui without n or y\n")
+logfile.close()
+sys.exit(1)
+if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+stdoutfile = open("stdout.txt","w")
+sys.stdout = stdoutfile
+sys.stderr = stdoutfile
+if __name__ == '__main__':
+import shutil
+hmmsearch_path = 'hmmsearch'
+hmmscan_path = 'hmmscan'
+antismash_path = '/home/galaxy/bin/antismash-1.1.0/'
+hmms_path = antismash_path + '/hmms/'
+shutil.copytree(antismash_path + '/NRPSPredictor2/', './NRPSPredictor2/')
+shutil.copytree(antismash_path + '/Minowa/', './Minowa/')
+shutil.copytree(antismash_path + '/pkssignatures/', './pkssignatures/')
+shutil.copytree(antismash_path + '/kr_analysis/', './kr_analysis/')
+shutil.copytree(antismash_path + '/docking_analysis/', './docking_analysis/')
+shutil.copytree(antismash_path + '/NRPeditor/', './NRPeditor/')
+shutil.copy(antismash_path + '/search_form.html', './')
+shutil.copy(antismash_path + '/empty.xhtml', './')
+shutil.copytree(antismash_path + '/vis/', './vis/')
+shutil.copytree(antismash_path + '/smcogtree/', './smcogtree/')
+# add freeze support
+freeze_support()
+#Open logfile
+logfile = open("antismash.log","w")
+#Identify screen width
+if sys.platform == ('win32'):
+import ctypes
+user32 = ctypes.windll.user32
+screenwidth = user32.GetSystemMetrics(0)
+if sys.platform == ('linux2'):
+screenwidth = 1024
+#  res = os.popen("xrandr | grep \* | cut -d' ' -f4")  ###FOR SERVER USE###
+#  res = res.read()                                    ###FOR SERVER USE###
+#  screenwidth = int(res.split("x")[0])                ###FOR SERVER USE###
+if screenwidth < 1024:
+screenwidth = 1024
+#temporary for testing
+screenwidth = 1024
+#Reads input
+inputinstructions = "antiSMASH 1.1.0 arguments:\n\nUsage: antismash <query fasta/embl/gbk file>  [options]\n\nOptions (x is an integer number, list x,y,z is a list of integer numbers separated by commas):\n\n--gtransl <x>  : GenBank translation table used for Glimmer (only for FASTA inputs, default: 1)\n1.  The Standard Code\n2.  The Vertebrate Mitochondrial Code\n3.  The Yeast Mitochondrial Code\n4.  The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code\n5.  The Invertebrate Mitochondrial Code\n6.  The Ciliate, Dasycladacean and Hexamita Nuclear Code\n9.  The Echinoderm and Flatworm Mitochondrial Code\n10. The Euplotid Nuclear Code\n11. The Bacterial, Archaeal and Plant Plastid Code\n12. The Alternative Yeast Nuclear Code\n13. The Ascidian Mitochondrial Code\n14. The Alternative Flatworm Mitochondrial Code\n15. Blepharisma Nuclear Code\n16. Chlorophycean Mitochondrial Code\n21. Trematode Mitochondrial Code\n22. Scenedesmus Obliquus Mitochondrial Code\n23. Thraustochytrium Mitochondrial Code\n--genomeconf <l/c>  : Genome configuration used for Glimmer: linear / circular (only for FASTA inputs, default: l)\n--minglength <x>  : Glimmer minimal gene length (range 30-120, only for FASTA inputs, default: 90)\n--taxon <p/e>  : Taxonomy: prokaryotic / eukaryotic (default: p)\n--cores <x>  : Number of parallel CPUs to use for threading (default: all)\n--clusterblast <y/n> : Include ClusterBlast gene cluster comparison analysis (default:y)\n--smcogs <y/n> : Include smCOG analysis for functional prediction of genes (default:y)\n--fullblast <y/n> : Include genome-wide BLAST analysis (default:n)\n--fullhmm <y/n> : Include genome-wide PFAM HMM analysis (default:n)\n--blastdbpath <path> : Specify folder containing CLUSEAN blast database (default:clusean/db)\n--pfamdbpath <path> : Specify folder containing PFAM database (default:clusean/db)\n--geneclustertypes <x,y,z> : Gene cluster types to scan for (default:1):\n1 = all\n2 = type I polyketide synthases\n3 = type II polyketide synthases\n4 = type III polyketide synthases\n5 = nonribosomal peptide synthetases\n6 = terpene synthases\n7 = lantibiotics\n8 = bacteriocins\n9 = beta-lactams\n10 = aminoglycosides / aminocyclitols\n11 = aminocoumarins\n12 = siderophores\n13 = ectoines\n14 = butyrolactones\n15 = indoles\n16 = nucleosides\n17 = phosphoglycolipids\n18 = melanins\n19 = others\n--help  : this help screen\n"
+#Check input file format
+if len(sys.argv) < 2 or len(sys.argv[1]) < 1:
+print >> sys.stderr, "Please supply valid name for input file."
+print "Usage: antismash <query fasta/embl/gbk file>  [options]"
+print "From the command line, input antismash --help for more information."
+logfile.write("Input format error. Please supply valid name for infile.\n")
+logfile.write("Usage: antismash <query fasta/embl/gbk file>  [options]\n")
+logfile.write("From the command line, input antismash --help for more information.\n")
+logfile.close()
+sys.exit(1)
+if sys.argv[1] != "--help":
+if len(sys.argv[1].split(".")) < 2 or (sys.argv[1].split(".")[-1] != "embl" and sys.argv[1].split(".")[-1] != "EMBL" and sys.argv[1].split(".")[-1] != "emb" and sys.argv[1].split(".")[-1] != "EMB" and sys.argv[1].split(".")[-1] != "genbank" and sys.argv[1].split(".")[-1] != "GENBANK" and sys.argv[1].split(".")[-1] != "gbk" and sys.argv[1].split(".")[-1] != "GBK" and sys.argv[1].split(".")[-1] != "gb" and sys.argv[1].split(".")[-1] != "GB" and sys.argv[1].split(".")[-1] != "fasta" and sys.argv[1].split(".")[-1] != "FASTA" and sys.argv[1].split(".")[-1] != "fas" and sys.argv[1].split(".")[-1] != "FAS" and sys.argv[1].split(".")[-1] != "fa" and sys.argv[1].split(".")[-1] != "FA"):
+print >> sys.stderr, "No EMBL/GBK/FASTA file submitted as input. Please supply a valid file with .embl / .gbk / .fasta extension. "
+print "Usage: antismash <query fasta/embl/gbk file>  [options]"
+print "From the command line, input antismash --help for more information."
+logfile.write("Input format error. Please supply a valid file with .embl / .gbk / .fasta extension.\n")
+logfile.write("Usage: antismash <query fasta/embl/gbk file>  [options]\n")
+logfile.write("From the command line, input antismash --help for more information.\n")
+logfile.close()
+sys.exit(1)
+#Define input filename and make fixes if necessary
+infile = sys.argv[1]
+try:
+testfile = open(infile,"r").read()
+except(IOError):
+print >> sys.stderr, "Please supply valid name for input file."
+print "Usage: antismash <query fasta/embl/gbk file>  [options]"
+print "From the command line, input antismash --help for more information."
+logfile = open("antismash.log","w")
+logfile.write("Input format error. Please supply valid name for infile.\n")
+logfile.write("Usage: antismash <query fasta/embl/gbk file>  [options]\n")
+logfile.write("From the command line, input antismash --help for more information.\n")
+logfile.close()
+sys.exit(1)
+#Parse absolute paths if found
+absolutepath = "n"
+if "/" in infile or "\\" in infile:
+absolutepath = "y"
+lastpos1 = infile.rfind("\\")
+lastpos2 = infile.rfind("/")
+lastpos = max([lastpos1,lastpos2])
+originpath = infile[:(lastpos + 1)]
+infile = infile[(lastpos + 1):]
+if sys.platform == ('win32'):
+copycommand = 'copy/y "' + originpath + infile + '" ' + infile + ' > nul'
+os.system(copycommand)
+if sys.platform == ('linux2'):
+copycommand = 'cp ' + originpath + infile + " . > /dev/null"
+os.system(copycommand)
+#genomename = ".".join(infile.split(".")[:-1])
+#for i in genomename:
+#  if i in '!"#$%&()*+,./:;=>?@[]^`{|}' or i in "'":
+#    genomename = genomename.replace(i,"")
+#    if "/" in genomename:
+#      genomename = genomename.rpartition("/")[2]
+#    if "\\" in genomename:
+#      genomename = genomename.rpartition("\\")[2]
+genomename = os.path.splitext(os.path.basename(infile))[0]
+if sys.platform == ('linux2'):
+if genomename !=  infile.split(".")[-2]:
+oldinfile = infile.replace("(","\\(").replace(")","\\)").replace("*","\\*").replace("&","\\&").replace("!","\\!").replace("$","\\$").replace("{","\\{").replace("}","\\}").replace("|","\\|").replace("`","\\`").replace("'","\\'").replace('"','\\"').replace('?','\\?')
+infile = genomename + "." + infile.split(".")[-1]
+if "/" in genomename:
+genomename = genomename.rpartition("/")[2]
+if "\\" in genomename:
+genomename = genomename.rpartition("\\")[2]
+os.system("cp " + oldinfile + " " + infile)
+#Define outputfolder
+if absolutepath == "y":
+if sys.platform == ('win32'):
+dir1 = os.popen("dir/w/ad " + originpath)
+dir2 = os.popen("dir/w/ad")
+dir1 = dir1.read()
+dir2 = dir2.read()
+if sys.platform == ('linux2'):
+dir1 = os.popen("ls")
+dir2 = os.popen("ls " + originpath)
+dir1 = dir1.read()
+dir2 = dir2.read()
+parts = dir1.split(" ") + dir2.split(" ")
+else:
+if sys.platform == ('win32'):
+dir = os.popen("dir/w/ad")
+dir = dir.read()
+if sys.platform == ('linux2'):
+dir = os.popen("ls")
+dir = dir.read()
+parts = dir.split(" ")
+parts2 = []
+for i in parts:
+partparts = i.split("\n")
+for i in partparts:
+i = i.replace("[","")
+i = i.replace("]","")
+parts2.append(i)
+parts = parts2
+oldgenomename = genomename
+if genomename in parts:
+genomename = genomename + "_" + str(0)
+while genomename in parts:
+finalpart = genomename.split("_")[-1]
+allnumbers = "y"
+for i in finalpart:
+if i not in ["0","1","2","3","4","5","6","7","8","9"]:
+allnumbers = "n"
+if allnumbers == "y" and int(finalpart) in range(0,1000):
+newgenomename = ""
+for i in genomename.split("_")[:-1]:
+newgenomename = newgenomename + "_" + i
+newgenomename = newgenomename + "_" + str(int(finalpart) + 1)
+genomename = newgenomename[1:]
+genomename = genomename.replace("__","_")
+#Output results folder name for output checking by GUI
+resultslocfile = open("resultsfolder.txt","w")
+resultslocfile.write(os.getcwd() + os.sep + genomename)
+resultslocfile.close()
+#Implement defaults
+glimmertransl_table = str(1)
+genomeconf = "l"
+minglength = str(90)
+cores = "all"
+taxon = "p"
+clusterblast = "y"
+smcogs = "y"
+fullblast = "n"
+fullhmm = "n"
+if sys.platform == ('win32'):
+blastdbpath = '"' + os.getcwd() + "/clusean/db" + '"'
+if sys.platform == ('linux2'):
+blastdbpath = os.getcwd() + "/clusean/db"
+if sys.platform == ('win32'):
+pfamdbpath = '"' + os.getcwd() + "/clusean/db/" + '"'
+if sys.platform == ('linux2'):
+pfamdbpath = os.getcwd() + "/clusean/db/"
+geneclustertypes = [1]
+#Read user-specified options which may override defaults
+if len(sys.argv) > 2 or sys.argv[1] == "--help":
+options = sys.argv
+if "--" in options[-1] and sys.argv[1] != "--help":
+invalidoptions(options[-1])
+#identify option identifiers
+identifiers = []
+for i in options:
+if "--" in i:
+if i not in identifiers:
+identifiers.append(i)
+else:
+invalidoptions("No '--' in given options or option given twice.")
+for i in identifiers:
+if i != "--help":
+value = options[options.index(i) + 1].strip()
+if i == "--gtransl":
+for k in value:
+if k not in ["0","1","2","3","4","5","6","7","8","9"]:
+invalidoptions(i + "input is no number")
+if int(value) in range(1,24) and int(value) != 7 and int(value) != 8 and int(value) != 17 and int(value) != 18 and int(value) != 19 and int(value) != 20:
+glimmertransl_table = value
+else:
+invalidoptions(i)
+elif i == "--genomeconf":
+if value == "l" or value == "c":
+genomeconf = value
+else:
+invalidoptions(i)
+elif i == "--minglength":
+for k in value:
+if k not in ["0","1","2","3","4","5","6","7","8","9"]:
+invalidoptions(i)
+if int(value) in range(30,91):
+minglength = value
+else:
+print >> sys.stderr, "Invalid options input: minimal gene length should be a number between 30-90."
+logfile = open("antismash.log","w")
+logfile.write("Invalid options input: minimal gene length should be a number between 30-90.\n")
+logfile.close()
+sys.exit(1)
+elif i == "--cores":
+for k in value:
+if k not in ["0","1","2","3","4","5","6","7","8","9"]:
+invalidoptions(i)
+if int(value) in range(1,1000):
+cores = int(value)
+else:
+invalidoptions(i)
+elif i == "--taxon":
+if value == "p" or value == "e":
+taxon = value
+else:
+invalidoptions(i)
+elif i == "--clusterblast":
+if value == "y" or value == "n":
+clusterblast = value
+else:
+invalidoptions(i)
+elif i == "--smcogs":
+if value == "y" or value == "n":
+smcogs = value
+else:
+invalidoptions(i)
+elif i == "--fullblast":
+if value == "y" or value == "n":
+fullblast = value
+else:
+invalidoptions(i)
+elif i == "--fullhmm":
+if value == "y" or value == "n":
+fullhmm = value
+else:
+invalidoptions(i)
+elif i == "--glimmer_prediction":
+glimmer_prediction_path = value
+elif i == "--blastdbpath":
+if sys.platform == ('win32'):
+if options[options.index(i) + 1][0] != '"':
+value = '"' + options[options.index(i) + 1] + '"'
+else:
+value = options[options.index(i) + 1]
+if ":\\" in value:
+blastdbpath = value
+elif "\\" in value or "/" in value:
+if value[0] == "\\" or value[0] == "/":
+blastdbpath = os.getcwd() + value
+else:
+blastdbpath = os.getcwd() + "\\" + value
+else:
+blastdbpath = os.getcwd() + "\\" + value
+if sys.platform == ('linux2'):
+value = options[options.index(i) + 1]
+if "\\" in value or "/" in value:
+value = value.replace("\\","/")
+if value[0] == "/":
+blastdbpath = value
+else:
+blastdbpath = os.getcwd() + "/" + value
+else:
+blastdbpath = os.getcwd() + "/" + value
+elif i == "--pfamdbpath":
+if sys.platform == ('win32'):
+if options[options.index(i) + 1][0] != '"':
+value = '"' + options[options.index(i) + 1] + '"'
+else:
+value = options[options.index(i) + 1]
+if ":\\" in value:
+pfamdbpath = value
+elif "\\" in value or "/" in value:
+if value[0] == "\\" or value[0] == "/":
+pfamdbpath = os.getcwd() + value
+else:
+pfamdbpath = os.getcwd() + "\\" + value
+else:
+pfamdbpath = os.getcwd() + "\\" + value
+if sys.platform == ('linux2'):
+value = options[options.index(i) + 1]
+if "\\" in value or "/" in value:
+value = value.replace("\\","/")
+if value[0] == "/":
+pfamdbpath = value
+else:
+pfamdbpath = os.getcwd() + "/" + value
+else:
+pfamdbpath = os.getcwd() + "/" + value
+elif i == "--geneclustertypes":
+if "," not in value and value not in ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19"]:
+invalidoptions(i)
+else:
+types = value.split(",")
+types2 = []
+if "1" in types:
+types2 = [1]
+for j in types:
+if int(j) not in range(1,20):
+invalidoptions(i)
+else:
+types2.append(int(j))
+geneclustertypes = types2
+elif i == "--help":
+print inputinstructions
+sys.exit()
+elif i == "--gui":
+pass
+else:
+invalidoptions(i)
+#Determine number of CPUs used
+if cores == "all":
+try:
+nrcpus = multiprocessing.cpu_count()
+except(IOError,OSError,NotImplementedError):
+nrcpus = 1
+else:
+try:
+nrcpus = multiprocessing.cpu_count()
+except(IOError,OSError,NotImplementedError):
+nrcpus = 1
+if cores < nrcpus:
+nrcpus = cores
+#Create directory structure needed for file storage
+try:
+os.mkdir(genomename)
+except(IOError,OSError):
+pass
+hmmoutputfolder = genomename + "/hmmoutput/"
+try:
+os.mkdir(hmmoutputfolder)
+except(IOError,OSError):
+pass
+nrpspksoutputfolder = genomename + "/nrpspks/"
+try:
+os.mkdir(nrpspksoutputfolder)
+except(IOError,OSError):
+pass
+nrpspredictoroutputfolder = nrpspksoutputfolder + "nrpspredictor/"
+try:
+os.mkdir(nrpspredictoroutputfolder)
+except(IOError,OSError):
+pass
+minowanrpsoutputfolder = nrpspksoutputfolder + "minowanrpspred/"
+try:
+os.mkdir(minowanrpsoutputfolder)
+except(IOError,OSError):
+pass
+minowapksoutputfolder = nrpspksoutputfolder + "minowapkspred/"
+try:
+os.mkdir(minowapksoutputfolder)
+except(IOError,OSError):
+pass
+minowacaloutputfolder = nrpspksoutputfolder + "minowacalpred/"
+try:
+os.mkdir(minowacaloutputfolder)
+except(IOError,OSError):
+pass
+pkssignatureoutputfolder = nrpspksoutputfolder + "pkssignatures/"
+try:
+os.mkdir(pkssignatureoutputfolder)
+except(IOError,OSError):
+pass
+kranalysisoutputfolder = nrpspksoutputfolder + "kr_analysis/"
+try:
+os.mkdir(kranalysisoutputfolder)
+except(IOError,OSError):
+pass
+clusterblastoutputfolder = genomename + "/clusterblast/"
+try:
+os.mkdir(clusterblastoutputfolder)
+except(IOError,OSError):
+pass
+smcogsoutputfolder = genomename + "/smcogs/"
+try:
+os.mkdir(smcogsoutputfolder)
+except(IOError,OSError):
+pass
+substrspecsfolder = genomename + "/substrspecs/"
+try:
+os.mkdir(substrspecsfolder)
+except(IOError,OSError):
+pass
+structuresfolder = genomename + "/structures/"
+try:
+os.mkdir(structuresfolder)
+except(IOError,OSError):
+pass
+svgfolder = genomename + "/svg/"
+try:
+os.mkdir(svgfolder)
+except(IOError,OSError):
+pass
+searchgtrfolder = genomename + "/searchgtr/"
+try:
+os.mkdir(searchgtrfolder)
+except(IOError,OSError):
+pass
+htmlfolder = genomename + "/html/"
+try:
+os.mkdir(htmlfolder)
+except(IOError,OSError):
+pass
+imagesfolder = genomename + "/images/"
+try:
+os.mkdir(imagesfolder)
+except(IOError,OSError):
+pass
+#If input is unannotated GBK/EMBL file, convert to FASTA and use that as input
+if "     CDS             " not in open(infile,"r").read() and "FT   CDS " not in open(infile,"r").read():
+if infile.split(".")[-1] == "embl" or infile.split(".")[-1] == "EMBL" or infile.split(".")[-1] == "emb" or infile.split(".")[-1] == "EMB":
+filetext = open(infile,"r").read()
+if "\nSQ" not in filetext:
+print >> sys.stderr, "Exit: EMBL file not properly formatted, no sequence found."
+logfile = open("antismash.log","w")
+logfile.write("Exit: EMBL file not properly formatted, no sequence found.\n")
+logfile.close()
+sys.exit(1)
+dnaseq = filetext.split("\nSQ")[1]
+dnaseq = cleandnaseq(dnaseq)
+sequence = dnaseq
+if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
+print >> sys.stderr, "Protein EMBL file provided. Please provide nucleotide EMBL file."
+sys.exit(1)
+fastafile = open(infile.rpartition(".")[0] + ".fasta","w")
+fastafile.write(">" + infile.rpartition(".")[0] + "|\n")
+fastafile.write(sequence)
+fastafile.close()
+infile = fastafile
+elif infile.split(".")[-1] == "gbk" or infile.split(".")[-1] == "GBK" or infile.split(".")[-1] == "gb" or infile.split(".")[-1] == "GB" or infile.split(".")[-1] == "genbank" or infile.split(".")[-1] == "GENBANK":
+filetext = open(infile,"r").read()
+if "\nORIGIN" not in filetext:
+print >> sys.stderr, "Exit: GBK file not properly formatted, no sequence found."
+logfile = open("antismash.log","w")
+logfile.write("Exit: GBK file not properly formatted, no sequence found.\n")
+logfile.close()
+sys.exit(1)
+dnaseq = filetext.split("\nORIGIN")[1]
+dnaseq = cleandnaseq(dnaseq)
+sequence = dnaseq
+if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
+print >> sys.stderr, "Protein GBK file provided. Please provide nucleotide GBK file."
+sys.exit(1)
+fastafile = open(infile.rpartition(".")[0] + ".fasta","w")
+fastafile.write(">" + infile.rpartition(".")[0] + "\n")
+fastafile.write(sequence)
+fastafile.close()
+infile = infile.rpartition(".")[0] + ".fasta"
+#If input is unannotated fasta file, predict genes with Glimmer and create EMBL file. If input is EMBL or GBK file, read input embl/gbk and create input fasta file, read input protein info into memory
+annotated = "y"
+if infile.split(".")[-1] == "fasta" or infile.split(".")[-1] == "FASTA" or infile.split(".")[-1] == "FAS" or infile.split(".")[-1] == "fas" or infile.split(".")[-1] == "FA" or infile.split(".")[-1] == "fa":
+annotated = "n"
+#Check input file formatting
+sequence = get_sequence(infile)
+if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
+print >> sys.stderr, "Protein FASTA file provided. Please provide nucleotide FASTA file."
+sys.exit(1)
+nucleotides = ["A","a","C","c","G","g","T","t","N","n"]
+badsequence = "n"
+sequence_name = open(infile,"r").read().partition(">")[2].partition("\n")[0]
+for i in sequence:
+if i not in nucleotides:
+badsequence = "y"
+if badsequence == "y":
+cleaned_sequence = cleandnaseq(sequence)
+badsequence = "n"
+for i in cleaned_sequence:
+if i not in nucleotides:
+badsequence = "y"
+if badsequence == "n":
+writefasta([sequence_name],[cleaned_sequence],infile.rpartition(".")[0] + "_f.fasta")
+infile = infile.rpartition(".")[0] + "_f.fasta"
+else:
+print >>sys.stderr, "Incorrect file formatting. Please submit a properly formatted single-sequence FASTA file."
+logfile = open("antismash.log","w")
+logfile.write("Incorrect file formatting. Please submit a properly formatted single-sequence FASTA file.\n")
+logfile.close()
+sys.exit(1)
+revseq = reverse_complement(sequence)
+seqlength = len(sequence)
+#Print Glimmer notification
+#if taxon == "p":
+#  print "Running Glimmer 3.02 to predict genes in unannotated prokaryotic genome..."
+#elif taxon == "e":
+#  print "Running GlimmerHMM 3.0.1 to predict genes in unannotated eukaryotic genome..."
+logfile = open("antismash.log","w")
+if taxon == "p":
+logfile.write("Running Glimmer 3.02 to predict genes in unannotated prokaryotic genome...\n")
+elif taxon == "e":
+logfile.write("Running GlimmerHMM 3.0.1 to predict genes in unannotated eukaryotic genome...\n")
+#logfile.close()
+loginfo = open("antismash.log","r").read()
+#logfile.close()
+#Copying file and changing to folder to prepare for Glimmer3 prediction
+os.mkdir( os.path.join(os.getcwd(), genomename, "geneprediction"))
+if sys.platform == ('win32'):
+os.system("copy/y " + infile + " geneprediction > nul")
+if sys.platform == ('linux2'):
+os.system("cp " + infile + " geneprediction > /dev/null")
+os.chdir( os.path.join(os.getcwd(), genomename, "geneprediction"))
+fastafile = '../../'+infile
+#Find DNA sequence length
+seq = get_sequence(fastafile)
+dnaseqlength = len(seq)
+#Run Glimmer for prokaryotic sequences, GlimmerHMM for eukaryotic sequences
+if taxon == "p":
+"""
+GlimmerPrediction, not needed since we can predict it in galaxy on our own
+if genomeconf == "l":
+if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+os.popen("tigr-glimmer long-orfs -l -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
+else:
+os.system("tigr-glimmer long-orfs -l -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
+else:
+if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+os.popen("tigr-glimmer long-orfs -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
+else:
+os.system("tigr-glimmer long-orfs -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
+if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+os.popen("tigr-glimmer extract -t " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs > " + fastafile.rpartition(".")[0] + ".train")
+else:
+os.system("tigr-glimmer extract -t " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs > " + fastafile.rpartition(".")[0] + ".train")
+if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+os.popen("tigr-glimmer build-icm -r " + fastafile.rpartition(".")[0] + ".icm < " + fastafile.rpartition(".")[0] + ".train")
+else:
+os.system("tigr-glimmer build-icm -r " + fastafile.rpartition(".")[0] + ".icm < " + fastafile.rpartition(".")[0] + ".train")
+if genomeconf == "l":
+if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+os.popen("tigr-glimmer glimmer3 -l -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
+else:
+os.system("tigr-glimmer glimmer3 -l -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
+else:
+if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+os.popen("tigr-glimmer glimmer3 -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
+else:
+os.system("tigr-glimmer glimmer3 -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
+#Convert glimmer predictions into EMBL with sequence
+glfile = fastafile.rpartition(".")[0] + ".predict"
+Ende der Glimmer-Prediction
+"""
+glfile = glimmer_prediction_path
+emblfile = fastafile.rpartition(".")[0] + ".embl"
+try:
+file = open(glfile,"r")
+filetext = file.read()
+except:
+print >> sys.stderr, "Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 11."
+logfile = open("antismash.log","w")
+logfile.write("Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 11.\n")
+logfile.close()
+sys.exit(1)
+if "orf" not in filetext:
+print >> sys.stderr, "Glimmer gene prediction failed: no genes found."
+logfile = open("antismash.log","w")
+logfile.write("Glimmer gene prediction failed: no genes found.\n")
+logfile.close()
+sys.exit(1)
+filetext = filetext.replace("\r","\n")
+lines = filetext.split("\n")
+lines = lines[1:-1]
+orfnames = []
+starts = []
+ends = []
+strands = []
+starts2 = []
+ends2 = []
+firstline = "y"
+for i in lines:
+columns = i.split(" ")
+columns2 = []
+for i in columns:
+if i != "":
+columns2.append(i)
+columns = columns2
+if len(columns) > 3:
+frame = columns[3][0]
+strands.append(frame)
+else:
+frame = ""
+if firstline == "y" and frame == "+" and len(columns) > 3:
+orfname = str(columns[0])
+orfnames.append(orfname)
+if genomeconf == "c" and (int(columns[1]) > int(columns[2])) and (int(columns[1]) > (0.5 * dnaseqlength)):
+gstart = (int(columns[2]) % 3) + 1
+if gstart == 3:
+gstart = 0
+starts.append(str(gstart))
+ends.append(columns[2])
+starts.append(columns[1])
+ends.append(str(dnaseqlength))
+else:
+starts.append(columns[1])
+ends.append(columns[2])
+firstline = "n"
+elif firstline == "y" and frame == "-" and len(columns) > 3:
+orfname = str(columns[0])
+orfnames.append(orfname)
+if genomeconf == "c" and (int(columns[1]) > int(columns[2])) and (int(columns[1]) > (0.5 * dnaseqlength)):
+gstart = (int(columns[2]) % 3) + 1
+if gstart == 3:
+gstart = 0
+starts.append("complement(" + str(gstart))
+ends.append(columns[2] + ")")
+starts.append("complement(" + columns[1])
+ends.append(str(dnaseqlength) + ")")
+else:
+complstart = "complement(" + str(columns[1])
+starts.append(complstart)
+complend = str(columns[2]) + ")"
+ends.append(str(complend))
+firstline = "n"
+elif frame == "+" and len(columns) > 3:
+orfname = str(columns[0])
+orfnames.append(orfname)
+starts.append(columns[1])
+ends.append(columns[2])
+elif frame == "-" and len(columns) > 3:
+orfname = str(columns[0])
+orfnames.append(orfname)
+complstart = "complement(" + str(columns[1])
+starts.append(complstart)
+complend = str(columns[2]) + ")"
+ends.append(str(complend))
+if len(orfnames) == 0:
+print >> sys.stderr, "Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 10."
+logfile = open("antismash.log","w")
+logfile.write("Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 10.\n")
+logfile.close()
+sys.exit(1)
+out_file = open(emblfile,"w")
+a = 0
+#print "Writing EMBL file with Glimmer-predicted genes..."
+logfile = open("antismash.log","w")
+logfile.write(loginfo)
+logfile.write("Writing EMBL file with Glimmer-predicted genes...\n")
+#logfile.close()
+loginfo = open("antismash.log","r").read()
+#logfile.close()
+if taxon == "p":
+out_file.write("ID   A01; SV 1; linear; DNA; STD; PRO; " + str(dnaseqlength) + " BP.\nXX\n")
+else:
+out_file.write("ID   A01; SV 1; linear; DNA; STD; FUN; " + str(dnaseqlength) + " BP.\nXX\n")
+out_file.write("AC   A01;\nXX\n")
+out_file.write("DE   " + genomename + ";\nXX\n")
+out_file.write("KW   none;\nXX\n")
+out_file.write("OS   unknown;\n")
+if taxon == "p":
+out_file.write("OC   Eubacteria;\nXX\n")
+else:
+out_file.write("OC   Fungi;\nXX\n")
+out_file.write("RN   [1]\n")
+out_file.write("RT   ;\n")
+out_file.write("RL   Unknown.\nXX\n")
+out_file.write("FH   Key             Location/Qualifiers\nFH\n")
+out_file.write("FT   source          1.." + str(dnaseqlength) + "\n")
+for i in orfnames:
+out_file.write("FT   gene            ")
+out_file.write(starts[a])
+out_file.write("..")
+out_file.write(ends[a])
+out_file.write("\n")
+out_file.write('FT                   /gene="' + i + '"\n')
+out_file.write("FT   CDS             ")
+out_file.write(starts[a])
+out_file.write("..")
+out_file.write(ends[a])
+out_file.write("\n")
+out_file.write('FT                   /gene="' + i + '"\n')
+a += 1
+elif taxon == "e":
+"""
+GlimmerHMM is executed extern ... in galaxy and will be provided through glimmer_prediction_path
+if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+os.popen("glimmerhmm " + fastafile + " train_crypto -o " + fastafile.rpartition(".")[0] + ".predict -g")
+else:
+os.system("glimmerhmm " + fastafile + " train_crypto -o " + fastafile.rpartition(".")[0] + ".predict -g")
+"""
+#Convert glimmerhmm predictions into EMBL with sequence
+#glfile = fastafile.rpartition(".")[0] + ".predict"
+glfile = glimmer_prediction_path
+emblfile = fastafile.rpartition(".")[0] + ".embl"
+try:
+file = open(glfile,"r")
+filetext = file.read().replace("\r","")
+except:
+print >> sys.stderr, "GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 9."
+logfile = open("antismash.log","w")
+logfile.write("GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 9.\n")
+logfile.close()
+sys.exit(1)
+if "CDS" not in filetext:
+print >> sys.stderr, "GlimmerHMM gene prediction failed: no genes found."
+logfile = open("antismash.log","w")
+logfile.write("GlimmerHMM gene prediction failed: no genes found.\n")
+logfile.close()
+sys.exit(1)
+filetext = filetext.replace("\r","\n")
+lines = filetext.split("\n")
+lines = lines[2:-1]
+orfnames = []
+positions = []
+firstline = "y"
+x = 0
+orfnr = 0
+starts = []
+ends = []
+for i in lines:
+columns = i.split("\t")
+if len(columns) > 1:
+if x == 0:
+strand = columns[6]
+if "mRNA" not in i:
+starts.append(columns[3])
+ends.append(columns[4])
+elif x == (len(lines) - 1) or "mRNA" in lines[x + 1]:
+strand = columns[6]
+starts.append(columns[3])
+ends.append(columns[4])
+orfnr += 1
+if len(str(orfnr)) == 1:
+orfname = "orf0000" + str(orfnr)
+elif len(str(orfnr)) == 2:
+orfname = "orf000" + str(orfnr)
+elif len(str(orfnr)) == 3:
+orfname = "orf00" + str(orfnr)
+elif len(str(orfnr)) == 4:
+orfname = "orf0" + str(orfnr)
+elif len(str(orfnr)) == 5:
+orfname = "orf" + str(orfnr)
+orfnames.append(orfname)
+if strand == "+":
+if len(starts) == 1:
+pos = starts[0] + ".." + ends[0]
+positions.append(pos)
+else:
+pos = "join("
+y = 0
+for i in starts:
+pos = pos + i + ".." + ends[y]
+if i != starts[-1]:
+pos = pos + ","
+y += 1
+pos = pos + ")"
+positions.append(pos)
+elif strand == "-":
+if len(starts) == 1:
+pos = "complement(" + starts[0] + ".." + ends[0] + ")"
+positions.append(pos)
+else:
+pos = "complement(join("
+y = 0
+for i in starts:
+pos = pos + i + ".." + ends[y]
+if i != starts[-1]:
+pos = pos + ","
+y += 1
+pos = pos + "))"
+positions.append(pos)
+starts = []
+ends = []
+elif "mRNA" not in i:
+starts.append(columns[3])
+ends.append(columns[4])
+x += 1
+if len(orfnames) == 0:
+print >> sys.stderr, "GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error: 12"
+logfile = open("antismash.log","w")
+logfile.write("GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 12\n")
+logfile.close()
+sys.exit(1)
+out_file = open(emblfile,"w")
+a = 0
+#print "Writing EMBL file with GlimmerHMM-predicted genes..."
+logfile = open("antismash.log","w")
+logfile.write(loginfo)
+logfile.write("Writing EMBL file with GlimmerHMM-predicted genes...\n")
+#logfile.close()
+loginfo = open("antismash.log","r").read()
+#logfile.close()
+out_file.write("ID   A01; SV 1; linear; DNA; STD; FUN; " + str(dnaseqlength) + " BP.\nXX\n")
+out_file.write("AC   A01;\nXX\n")
+out_file.write("DE   " + genomename + ";\nXX\n")
+out_file.write("KW   none;\nXX\n")
+out_file.write("OS   unknown;\n")
+out_file.write("OC   Fungi;\nXX\n")
+out_file.write("RN   [1]\n")
+out_file.write("RT   ;\n")
+out_file.write("RL   Unknown.\nXX\n")
+out_file.write("FH   Key             Location/Qualifiers\nFH\n")
+out_file.write("FT   source          1.." + str(dnaseqlength) + "\n")
+for i in orfnames:
+out_file.write("FT   gene            ")
+out_file.write(positions[a])
+out_file.write("\n")
+out_file.write('FT                   /gene="' + i + '"\n')
+out_file.write("FT   CDS             ")
+out_file.write(positions[a])
+out_file.write("\n")
+out_file.write('FT                   /gene="' + i + '"\n')
+a += 1
+out_file.write("XX\nSQ   Sequence " + str(dnaseqlength) + " BP; " + str(seq.count("a") + seq.count("A")) + " A; " + str(seq.count("c") + seq.count("C")) + " C; " + str(seq.count("g") + seq.count("G")) + " G; " + str(seq.count("t") + seq.count("T")) + " T; " + str(dnaseqlength - (seq.count("a") + seq.count("A") + seq.count("c") + seq.count("C") + seq.count("g") + seq.count("G") + seq.count("t") + seq.count("T"))) + " other;\n")
+seq2 = seq
+out_file.write("     ")
+grouplen=10
+textlen = len(seq)
+end = textlen - (textlen % grouplen)
+repeated_iterator = [iter(itertools.islice(seq, 0, end))] * grouplen
+parts = list(itertools.imap(lambda *chars: ''.join(chars),*repeated_iterator))
+if dnaseqlength%grouplen != 0:
+parts.append(seq[-1 * (dnaseqlength%grouplen):])
+w = 1
+for l in parts:
+out_file.write(l + " ")
+if w == len(parts):
+if w%6 == 0 and dnaseqlength%60 != 0:
+out_file.write((" " * (10 - dnaseqlength%grouplen) + " " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
+elif dnaseqlength%60 == 0:
+out_file.write((" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
+elif w%6 == 5 and dnaseqlength%grouplen == 0:
+out_file.write(("           " + " " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
+elif dnaseqlength%grouplen != 0:
+out_file.write(" " * (10 - dnaseqlength%grouplen) + "          " * (6 - len(parts)%6) + " " * (6 - len(parts)%6) + (" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
+else:
+out_file.write("          " * (6 - len(parts)%6) + " " * (5 - len(parts)%6) + (" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
+elif w%6 == 0:
+out_file.write((" " * (10 - len(str(w * 10)))) + str(w * 10) + "\n     ")
+w += 1
+out_file.close()
+os.chdir("../../")
+infile = emblfile[6:]
+emblfile = emblfile[6:]
+if taxon == "p":
+glimmeroutputfolder = genomename + "/glimmer/"
+elif taxon == "e":
+glimmeroutputfolder = genomename + "/glimmerhmm/"
+try:
+os.mkdir(glimmeroutputfolder)
+except(IOError,OSError):
+pass
+proteins = embl2proteins(infile,sequence)
+genomic_accnr = proteins[1]
+dnaseqlength = proteins[2]
+proteins = proteins[0]
+writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta")
+else:
+#print "Reading embl/gbk file and creating input FASTA file for gene cluster detection..."
+logfile.write("Reading embl/gbk file and creating input FASTA file for gene cluster detection...\n")
+if infile.split(".")[-1] == "embl" or infile.split(".")[-1] == "EMBL" or infile.split(".")[-1] == "emb" or infile.split(".")[-1] == "EMB":
+sequence = ""
+proteins = embl2proteins(infile,sequence)
+genomic_accnr = proteins[1]
+dnaseqlength = proteins[2]
+proteins = proteins[0]
+writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta")
+elif infile.split(".")[-1] == "gbk" or infile.split(".")[-1] == "GBK" or infile.split(".")[-1] == "gb" or infile.split(".")[-1] == "GB" or infile.split(".")[-1] == "genbank" or infile.split(".")[-1] == "GENBANK":
+proteins = gbk2proteins(infile)
+genomic_accnr = proteins[1]
+dnaseqlength = proteins[2]
+proteins = proteins[0]
+writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta")
+accessiondict = proteins[4]
+seqdict = {}
+fullnamedict = {}
+strandsdict = {}
+z = 0
+for i in proteins[0]:
+name = i.split("|")[4]
+seq = proteins[1][z]
+seqdict[name] = seq
+strand = i.split("|")[3]
+strandsdict[name] = strand
+fullnamedict[name] = i
+z += 1
+elapsed = (time.time() - starttime)
+#print "2968Time since start: " + str(elapsed)
+#Run hmmsearch on proteins from input file and parse output
+#print "Performing HMM search on proteins for detection of signature genes..."
+logfile.write("Performing HMM search on proteins for detection of signature genes...\n")
+hmmslist = ["AMP-binding.hmm","BLS.hmm","CAS.hmm","Chal_sti_synt_C.hmm","Chal_sti_synt_N.hmm","Condensation.hmm","ene_KS.hmm","hyb_KS.hmm","itr_KS.hmm","mod_KS.hmm","tra_KS.hmm","LANC_like.hmm","ATd.hmm","PKS_AT.hmm","PKS_KS.hmm","PP-binding.hmm","t2clf.hmm","t2ks.hmm","t2ks2.hmm","Terpene_synth.hmm","Terpene_synth_C.hmm","strH_like.hmm","neoL_like.hmm","DOIS.hmm","valA_like.hmm","spcFG_like.hmm","spcDK_like_cou.hmm","spcDK_like_glyc.hmm","strK_like1.hmm","strK_like2.hmm","bt1fas.hmm","ft1fas.hmm","t2fas.hmm","hglD.hmm","hglE.hmm","fabH.hmm","AfsA.hmm","IucA_IucC.hmm","ectoine_synt.hmm","phytoene_synt.hmm","Lant_dehyd_N.hmm","Lant_dehyd_C.hmm","Antimicrobial18.hmm","Gallidermin.hmm","L_biotic_typeA.hmm","LE-DUF.hmm","LE-LAC481.hmm","LE-LanBC.hmm","LE-MER+2PEP.hmm","MA-2PEPA.hmm","MA-DUF.hmm","MA-EPI.hmm","MA-LAC481.hmm","MA-NIS+EPI.hmm","MA-NIS.hmm","indsynth.hmm","A-OX.hmm","LmbU.hmm","MoeO5.hmm","LipM.hmm","LipU.hmm","LipV.hmm","ToyB.hmm","TunD.hmm","melC.hmm","strepbact.hmm","goadsporin_like.hmm","Antimicrobial14.hmm","Bacteriocin_IId.hmm","BacteriocIIc_cy.hmm","Bacteriocin_II.hmm","Lactococcin.hmm","Antimicrobial17.hmm","Lactococcin_972.hmm","Bacteriocin_IIc.hmm","LcnG-beta.hmm","Bacteriocin_IIi.hmm","Subtilosin_A.hmm","Cloacin.hmm","Neocarzinostat.hmm","Linocin_M18.hmm","TIGR03603.hmm","TIGR03604.hmm","TIGR03605.hmm","TIGR03731.hmm","TIGR03651.hmm","TIGR03678.hmm","TIGR03693.hmm","TIGR03798.hmm","TIGR03882.hmm","TIGR03601.hmm","TIGR03602.hmm","tabtoxin.hmm","cycdipepsynth.hmm","cyanobactin_synth.hmm","fom1.hmm","bcpB.hmm","frbD.hmm","mitE.hmm",'Lycopene_cycl.hmm','terpene_cyclase.hmm','NapT7.hmm','fung_ggpps.hmm','fung_ggpps2.hmm','dmat.hmm','trichodiene_synth.hmm','novK.hmm','novJ.hmm','novI.hmm','novH.hmm','pur6.hmm','pur10.hmm','nikJ.hmm','nikO.hmm','mvnA.hmm','thiostrepton.hmm','NAD_binding_4.hmm','vlmB.hmm','salQ.hmm','prnB.hmm']
+for i in hmmslist:
+hmmsearch = hmmsearch_path + " " + "--cpu " + str(nrcpus) + " -o " + genomename + "/hmmoutput/" + i.split(".")[0] + "_output.txt" + " --noali --tblout " + genomename + "/hmmoutput/" + i.split(".")[0] + ".txt " + hmms_path + i + " " + genomename + "/genome_proteins.fasta"
+os.system(hmmsearch)
+#print "Parsing HMM outputs..."
+logfile.write("Parsing HMM outputs...\n")
+detecteddomainsdict = {}
+#Extract type I PKS proteins, KS cut-off: 50; AT cut-off: 20; exclude those sequences that score higher on type I FAS HMMs, type IV hglE-like KS domains
+t1pksprots = []
+transatpksprots = []
+if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
+ks = parsehmmoutput(50,hmmoutputfolder + "PKS_KS.txt")
+at = parsehmmoutput(50,hmmoutputfolder + "PKS_AT.txt")
+ft1fasks = parsehmmoutput(50,hmmoutputfolder + "ft1fas.txt")
+bt1fasks = parsehmmoutput(50,hmmoutputfolder + "bt1fas.txt")
+hgleks = parsehmmoutput(50,hmmoutputfolder + "hglE.txt")
+hgldks = parsehmmoutput(50,hmmoutputfolder + "hglD.txt")
+fabhks = parsehmmoutput(50,hmmoutputfolder + "fabH.txt")
+pksksprots = ks[0]
+pksatprots = at[0]
+pksatscores = at[1]
+pksksscores = ks[1]
+bt1fasprots = bt1fasks[0]
+bt1fasscores = bt1fasks[1]
+ft1fasprots = ft1fasks[0]
+ft1fasscores = ft1fasks[1]
+hgleprots = hgleks[0]
+hglescores = hgleks[1]
+hgldprots = hgldks[0]
+hgldscores = hgldks[1]
+fabhprots = fabhks[0]
+fabhscores = fabhks[1]
+for i in pksksprots:
+exclude = "n"
+score = pksksscores[pksksprots.index(i)]
+if i in bt1fasprots:
+bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
+if float(score) < float(bt1fasscore):
+exclude = "y"
+if i in ft1fasprots:
+ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
+if float(score) < float(ft1fasscore):
+exclude = "y"
+if i in hgldprots:
+hgldscore = hgldscores[hgldprots.index(i)]
+if float(score) < float(hgldscore):
+exclude = "y"
+if i in hgleprots:
+hglescore = hglescores[hgleprots.index(i)]
+if float(score) < float(hglescore):
+exclude = "y"
+if i in fabhprots:
+fabhscore = fabhscores[fabhprots.index(i)]
+if float(score) < float(fabhscore):
+exclude = "y"
+if i in pksatprots and exclude == "n":
+t1pksprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]])
+detdomlist.append(["PKS acyltransferase domain",pksatscores[pksatprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]],["PKS acyltransferase domain",pksatscores[pksatprots.index(i)]]]
+#Extract trans-AT PKSs: proteins with KS hits but without AT hits, and with trans-AT specific ATd-hits
+atd = parsehmmoutput(65,hmmoutputfolder + "ATd.txt")
+traks = parsehmmoutput(50,hmmoutputfolder + "tra_KS.txt")
+traksprots = traks[0]
+atdprots = atd[0]
+atdscores = atd[1]
+for i in pksksprots:
+if i in atdprots and i in traksprots and i not in t1pksprots:
+transatpksprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]])
+detdomlist.append(["Trans-AT PKS AT-docking domain",atdscores[atdprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]],["Trans-AT PKS AT-docking domain",atdscores[atdprots.index(i)]]]
+#Extract type II PKS & CLF proteins, KS-cut-off: 50, t2KS/clf score > modKS,eneKS,itrKS,traKS,t1fas,t2fas,hgle scores
+t2pksprots = []
+if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
+t2ks = parsehmmoutput(50,hmmoutputfolder + "t2ks.txt")
+t2ks2 = parsehmmoutput(450,hmmoutputfolder + "t2ks2.txt")
+t2clf = parsehmmoutput(50,hmmoutputfolder + "t2clf.txt")
+eneks = parsehmmoutput(50,hmmoutputfolder + "ene_KS.txt")
+hybks = parsehmmoutput(50,hmmoutputfolder + "hyb_KS.txt")
+modks = parsehmmoutput(50,hmmoutputfolder + "mod_KS.txt")
+itrks = parsehmmoutput(50,hmmoutputfolder + "itr_KS.txt")
+traks = parsehmmoutput(50,hmmoutputfolder + "tra_KS.txt")
+t2fasks = parsehmmoutput(50,hmmoutputfolder + "t2fas.txt")
+ft1fasks = parsehmmoutput(50,hmmoutputfolder + "ft1fas.txt")
+bt1fasks = parsehmmoutput(50,hmmoutputfolder + "bt1fas.txt")
+hgleks = parsehmmoutput(50,hmmoutputfolder + "hglE.txt")
+hgldks = parsehmmoutput(50,hmmoutputfolder + "hglD.txt")
+fabhks = parsehmmoutput(50,hmmoutputfolder + "fabH.txt")
+t2ksprots = t2ks[0]
+t2ks2prots = t2ks2[0]
+t2clfprots = t2clf[0]
+eneksprots = eneks[0]
+hybksprots = hybks[0]
+modksprots = modks[0]
+itrksprots = itrks[0]
+traksprots = traks[0]
+t2fasprots = t2fasks[0]
+t2ksscores = t2ks[1]
+t2ks2scores = t2ks2[1]
+t2clfscores = t2clf[1]
+eneksscores = eneks[1]
+hybksscores = hybks[1]
+modksscores = modks[1]
+itrksscores = itrks[1]
+traksscores = traks[1]
+t2fasscores = t2fasks[1]
+bt1fasprots = bt1fasks[0]
+bt1fasscores = bt1fasks[1]
+ft1fasprots = ft1fasks[0]
+ft1fasscores = ft1fasks[1]
+hgleprots = hgleks[0]
+hglescores = hgleks[1]
+hgldprots = hgldks[0]
+hgldscores = hgldks[1]
+fabhprots = fabhks[0]
+fabhscores = fabhks[1]
+for i in t2ksprots:
+type2 = "y"
+score = t2ksscores[t2ksprots.index(i)]
+if i in eneksprots:
+enescore = eneksscores[eneksprots.index(i)]
+if float(enescore) > float(score):
+type2 = "n"
+if i in hybksprots:
+hybscore = hybksscores[hybksprots.index(i)]
+if float(hybscore) > float(score):
+type2 = "n"
+if i in modksprots:
+modscore = modksscores[modksprots.index(i)]
+if float(modscore) > float(score):
+type2 = "n"
+if i in itrksprots:
+itrscore = itrksscores[itrksprots.index(i)]
+if float(itrscore) > float(score):
+type2 = "n"
+if i in traksprots:
+trascore = traksscores[traksprots.index(i)]
+if float(trascore) > float(score):
+type2 = "n"
+if i in bt1fasprots:
+bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
+if float(bt1fasscore) > float(score):
+type2 = "n"
+if i in ft1fasprots:
+ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
+if float(ft1fasscore) > float(score):
+type2 = "n"
+if i in t2fasprots:
+t2fasscore = t2fasscores[t2fasprots.index(i)]
+if float(t2fasscore) > float(score):
+type2 = "n"
+if i in hgleprots:
+hglescore = hglescores[hgleprots.index(i)]
+if float(hglescore) > float(score):
+type2 = "n"
+if i in fabhprots:
+fabhscore = fabhscores[fabhprots.index(i)]
+if float(fabhscore) > float(score):
+type2 = "n"
+if type2 == "y" and i not in t2pksprots and i not in t1pksprots:
+t2pksprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Type II ketosynthase",t2ksscores[t2ksprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Type II ketosynthase",t2ksscores[t2ksprots.index(i)]]]
+for i in t2clfprots:
+type2 = "y"
+score = t2clfscores[t2clfprots.index(i)]
+if i in eneksprots:
+enescore = eneksscores[eneksprots.index(i)]
+if float(enescore) > float(score):
+type2 = "n"
+if i in hybksprots:
+hybscore = hybksscores[hybksprots.index(i)]
+if float(hybscore) > float(score):
+type2 = "n"
+if i in modksprots:
+modscore = modksscores[modksprots.index(i)]
+if float(modscore) > float(score):
+type2 = "n"
+if i in itrksprots:
+itrscore = itrksscores[itrksprots.index(i)]
+if float(itrscore) > float(score):
+type2 = "n"
+if i in traksprots:
+trascore = traksscores[traksprots.index(i)]
+if float(trascore) > float(score):
+type2 = "n"
+if i in bt1fasprots:
+bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
+if float(bt1fasscore) > float(score):
+type2 = "n"
+if i in ft1fasprots:
+ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
+if float(ft1fasscore) > float(score):
+type2 = "n"
+if i in t2fasprots:
+t2fasscore = t2fasscores[t2fasprots.index(i)]
+if float(t2fasscore) > float(score):
+type2 = "n"
+if i in hgleprots:
+hglescore = hglescores[hgleprots.index(i)]
+if float(hglescore) > float(score):
+type2 = "n"
+if i in fabhprots:
+fabhscore = fabhscores[fabhprots.index(i)]
+if float(fabhscore) > float(score):
+type2 = "n"
+if type2 == "y" and i not in t2pksprots and i not in t1pksprots:
+t2pksprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Chain length factor",t2clfscores[t2clfprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Chain length factor",t2clfscores[t2clfprots.index(i)]]]
+for i in t2ks2prots:
+type2 = "y"
+score = t2ks2scores[t2ks2prots.index(i)]
+if i in eneksprots:
+enescore = eneksscores[eneksprots.index(i)]
+if float(enescore) > float(score):
+type2 = "n"
+if i in hybksprots:
+hybscore = hybksscores[hybksprots.index(i)]
+if float(hybscore) > float(score):
+type2 = "n"
+if i in modksprots:
+modscore = modksscores[modksprots.index(i)]
+if float(modscore) > float(score):
+type2 = "n"
+if i in itrksprots:
+itrscore = itrksscores[itrksprots.index(i)]
+if float(itrscore) > float(score):
+type2 = "n"
+if i in traksprots:
+trascore = traksscores[traksprots.index(i)]
+if float(trascore) > float(score):
+type2 = "n"
+if i in bt1fasprots:
+bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
+if float(bt1fasscore) > float(score):
+type2 = "n"
+if i in ft1fasprots:
+ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
+if float(ft1fasscore) > float(score):
+type2 = "n"
+if i in t2fasprots:
+t2fasscore = t2fasscores[t2fasprots.index(i)]
+if float(t2fasscore) > float(score):
+type2 = "n"
+if i in hgleprots:
+hglescore = hglescores[hgleprots.index(i)]
+if float(hglescore) > float(score):
+type2 = "n"
+if i in fabhprots:
+fabhscore = fabhscores[fabhprots.index(i)]
+if float(fabhscore) > float(score):
+type2 = "n"
+if type2 == "y" and i not in t2pksprots and i not in t1pksprots:
+t2pksprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Type II ketosynthase, model 2",t2ks2scores[t2ks2prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Type II ketosynthase, model 2",t2ks2scores[t2ks2prots.index(i)]]]
+#Extract type III PKS proteins
+t3pksprots = []
+if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
+t3n = parsehmmoutput(63,hmmoutputfolder + "Chal_sti_synt_N.txt")
+t3c = parsehmmoutput(35,hmmoutputfolder + "Chal_sti_synt_C.txt")
+t3nprots = t3n[0]
+t3nscores = t3n[1]
+t3cprots = t3c[0]
+t3cscores = t3c[1]
+for i in t3cprots:
+if i not in t3pksprots and i not in t1pksprots and i not in t2pksprots:
+t3pksprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Chalcone/stilbene synthase,C-terminus",t3cscores[t3cprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Chalcone/stilbene synthase,C-terminus",t3cscores[t3cprots.index(i)]]]
+for i in t3nprots:
+if i not in t3pksprots and i not in t1pksprots and i not in t2pksprots:
+t3pksprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Chalcone/stilbene synthase,N-terminus",t3nscores[t3nprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Chalcone/stilbene synthase,N-terminus",t3nscores[t3nprots.index(i)]]]
+#Extract 'type IV' hglE-like PKS proteins, cut-off:50; only if not already scored as type 1-3 PKS, and not if FAS HMM has higher score
+t4pksprots = []
+if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
+t2fasks = parsehmmoutput(50,hmmoutputfolder + "t2fas.txt")
+t2fasprots = t2fasks[0]
+t2fasscores = t2fasks[1]
+for i in hgleprots:
+type4 = "y"
+score = hglescores[hgleprots.index(i)]
+if i in bt1fasprots:
+bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
+if float(bt1fasscore) > float(score):
+type4 = "n"
+if i in ft1fasprots:
+ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
+if float(ft1fasscore) > float(score):
+type4 = "n"
+if i in t2fasprots:
+t2fasscore = t2fasscores[t2fasprots.index(i)]
+if float(t2fasscore) > float(score):
+type4 = "n"
+if i in fabhprots:
+fabhscore = fabhscores[fabhprots.index(i)]
+if float(fabhscore) > float(score):
+type4 = "n"
+if i not in t1pksprots and i not in t2pksprots and i not in t3pksprots and i not in transatpksprots and type4 == "y":
+t4pksprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Atypical PKS domain, HglE-like",hglescores[hgleprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Atypical PKS domain, HglE-like",hglescores[hgleprots.index(i)]]]
+for i in hgldprots:
+type4 = "y"
+score = hgldscores[hgldprots.index(i)]
+if i in bt1fasprots:
+bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
+if float(bt1fasscore) > float(score):
+type4 = "n"
+if i in ft1fasprots:
+ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
+if float(ft1fasscore) > float(score):
+type4 = "n"
+if i in t2fasprots:
+t2fasscore = t2fasscores[t2fasprots.index(i)]
+if float(t2fasscore) > float(score):
+type4 = "n"
+if i in fabhprots:
+fabhscore = fabhscores[fabhprots.index(i)]
+if float(fabhscore) > float(score):
+type4 = "n"
+if i not in t1pksprots and i not in t2pksprots and i not in t3pksprots and i not in transatpksprots and type4 == "y" and i not in t4pksprots:
+t4pksprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Atypical PKS domain, HglD-like",hgldscores[hgldprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Atypical PKS domain, HglD-like",hgldscores[hgldprots.index(i)]]]
+#Extract NRPS proteins, C cut-off: 20; A cut-off:20, both should be there, or single domain proteins C,A, or T should be within 20kb of each other or a full NRPS
+nrpsprots = []
+if 1 in geneclustertypes or 5 in geneclustertypes:
+cond = parsehmmoutput(20,hmmoutputfolder + "Condensation.txt")
+amp = parsehmmoutput(20,hmmoutputfolder + "AMP-binding.txt")
+ampox = parsehmmoutput(50,hmmoutputfolder + "A-OX.txt")
+ampoxprots = ampox[0]
+ampoxscores = ampox[1]
+for i in ampox[0]:
+if i not in amp:
+amp.append(i)
+cprots = cond[0]
+cscores = cond[1]
+aprots = amp[0]
+ascores = amp[1]
+nrpsprots = []
+for i in cprots:
+if i in aprots:
+nrpsprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Condensation domain",cscores[cprots.index(i)]])
+if i in aprots:
+detdomlist.append(["Adenylation domain",ascores[aprots.index(i)]])
+elif i in ampoxprots:
+detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in aprots:
+detecteddomainsdict[i] = [["Condensation domain",cscores[cprots.index(i)]],["Adenylation domain",ascores[aprots.index(i)]]]
+elif i in ampoxprots:
+detecteddomainsdict[i] = [["Condensation domain",cscores[cprots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
+for i in t1pksprots:
+if i in aprots:
+nrpsprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in aprots:
+detdomlist.append(["Adenylation domain",ascores[aprots.index(i)]])
+elif i in ampoxprots:
+detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in aprots:
+detecteddomainsdict[i] = [["Adenylation domain",ascores[aprots.index(i)]]]
+elif i in ampoxprots:
+detecteddomainsdict[i] = [["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
+single_aprots = []
+single_cprots = []
+single_pptprots = []
+pptprots = parsehmmoutput(20,hmmoutputfolder + "PP-binding.txt")[0]
+for i in aprots:
+if i not in nrpsprots:
+single_aprots.append(i)
+for i in cprots:
+if i not in nrpsprots:
+single_cprots.append(i)
+for i in pptprots:
+if i not in nrpsprots:
+single_pptprots.append(i)
+genelist = proteins[2]
+genedict = proteins[3]
+single_aprots_positions = {}
+single_cprots_positions = {}
+single_pptprots_positions = {}
+nrpsprots_positions = {}
+for j in single_aprots:
+if j in genelist:
+protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
+protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
+single_aprots_positions[j] = int((protend_abs + protstart_abs) / 2)
+for j in single_cprots:
+if j in genelist:
+protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
+protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
+single_cprots_positions[j] = int((protend_abs + protstart_abs) / 2)
+for j in single_pptprots:
+if j in genelist:
+protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
+protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
+single_pptprots_positions[j] = int((protend_abs + protstart_abs) / 2)
+for j in nrpsprots:
+if j in genelist:
+protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
+protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
+nrpsprots_positions[j] = int((protend_abs + protstart_abs) / 2)
+nrpsprots2 = []
+for i in nrpsprots:
+nrpsprots2.append(i)
+for j in single_aprots:
+include = "n"
+pos = single_aprots_positions[j]
+for i in single_cprots:
+pos2 = single_cprots_positions[i]
+if abs(pos - pos2) < 20000:
+include = "y"
+for i in nrpsprots2:
+pos2 = nrpsprots_positions[i]
+if abs(pos - pos2) < 20000:
+include = "y"
+if include == "y":
+nrpsprots.append(j)
+if detecteddomainsdict.has_key(j):
+detdomlist = detecteddomainsdict[j]
+if j in aprots:
+detdomlist.append(["Adenylation domain",ascores[aprots.index(j)]])
+elif j in ampoxprots:
+detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(j)]])
+detecteddomainsdict[j] = detdomlist
+else:
+if j in aprots:
+detecteddomainsdict[j] = [["Adenylation domain",ascores[aprots.index(j)]]]
+elif j in ampoxprots:
+detecteddomainsdict[j] = [["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(j)]]]
+for j in single_cprots:
+include = "n"
+pos = single_cprots_positions[j]
+for i in single_aprots:
+pos2 = single_aprots_positions[i]
+if abs(pos - pos2) < 20000:
+include = "y"
+for i in nrpsprots2:
+pos2 = nrpsprots_positions[i]
+if abs(pos - pos2) < 20000:
+include = "y"
+if include == "y":
+nrpsprots.append(j)
+if detecteddomainsdict.has_key(j):
+detdomlist = detecteddomainsdict[j]
+detdomlist.append(["Condensation domain",cscores[cprots.index(j)]])
+detecteddomainsdict[j] = detdomlist
+else:
+detecteddomainsdict[j] = [["Condensation domain",cscores[cprots.index(j)]]]
+#Extract Terpene synthase proteins, various cut-offs
+terpeneprots = []
+if 1 in geneclustertypes or 6 in geneclustertypes:
+terpene = parsehmmoutput(23,hmmoutputfolder + "Terpene_synth_C.txt")
+terpeneprots = terpene[0]
+terpenescores = terpene[1]
+for i in terpeneprots:
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Terpene synthase, C-terminus",terpenescores[terpeneprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Terpene synthase, C-terminus",terpenescores[terpeneprots.index(i)]]]
+if 1 in geneclustertypes or 6 in geneclustertypes:
+physqualdata = parsehmmoutput(20,hmmoutputfolder + "phytoene_synt.txt")
+physqualprots = physqualdata[0]
+physqualscores = physqualdata[1]
+for i in physqualprots:
+if i not in terpeneprots:
+terpeneprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Phytoene/squalene synthase",physqualscores[physqualprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Phytoene/squalene synthase",physqualscores[physqualprots.index(i)]]]
+if 1 in geneclustertypes or 6 in geneclustertypes:
+lycopenedata = parsehmmoutput(80,hmmoutputfolder + "Lycopene_cycl.txt")
+lycopeneprots = lycopenedata[0]
+lycopenescores = lycopenedata[1]
+for i in lycopeneprots:
+if i not in terpeneprots:
+terpeneprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Lycopene cyclase",lycopenescores[lycopeneprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Lycopene cyclase",lycopenescores[lycopeneprots.index(i)]]]
+if 1 in geneclustertypes or 6 in geneclustertypes:
+terpene_cyclasesdata = parsehmmoutput(50,hmmoutputfolder + "terpene_cyclase.txt")
+terpene_cyclases = terpene_cyclasesdata[0]
+terpene_cyclases_scores = terpene_cyclasesdata[1]
+for i in terpene_cyclases:
+if i not in terpeneprots:
+terpeneprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Terpene cyclase",terpene_cyclases_scores[terpene_cyclases.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Terpene cyclase",terpene_cyclases_scores[terpene_cyclases.index(i)]]]
+if 1 in geneclustertypes or 6 in geneclustertypes:
+NapT7 = parsehmmoutput(250,hmmoutputfolder + "NapT7.txt")
+NapT7prots = NapT7[0]
+NapT7scores = NapT7[1]
+for i in NapT7prots:
+if i not in terpeneprots:
+terpeneprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["NapT7",NapT7scores[NapT7prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["NapT7",NapT7scores[NapT7prots.index(i)]]]
+if 1 in geneclustertypes or 6 in geneclustertypes:
+fung_ggpps = parsehmmoutput(420,hmmoutputfolder + "fung_ggpps.txt")
+fung_ggppsprots = fung_ggpps[0]
+fung_ggppsscores = fung_ggpps[1]
+for i in fung_ggppsprots:
+if i not in terpeneprots:
+terpeneprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Fungal geranylgeranyl pyrophosphate synthase, model 1",fung_ggppsscores[fung_ggppsprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Fungal geranylgeranyl pyrophosphate synthase, model 1",fung_ggppsscores[fung_ggppsprots.index(i)]]]
+if 1 in geneclustertypes or 6 in geneclustertypes:
+fung_ggpps2 = parsehmmoutput(312,hmmoutputfolder + "fung_ggpps2.txt")
+fung_ggpps2prots = fung_ggpps2[0]
+fung_ggpps2scores = fung_ggpps2[1]
+for i in fung_ggpps2prots:
+if i not in terpeneprots:
+terpeneprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Fungal geranylgeranyl pyrophosphate synthase, model 2",fung_ggpps2scores[fung_ggpps2prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Fungal geranylgeranyl pyrophosphate synthase, model 2",fung_ggpps2scores[fung_ggpps2prots.index(i)]]]
+if 1 in geneclustertypes or 6 in geneclustertypes:
+dmat = parsehmmoutput(200,hmmoutputfolder + "dmat.txt")
+dmatprots = dmat[0]
+dmatscores = dmat[1]
+for i in dmatprots:
+if i not in terpeneprots:
+terpeneprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Dimethylallyl tryptophan synthase",dmatscores[dmatprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Dimethylallyl tryptophan synthase",dmatscores[dmatprots.index(i)]]]
+if 1 in geneclustertypes or 6 in geneclustertypes:
+trichodiene_synth = parsehmmoutput(150,hmmoutputfolder + "trichodiene_synth.txt")
+trichodiene_synthprots = trichodiene_synth[0]
+trichodiene_synthscores = trichodiene_synth[1]
+for i in trichodiene_synthprots:
+if i not in terpeneprots:
+terpeneprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Trichodiene synthase",trichodiene_synthscores[trichodiene_synthprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Trichodiene synthase",trichodiene_synthscores[trichodiene_synthprots.index(i)]]]
+#Extract lantibiotic proteins, LanC cut-off: 80, Lant_dehN & Lant_dehC combination cut-off: 20 each
+lantprots = []
+if 1 in geneclustertypes or 7 in geneclustertypes:
+lantc = parsehmmoutput(80,hmmoutputfolder + "LANC_like.txt")
+lancprots = lantc[0]
+lancscores = lantc[1]
+landehn = parsehmmoutput(20,hmmoutputfolder + "Lant_dehyd_N.txt")
+landehnprots = landehn[0]
+landehnscores = landehn[1]
+landehc = parsehmmoutput(20,hmmoutputfolder + "Lant_dehyd_C.txt")
+landehcprots = landehc[0]
+landehcscores = landehc[1]
+lanti1 = parsehmmoutput(20,hmmoutputfolder + "Antimicrobial18.txt")
+lanti1prots = lanti1[0]
+lanti1scores = lanti1[1]
+lanti2 = parsehmmoutput(20,hmmoutputfolder + "Gallidermin.txt")
+lanti2prots = lanti2[0]
+lanti2scores = lanti2[1]
+lanti3 = parsehmmoutput(20,hmmoutputfolder + "L_biotic_typeA.txt")
+lanti3prots = lanti3[0]
+lanti3scores = lanti3[1]
+lanti4 = parsehmmoutput(20,hmmoutputfolder + "LE-DUF.txt")
+lanti4prots = lanti4[0]
+lanti4scores = lanti4[1]
+lanti5 = parsehmmoutput(20,hmmoutputfolder + "LE-LAC481.txt")
+lanti5prots = lanti5[0]
+lanti5scores = lanti5[1]
+lanti6 = parsehmmoutput(20,hmmoutputfolder + "LE-LanBC.txt")
+lanti6prots = lanti6[0]
+lanti6scores = lanti6[1]
+lanti7 = parsehmmoutput(20,hmmoutputfolder + "LE-MER+2PEP.txt")
+lanti7prots = lanti7[0]
+lanti7scores = lanti7[1]
+lanti8 = parsehmmoutput(20,hmmoutputfolder + "MA-2PEPA.txt")
+lanti8prots = lanti8[0]
+lanti8scores = lanti8[1]
+lanti9 = parsehmmoutput(20,hmmoutputfolder + "MA-DUF.txt")
+lanti9prots = lanti9[0]
+lanti9scores = lanti9[1]
+lanti10 = parsehmmoutput(20,hmmoutputfolder + "MA-EPI.txt")
+lanti10prots = lanti10[0]
+lanti10scores = lanti10[1]
+lanti11 = parsehmmoutput(20,hmmoutputfolder + "MA-LAC481.txt")
+lanti11prots = lanti11[0]
+lanti11scores = lanti11[1]
+lanti12 = parsehmmoutput(20,hmmoutputfolder + "MA-NIS+EPI.txt")
+lanti12prots = lanti12[0]
+lanti12scores = lanti12[1]
+lanti13 = parsehmmoutput(20,hmmoutputfolder + "MA-NIS.txt")
+lanti13prots = lanti13[0]
+lanti13scores = lanti13[1]
+lanti14 = parsehmmoutput(18,hmmoutputfolder + "TIGR03731.txt")
+lanti14prots = lanti14[0]
+lanti14scores = lanti14[1]
+lantiprots = lanti1prots + lanti2prots + lanti3prots + lanti4prots + lanti5prots + lanti6prots + lanti7prots + lanti8prots + lanti9prots + lanti10prots + lanti11prots + lanti12prots + lanti13prots + lanti14prots
+lantiprots2 = []
+for i in lantiprots:
+if i not in lantiprots2:
+lantiprots2.append(i)
+lantiprots = lantiprots2
+for i in lancprots:
+lantprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["LanC lanthionine synthase domain",lancscores[lancprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["LanC lanthionine synthase domain",lancscores[lancprots.index(i)]]]
+for i in landehnprots:
+if i in landehcprots and i not in lantprots:
+lantprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Lantibiotic dehydratase, N-terminus",landehnscores[landehnprots.index(i)]])
+detdomlist.append(["Lantibiotic dehydratase, C-terminus",landehcscores[landehcprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Lantibiotic dehydratase, N-terminus",landehnscores[landehnprots.index(i)]],["Lantibiotic dehydratase, C-terminus",landehcscores[landehcprots.index(i)]]]
+for i in lantiprots:
+if i not in lantprots:
+lantprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in lanti1prots:
+detdomlist.append(["Antimicrobial18 domain",lanti1scores[lanti1prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in lanti1prots:
+detecteddomainsdict[i] = [["Antimicrobial18 domain",lanti1scores[lanti1prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in lanti2prots:
+detdomlist.append(["Gallidermin domain",lanti2scores[lanti2prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in lanti2prots:
+detecteddomainsdict[i] = [["Gallidermin domain",lanti2scores[lanti2prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in lanti3prots:
+detdomlist.append(["L_biotic_typeA domain",lanti3scores[lanti3prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in lanti3prots:
+detecteddomainsdict[i] = [["L_biotic_typeA domain",lanti3scores[lanti3prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in lanti4prots:
+detdomlist.append(["LE-DUF domain",lanti4scores[lanti4prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in lanti4prots:
+detecteddomainsdict[i] = [["LE-DUF domain",lanti4scores[lanti4prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in lanti5prots:
+detdomlist.append(["LE-LAC481 domain",lanti5scores[lanti5prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in lanti5prots:
+detecteddomainsdict[i] = [["LE-LAC481 domain",lanti5scores[lanti5prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in lanti6prots:
+detdomlist.append(["LE-LanBC domain",lanti6scores[lanti6prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in lanti6prots:
+detecteddomainsdict[i] = [["LE-LanBC domain",lanti6scores[lanti6prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in lanti7prots:
+detdomlist.append(["LE-MER+2PEP domain",lanti7scores[lanti7prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in lanti7prots:
+detecteddomainsdict[i] = [["LE-MER+2PEP domain",lanti7scores[lanti7prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in lanti8prots:
+detdomlist.append(["MA-2PEPA domain",lanti8scores[lanti8prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in lanti8prots:
+detecteddomainsdict[i] = [["MA-2PEPA domain",lanti8scores[lanti8prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in lanti9prots:
+detdomlist.append(["MA-DUF domain",lanti9scores[lanti9prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in lanti9prots:
+detecteddomainsdict[i] = [["MA-DUF domain",lanti9scores[lanti9prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in lanti10prots:
+detdomlist.append(["MA-EPI domain",lanti10scores[lanti10prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in lanti10prots:
+detecteddomainsdict[i] = [["MA-EPI domain",lanti10scores[lanti10prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in lanti11prots:
+detdomlist.append(["MA-LAC481 domain",lanti11scores[lanti11prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in lanti11prots:
+detecteddomainsdict[i] = [["MA-LAC481 domain",lanti11scores[lanti11prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in lanti12prots:
+detdomlist.append(["MA-NIS+EPI domain",lanti12scores[lanti12prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in lanti12prots:
+detecteddomainsdict[i] = [["MA-NIS+EPI domain",lanti12scores[lanti12prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in lanti13prots:
+detdomlist.append(["MA-NIS domain",lanti13scores[lanti13prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in lanti13prots:
+detecteddomainsdict[i] = [["MA-NIS domain",lanti13scores[lanti13prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in lanti14prots:
+detdomlist.append(["TIGR03731: lantibiotic, gallidermin/nisin family",lanti14scores[lanti14prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in lanti14prots:
+detecteddomainsdict[i] = [["TIGR03731: lantibiotic, gallidermin/nisin family",lanti14scores[lanti14prots.index(i)]]]
+#Bacteriocin proteins, various cut-offs
+bcinprots = []
+if 1 in geneclustertypes or 8 in geneclustertypes:
+bcin1prots = parsehmmoutput(50,hmmoutputfolder + "strepbact.txt")[0]
+bcin2prots = parsehmmoutput(90,hmmoutputfolder + "Antimicrobial14.txt")[0]
+bcin3prots = parsehmmoutput(23,hmmoutputfolder + "Bacteriocin_IId.txt")[0]
+bcin4prots = parsehmmoutput(92,hmmoutputfolder + "BacteriocIIc_cy.txt")[0]
+bcin5prots = parsehmmoutput(40,hmmoutputfolder + "Bacteriocin_II.txt")[0]
+bcin6prots = parsehmmoutput(24,hmmoutputfolder + "Lactococcin.txt")[0]
+bcin7prots = parsehmmoutput(31,hmmoutputfolder + "Antimicrobial17.txt")[0]
+bcin8prots = parsehmmoutput(25,hmmoutputfolder + "Lactococcin_972.txt")[0]
+bcin9prots = parsehmmoutput(27,hmmoutputfolder + "Bacteriocin_IIc.txt")[0]
+bcin10prots = parsehmmoutput(78,hmmoutputfolder + "LcnG-beta.txt")[0]
+bcin11prots = parsehmmoutput(56,hmmoutputfolder + "Bacteriocin_IIi.txt")[0]
+bcin12prots = parsehmmoutput(98,hmmoutputfolder + "Subtilosin_A.txt")[0]
+bcin13prots = parsehmmoutput(27,hmmoutputfolder + "Cloacin.txt")[0]
+bcin14prots = parsehmmoutput(25,hmmoutputfolder + "Linocin_M18.txt")[0]
+bcin15prots = parsehmmoutput(150,hmmoutputfolder + "TIGR03603.txt")[0]
+bcin16prots = parsehmmoutput(440,hmmoutputfolder + "TIGR03604.txt")[0]
+bcin17prots = parsehmmoutput(200,hmmoutputfolder + "TIGR03605.txt")[0]
+bcin18prots = parsehmmoutput(18,hmmoutputfolder + "TIGR03651.txt")[0]
+bcin19prots = parsehmmoutput(35,hmmoutputfolder + "TIGR03678.txt")[0]
+bcin20prots = parsehmmoutput(400,hmmoutputfolder + "TIGR03693.txt")[0]
+bcin21prots = parsehmmoutput(16,hmmoutputfolder + "TIGR03798.txt")[0]
+bcin22prots = parsehmmoutput(150,hmmoutputfolder + "TIGR03882.txt")[0]
+bcin23prots = parsehmmoutput(50,hmmoutputfolder + "TIGR03601.txt")[0]
+bcin24prots = parsehmmoutput(50,hmmoutputfolder + "TIGR03602.txt")[0]
+bcin25prots = parsehmmoutput(20,hmmoutputfolder + "mvnA.txt")[0]
+bcin26prots = parsehmmoutput(20,hmmoutputfolder + "thiostrepton.txt")[0]
+bcin1scores = parsehmmoutput(50,hmmoutputfolder + "strepbact.txt")[1]
+bcin2scores = parsehmmoutput(90,hmmoutputfolder + "Antimicrobial14.txt")[1]
+bcin3scores = parsehmmoutput(23,hmmoutputfolder + "Bacteriocin_IId.txt")[1]
+bcin4scores = parsehmmoutput(92,hmmoutputfolder + "BacteriocIIc_cy.txt")[1]
+bcin5scores = parsehmmoutput(40,hmmoutputfolder + "Bacteriocin_II.txt")[1]
+bcin6scores = parsehmmoutput(24,hmmoutputfolder + "Lactococcin.txt")[1]
+bcin7scores = parsehmmoutput(31,hmmoutputfolder + "Antimicrobial17.txt")[1]
+bcin8scores = parsehmmoutput(25,hmmoutputfolder + "Lactococcin_972.txt")[1]
+bcin9scores = parsehmmoutput(27,hmmoutputfolder + "Bacteriocin_IIc.txt")[1]
+bcin10scores = parsehmmoutput(78,hmmoutputfolder + "LcnG-beta.txt")[1]
+bcin11scores = parsehmmoutput(56,hmmoutputfolder + "Bacteriocin_IIi.txt")[1]
+bcin12scores = parsehmmoutput(98,hmmoutputfolder + "Subtilosin_A.txt")[1]
+bcin13scores = parsehmmoutput(27,hmmoutputfolder + "Cloacin.txt")[1]
+bcin14scores = parsehmmoutput(25,hmmoutputfolder + "Linocin_M18.txt")[1]
+bcin15scores = parsehmmoutput(150,hmmoutputfolder + "TIGR03603.txt")[1]
+bcin16scores = parsehmmoutput(440,hmmoutputfolder + "TIGR03604.txt")[1]
+bcin17scores = parsehmmoutput(200,hmmoutputfolder + "TIGR03605.txt")[1]
+bcin18scores = parsehmmoutput(18,hmmoutputfolder + "TIGR03651.txt")[1]
+bcin19scores = parsehmmoutput(35,hmmoutputfolder + "TIGR03678.txt")[1]
+bcin20scores = parsehmmoutput(400,hmmoutputfolder + "TIGR03693.txt")[1]
+bcin21scores = parsehmmoutput(16,hmmoutputfolder + "TIGR03798.txt")[1]
+bcin22scores = parsehmmoutput(150,hmmoutputfolder + "TIGR03882.txt")[1]
+bcin23scores = parsehmmoutput(50,hmmoutputfolder + "TIGR03601.txt")[1]
+bcin24scores = parsehmmoutput(50,hmmoutputfolder + "TIGR03602.txt")[1]
+bcin25scores = parsehmmoutput(20,hmmoutputfolder + "mvnA.txt")[1]
+bcin26scores = parsehmmoutput(20,hmmoutputfolder + "thiostrepton.txt")[1]
+bcinprots = bcin1prots + bcin2prots + bcin3prots + bcin4prots + bcin5prots + bcin6prots + bcin7prots + bcin8prots + bcin9prots + bcin10prots + bcin11prots + bcin12prots + bcin13prots + bcin14prots + bcin15prots + bcin16prots + bcin17prots + bcin18prots + bcin19prots + bcin20prots + bcin21prots + bcin22prots + bcin23prots + bcin24prots + bcin25prots + bcin26prots
+bcinprots2 = []
+for i in bcinprots:
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin1prots:
+detdomlist.append(["Putative Streptomyces bacteriocin",bcin1scores[bcin1prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin1prots:
+detecteddomainsdict[i] = [["Putative Streptomyces bacteriocin",bcin1scores[bcin1prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin2prots:
+detdomlist.append(["Antimicrobial14 domain",bcin2scores[bcin2prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin2prots:
+detecteddomainsdict[i] = [["Antimicrobial14 domain",bcin2scores[bcin2prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin3prots:
+detdomlist.append(["Bacteriocin_IId domain",bcin3scores[bcin3prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin3prots:
+detecteddomainsdict[i] = [["Bacteriocin_IId domain",bcin3scores[bcin3prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin4prots:
+detdomlist.append(["BacteriocIIc_cy domain",bcin4scores[bcin4prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin4prots:
+detecteddomainsdict[i] = [["BacteriocIIc_cy domain",bcin4scores[bcin4prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin5prots:
+detdomlist.append(["Bacteriocin_II domain",bcin5scores[bcin5prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin5prots:
+detecteddomainsdict[i] = [["Bacteriocin_II domain",bcin5scores[bcin5prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin6prots:
+detdomlist.append(["Lactococcin",bcin6scores[bcin6prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin6prots:
+detecteddomainsdict[i] = [["Lactococcin",bcin6scores[bcin6prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin7prots:
+detdomlist.append(["Antimicrobial17 domain",bcin7scores[bcin7prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin7prots:
+detecteddomainsdict[i] = [["Antimicrobial17 domain",bcin7scores[bcin7prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin8prots:
+detdomlist.append(["Lactococcin_972 domain",bcin8scores[bcin8prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin8prots:
+detecteddomainsdict[i] = [["Lactococcin_972 domain",bcin8scores[bcin8prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin9prots:
+detdomlist.append(["Bacteriocin_IIc domain",bcin9scores[bcin9prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin9prots:
+detecteddomainsdict[i] = [["Bacteriocin_IIc domain",bcin9scores[bcin9prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin10prots:
+detdomlist.append(["LcnG-beta domain",bcin10scores[bcin10prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin10prots:
+detecteddomainsdict[i] = [["LcnG-beta domain",bcin10scores[bcin10prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin11prots:
+detdomlist.append(["Bacteriocin_IIi domain",bcin11scores[bcin11prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin11prots:
+detecteddomainsdict[i] = [["Bacteriocin_IIi domain",bcin11scores[bcin11prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin12prots:
+detdomlist.append(["Subtilosin_A domain",bcin12scores[bcin12prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin12prots:
+detecteddomainsdict[i] = [["Subtilosin_A domain",bcin12scores[bcin12prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin13prots:
+detdomlist.append(["Cloacin domain",bcin13scores[bcin13prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin13prots:
+detecteddomainsdict[i] = [["Cloacin domain",bcin13scores[bcin13prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin14prots:
+detdomlist.append(["Linocin_M18 domain",bcin14scores[bcin14prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin14prots:
+detecteddomainsdict[i] = [["Linocin_M18 domain",bcin14scores[bcin14prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin15prots:
+detdomlist.append(["TIGR03603: bacteriocin biosynthesis cyclodehydratase",bcin15scores[bcin15prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin15prots:
+detecteddomainsdict[i] = [["TIGR03603: bacteriocin biosynthesis cyclodehydratase",bcin15scores[bcin15prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin16prots:
+detdomlist.append(["TGIR03604: bacteriocin biosynthesis docking scaffold",bcin16scores[bcin16prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin16prots:
+detecteddomainsdict[i] = [["TGIR03604: bacteriocin biosynthesis docking scaffold",bcin16scores[bcin16prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin17prots:
+detdomlist.append(["TGIR03605: SagB-type dehydrogenase",bcin17scores[bcin17prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin17prots:
+detecteddomainsdict[i] = [["TGIR03605: SagB-type dehydrogenase",bcin17scores[bcin17prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin18prots:
+detdomlist.append(["TIGR03651: bacteriocin, circularin A/uberolysin family",bcin18scores[bcin18prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin18prots:
+detecteddomainsdict[i] = [["TIGR03651: bacteriocin, circularin A/uberolysin family",bcin18scores[bcin18prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin19prots:
+detdomlist.append(["TIGR03678: bacteriocin, microcyclamide/patellamide family",bcin19scores[bcin19prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin19prots:
+detecteddomainsdict[i] = [["TIGR03678: bacteriocin, microcyclamide/patellamide family",bcin19scores[bcin19prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin20prots:
+detdomlist.append(["TIGR03693: thiazole-containing bacteriocin maturation protein",bcin20scores[bcin20prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin20prots:
+detecteddomainsdict[i] = [["TIGR03693: thiazole-containing bacteriocin maturation protein",bcin20scores[bcin20prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin21prots:
+detdomlist.append(["TIGR03798: bacteriocin propeptide",bcin21scores[bcin21prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin21prots:
+detecteddomainsdict[i] = [["TIGR03798: bacteriocin propeptide",bcin21scores[bcin21prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin22prots:
+detdomlist.append(["TIGR03882: bacteriocin biosynthesis cyclodehydratase",bcin22scores[bcin22prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin22prots:
+detecteddomainsdict[i] = [["TIGR03882: bacteriocin biosynthesis cyclodehydratase",bcin22scores[bcin22prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin23prots:
+detdomlist.append(["TIGR03601: bacteriocin, BA_2677 family",bcin23scores[bcin23prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin23prots:
+detecteddomainsdict[i] = [["TIGR03601: bacteriocin, BA_2677 family",bcin23scores[bcin23prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin24prots:
+detdomlist.append(["TIGR03602: bacteriocin protoxin, streptolysin S family",bcin24scores[bcin24prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin24prots:
+detecteddomainsdict[i] = [["TIGR03602: bacteriocin protoxin, streptolysin S family",bcin24scores[bcin24prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin25prots:
+detdomlist.append(["Bacteriocin, microviridin family",bcin25scores[bcin25prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin25prots:
+detecteddomainsdict[i] = [["Bacteriocin, microviridin family",bcin25scores[bcin25prots.index(i)]]]
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+if i in bcin26prots:
+detdomlist.append(["Thiopeptide, thiostrepton-like",bcin26scores[bcin26prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in bcin26prots:
+detecteddomainsdict[i] = [["Thiopeptide, thiostrepton-like",bcin26scores[bcin26prots.index(i)]]]
+if i not in bcinprots2:
+bcinprots2.append(i)
+bcinprots = bcinprots2
+#Extract beta-lactam synthetase proteins, cut-off: 250
+lactamprots = []
+if 1 in geneclustertypes or 9 in geneclustertypes:
+bls = parsehmmoutput(250,hmmoutputfolder + "BLS.txt")
+blsprots = bls[0]
+blsscores = bls[1]
+for i in bls[0]:
+lactamprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Beta-lactam synthase",blsscores[blsprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Beta-lactam synthase",blsscores[blsprots.index(i)]]]
+cas = parsehmmoutput(250,hmmoutputfolder + "CAS.txt")
+casprots = cas[0]
+casscores = cas[1]
+for i in cas[0]:
+if i not in lactamprots:
+lactamprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Clavulanic acid synthase-like",casscores[casprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Clavulanic acid synthase-like",casscores[casprots.index(i)]]]
+tabtoxin = parsehmmoutput(500,hmmoutputfolder + "tabtoxin.txt")
+tabtoxinprots = tabtoxin[0]
+tabtoxinscores = tabtoxin[1]
+for i in tabtoxin[0]:
+if i not in lactamprots:
+lactamprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Tabtoxin synthase-like",tabtoxinscores[tabtoxinprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Tabtoxin synthase-like",tabtoxinscores[tabtoxinprots.index(i)]]]
+#Extract aminoglycoside / aminocyclitol biosynthesis clusters, clusters taken from Flatt & Mahmud et al. 2007
+amglyccyclprots = []
+if 1 in geneclustertypes or 10 in geneclustertypes:
+strH = parsehmmoutput(200,hmmoutputfolder + "strH_like.txt")
+strhprots = strH[0]
+strhscores = strH[1]
+for i in strH[0]:
+amglyccyclprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["StrH-like glycosyltransferase",strhscores[strhprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["StrH-like glycosyltransferase",strhscores[strhprots.index(i)]]]
+strK1 = parsehmmoutput(800,hmmoutputfolder + "strK_like1.txt")
+strk1prots = strK1[0]
+strk1scores = strK1[1]
+for i in strK1[0]:
+amglyccyclprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["StrK-like phosphatase",strk1scores[strk1prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["StrK-like phosphatase",strk1scores[strk1prots.index(i)]]]
+strK2 = parsehmmoutput(650,hmmoutputfolder + "strK_like2.txt")
+strk2prots = strK2[0]
+strk2scores = strK2[1]
+for i in strK2[0]:
+amglyccyclprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["StrK-like phosphatase, model 2",strk2scores[strk2prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["StrK-like phosphatase, model 2",strk2scores[strk2prots.index(i)]]]
+neoL = parsehmmoutput(50,hmmoutputfolder + "neoL_like.txt")
+neolprots = neoL[0]
+neolscores = neoL[1]
+for i in neoL[0]:
+amglyccyclprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["NeoL-like deacetylase",neolscores[neolprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["NeoL-like deacetylase",neolscores[neolprots.index(i)]]]
+DOIS = parsehmmoutput(500,hmmoutputfolder + "DOIS.txt")
+doisprots = DOIS[0]
+doisscores = DOIS[1]
+for i in DOIS[0]:
+amglyccyclprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["2-deoxy-scyllo-inosose synthase",doisscores[doisprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["2-deoxy-scyllo-inosose synthase",doisscores[doisprots.index(i)]]]
+valA = parsehmmoutput(600,hmmoutputfolder + "valA_like.txt")
+valaprots = valA[0]
+valascores = valA[1]
+for i in valA[0]:
+amglyccyclprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["2-epi-5-epi-valiolone synthase, ValA-like",valascores[valaprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["2-epi-5-epi-valiolone synthase, ValA-like",valascores[valaprots.index(i)]]]
+spcFG = parsehmmoutput(200,hmmoutputfolder + "spcFG_like.txt")
+spcfgprots = spcFG[0]
+spcfgscores = spcFG[1]
+for i in spcFG[0]:
+amglyccyclprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["SpcF/SpcG-like glycosyltransferase",spcfgscores[spcfgprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["SpcF/SpcG-like glycosyltransferase",spcfgscores[spcfgprots.index(i)]]]
+spcDK_glyc = parsehmmoutput(600,hmmoutputfolder + "spcDK_like_glyc.txt")
+spcdkglycprots = spcDK_glyc[0]
+spcdkglycscores = spcDK_glyc[1]
+for i in spcDK_glyc[0]:
+amglyccyclprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["SpcD/SpcK-like thymidylyltransferase",spcdkglycscores[spcdkglycprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["SpcD/SpcK-like thymidylyltransferase",spcdkglycscores[spcdkglycprots.index(i)]]]
+salQ = parsehmmoutput(480,hmmoutputfolder + "salQ.txt")
+salqprots = salQ[0]
+salqscores = salQ[1]
+for i in salqprots:
+amglyccyclprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["2-epi-5-epi-valiolone synthase, SalQ-like",salqscores[salqprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["2-epi-5-epi-valiolone synthase, SalQ-like",salqscores[salqprots.index(i)]]]
+#Extract aminocoumarin biosynthesis clusters
+aminocoumarinprots = []
+if 1 in geneclustertypes or 11 in geneclustertypes:
+novK = parsehmmoutput(200,hmmoutputfolder + "novK.txt")
+novkprots = novK[0]
+novkscores = novK[1]
+for i in novkprots:
+aminocoumarinprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["NovK-like reductase",novkscores[novkprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["NovK-like reductase",novkscores[novkprots.index(i)]]]
+novJ = parsehmmoutput(350,hmmoutputfolder + "novJ.txt")
+novjprots = novJ[0]
+novjscores = novJ[1]
+for i in novjprots:
+aminocoumarinprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["NovJ-like reductase",novjscores[novjprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["NovJ-like reductase",novjscores[novjprots.index(i)]]]
+novI = parsehmmoutput(600,hmmoutputfolder + "novI.txt")
+noviprots = novI[0]
+noviscores = novI[1]
+for i in noviprots :
+aminocoumarinprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["NovI-like cytochrome P450",noviscores[noviprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["NovI-like cytochrome P450",noviscores[noviprots.index(i)]]]
+novH = parsehmmoutput(750,hmmoutputfolder + "novH.txt")
+novhprots = novH[0]
+novhscores = novH[1]
+for i in novhprots:
+aminocoumarinprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["NovH-like protein",novhscores[novhprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["NovH-like protein",novhscores[novhprots.index(i)]]]
+spcDK_like_cou = parsehmmoutput(600,hmmoutputfolder + "spcDK_like_cou.txt")
+spcDK_like_cou_prots = spcDK_like_cou[0]
+spcDK_like_cou_scores = spcDK_like_cou[1]
+for i in spcDK_like_cou_prots:
+aminocoumarinprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["SpcD/SpcK-like thymidylyltransferase, aminocoumarins group",spcDK_like_cou_scores[spcDK_like_cou_prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["SpcD/SpcK-like thymidylyltransferase, aminocoumarins group",spcDK_like_cou_scores[spcDK_like_cou_prots.index(i)]]]
+#Extract siderophores biosynthesis proteins, IucA/C and AlcB
+siderophoreprots = []
+if 1 in geneclustertypes or 12 in geneclustertypes:
+siderophore = parsehmmoutput(30,hmmoutputfolder + "IucA_IucC.txt")
+siderophoreprots = siderophore[0]
+siderophorescores = siderophore[1]
+for i in siderophoreprots:
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["IucA-IucC domain",siderophorescores[siderophoreprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["IucA-IucC domain",siderophorescores[siderophoreprots.index(i)]]]
+#Extract ectoine biosynthesis proteins
+ectprots = []
+if 1 in geneclustertypes or 13 in geneclustertypes:
+ect = parsehmmoutput(35,hmmoutputfolder + "ectoine_synt.txt")
+ectprots = ect[0]
+ectscores = ect[1]
+for i in ectprots:
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Ectoine synthase",ectscores[ectprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Ectoine synthase",ectscores[ectprots.index(i)]]]
+#Extract butyrolactone biosynthesis proteins
+butyrprots = []
+if 1 in geneclustertypes or 14 in geneclustertypes:
+butyr= parsehmmoutput(25,hmmoutputfolder + "AfsA.txt")
+butyrprots = butyr[0]
+butyrscores = butyr[1]
+for i in butyrprots:
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["AfsA butyrolactone synthesis domain",butyrscores[butyrprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["AfsA butyrolactone synthesis domain",butyrscores[butyrprots.index(i)]]]
+#Extract indole biosynthesis proteins
+indoleprots = []
+if 1 in geneclustertypes or 15 in geneclustertypes:
+indole = parsehmmoutput(100,hmmoutputfolder + "indsynth.txt")
+indoleprots = indole[0]
+indolescores = indole[1]
+for i in indoleprots:
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["StaD-like chromopyrrolic acid synthase domain",indolescores[indoleprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["StaD-like chromopyrrolic acid synthase domain",indolescores[indoleprots.index(i)]]]
+#Extract nucleoside antibiotic biosynthesis proteins
+nucleoprots = []
+if 1 in geneclustertypes or 16 in geneclustertypes:
+nucleoprots = []
+lipm = parsehmmoutput(50,hmmoutputfolder + "LipM.txt")
+lipmprots = lipm[0]
+lipmscores = lipm[1]
+lipu = parsehmmoutput(30,hmmoutputfolder + "LipU.txt")
+lipuprots = lipu[0]
+lipuscores = lipu[1]
+lipv = parsehmmoutput(375,hmmoutputfolder + "LipV.txt")
+lipvprots = lipv[0]
+lipvscores = lipv[1]
+toyb = parsehmmoutput(175,hmmoutputfolder + "ToyB.txt")
+toybprots = toyb[0]
+toybscores = toyb[1]
+tund = parsehmmoutput(200,hmmoutputfolder + "TunD.txt")
+tundprots = tund[0]
+tundscores = tund[1]
+pur6 = parsehmmoutput(200,hmmoutputfolder + "pur6.txt")
+pur6prots = pur6[0]
+pur6scores = pur6[1]
+pur10 = parsehmmoutput(600,hmmoutputfolder + "pur10.txt")
+pur10prots = pur10[0]
+pur10scores = pur10[1]
+nikj = parsehmmoutput(200,hmmoutputfolder + "nikJ.txt")
+nikjprots = nikj[0]
+nikjscores = nikj[1]
+niko = parsehmmoutput(400,hmmoutputfolder + "nikO.txt")
+nikoprots = niko[0]
+nikoscores = niko[1]
+for i in lipmprots:
+if i not in nucleoprots:
+nucleoprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["LipM-like nucleotidyltransferase",lipmscores[lipmprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["LipM-like nucleotidyltransferase",lipmscores[lipmprots.index(i)]]]
+for i in lipuprots:
+if i not in nucleoprots:
+nucleoprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["LipU-like protein",lipuscores[lipuprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["LipU-like protein",lipuscores[lipuprots.index(i)]]]
+for i in lipvprots:
+if i not in nucleoprots:
+nucleoprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["LipV-like dehydrogenase",lipvscores[lipvprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["LipV-like dehydrogenase",lipvscores[lipvprots.index(i)]]]
+for i in toybprots:
+if i not in nucleoprots:
+nucleoprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["ToyB-like synthase",toybscores[toybprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["ToyB-like synthase",toybscores[toybprots.index(i)]]]
+for i in tundprots:
+if i not in nucleoprots:
+nucleoprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["TunD-like putative N-acetylglucosamine transferase",tundscores[tundprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["TunD-like putative N-acetylglucosamine transferase",tundscores[tundprots.index(i)]]]
+for i in pur6prots:
+if i not in nucleoprots:
+nucleoprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Pur6-like synthetase",pur6scores[pur6prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Pur6-like synthetase",pur6scores[pur6prots.index(i)]]]
+for i in pur10prots:
+if i not in nucleoprots:
+nucleoprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Pur10-like oxidoreductase",pur10scores[pur10prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Pur10-like oxidoreductase",pur10scores[pur10prots.index(i)]]]
+for i in nikjprots:
+if i not in nucleoprots:
+nucleoprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["NikJ-like protein",nikjscores[nikjprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["NikJ-like protein",nikjscores[nikjprots.index(i)]]]
+for i in nikoprots:
+if i not in nucleoprots:
+nucleoprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["NikO-like enolpyruvyl transferase",nikoscores[nikoprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["NikO-like enolpyruvyl transferase",nikoscores[nikoprots.index(i)]]]
+#Extract phosphoglycolipid biosynthesis proteins
+phosphoprots = []
+if 1 in geneclustertypes or 17 in geneclustertypes:
+phosphogl = parsehmmoutput(65,hmmoutputfolder + "MoeO5.txt")
+phosphoprots = phosphogl[0]
+phosphoscores = phosphogl[1]
+for i in phosphoprots:
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["MoeO5-like prenyl-3-phosphoglycerate synthase",phosphoscores[phosphoprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["MoeO5-like prenyl-3-phosphoglycerate synthase",phosphoscores[phosphoprots.index(i)]]]
+#Extract melanin biosynthesis proteins
+melaninprots = []
+if 1 in geneclustertypes or 18 in geneclustertypes:
+melanin = parsehmmoutput(40,hmmoutputfolder + "melC.txt")
+melaninprots = melanin[0]
+melaninscores = melanin[1]
+for i in melaninprots:
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["MelC-like melanin synthase",melaninscores[melaninprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["MelC-like melanin synthase",melaninscores[melaninprots.index(i)]]]
+#Extract other putative secondary metabolite biosynthesis proteins
+otherprots = []
+amp_t_prots = []
+if 1 in geneclustertypes or 19 in geneclustertypes:
+pptb = parsehmmoutput(20,hmmoutputfolder + "PP-binding.txt")
+pptbprots = pptb[0]
+pptbscores = pptb[1]
+cond = parsehmmoutput(20,hmmoutputfolder + "Condensation.txt")
+amp = parsehmmoutput(20,hmmoutputfolder + "AMP-binding.txt")
+ampprots = amp[0]
+ampscores = amp[1]
+ampox = parsehmmoutput(50,hmmoutputfolder + "A-OX.txt")
+ampoxprots = ampox[0]
+ampoxscores = ampox[1]
+nad4 = parsehmmoutput(40,hmmoutputfolder + "NAD_binding_4.txt")
+nad4prots = nad4[0]
+nad4scores = nad4[1]
+cprots = cond[0]
+aprots = amp[0]
+for i in ampox[0]:
+if i not in aprots:
+aprots.append(i)
+nrpsprots2 = []
+for i in cprots:
+if i in aprots:
+nrpsprots2.append(i)
+tprots = pptb[0]
+for i in tprots:
+if i in aprots and i not in nrpsprots2 and i not in aminocoumarinprots:
+otherprots.append(i)
+amp_t_prots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["PP-binding domain",pptbscores[pptbprots.index(i)]])
+if i in ampprots:
+detdomlist.append(["Adenylation domain",ampscores[ampprots.index(i)]])
+elif i in ampoxprots:
+detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in ampprots:
+detecteddomainsdict[i] = [["PP-binding domain",pptbscores[pptbprots.index(i)]],["Adenylation domain",ampscores[ampprots.index(i)]]]
+elif i in ampoxprots:
+detecteddomainsdict[i] = [["PP-binding domain",pptbscores[pptbprots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
+for i in nad4prots:
+if i in aprots and i not in aminocoumarinprots:
+otherprots.append(i)
+amp_t_prots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["NAD-binding domain 4",nad4scores[nad4prots.index(i)]])
+if i in ampprots:
+detdomlist.append(["Adenylation domain",ampscores[ampprots.index(i)]])
+elif i in ampoxprots:
+detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+if i in ampprots:
+detecteddomainsdict[i] = [["NAD-binding domain 4",nad4scores[nad4prots.index(i)]],["Adenylation domain",ampscores[ampprots.index(i)]]]
+elif i in ampoxprots:
+detecteddomainsdict[i] = [["NAD-binding domain 4",nad4scores[nad4prots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
+lmbu = parsehmmoutput(50,hmmoutputfolder + "LmbU.txt")
+lmbuprots = lmbu[0]
+lmbuscores = lmbu[1]
+for i in lmbuprots:
+if i not in otherprots:
+otherprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["LmbU-like protein",lmbuscores[lmbuprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["LmbU-like protein",lmbuscores[lmbuprots.index(i)]]]
+goadsporin = parsehmmoutput(500,hmmoutputfolder + "goadsporin_like.txt")
+goadsporinprots = goadsporin[0]
+goadsporinscores = goadsporin[1]
+for i in goadsporinprots:
+if i not in otherprots:
+otherprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Goadsporin-like protein",goadsporinscores[goadsporinprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Goadsporin-like protein",goadsporinscores[goadsporinprots.index(i)]]]
+neocarzinostat = parsehmmoutput(28,hmmoutputfolder + "Neocarzinostat.txt")
+neocarzinostatprots = neocarzinostat[0]
+neocarzinostatscores = neocarzinostat[1]
+for i in neocarzinostatprots:
+if i not in otherprots:
+otherprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Neocarzinostatin-like protein",neocarzinostatscores[neocarzinostatprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Neocarzinostatin-like protein",neocarzinostatscores[neocarzinostatprots.index(i)]]]
+cyanobactin = parsehmmoutput(80,hmmoutputfolder + "cyanobactin_synth.txt")
+cyanobactinprots = cyanobactin[0]
+cyanobactinscores = cyanobactin[1]
+for i in cyanobactinprots:
+if i not in otherprots:
+otherprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Cyanobactin protease",cyanobactinscores[cyanobactinprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Cyanobactin protease",cyanobactinscores[cyanobactinprots.index(i)]]]
+cycdipeptide = parsehmmoutput(110,hmmoutputfolder + "cycdipepsynth.txt")
+cycdipeptideprots = cycdipeptide[0]
+cycdipeptidescores = cycdipeptide[1]
+for i in cycdipeptideprots:
+if i not in otherprots:
+otherprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Cyclodipeptide synthase",cycdipeptidescores[cycdipeptideprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Cyclodipeptide synthase",cycdipeptidescores[cycdipeptideprots.index(i)]]]
+fom1 = parsehmmoutput(750,hmmoutputfolder + "fom1.txt")
+fom1prots = fom1[0]
+fom1scores = fom1[1]
+for i in fom1prots:
+if i not in otherprots:
+otherprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Fom1-like phosphomutase",fom1scores[fom1prots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Fom1-like phosphomutase",fom1scores[fom1prots.index(i)]]]
+bcpb = parsehmmoutput(400,hmmoutputfolder + "bcpB.txt")
+bcpbprots = bcpb[0]
+bcpbscores = bcpb[1]
+for i in bcpbprots:
+if i not in otherprots:
+otherprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["BcpB-like phosphomutase",bcpbscores[bcpbprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["BcpB-like phosphomutase",bcpbscores[bcpbprots.index(i)]]]
+frbd = parsehmmoutput(350,hmmoutputfolder + "frbD.txt")
+frbdprots = frbd[0]
+frbdscores = frbd[1]
+for i in frbdprots:
+if i not in otherprots:
+otherprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["FrbD-like phosphomutase",frbdscores[frbdprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["FrbD-like phosphomutase",frbdscores[frbdprots.index(i)]]]
+mite = parsehmmoutput(400,hmmoutputfolder + "mitE.txt")
+miteprots = mite[0]
+mitescores = mite[1]
+for i in miteprots:
+if i not in otherprots:
+otherprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["MitE-like CoA-ligase",mitescores[miteprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["MitE-like CoA-ligase",mitescores[miteprots.index(i)]]]
+vlmb = parsehmmoutput(250,hmmoutputfolder + "vlmB.txt")
+vlmbprots = vlmb[0]
+vlmbscores = vlmb[1]
+for i in vlmbprots:
+if i not in otherprots:
+otherprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Valanimycin biosynthesis VlmB domain",vlmbscores[vlmbprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Valanimycin biosynthesis VlmB domain",vlmbscores[vlmbprots.index(i)]]]
+prnb = parsehmmoutput(200,hmmoutputfolder + "prnB.txt")
+prnbprots = prnb[0]
+prnbscores = prnb[1]
+for i in prnbprots:
+if i not in otherprots:
+otherprots.append(i)
+if detecteddomainsdict.has_key(i):
+detdomlist = detecteddomainsdict[i]
+detdomlist.append(["Pyrrolnitrin biosynthesis PrnB domain",prnbscores[prnbprots.index(i)]])
+detecteddomainsdict[i] = detdomlist
+else:
+detecteddomainsdict[i] = [["Pyrrolnitrin biosynthesis PrnB domain",prnbscores[prnbprots.index(i)]]]
+if 5 not in geneclustertypes and 1 not in geneclustertypes:
+nrpsprots = []
+if 4 not in geneclustertypes and 1 not in geneclustertypes:
+t3pksprots = []
+if 3 not in geneclustertypes and 1 not in geneclustertypes:
+t2pksprots = []
+if 2 not in geneclustertypes and 1 not in geneclustertypes:
+t1pksprots = []
+t4pksprots = []
+transatpksprots = []
+#Assemble all core sec met proteins
+allsecmetprots = []
+for i in t1pksprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in transatpksprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in t2pksprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in t3pksprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in t4pksprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in nrpsprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in terpeneprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in lantprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in bcinprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in lactamprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in amglyccyclprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in siderophoreprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in ectprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in butyrprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in indoleprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in nucleoprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in phosphoprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in melaninprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in aminocoumarinprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+for i in otherprots:
+if i not in allsecmetprots:
+allsecmetprots.append(i)
+allsecmetprots.sort()
+if len(allsecmetprots) == 0:
+logfile.write("No secondary metabolite biosynthesis gene clusters detected in this nucleotide file.\n")
+logfile.close()
+print >> sys.stderr, "No secondary metabolite biosynthesis gene clusters detected in this nucleotide file."
+sys.exit(1)
+elapsed = (time.time() - starttime)
+#print "4713Time since start: " + str(elapsed)
+#Extract approximate gene clusters based on hmmsearch results, create list of core PKS / NRPS genes for further analysis (use less strict parameters for this then in gene cluster detection to include all PKS/NRPS domains)
+#Create nucleotide fasta files with sec met gene clusters
+#print "Extracting gene clusters from gbk/embl file using detected signature genes..."
+logfile.write("Extracting gene clusters from gbk/embl file using detected signature genes...\n")
+fastafile = open(genomename + "/clusterblast/geneclusterprots.fasta","w")
+txtfile = open(genomename + "/clusterblast/geneclusters.txt","w")
+wb = Workbook()
+font1 = Font()
+style1 = XFStyle()
+style1.font = font1
+font1.bold = True
+ws0 = wb.add_sheet('0')
+ws0.write(0,0,"Input accession number",style1)
+ws0.write(0,1,"Input name",style1)
+ws0.write(0,2,"Gene cluster type",style1)
+ws0.write(0,3,"Gene cluster genes",style1)
+if clusterblast == "y":
+ws0.write(0,4,"Compound with gene cluster of highest homology",style1)
+protcodes = allsecmetprots
+nuccode = genomename
+gbkfile = open(infile,"r")
+output = gbkfile.read()
+output = output.replace("\r","\n")
+#Extract description of nucleotide from gbk/embl file
+if ".gbk" in infile or ".GBK" in infile or ".gb" in infile or ".GB" in infile or ".genbank" in infile or ".GENBANK" in infile:
+try:
+nucname1 = output.split("ACCESSION   ")[0]
+nucname2 = nucname1.split("DEFINITION  ")[1]
+nucname3 = nucname2.replace("\n","")
+while "  " in nucname3:
+nucname3 = nucname3.replace("  "," ")
+nucname = nucname3
+except(KeyError,IOError,IndexError):
+nucname = "input_nucleotide"
+elif ".embl" in infile or ".EMBL" in infile or ".emb" in infile or ".EMB" in infile:
+try:
+nucname1 = output.split("DE   ")[1]
+nucname2 = nucname1.split("\n")[0]
+nucname3 = nucname2.replace("\n","")
+while "  " in nucname3:
+nucname3 = nucname3.replace("  "," ")
+nucname = nucname3
+except(KeyError,IOError,IndexError):
+nucname = "input_nucleotide"
+protstartlocations = []
+protendlocations = []
+genelist = proteins[2]
+genedict = proteins[3]
+#Save all locations of query proteins on the nucleotide in a list
+for j in protcodes:
+if j in genelist:
+protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
+protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
+protstartlocations.append(protstart_abs)
+protendlocations.append(protend_abs)
+#Identify clusters of genes based on protein locations on the nucleotide
+clusterstarts = []
+clusterends = []
+protstartlocations.sort()
+protendlocations.sort()
+nrlocations = len(protstartlocations)
+a = 0
+for i in protstartlocations:
+if a == 0:
+start = str(i)
+clusterstarts.append(start)
+if len(protendlocations) == 1:
+clusterends.append(protendlocations[a])
+elif a == nrlocations - 1:
+if i < ((protendlocations[a - 1]) + 20000):
+clusterends.append(str(protendlocations[a]))
+else:
+end = str(protendlocations[a - 1])
+clusterends.append(end)
+clusterstarts.append(str(i))
+clusterends.append(str(protendlocations[a]))
+else:
+if i > ((protendlocations[a - 1]) + 20000):
+clusterends.append(str(protendlocations[a - 1]))
+start = str(i)
+clusterstarts.append(start)
+else:
+pass
+a += 1
+lastendlocation = i
+#Extend clusters with 20kb on each side of the identified core genes
+clusterstarts2 = []
+for i in clusterstarts:
+j = int(i) - 20000
+if j < 0:
+j = 0
+clusterstarts2.append(j)
+clusterstarts = clusterstarts2
+clusterends2 = []
+for i in clusterends:
+j = int(i) + 20000
+clusterends2.append(j)
+clusterends = clusterends2
+#For each genbank secondary metabolite gene cluster: extract all proteins and write to fasta,
+a = 0
+clusterinfo = {}
+geneclusters = []
+geneclustergenes = []
+allcoregenes = []
+for i in clusterstarts:
+cstart = int(i)
+cend = int(clusterends[a])
+a += 1
+clusternr = a
+geneclusters.append(clusternr)
+coregenes = []
+clustergenes = []
+#For each gene in nucleotide, check if it is inside this cluster; if, so append info to list of clustergenes
+if a == 1:
+for i in genelist:
+geneinfo = genedict[i][:-1]
+geneinfo.append(i)
+genedict[i] = geneinfo
+for i in genelist:
+geneinfo = genedict[i]
+genestart = int(geneinfo[0])
+geneend = int(geneinfo[1])
+if (genestart > cstart and genestart < cend) or (geneend > cstart and geneend < cend):
+clustergenes.append(geneinfo)
+#Determine type of cluster
+type = "other"
+z = 0
+for k in clustergenes:
+i = k[4]
+if i in t1pksprots:
+if z == 0:
+type = "t1pks"
+elif "t1pks" not in type:
+type = type + "-t1pks"
+z = 1
+if i in transatpksprots:
+if z == 0:
+type = "transatpks"
+elif "transatpks" not in type:
+type = type + "-transatpks"
+z = 1
+if i in t2pksprots:
+if z == 0:
+type = "t2pks"
+elif "t2pks" not in type:
+type = type + "-t2pks"
+z = 1
+if i in t3pksprots:
+if z == 0:
+type = "t3pks"
+elif "t3pks" not in type:
+type = type + "-t3pks"
+z = 1
+if i in t4pksprots:
+if z == 0:
+type = "t1pks"
+elif "t1pks" not in type:
+type = type + "-t1pks"
+z = 1
+if i in nrpsprots:
+if z == 0:
+type = "nrps"
+elif "nrps" not in type:
+type = type + "-nrps"
+z = 1
+if i in terpeneprots:
+if z == 0:
+type= "terpene"
+elif "terpene" not in type:
+type = type + "-terpene"
+z = 1
+if i in lantprots:
+if z == 0:
+type= "lant"
+elif "lant" not in type:
+type = type + "-lant"
+z = 1
+if i in bcinprots:
+if z == 0:
+type= "bcin"
+elif "bcin" not in type:
+type = type + "-bcin"
+z = 1
+if i in lactamprots:
+if z == 0:
+type = "blactam"
+elif "blactam" not in type:
+type = type + "-blactam"
+z = 1
+if i in amglyccyclprots:
+if z == 0:
+type = "amglyccycl"
+elif "amglyccycl" not in type:
+type = type + "-amglyccycl"
+z = 1
+if i in siderophoreprots:
+if z == 0:
+type = "siderophore"
+elif "siderophore" not in type:
+type = type + "-siderophore"
+z = 1
+if i in ectprots:
+if z == 0:
+type = "ectoine"
+elif "ectoine" not in type:
+type = type + "-ectoine"
+z = 1
+if i in indoleprots:
+if z == 0:
+type = "indole"
+elif "indole" not in type:
+type = type + "-indole"
+z = 1
+if i in nucleoprots:
+if z == 0:
+type = "nucleoside"
+elif "nucleoside" not in type:
+type = type + "-nucleoside"
+z = 1
+if i in phosphoprots:
+if z == 0:
+type = "phosphoglycolipid"
+elif "phosphoglycolipid" not in type:
+type = type + "-phosphoglycolipid"
+z = 1
+if i in butyrprots:
+if z == 0:
+type = "butyrolactone"
+elif "butyrolactone" not in type:
+type = type + "-butyrolactone"
+z = 1
+if i in melaninprots:
+if z == 0:
+type = "melanin"
+elif "melanin" not in type:
+type = type + "-melanin"
+z = 1
+if i in aminocoumarinprots:
+if z == 0:
+type = "aminocoumarin"
+elif "aminocoumarin" not in type:
+type = type + "-aminocoumarin"
+z = 1
+if "other-" in type[:6]:
+type = type[6:]
+#Shorten gene cluster if type is among typically short gene cluster types
+if cend > dnaseqlength:
+cend = dnaseqlength
+if type == "t3pks" or type == "t2pks":
+if cstart != 0:
+cstart = cstart + 5000
+if cend != dnaseqlength:
+cend = cend - 5000
+clustergenes2 = []
+for i in clustergenes:
+start = int(i[0])
+end = int(i[1])
+if (start > cstart and start < cend) or (end > cstart and end < cend):
+clustergenes2.append(i)
+clustergenes = clustergenes2
+if type == "bcin" or type == "siderophore" or type == "lant" or type == "terpene":
+if cstart != 0:
+cstart = cstart + 10000
+if cend != dnaseqlength:
+cend = cend - 10000
+clustergenes2 = []
+for i in clustergenes:
+start = int(i[0])
+end = int(i[1])
+if (start > cstart and start < cend) or (end > cstart and end < cend):
+clustergenes2.append(i)
+clustergenes = clustergenes2
+if type == "butyrolactone" or type == "melanin" or type == "ectoine":
+if cstart != 0:
+cstart = cstart + 17000
+if cend != dnaseqlength:
+cend = cend - 17000
+clustergenes2 = []
+for i in clustergenes:
+start = int(i[0])
+end = int(i[1])
+if (start > cstart and start < cend) or (end > cstart and end < cend):
+clustergenes2.append(i)
+clustergenes = clustergenes2
+#For all clustergenes, write info to fasta
+for i in clustergenes:
+start = str(i[0])
+end = str(i[1])
+strand = i[2]
+seq = seqdict[i[4]]
+ann = i[3].replace(" ","_")
+accession = i[4]
+name = nuccode + "|c" + str(a) + "|" + start + "-" + end + "|" + strand + "|" + accession + "|" + ann
+fastafile.write(">" + name + "\n" + seq + "\n")
+if accession not in geneclustergenes:
+geneclustergenes.append(accession)
+#Write gene cluster info to separate txt file
+txtfile.write(nuccode + "\t" + nucname + "\t" + "c" + str(a) + "\t" + type + "\t")
+ws0.write(a,0,genomic_accnr)
+try:
+ws0.write(a,1,nucname)
+except:
+ws0.write(a,1,"Name to long to be contained in Excel cell; see txt file in downloadable zip archive.")
+ws0.write(a,2,type)
+xlsgenesfield = ""
+for i in clustergenes:
+txtfile.write(i[4] + ";")
+xlsgenesfield = xlsgenesfield + i[4] + ";"
+txtfile.write("\t")
+for i in clustergenes:
+txtfile.write(accessiondict[i[4]] + ";")
+xlsgenesfield = xlsgenesfield[:-1]
+try:
+ws0.write(a,3,xlsgenesfield)
+except:
+ws0.write(a,3,"Too many genes to be contained in Excel cell; see txt file in downloadable zip archive.")
+txtfile.write("\n")
+#Write gene cluster info to clusterinfo dictionary
+for i in clustergenes:
+if i[4] in allsecmetprots:
+coregenes.append(i[4])
+allcoregenes.append(i[4])
+clusterinfo[clusternr] = [type,cstart,cend,coregenes,clustergenes]
+#Close xls, fasta and txt files
+fastafile.close()
+txtfile.close()
+#Analysis of core PKS/NRPS genes (separate py), detect subgroups and predict specificities and final products
+#Make list of PKS / NRPS gene clusters to be analysed
+#print "Analysing core PKS/NRPS genes..."
+logfile.write("Analysing core PKS/NRPS genes...\n")
+pksnrpsgeneclusters = []
+pksnrpscoregenes = []
+for i in geneclusters:
+if "t1pks" in clusterinfo[i][0] or "t4pks" in clusterinfo[i][0] or "transatpks" in clusterinfo[i][0] or "nrps" in clusterinfo[i][0]:
+pksnrpsgeneclusters.append(i)
+for i in t1pksprots:
+pksnrpscoregenes.append(i)
+for i in transatpksprots:
+pksnrpscoregenes.append(i)
+for i in t4pksprots:
+pksnrpscoregenes.append(i)
+for i in nrpsprots:
+pksnrpscoregenes.append(i)
+for i in amp_t_prots:
+pksnrpscoregenes.append(i)
+pksnrpsgenestartdict = {}
+for i in pksnrpscoregenes:
+start = int(genedict[i][0])
+pksnrpsgenestartdict[i] = start
+pksnrpscoregenes = sortdictkeysbyvalues(pksnrpsgenestartdict)
+nrpsnames = []
+nrpsseqs = []
+pksnrpsnames = []
+pksnrpsseqs = []
+pksnames = []
+pksseqs = []
+calnames = []
+calseqs = []
+krnames = []
+krseqs = []
+nrpspkstypedict = {}
+domaindict = {}
+if len(pksnrpscoregenes) > 0:
+#Write PKS / NRPS core genes to FASTA file
+for i in pksnrpscoregenes:
+name = i
+seq = seqdict[i]
+pksnrpsnames.append(name)
+pksnrpsseqs.append(seq)
+writefasta(pksnrpsnames,pksnrpsseqs,genomename + "/nrpspks_proteins.fasta")
+#Analyse for abMotifs
+hmmsearch = hmmscan_path + " --cpu " + str(nrcpus) + " -E 0.1 -o " + genomename + "/nrpspks/abmotifshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/abmotifshmm.txt "+ hmms_path +"abmotifs.hmm " + genomename + "/nrpspks_proteins.fasta"
+os.system(hmmsearch)
+mhmmlengthsdict = hmmlengths(hmms_path+"abmotifs.hmm")
+motifdict = hmmscanparse(genomename + "/nrpspks/abmotifshmm_output.txt",mhmmlengthsdict)
+#Analyse for C/A/PCP/E/KS/AT/ATd/DH/KR/ER/ACP/TE/TD/COM/Docking/MT/CAL domains
+hmmsearch = hmmscan_path + " --cut_tc --cpu " + str(nrcpus) + " -o " + genomename + "/nrpspks/nrpspkshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/nrpspkshmm.txt "+ hmms_path +"nrpspksdomains.hmm " + genomename + "/nrpspks_proteins.fasta"
+os.system(hmmsearch)
+hmmlengthsdict = hmmlengths(hmms_path+"nrpspksdomains.hmm")
+domaindict = hmmscanparse(genomename + "/nrpspks/nrpspkshmm_output.txt",hmmlengthsdict)
+nrpspksdomainsfile = open(genomename + "/nrpspks/nrpspksdomains.txt","w")
+#Analyse KS domains & PKS/NRPS protein domain composition to detect NRPS/PKS types
+kshmmsearch = hmmscan_path + " --cut_tc --cpu " + str(nrcpus) + " -o " + genomename + "/nrpspks/kshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/kshmm.txt " + hmms_path + "ksdomains.hmm " + genomename + "/nrpspks_proteins.fasta"
+os.system(kshmmsearch)
+kshmmlengthsdict = hmmlengths(hmms_path+"ksdomains.hmm")
+ksdomaindict = hmmscanparse(genomename + "/nrpspks/kshmm_output.txt",kshmmlengthsdict)
+for k in pksnrpscoregenes:
+#structure of domaindict: domaindict[genename] = [[name,start,end,evalue,score],[name,start,end,evalue,score], etc.]
+domainlist = []
+nrKSdomains = 0
+for i in domaindict[k]:
+domainlist.append(i[0])
+if i[0] == "PKS_KS":
+nrKSdomains += 1
+modKSscore = 0
+traKSscore = 0
+eneKSscore = 0
+iterKSscore = 0
+for i in ksdomaindict[k]:
+if i[0] == "Trans-AT-KS":
+traKSscore += 1
+if i[0] == "Modular-KS":
+modKSscore += 1
+if i[0] == "Enediyne-KS":
+eneKSscore += 1
+if i[0] == "Iterative-KS":
+iterKSscore += 1
+for i in domaindict[k]:
+if "Cglyc" in domainlist and "Epimerization" in domainlist and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
+type = "Glycopeptide NRPS"
+elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist) and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
+type = "NRPS"
+elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist) or "AMP-binding" in domainlist and ("PKS_KS" in domainlist or "PKS_AT" in domainlist):
+type = "Hybrid PKS-NRPS"
+elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" not in domainlist and "Trans-AT_docking" in domainlist and traKSscore > modKSscore and traKSscore > iterKSscore and traKSscore > eneKSscore:
+type = "Type I Trans-AT PKS"
+elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and iterKSscore > modKSscore and iterKSscore > traKSscore and iterKSscore > eneKSscore and nrKSdomains < 3:
+type = "Type I Iterative PKS"
+elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and eneKSscore > modKSscore and eneKSscore > traKSscore and eneKSscore > iterKSscore and nrKSdomains < 3:
+type = "Type I Enediyne PKS"
+elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and ((modKSscore > eneKSscore and modKSscore > traKSscore and modKSscore > iterKSscore) or nrKSdomains > 3):
+type = "Type I Modular PKS"
+elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist:
+type = "PKS-like protein"
+elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist or "AMP-binding" in domainlist) and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
+type = "NRPS-like protein"
+else:
+type = "PKS/NRPS-like protein"
+nrpspkstypedict[k] = type
+#Write data to output file
+for k in pksnrpscoregenes:
+j = domaindict[k]
+l = motifdict[k]
+nrpspksdomainsfile.write(">> " + k + "\n")
+nrpspksdomainsfile.write(">> " + nrpspkstypedict[k] + "\n")
+nrpspksdomainsfile.write("name\tstart\tend\te-value\tscore\n")
+for i in j:
+#nrpspksdomainsfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n")
+nrpspksdomainsfile.write("%s\t%s\t%s\t%s\t%s\n" % (i[0], i[1], i[2], i[3], i[4]) )
+nrpspksdomainsfile.write("** Motifs: **\n")
+for i in l:
+#nrpspksdomainsfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n")
+nrpspksdomainsfile.write("%s\t%s\t%s\t%s\t%s\n" % (i[0], i[1], i[2], i[3], i[4]) )
+nrpspksdomainsfile.write("\n\n")
+nrpspksdomainsfile.close()
+elapsed = (time.time() - starttime)
+#print "5163Time since start: " + str(elapsed)
+#Predict NRPS A domain specificities with NRPSPredictor and Minowa et al. method
+#print "Predicting NRPS A domain substrate specificities by NRPSPredictor"
+logfile.write("Predicting NRPS A domain substrate specificities by NRPSPredictor\n")
+#NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor
+for k in pksnrpscoregenes:
+j = domaindict[k]
+nr = 0
+for i in j:
+if i[0] == "AMP-binding" or i[0] == "A-OX":
+nr += 1
+start = int(i[1])
+end = int(i[2]) + 120
+seq = seqdict[k][start:end]
+name = k + "_A" + str(nr)
+nrpsnames.append(name)
+nrpsseqs.append(seq)
+if len(nrpsnames) > 0:
+writefasta(nrpsnames,nrpsseqs,"NRPSPredictor2/nrpsseqs.fasta")
+#nrpspredcommand = "perl nrpsSpecPredictor.pl nrpsseqs.fasta ../" + nrpspredictoroutputfolder + " ." #OLD NRPSPREDICTOR1 command
+os.chdir("NRPSPredictor2/")
+#Get NRPSPredictor2 code predictions, output sig file for input for NRPSPredictor2 SVMs
+if sys.platform == ('win32'):
+nrpspred2codecommand = 'nrpscodepred nrpsseqs.fasta input.sig nrpscodes.txt  > nul'
+if sys.platform == ('linux2'):
+nrpspred2codecommand = 'python nrpscodepred.py nrpsseqs.fasta input.sig nrpscodes.txt > /dev/null'
+os.system(nrpspred2codecommand)
+#Run NRPSPredictor2 SVM
+currentdir = os.getcwd()
+if sys.platform == ('win32'):
+nrpspred2command = 'java -Ddatadir="' + currentdir + '\\data" -cp build/NRPSpredictor2.jar;lib/java-getopt-1.0.13.jar;lib/Utilities.jar;lib/libsvm.jar org.roettig.NRPSpredictor2.NRPSpredictor2 -i input.sig -r ..\\' + nrpspredictoroutputfolder + 'nrpspredictor2.out -s 1'
+if sys.platform == ('linux2'):
+nrpspred2command = './NRPSpredictor2.sh -i input.sig -r ../' + nrpspredictoroutputfolder + 'nrpspredictor2.out -s 1'
+os.popen(nrpspred2command)
+#Copy NRPSPredictor results
+if sys.platform == ('win32'):
+copycommand = 'copy/y nrpscodes.txt ..\\' + nrpspredictoroutputfolder.replace("/","\\") + ' > nul'
+if sys.platform == ('linux2'):
+copycommand = 'cp nrpscodes.txt ../' + nrpspredictoroutputfolder + " > /dev/null"
+os.system(copycommand)
+os.chdir("..")
+elapsed = (time.time() - starttime)
+#print "5206Time since start: " + str(elapsed)
+# folgendes bis zum naechsten time braucht 500s, liegt wohl haupsaechlich an schlechtem minowa_A code
+#Minowa method: extract AMP-binding domain, and run Minowa_A
+if len(nrpsnames) > 0:
+#print "Predicting NRPS A domain substrate specificities by Minowa et al. method\n"
+logfile.write("Predicting NRPS A domain substrate specificities by Minowa et al. method")
+nrpsnames2 = []
+nrpsseqs2 = []
+for k in pksnrpscoregenes:
+j = domaindict[k]
+nr = 0
+for i in j:
+if i[0] in ["AMP-binding", "A-OX"]:
+nr += 1
+start = int(i[1])
+end = int(i[2])
+seq = seqdict[k][start:end]
+name = k + "_A" + str(nr)
+nrpsnames2.append(name)
+nrpsseqs2.append(seq)
+writefasta(nrpsnames2,nrpsseqs2,minowanrpsoutputfolder + "nrpsseqs.fasta")
+if sys.platform == ('win32'):
+minowanrpscommand = "minowa_A ../" + minowanrpsoutputfolder + "nrpsseqs.fasta ../" + minowanrpsoutputfolder + "nrpspredoutput.txt"
+if sys.platform == ('linux2'):
+minowanrpscommand = "python minowa_A.py ../" + minowanrpsoutputfolder + "nrpsseqs.fasta ../" + minowanrpsoutputfolder + "nrpspredoutput.txt"
+os.chdir("Minowa/")
+os.system(minowanrpscommand)
+os.chdir("..")
+elapsed = (time.time() - starttime)
+#print "5235Time since start: " + str(elapsed)
+#Predict PKS AT domain specificities with Minowa et al. method and PKS code (NP searcher / ClustScan / own?)
+for k in pksnrpscoregenes:
+j = domaindict[k]
+nr = 0
+for i in j:
+if i[0] == "PKS_AT":
+nr += 1
+start = int(i[1])
+end = int(i[2])
+seq = seqdict[k][start:end]
+name = k + "_AT" + str(nr)
+pksnames.append(name)
+pksseqs.append(seq)
+if len(pksnames) > 0:
+writefasta(pksnames,pksseqs,pkssignatureoutputfolder + "pksseqs.fasta")
+writefasta(pksnames,pksseqs,minowapksoutputfolder + "pksseqs.fasta")
+#Run PKS signature analysis
+elapsed = (time.time() - starttime)
+#print "5254Time since start: " + str(elapsed)
+print "Predicting PKS AT domain substrate specificities by Yadav et al. PKS signature sequences"
+logfile.write("Predicting PKS AT domain substrate specificities by Yadav et al. PKS signature sequences\n")
+if sys.platform == ('win32'):
+pkspredcommand = "PKS_analysis ../" + pkssignatureoutputfolder + "pksseqs.fasta ../" + pkssignatureoutputfolder + "pkspredoutput.txt"
+if sys.platform == ('linux2'):
+pkspredcommand = "python PKS_analysis.py ../" + pkssignatureoutputfolder + "pksseqs.fasta ../" + pkssignatureoutputfolder + "pkspredoutput.txt"
+os.chdir("pkssignatures/")
+os.system(pkspredcommand)
+os.chdir("..")
+#Minowa method: run Minowa_AT
+elapsed = (time.time() - starttime)
+#print "5266Time since start: " + str(elapsed)
+print "Predicting PKS AT domain substrate specificities by Minowa et al. method"
+logfile.write("Predicting PKS AT domain substrate specificities by Minowa et al. method\n")
+if sys.platform == ('win32'):
+minowapkscommand = "minowa_AT ../" + minowapksoutputfolder + "pksseqs.fasta ../" + minowapksoutputfolder + "pkspredoutput.txt"
+if sys.platform == ('linux2'):
+minowapkscommand = "python minowa_AT.py ../" + minowapksoutputfolder + "pksseqs.fasta ../" + minowapksoutputfolder + "pkspredoutput.txt"
+os.chdir("Minowa/")
+os.system(minowapkscommand)
+os.chdir("..")
+#Predict PKS CAL domain specificities with Minowa et al. method
+elapsed = (time.time() - starttime)
+#print "5279Time since start: " + str(elapsed)
+print "Predicting CAL domain substrate specificities by Minowa et al. method"
+logfile.write("Predicting CAL domain substrate specificities by Minowa et al. method\n")
+for k in pksnrpscoregenes:
+j = domaindict[k]
+nr = 0
+for i in j:
+if i[0] == "CAL_domain":
+nr += 1
+start = int(i[1])
+end = int(i[2])
+seq = seqdict[k][start:end]
+name = k + "_CAL" + str(nr)
+calnames.append(name)
+calseqs.append(seq)
+if len(calnames) > 0:
+writefasta(calnames,calseqs,minowacaloutputfolder + "calseqs.fasta")
+if sys.platform == ('win32'):
+minowacalcommand = "minowa_CAL ../" + minowacaloutputfolder + "calseqs.fasta ../" + minowacaloutputfolder + "calpredoutput.txt"
+if sys.platform == ('linux2'):
+minowacalcommand = "python minowa_CAL.py ../" + minowacaloutputfolder + "calseqs.fasta ../" + minowacaloutputfolder + "calpredoutput.txt"
+os.chdir("Minowa/")
+os.system(minowacalcommand)
+os.chdir("..")
+elapsed = (time.time() - starttime)
+#print "5305Time since start: " + str(elapsed)
+#Predict PKS KR domain stereochemistry using pattern as published in ClustScan
+print "Predicting PKS KR activity and stereochemistry using KR fingerprints from Starcevic et al."
+logfile.write("Predicting PKS KR activity and stereochemistry using KR fingerprints from Starcevic et al.\n")
+for k in pksnrpscoregenes:
+j = domaindict[k]
+nr = 0
+for i in j:
+if i[0] == "PKS_KR":
+nr += 1
+start = int(i[1])
+end = int(i[2])
+seq = seqdict[k][start:end]
+name = k + "_KR" + str(nr)
+krnames.append(name)
+krseqs.append(seq)
+if len(krnames) > 0:
+writefasta(krnames,krseqs,kranalysisoutputfolder + "krseqs.fasta")
+if sys.platform == ('win32'):
+kranalysiscommand = "kr_analysis ../" + kranalysisoutputfolder + "krseqs.fasta ../" + kranalysisoutputfolder + "krpredoutput.txt"
+if sys.platform == ('linux2'):
+kranalysiscommand = "python kr_analysis.py ../" + kranalysisoutputfolder + "krseqs.fasta ../" + kranalysisoutputfolder + "krpredoutput.txt"
+os.chdir("kr_analysis/")
+os.system(kranalysiscommand)
+os.chdir("..")
+#Read and parse all substrate specificity prediction output files
+minowa_nrps_preds = {}
+minowa_nrps_preds_details = {}
+nrps_svm_preds = {}
+nrps_svm_preds_details = {}
+nrps_code_preds = {}
+nrps_code_preds_details = {}
+substratetransdict2 = {'pipecolate':'pip','fOHOrn':'orn','beta-Lys':'blys','5NhOrn':'orn','OHOrn':'orn','Aad':'Aaa','bOHTyr':'bht'}
+if len(nrpsnames) > 0:
+minowa_a_file = open(minowanrpsoutputfolder + "nrpspredoutput.txt","r")
+minowa_a_file = minowa_a_file.read()
+minowa_a_file = minowa_a_file.replace("\r","\n")
+parts = minowa_a_file.split("\\\\\n")[1:]
+for i in parts:
+partlines = i.split("\n")
+acc = partlines[0]
+tophit = partlines[2].split("\t")[0]
+if tophit in substratetransdict2.keys():
+tophit = substratetransdict2[tophit]
+minowa_nrps_preds[acc] = tophit.lower()
+minowa_nrps_preds_details[acc] = "<b>Minowa HMM method A-domain<br>Substrate specificity prediction top hits:</b><br>\n" + partlines[1] + "<br>\n" + partlines[2] + "<br>\n" + partlines[3] + "<br>\n" + partlines[4] + "<br><br>\n\n"
+nrpspredictorfile1 = open(nrpspredictoroutputfolder + "nrpspredictor2.out","r")
+nrpspredictorfile2 = open(nrpspredictoroutputfolder + "nrpscodes.txt","r")
+nrpspredictorfile1 = nrpspredictorfile1.read()
+nrpspredictorfile1 = nrpspredictorfile1.replace("\r","\n")
+lines = nrpspredictorfile1.split("\n")[1:-1]
+for k in lines:
+tabs = k.split("\t")
+nrps_svm_preds[tabs[0]] = tabs[6]
+nrps_svm_preds_details[tabs[0]] = "<b> NRPSPredictor2 SVM prediction details:</b><br>\n8 Angstrom 34 AA code:<br>\n" + tabs[1] + "<br>\nPredicted physicochemical class:<br>\n" + tabs[3] + "<br>\nLarge clusters prediction:<br>\n" + tabs[4] + "<br>\nSmall clusters prediction:<br>\n" + tabs[5] + "<br>\nSingle AA prediction:<br>\n" + tabs[6] + "<br><br>\n\n"
+nrpspredictorfile2 = nrpspredictorfile2.read()
+nrpspredictorfile2 = nrpspredictorfile2.replace("\r","\n")
+lines = nrpspredictorfile2.split("\n")[:-1]
+for k in lines:
+tabs = k.split("\t")
+nrps_code_preds[tabs[0]] = tabs[1]
+nrps_code_preds_details[tabs[0]] = "<b> NRPSPredictor2 Stachelhaus code prediction:</b><br>\n" + tabs[1] + "<br><br>\n\n"
+minowa_pks_preds_details = {}
+minowa_pks_preds = {}
+pks_code_preds ={}
+pks_code_preds_details ={}
+substratetransdict = {'Malonyl-CoA':'mal','Methylmalonyl-CoA':'mmal','Methoxymalonyl-CoA':'mxmal','Ethylmalonyl-CoA':'emal','Isobutyryl-CoA':'isobut','2-Methylbutyryl-CoA':'2metbut','trans-1,2-CPDA':'trans-1,2-CPDA','Acetyl-CoA':'Acetyl-CoA','Benzoyl-_CoA':'benz','Propionyl-CoA':'prop','3-Methylbutyryl-CoA':'3metbut','Ethylmalonyl-CoA':'Ethyl_mal','CE-Malonyl-CoA':'cemal','2-Rhyd-Malonyl-CoA':'2Rhydmal','CHC-CoA':'CHC-CoA','inactive':'inactive'}
+if len(pksnames) > 0:
+minowa_at_file = open(minowapksoutputfolder + "pkspredoutput.txt","r")
+minowa_at_file = minowa_at_file.read()
+minowa_at_file = minowa_at_file.replace("\r","\n")
+parts = minowa_at_file.split("\\\\\n")[1:]
+for i in parts:
+partlines = i.split("\n")
+acc = partlines[0]
+if substratetransdict.has_key(partlines[2].split("\t")[0]):
+tophit = substratetransdict[partlines[2].split("\t")[0]]
+else:
+tophit = "pk"
+minowa_pks_preds[acc] = tophit
+minowa_pks_preds_details[acc] = "<b>Minowa HMM method AT-domain<br>Substrate specificity prediction top hits:</b><br>\n" + partlines[1] + "<br>\n" + partlines[2] + "<br>\n" + partlines[3] + "<br>\n" + partlines[4] + "<br><br>\n\n"
+pkssignaturefile = open(pkssignatureoutputfolder + "pkspredoutput.txt","r")
+pkssignaturefile = pkssignaturefile.read()
+pkssignaturefile = pkssignaturefile.replace("\r","\n")
+parts = pkssignaturefile.split("//\n")[1:]
+for i in parts:
+partlines = i.split("\n")
+partlines2 = []
+for j in partlines:
+if j != "":
+partlines2.append(j)
+partlines = partlines2
+acc = partlines[0].split("\t")[0]
+if len(partlines) > 2:
+tophit = (partlines[1].split("\t")[0]).split("__")[1]
+pks_code_preds[acc] = tophit
+codes = []
+prots = []
+scores = []
+for i in partlines[1:4]:
+codes.append(i.split("\t")[0])
+prot = i.split("\t")[1]
+prot = prot.replace("_AT"," (AT")
+prot = prot.replace("__","): ")
+prots.append(prot)
+scores.append(i.split("\t")[2])
+if len(prots) >= 3:
+pks_code_preds_details[acc] = "<b>PKS Active Site Signature method<br>AT-domain substrate specificity prediction top hits:</b><br>\nCode:" + partlines[0].split("\t")[1] + "<br>\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)<br>\n" + codes[1] + " - " + prots[1] + " : (" + scores[1] + "% identity)<br>\n" + codes[2] + " - " + prots[2] + " : (" + scores[2] + "% identity)<br><br>\n\n"
+elif len(prots) == 2:
+pks_code_preds_details[acc] = "<b>PKS Active Site Signature method<br>AT-domain substrate specificity prediction top hits:</b><br>\nCode:" + partlines[0].split("\t")[1] + "<br>\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)<br>\n" + codes[1] + " - " + prots[1] + " : (" + scores[1] + "% identity)<br><br>\n\n"
+elif len(prots) == 1:
+pks_code_preds_details[acc] = "<b>PKS Active Site Signature method<br>AT-domain substrate specificity prediction top hits:</b><br>\nCode:" + partlines[0].split("\t")[1] + "<br>\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)<br><br>\n\n"
+else:
+pks_code_preds[acc] = "N/A"
+pks_code_preds_details[acc] = "<b>PKS Active Site Signature method<br>No AT-domain substrate specificity prediction hits above 40% identity.<br>\n\n"
+minowa_cal_preds = {}
+minowa_cal_preds_details = {}
+if len(calnames) > 0:
+minowa_cal_file = open(minowacaloutputfolder + "calpredoutput.txt","r")
+minowa_cal_file = minowa_cal_file.read()
+minowa_cal_file = minowa_cal_file.replace("\r","\n")
+parts = minowa_cal_file.split("\\\\\n")[1:]
+for i in parts:
+partlines = i.split("\n")
+acc = partlines[0]
+tophit = partlines[2].split("\t")[0]
+minowa_cal_preds[acc] = tophit
+minowa_cal_preds_details[acc] = "<b>Minowa HMM method<br>CAL-domain substrate specificity prediction top hits:</b><br>\n" + partlines[1] + "<br>\n" + partlines[2] + "<br>\n" + partlines[3] + "<br>\n" + partlines[4] + "<br><br>\n\n"
+kr_activity_preds = {}
+kr_stereo_preds = {}
+if len(krnames) > 0:
+krfile = open(kranalysisoutputfolder + "krpredoutput.txt","r")
+krfile = krfile.read()
+krfile = krfile.replace("\r","\n")
+krlines = krfile.split("\n")[:-1]
+for i in krlines:
+tabs = i.split("\t")
+kr_activity_preds[tabs[0]] = tabs[1]
+kr_stereo_preds[tabs[0]] = tabs[2]
+#Combine substrate specificity predictions into consensus prediction
+consensuspreds = {}
+#available_smiles_parts = ['ALA','ARG','ASN','ASP','CYS','GLN','GLU','GLY','HIS','ILE','LEU','LYS','PHE','PRO','SER','THR','TRP','TYR','VAL','MET','ORN','ala','arg','asn','asp','cys','gln','glu','gly','his','ile','leu','lys','phe','pro','ser','thr','trp','tyr','val','met','orn','Ala','Arg','Asn','Asp','Cys','Gln','Glu','Gly','His','Ile','Leu','Lys','Phe','Pro','Ser','Thr','Trp','Tyr','Val','Met','Orn','MPRO','23DHB','34DHB','2HIVA','PGLY','DAB','BALA','AEO','4MHA','PICO','AAA','DHA','SCY','PIP','BMT','ADDS','mpro','23dhb','34dhb','2hiva','pgly','dab','bala','aeo','4mha','pico','aaa','dha','scy','pip','bmt','adds','Mpro','23Dhb','34Dhb','2Hiva','Pgly','Dab','Bala','Aeo','4Mha','Pico','Aaa','Dha','Scy','Pip','Bmt','Adds','mal','mmal','omal','emal','nrp','pk']
+available_smiles_parts = ['GLY','ALA','VAL','LEU','ILE','MET','PRO','PHE','TRP','SER','THR','ASN','GLN','TYR','CYS','LYS','ARG','HIS','ASP','GLU','MPRO','ORN','PGLY','DAB','BALA','AEO','DHA','PIP','BMT','gly','ala','val','leu','ile','met','pro','phe','trp','ser','thr','asn','gln','tyr','cys','lys','arg','his','asp','glu','aaa','mpro','dhb','2hiva','orn','pgly','dab','bala','aeo','4mha','pico','phg','dha','scy','pip','bmt','adds','aad','abu','hiv','dhpg','bht','3-me-glu','4pPro','ala-b','ala-d','dht','Sal','tcl','lys-b','hpg','hyv-d','iva','vol','mal','mmal','mxmal','emal','nrp','pk','Gly','Ala','Val','Leu','Ile','Met','Pro','Phe','Trp','Ser','Thr','Asn','Gln','Tyr','Cys','Lys','Arg','His','Asp','Glu','Mpro','23Dhb','34Dhb','2Hiva','Orn','Pgly','Dab','Bala','Aeo','4Mha','Pico','Aaa','Dha','Scy','Pip','Bmt','Adds','DHpg','DHB','nrp','pk']
+for i in pksnrpscoregenes:
+nra = 0
+nrat = 0
+nrcal = 0
+j = domaindict[i]
+for k in j:
+if k[0] == "PKS_AT":
+nrat += 1
+preds = []
+preds.append(minowa_pks_preds[i + "_AT" + str(nrat)])
+preds.append(pks_code_preds[i + "_AT" + str(nrat)])
+cpred = "n"
+for l in preds:
+if preds.count(l) > 1:
+if l in available_smiles_parts:
+consensuspreds[i + "_AT" + str(nrat)] = l
+else:
+consensuspreds[i + "_AT" + str(nrat)] = "pk"
+cpred = "y"
+if cpred == "n":
+consensuspreds[i + "_AT" + str(nrat)] = "pk"
+if k[0] == "AMP-binding" or k[0] == "A-OX":
+nra +=1
+preds = []
+preds.append(minowa_nrps_preds[i + "_A" + str(nra)])
+preds.append(nrps_svm_preds[i + "_A" + str(nra)])
+preds.append(nrps_code_preds[i + "_A" + str(nra)])
+cpred = "n"
+for l in preds:
+if preds.count(l) > 1:
+if l in available_smiles_parts:
+consensuspreds[i + "_A" + str(nra)] = l
+else:
+consensuspreds[i + "_A" + str(nra)] = "nrp"
+cpred = "y"
+if cpred == "n":
+consensuspreds[i + "_A" + str(nra)] = "nrp"
+if k[0] == "CAL_domain":
+nrcal += 1
+if minowa_cal_preds[i + "_CAL" + str(nrcal)] in available_smiles_parts:
+consensuspreds[i + "_CAL" + str(nrcal)] = minowa_cal_preds[i + "_CAL" + str(nrcal)]
+else:
+consensuspreds[i + "_CAL" + str(nrcal)] = "pk"
+#Write all prediction details to HTML files for each gene to be used as pop-up window
+domainnamesdict = {}
+for i in pksnrpscoregenes:
+j = domaindict[i]
+domainnames = []
+for k in j:
+domainnames.append(k[0])
+domainnamesdict[i] = domainnames
+for i in pksnrpscoregenes:
+if "PKS_AT" in domainnamesdict[i] or "AMP-binding" in domainnamesdict[i] or "A-OX" in domainnamesdict[i] or "CAL_domain" in domainnamesdict[i]:
+j = domaindict[i]
+nrat = 0
+nra = 0
+nrcal = 0
+nrkr = 0
+for k in j:
+if k[0] == "PKS_AT":
+nrat += 1
+domainname = i + "_AT" + str(nrat)
+htmloutfile = open(substrspecsfolder + domainname + ".html","w")
+htmloutfile.write('<html>\n<head>\n<title>Prediction details</title>\n<STYLE type="text/css">\nbody{\n  text-align:left;\n  background-color:white;\n  font-family: Tahoma, sans-serif;\n  font-size: 0.8em;\n  color: #810E15;\n}\n</STYLE>\n</head>\n<body>')
+htmloutfile.write(minowa_pks_preds_details[domainname])
+htmloutfile.write(pks_code_preds_details[domainname])
+htmloutfile.write("<b><i>Consensus Predictions: " + consensuspreds[domainname] + "</b></i>")
+htmloutfile.write('\n</body>\n</html>')
+htmloutfile.close()
+if k[0] == "AMP-binding" or k[0] == "A-OX":
+nra += 1
+domainname = i + "_A" + str(nra)
+htmloutfile = open(substrspecsfolder + domainname + ".html","w")
+htmloutfile.write('<html>\n<head>\n<title>Prediction details</title>\n<STYLE type="text/css">\nbody{\n  text-align:left;\n  background-color:white;\n  font-family: Tahoma, sans-serif;\n  font-size: 0.8em;\n  color: #810E15;\n}\n</STYLE>\n</head>\n<body>')
+htmloutfile.write(nrps_svm_preds_details[domainname])
+htmloutfile.write(nrps_code_preds_details[domainname])
+htmloutfile.write(minowa_nrps_preds_details[domainname])
+htmloutfile.write("<b><i>Consensus Prediction: '" + consensuspreds[domainname] + "'</b></i>")
+htmloutfile.write('\n</body>\n</html>')
+htmloutfile.close()
+if k[0] == "CAL_domain":
+nrcal += 1
+domainname = i + "_CAL" + str(nrcal)
+htmloutfile = open(substrspecsfolder + domainname + ".html","w")
+htmloutfile.write('<html>\n<head>\n<title>Prediction details</title>\n<STYLE type="text/css">\nbody{\n  text-align:left;\n  background-color:white;\n  font-family: Tahoma, sans-serif;\n  font-size: 0.8em;\n  color: #810E15;\n}\n</STYLE>\n</head>\n<body>')
+htmloutfile.write(minowa_cal_preds_details[domainname])
+htmloutfile.write('\n</body>\n</html>')
+htmloutfile.close()
+elapsed = (time.time() - starttime)
+#print "5541Time since start: " + str(elapsed)
+#Predict biosynthetic gene order in gene cluster using starter domains, thioesterase domains, gene order and docking domains
+compound_pred_dict = {}
+dockingdomainanalysis = []
+nrpspksclusters = []
+a = 1
+for i in geneclusters:
+genecluster = i
+clustercoregenes = clusterinfo[i][3]
+clusterpksnrpsgenes = []
+for j in clustercoregenes:
+if j in pksnrpscoregenes:
+clusterpksnrpsgenes.append(j)
+if len(clusterpksnrpsgenes) > 0:
+nrpspksclusters.append(genecluster)
+pksgenes = 0
+clusterpksgenes = []
+nrpsgenes = 0
+clusternrpsgenes = []
+hybridgenes = 0
+clusterhybridgenes = []
+for j in clusterpksnrpsgenes:
+k = nrpspkstypedict[j]
+if "PKS" in k and "NRPS" not in k:
+pksgenes += 1
+clusterpksgenes.append(j)
+elif "PKS" not in k and "NRPS" in k:
+nrpsgenes += 1
+clusternrpsgenes.append(j)
+elif "PKS/NRPS" in k:
+if ("PKS_KS" in domainnamesdict[j] or "PKS_AT" in domainnamesdict[j]) and ("AMP-binding" not in domainnamesdict[j] and "A-OX" not in domainnamesdict[j] and "Condensation" not in domainnamesdict[j]):
+pksgenes += 1
+clusterpksgenes.append(j)
+elif ("PKS_KS" not in domainnamesdict[j] and  "PKS_AT" not in domainnamesdict[j]) and ("AMP-binding" in domainnamesdict[j] or "A-OX" in domainnamesdict[j] or "Condensation" in domainnamesdict[j]):
+nrpsgenes += 1
+clusternrpsgenes.append(j)
+elif "PKS" in k and "NRPS" in k:
+hybridgenes += 1
+clusterhybridgenes.append(j)
+#If more than three PKS genes, use dock_dom_analysis if possible to identify order
+dock_dom_analysis = "failed"
+if pksgenes > 3 and nrpsgenes == 0 and hybridgenes == 0:
+#print "Predicting PKS gene order by docking domain sequence analysis"
+logfile.write("Predicting PKS gene order by docking domain sequence analysis")
+dockhtmlfile = open(htmlfolder + "docking_analysis" + str(genecluster) + ".html","w")
+#Find first and last genes based on starter module and TE / TD
+startergene = ""
+endinggene = ""
+for k in clusterpksgenes:
+if "Thioesterase" in domainnamesdict[k] or "TD" in domainnamesdict[k]:
+if endinggene == "":
+endinggene = k
+else:
+endinggene = ""
+if len(domainnamesdict[k]) >=2 and  "PKS_AT" == domainnamesdict[k][0] and "ACP" == domainnamesdict[k][1]:
+if startergene == "":
+startergene = k
+else:
+startergene = ""
+if startergene == "":
+for k in clusterpksgenes:
+if len(domainnamesdict[k]) >=3 and "PKS_KS" == domainnamesdict[k][0] and "PKS_AT" == domainnamesdict[k][1] and "ACP" == domainnamesdict[k][2]:
+if startergene == "":
+startergene = k
+else:
+startergene = ""
+break
+#Extract N-terminal 50 residues of each non-starting protein, scan for docking domains using hmmsearch, parse output to locate interacting residues
+ntermintresdict = {}
+ntermnames = []
+ntermseqs = []
+for k in clusterpksgenes:
+if k != startergene:
+ntermnames.append(k)
+seq = seqdict[k]
+ntermseqs.append(seq[:50])
+ntermfasta = "docking_analysis/input.fasta"
+z = 0
+for k in ntermnames:
+writefasta([ntermnames[z]],[ntermseqs[z]],ntermfasta)
+os.chdir("docking_analysis")
+os.system("muscle -profile -quiet -in1 nterm.fasta -in2 input.fasta -out muscle.fasta")
+intresidues = extractpositions("nterm.fasta","muscle.fasta",[2,15],"EryAIII_5_6_ref",ntermnames[z])
+ntermintresdict[ntermnames[z]] = intresidues
+os.chdir("..")
+z += 1
+#Extract C-terminal 100 residues of each non-ending protein, scan for docking domains using hmmsearch, parse output to locate interacting residues
+ctermintresdict = {}
+ctermnames = []
+ctermseqs = []
+for k in clusterpksgenes:
+if k != endinggene:
+ctermnames.append(k)
+seq = seqdict[k]
+ctermseqs.append(seq[-100:])
+ctermfasta = "docking_analysis/input.fasta"
+z = 0
+for k in ctermnames:
+writefasta([ctermnames[z]],[ctermseqs[z]],ctermfasta)
+os.chdir("docking_analysis")
+os.system("muscle -profile -quiet -in1 cterm.fasta -in2 input.fasta -out muscle.fasta")
+intresidues = extractpositions("cterm.fasta","muscle.fasta",[55,64],"EryAII_ref",ctermnames[z])
+ctermintresdict[ctermnames[z]] = intresidues
+os.chdir("..")
+z += 1
+#If docking domains found in all, check for optimal order using interacting residues
+genes_to_order = []
+z = 0
+for k in clusterpksgenes:
+if k == startergene or k == endinggene:
+pass
+else:
+genes_to_order.append(k)
+z += 1
+possible_orders = list(itertools.permutations(genes_to_order,len(genes_to_order)))
+hydrophobic = ["A","V","I","L","F","W","Y","M"]
+positivecharge = ["H","K","R"]
+negativecharge = ["D","E"]
+other = ["C","G","P","S","T","N","Q","X","U"]
+possible_orders_scoredict = {}
+for k in possible_orders:
+score = 0
+interactions = []
+z = 0
+for l in k[:-1]:
+interactions.append([l,k[z + 1]])
+z += 1
+for l in interactions:
+res1a = ctermintresdict[l[0]][0]
+res1b = ntermintresdict[l[1]][0]
+res2a = ctermintresdict[l[0]][1]
+res2b = ntermintresdict[l[1]][1]
+if (res1a in hydrophobic and res1b in hydrophobic) or (res1a in positivecharge and res1b in negativecharge) or (res1a in negativecharge and res1b in positivecharge):
+score += 1
+if (res1a in positivecharge and res1b in positivecharge) or (res1a in negativecharge and res1b in negativecharge):
+score = score - 1
+if (res2a in hydrophobic and res2b in hydrophobic) or (res2a in positivecharge and res2b in negativecharge) or (res2a in negativecharge and res2b in positivecharge):
+score += 1
+if (res2a in positivecharge and res2b in positivecharge) or (res2a in negativecharge and res2b in negativecharge):
+score = score - 1
+possible_orders_scoredict[k] = score
+ranked_orders = sortdictkeysbyvaluesrev(possible_orders_scoredict)
+ranked_orders_part = []
+ranked_orders2 = []
+a = 0
+ranked_orders_len = len(ranked_orders) - 1
+for i in ranked_orders:
+if a == 0:
+score = possible_orders_scoredict[i]
+ranked_orders_part.append(i)
+elif a == ranked_orders_len:
+ranked_orders_part.append(i)
+ranked_orders2 = ranked_orders2 + ranked_orders_part
+else:
+if possible_orders_scoredict[i] == score:
+ranked_orders_part.append(i)
+else:
+ranked_orders_part.reverse()
+ranked_orders2 = ranked_orders2 + ranked_orders_part
+score = possible_orders_scoredict[i]
+ranked_orders_part = []
+ranked_orders_part.append(i)
+a += 1
+ranked_orders = ranked_orders2[:1000]
+geneorders = ranked_orders
+geneorders2 = []
+for l in geneorders:
+geneorder = []
+if startergene != "":
+geneorder.append(startergene)
+[ geneorder.append(m) for m in l ]
+#for m in l:
+#  geneorder.append(m)
+if endinggene != "":
+geneorder.append(endinggene)
+geneorders2.append(geneorder)
+geneorders = geneorders2
+if len(ranked_orders) == 1000:
+dockhtmlfile.write('<html>\n<head>\n<LINK href="style.css" rel="stylesheet" type="text/css">\n</head>\n<body>\nDocking domain analysis.  Score for 1000 highest scoring gene orders:<br><br><table border=1>\n')
+else:
+dockhtmlfile.write('<html>\n<head>\n<LINK href="style.css" rel="stylesheet" type="text/css">\n</head>\n<body>\nDocking domain analysis. Scores for all possible gene orders:<br><br><table border=1>\n')
+dockhtmlfile.write('<tr><td><b>Gene order</b></td><td><b>Score</b></td></tr>\n')
+for l in geneorders:
+string = "<tr><td>"
+for m in l:
+string = string + m + ","
+if startergene != "" and endinggene != "":
+string = string[:-1] + "</td><td>" + str(possible_orders_scoredict[tuple(l[1:-1])])
+elif startergene == "" and endinggene != "":
+string = string[:-1] + "</td><td>" + str(possible_orders_scoredict[tuple(l[:-1])])
+elif startergene != "" and endinggene == "":
+string = string[:-1] + "</td><td>" + str(possible_orders_scoredict[tuple(l[1:])])
+elif startergene == "" and endinggene == "":
+string = string[:-1] + "</td><td>" + str(possible_orders_scoredict[tuple(l)])
+dockhtmlfile.write(string + "</td></tr>\n")
+dockhtmlfile.write('\n</table></body></html>')
+dockhtmlfile.close()
+#print "Predicting PKS gene order by docking domain sequence analysis succeeded."
+#Write html outfile with docking domain analysis output
+#
+logfile.write("Predicting PKS gene order by docking domain sequence analysis succeeded.")
+dockingdomainanalysis.append(genecluster)
+#If NRPS genes, mixed NRPS/PKS genes, PKS genes without detected docking domains, or clusters with a 1-3 PKS genes, assume colinearity
+direction = 0
+for k in clusterpksnrpsgenes:
+if strandsdict[k] == "+":
+direction += 1
+elif strandsdict[k] == "-":
+direction = direction - 1
+if direction < 0:
+clusterpksnrpsgenes.reverse()
+if "Thioesterase" in domainnamesdict[clusterpksnrpsgenes[0]] or "TD" in domainnamesdict[clusterpksnrpsgenes[0]]:
+clusterpksnrpsgenes.reverse()
+geneorder = clusterpksnrpsgenes
+#Generate substrates order from predicted gene order and consensus predictions
+prediction = ""
+for k in geneorder:
+domains = domainnamesdict[k]
+nra = 0
+nrat = 0
+nrcal = 0
+for l in domains:
+if "PKS_AT" in l:
+nrat += 1
+prediction = prediction + consensuspreds[k + "_AT" + str(nrat)] + " "
+if "AMP-binding" in l or "A-OX" in l:
+nra += 1
+prediction = prediction + consensuspreds[k + "_A" + str(nra)] + " "
+if "CAL_domain" in l:
+nrcal += 1
+prediction = prediction + consensuspreds[k + "_CAL" + str(nrcal)] + " "
+prediction = prediction[:-1]
+compound_pred_dict[genecluster] = prediction
+a += 1
+#Combine predictions into a prediction of the final chemical structure and generate images
+os.chdir("NRPeditor")
+failedstructures = []
+for i in geneclusters:
+genecluster = i
+if compound_pred_dict.has_key(genecluster):
+residues = compound_pred_dict[genecluster]
+nrresidues = len(residues.split(" "))
+if nrresidues > 1:
+if sys.platform == ('win32'):
+structcommand = 'main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "'
+elif sys.platform == ('linux2'):
+structcommand = './main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "'
+for i in residues.split(" "):
+structcommand = structcommand + i + " "
+structcommand = structcommand + 'TE"'
+smilesinfo = os.popen(structcommand)
+smilesinfo = smilesinfo.read()
+smiles_string = (smilesinfo.split("core peptide: ")[1]).split("\ntermintype")[0]
+if sys.platform == ('linux2'):
+smiles_string.replace("[X]","[*:X]")
+smiles_string2 = ""
+a = 1
+for k in smiles_string:
+if k == "X":
+smiles_string2 = smiles_string2 + str(a)
+a += 1
+else:
+smiles_string2 = smiles_string2 + k
+smiles_string = smiles_string2
+smilesfile = open("genecluster" + str(genecluster) + ".smi","w")
+smilesfile.write(smiles_string)
+smilesfile.close()
+depictstatus = depict_smile(genecluster,structuresfolder)
+if depictstatus == "failed":
+failedstructures.append(genecluster)
+elif clusterinfo[genecluster][0] == "ectoine":
+smiles_string = "CC1=NCCC(N1)C(=O)O"
+smilesfile = open("genecluster" + str(genecluster) + ".smi","w")
+smilesfile.write(smiles_string)
+smilesfile.close()
+depictstatus = depict_smile(genecluster,structuresfolder)
+if depictstatus == "failed":
+failedstructures.append(genecluster)
+elif genecluster in failedstructures:
+del failedstructures[failedstructures.index(genecluster)]
+compound_pred_dict[genecluster] = "ectoine "
+os.chdir("..")
+elapsed = (time.time() - starttime)
+#print "5826 Time since start: " + str(elapsed)
+#ClusterBlast
+if clusterblast == "y":
+#Load gene cluster database into memory
+#print "ClusterBlast: Loading gene clusters database into memory..."
+logfile.write("ClusterBlast: Loading gene clusters database into memory...\n")
+os.chdir(genomename + "/clusterblast")
+#file = open( os.path.join(antismash_path, "clusterblast/geneclusters.txt") ,"r")
+#filetext = file.read()
+#lines = filetext.split("\n")
+clusters = {}
+#for i in open(os.path.join(antismash_path, "clusterblast/geneclusters.txt")):
+bin_path = os.path.join(antismash_path, "clusterblast/geneclusters.bin")
+if os.path.exists( bin_path ):
+clusters = cPickle.load( open(bin_path) )
+#print clusters
+else:
+for line in open( os.path.join(antismash_path, "clusterblast/geneclusters.txt") ,"r"):
+line = line.strip()
+tabs = line.split("\t")
+accession = tabs[0]
+clusterdescription = tabs[1]
+clusternr = tabs[2]
+clustertype = tabs[3]
+clustername = accession + "_" + clusternr
+clustertags = tabs[4].split(";")
+clusterprots = tabs[5].split(";")
+clusters[clustername] = [clusterprots,clusterdescription,clustertype,clustertags]
+cPickle.dump(clusters, open(bin_path, 'w'), -1)
+#Load gene cluster database proteins info into memory
+#print "ClusterBlast: Loading gene cluster database proteins into memory..."
+logfile.write("ClusterBlast: Loading gene cluster database proteins into memory...\n")
+#file = open( os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta") ,"r")
+#filetext = file.read()
+#filetext = filetext.replace("\r","\n")
+#lines = filetext.split("\n")
+proteingeneclusters = {}
+proteinlocations = {}
+proteinstrands = {}
+proteinannotations = {}
+proteintags = {}
+bin_path = os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta.bin")
+if os.path.exists( bin_path ):
+(proteingeneclusters, proteinlocations, proteinstrands, proteinannotations, proteintags) = cPickle.load( open(bin_path, 'r') )
+else:
+for line in open( os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta") ,"r"):
+line = line.replace('\n', '')
+if line.startswith(">"):
+tabs = line.split("|")
+#print 'Protein:', tabs
+protein = tabs[6]
+locustag = tabs[4]
+if accessiondict.has_key(locustag):
+locustag = "h_" + locustag
+proteintags[protein] = locustag
+clustername = tabs[0] + "_" + tabs[1]
+proteingeneclusters[protein] = clustername
+location = tabs[2]
+proteinlocations[protein] = location
+strand = tabs[3]
+proteinstrands[protein] = strand
+annotation = tabs[5]
+proteinannotations[protein] = annotation
+cPickle.dump([proteingeneclusters, proteinlocations, proteinstrands, proteinannotations, proteintags], open(bin_path, 'w'), -1)
+#Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly
+#print "Finding internal homologs in each gene cluster.."
+logfile.write("Finding internal homologs in each gene cluster..\n")
+internalhomologygroupsdict = {}
+for i in geneclusters:
+clusternumber = i
+#Create input fasta files for BLAST search
+queryclusterprotslist = clusterinfo[i][4]
+queryclusterprots = []
+for i in queryclusterprotslist:
+queryclusterprots.append(i[4])
+queryclusternames = []
+queryclusterseqs = []
+for i in queryclusterprots:
+seq = seqdict[i]
+name = fullnamedict[i]
+queryclusterseqs.append(seq)
+queryclusternames.append(name)
+writefasta(queryclusternames,queryclusterseqs,"internal_input.fasta")
+#Run and parse BLAST search
+makeblastdbcommand = "makeblastdb -in internal_input.fasta -out internal_input.fasta -dbtype prot"
+blastsearch = "blastp  -db internal_input.fasta -query internal_input.fasta -outfmt 6 -max_target_seqs 1000 -evalue 1e-05 -out internal_input.out"
+if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+os.popen(makeblastdbcommand)
+os.popen(blastsearch)
+else:
+os.system(makeblastdbcommand)
+os.system(blastsearch)
+#print "5920 makeblastdb finised"
+blastoutput = open("internal_input.out","r").read()
+minseqcoverage = 25
+minpercidentity = 30
+seqlengths = fastaseqlengths(proteins)
+iblastinfo = blastparse(blastoutput,minseqcoverage,minpercidentity,seqlengths,geneclustergenes)
+iblastdict = iblastinfo[0]
+iquerylist = iblastinfo[1]
+#find and store internal homologs
+groups = []
+for j in queryclusternames:
+jsplit = j.split("|")[4]
+if iblastdict.has_key(j):
+hits = iblastdict[j][0]
+group = []
+for k in hits:
+if k[:2] == "h_":
+group.append(k[2:])
+elif k.count("|") > 4:
+group.append(k.split("|")[4])
+else:
+group.append(k)
+if jsplit not in group:
+group.append( jsplit )
+x = 0
+for l in groups:
+for m in group:
+if m in l:
+del groups[x]
+[group.append(n) for n in l if n not in group]
+#for n in l:
+#  if n not in group:
+#    group.append(n)
+break
+x += 1
+group.sort()
+groups.append(group)
+else:
+groups.append([ jsplit ])
+internalhomologygroupsdict[clusternumber] = groups
+#Run BLAST on gene cluster proteins of each cluster and parse output
+#print "5961 Running NCBI BLAST+ gene cluster searches.."
+logfile.write("Running NCBI BLAST+ gene cluster searches..\n")
+for i in geneclusters:
+clusternumber = i
+#print "   Gene cluster " + str(clusternumber)
+#Create input fasta files for BLAST search
+queryclusterprotslist = clusterinfo[i][4]
+queryclusterprots = []
+for i in queryclusterprotslist:
+queryclusterprots.append(i[4])
+queryclusternames = []
+queryclusterseqs = []
+for i in queryclusterprots:
+seq = seqdict[i]
+name = fullnamedict[i]
+queryclusterseqs.append(seq)
+queryclusternames.append(name)
+equalpartsizes = int(len(queryclusternames)/nrcpus)
+for i in range(nrcpus):
+if i == 0:
+setnames = queryclusternames[:equalpartsizes]
+setseqs = queryclusterseqs[:equalpartsizes]
+elif i == (nrcpus - 1):
+setnames = queryclusternames[(i*equalpartsizes):]
+setseqs = queryclusterseqs[(i*equalpartsizes):]
+else:
+setnames = queryclusternames[(i*equalpartsizes):((i+1)*equalpartsizes)]
+setseqs = queryclusterseqs[(i*equalpartsizes):((i+1)*equalpartsizes)]
+writefasta(setnames,setseqs,"input" + str(i) + ".fasta")
+processes = []
+processnames = []
+for i in range(nrcpus):
+processes.append(Process(target=runblast, args=["input" + str(i) + ".fasta"]))
+[i.start() for i in processes]
+time.sleep(10)
+while True:
+processrunning = "n"
+for i in processes:
+if i.is_alive():
+processrunning = "y"
+if processrunning == "y":
+time.sleep(5)
+else:
+break
+[i.join() for i in processes]
+blastoutput = ""
+for i in range(nrcpus):
+output = open("input" + str(i) + ".out","r")
+output = output.read()
+blastoutput = blastoutput + output
+os.chdir("..")
+blastoutputfile = open("./clusterblastoutput.txt","w")
+blastoutputfile.write(blastoutput)
+blastoutputfile.close()
+os.chdir("clusterblast")
+#print "   Blast search finished. Parsing results..."
+logfile.write("   Blast search finished. Parsing results...\n")
+minseqcoverage = 25
+minpercidentity = 30
+seqlengths = fastaseqlengths(proteins)
+blastinfo = blastparse(blastoutput,minseqcoverage,minpercidentity,seqlengths,geneclustergenes)
+blastdict = blastinfo[0]
+querylist = blastinfo[1]
+#Remove queries without hits
+querylist2 = []
+for i in querylist:
+if blastdict.has_key(i):
+querylist2.append(i)
+else:
+pass
+querylist = querylist2
+hitclusters = blastinfo[2]
+#Score BLAST output on all gene clusters
+#Rank gene cluster hits based on 1) number of protein hits covering >25% sequence length or at least 100aa alignment, with >30% identity and 2) cumulative blast score
+#Find number of protein hits and cumulative blast score for each gene cluster
+#print "   Scoring Blast outputs on database of gene clusters..."
+logfile.write("   Scoring Blast outputs on database of gene clusters...\n")
+hitclusterdict = {}
+hitclusterdata = {}
+for i in hitclusters:
+hitclusterdatalist = []
+nrhits = float(0)
+nrcoregenehits = float(0)
+cumblastscore = float(0)
+hitpositions = []
+hitposcorelist = []
+for j in querylist:
+querynrhits = 0
+querycumblastscore = float(0)
+nrhitsplus = "n"
+for k in blastdict[j][0]:
+if i == blastdict[j][1][k][0]:
+if [querylist.index(j),clusters[i][0].index(blastdict[j][1][k][9])] not in hitpositions:
+nrhitsplus = "y"
+querynrhits += 1
+blastscore = float(blastdict[j][1][k][6]) / 1000000
+querycumblastscore = querycumblastscore + blastscore
+hitclusterdatalist.append([j,k,blastdict[j][1][k][5],blastdict[j][1][k][6],blastdict[j][1][k][7],blastdict[j][1][k][8]])
+hitclusterdata[i] = hitclusterdatalist
+hitpositions.append([querylist.index(j),clusters[i][0].index(blastdict[j][1][k][9])])
+if nrhitsplus == "y":
+nrhits += 1
+if j.split("|")[4] in allcoregenes:
+nrcoregenehits += 0.1
+for hit in range(querynrhits):
+hitposcorelist.append(1)
+else:
+for hit in range(querynrhits):
+hitposcorelist.append(0)
+cumblastscore = cumblastscore + float(querycumblastscore)
+query_givenscores_querydict = {}
+query_givenscores_hitdict = {}
+#Find groups of hits
+hitgroupsdict = {}
+for p in hitpositions:
+if not hitgroupsdict.has_key(p[0]):
+hitgroupsdict[p[0]] = [p[1]]
+else:
+hitgroupsdict[p[0]].append(p[1])
+#Calculate synteny score; give score only if more than one hits (otherwise no synteny possible), and only once for every query gene and every hit gene
+synteny_score = 0
+z = 1
+if nrhits > 1:
+for p in hitpositions[:-1]:
+tandem = "n"
+#Check if a gene homologous to this gene has already been scored for synteny in the previous entry
+if p[1] in hitgroupsdict[hitpositions[z][0]]:
+tandem = "y"
+#Score entry
+if ((not query_givenscores_querydict.has_key(p[0])) or query_givenscores_querydict[p[0]] == 0) and ((not query_givenscores_hitdict.has_key(p[1])) or query_givenscores_hitdict[p[1]] == 0) and tandem == "n":
+q = hitpositions[z]
+if (abs(p[0] - q[0]) < 2) and abs(p[0]-q[0]) == abs(p[1]-q[1]):
+synteny_score += 1
+if hitposcorelist[z - 1] == 1 or hitposcorelist[z] == 1:
+synteny_score += 1
+query_givenscores_querydict[p[0]] = 1
+query_givenscores_hitdict[p[1]] = 1
+else:
+query_givenscores_querydict[p[0]] = 0
+query_givenscores_hitdict[p[1]] = 0
+z += 1
+#Give bonus to gene clusters with >0 core gene hits
+if nrcoregenehits > 0:
+corebonus = 3
+else:
+corebonus = 0
+#sorting score is based on number of hits (discrete values) & cumulative blast score (behind comma values)
+sortingscore = nrhits + synteny_score + corebonus + nrcoregenehits + cumblastscore
+hitclusterdict[i] = sortingscore
+#Sort gene clusters
+rankedclusters = sortdictkeysbyvaluesrev(hitclusterdict)
+rankedclustervalues = sortdictkeysbyvaluesrevv(hitclusterdict)
+#Output for each hit: table of genes and locations of input cluster, table of genes and locations of hit cluster, table of hits between the clusters
+#print "   Writing output file..."
+logfile.write("   Writing output file...\n")
+#os.chdir("..")
+#os.chdir(genomename)
+#os.chdir("clusterblast")
+out_file = open("cluster" + str(clusternumber) + ".txt","w")
+out_file.write("ClusterBlast scores for " + infile)
+out_file.write("\n\nTable of genes, locations, strands and annotations of query cluster:\n")
+#out_file.write("\n")
+#out_file.write("Table of genes, locations, strands and annotations of query cluster:")
+#out_file.write("\n")
+for i in queryclusterprots:
+out_file.write("%s\t%s\t%s\t%s\t%s\t\n" % (i, proteins[3][i][0], proteins[3][i][1], proteins[3][i][2], proteins[3][i][3]))
+"""out_file.write(i)
+out_file.write("\t")
+out_file.write(proteins[3][i][0])
+out_file.write("\t")
+out_file.write(proteins[3][i][1])
+out_file.write("\t")
+out_file.write(proteins[3][i][2])
+out_file.write("\t")
+out_file.write(proteins[3][i][3])
+out_file.write("\t")
+out_file.write("\n")"""
+out_file.write("\n\nSignificant hits: \n")
+#out_file.write("\n")
+#out_file.write("Significant hits: ")
+#out_file.write("\n")
+z = 0
+for i in rankedclusters[:100]:
+#out_file.write(str(z+1) + ". " + i + "\t" + clusters[i][1])
+#out_file.write("\n")
+out_file.write("%s. %s\t%s\n" % ((z+1), i, clusters[i][1]) )
+z += 1
+out_file.write("\n\n")
+#out_file.write("\n")
+z = 0
+out_file.write("Details:")
+for i in rankedclusters[:100]:
+value = str(rankedclustervalues[z])
+nrhits = value.split(".",1)[0]
+if nrhits > 0:
+cumblastscore = str(int(float(value.split(".")[1])))
+out_file.write("\n\n>>\n\n%s. %s\nSource: %s\nType: %s\nNumber of proteins with BLAST hits to this cluster: %s\nCumulative BLAST score: %s\n\nTable of genes, locations, strands and annotations of subject cluster:\n" % (z+1, i, clusters[i][1], clusters[i][2], nrhits, cumblastscore))
+clusterproteins = clusters[i][0]
+#print 'clusterproteins\n\n', clusterproteins
+"""out_file.write("\n\n")
+out_file.write(">>")
+out_file.write("\n")
+cumblastscore = str(int(float(value.split(".")[1])))
+out_file.write("\n")
+out_file.write(str(z+1) + ". " + i)
+out_file.write("\n")
+out_file.write("Source: " + clusters[i][1])
+out_file.write("\n")
+out_file.write("Type: " + clusters[i][2])
+out_file.write("\n")
+out_file.write("Number of proteins with BLAST hits to this cluster: " + nrhits)
+out_file.write("\n")
+out_file.write("Cumulative BLAST score: " + cumblastscore)
+out_file.write("\n")
+out_file.write("\n")
+out_file.write("Table of genes, locations, strands and annotations of subject cluster:")
+out_file.write("\n")
+clusterproteins = clusters[i][0]"""
+for j in clusterproteins:
+#print '##########asdfasdf######', j, '---'+proteinlocations.keys()[0]+ '---', proteinannotations.has_key(j), proteinstrands.has_key(j), proteinlocations.has_key(j)
+if proteinlocations.has_key(j) and proteinannotations.has_key(j) and proteinstrands.has_key(j):
+if proteintags[j] == "no_locus_tag":
+out_file.write(j)
+else:
+out_file.write(proteintags[j])
+out_file.write( "\t%s\t%s\t%s\t%s\t%s\n" % (j, proteinlocations[j].split("-")[0], proteinlocations[j].split("-")[1], proteinstrands[j], proteinannotations[j]) )
+"""out_file.write("\t")
+out_file.write(j)
+out_file.write("\t")
+out_file.write(proteinlocations[j].split("-")[0])
+out_file.write("\t")
+out_file.write(proteinlocations[j].split("-")[1])
+out_file.write("\t")
+out_file.write(proteinstrands[j])
+out_file.write("\t")
+out_file.write(proteinannotations[j])
+out_file.write("\n")
+"""
+out_file.write("\nTable of Blast hits (query gene, subject gene, %identity, blast score, %coverage, e-value):\n")
+if i in hitclusterdata.keys():
+tabledata = hitclusterdata[i]
+for x in tabledata:
+w = 0
+for y in x:
+if w == 0:
+out_file.write( "%s\t" % y.split("|")[4] )
+#out_file.write("\t")
+w += 1
+else:
+out_file.write("%s\t" % y)
+#out_file.write("\t")
+out_file.write("\n")
+else:
+"data not found"
+out_file.write("\n")
+out_file.write("\n")
+z += 1
+#os.chdir("..")
+#os.chdir("..")
+#os.chdir("clusterblast")
+os.chdir("..")
+out_file.close()
+elapsed = (time.time() - starttime)
+#print "Time since start: " + str(elapsed)
+#smCOG analysis
+smcogtreedict = {}
+if smcogs == "y":
+#print "Performing smCOG analysis"
+logfile.write("Performing smCOG analysis\n")
+hmmsearch = hmmscan_path + " --cpu " + str(nrcpus) + " -E 1E-6 -o " + "./smcogs/smcogshmm_output.txt" + " --noali --tblout " + "./smcogs/smcogshmm.txt "+ hmms_path +"smcogs.hmm " + "./clusterblast/geneclusterprots.fasta"
+#print hmmsearch
+os.system(hmmsearch)
+#print 'finised'
+smcoghmmlengthsdict = hmmlengths(hmms_path+"smcogs.hmm")
+smcogdict = hmmscanparse("./smcogs/smcogshmm_output.txt", smcoghmmlengthsdict)
+smcogdict2 = {}
+for i in smcogdict.keys():
+newkey = i.split("|")[4]
+smcogdict2[newkey] = smcogdict[i]
+smcogdict = smcogdict2
+#Write output
+#os.chdir(genomename)
+os.chdir("smcogs")
+smcogfile = open("smcogs.txt","w")
+for k in geneclustergenes:
+if k not in pksnrpscoregenes:
+l = smcogdict[k]
+smcogfile.write(">> " + k + "\n")
+smcogfile.write("name\tstart\tend\te-value\tscore\n")
+smcogfile.write("** smCOG hits **\n")
+for i in l:
+smcogfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n")
+smcogfile.write("\n\n")
+smcogfile.close()
+os.chdir("..")
+os.chdir("..")
+#smCOG phylogenetic tree construction
+#print "Calculating and drawing phylogenetic trees of cluster genes with smCOG members"
+logfile.write("Calculating and drawing phylogenetic trees of cluster genes with smCOG members")
+os.chdir("smcogtree")
+smcoganalysisgenes = []
+#for k in geneclustergenes:
+#  if k not in pksnrpscoregenes:
+#    smcoganalysisgenes.append(k)
+[smcoganalysisgenes.append(k) for k in geneclustergenes if k not in pksnrpscoregenes]
+smcogsets = []
+equalpartsizes = int(len(smcoganalysisgenes)/nrcpus)
+for i in range(nrcpus):
+if i == 0:
+geneslist = smcoganalysisgenes[:equalpartsizes]
+elif i == (nrcpus - 1):
+geneslist = smcoganalysisgenes[(i*equalpartsizes):]
+else:
+geneslist = smcoganalysisgenes[(i*equalpartsizes):((i+1)*equalpartsizes)]
+smcogsets.append(geneslist)
+processes = []
+processnames = []
+z = 0
+for k in smcogsets:
+processes.append(Process(target=smcog_analysis, args=[k,z,accessiondict,seqdict,smcogdict,smcogsoutputfolder]))
+z += 1
+for k in processes:
+k.start()
+time.sleep(1)
+while True:
+processrunning = "n"
+for k in processes:
+if k.is_alive():
+processrunning = "y"
+if processrunning == "y":
+time.sleep(5)
+else:
+break
+for k in processes:
+k.join()
+os.chdir("..")
+currentpath = os.getcwd()
+os.chdir(smcogsoutputfolder)
+dircontents = getdircontents()
+for k in dircontents:
+#POTENTIAL pERFORMANCE gainfor k in glob.glob('*.png'):
+if ".png" in k:
+tag = k.split(".png")[0]
+smcogtreedict[tag] = tag + ".png"
+os.chdir(currentpath)
+##Visualization
+#Read in ClusterBlast data
+#Read in PubMed / PubChem links of database gene clusters
+if clusterblast == "y":
+if genomename in os.getcwd():
+os.chdir('..')
+pubmed_dict = {}
+pubchem_dict = {}
+known_compound_dict = {}
+#pubfile = open(antismash_path + "pubmed_pubchem_links.txt","r")
+#pubfile = pubfile.read()
+#publines = pubfile.split("\n")
+#for i in publines:
+bin_path = os.path.join(antismash_path, "pubmed_pubchem_links.bin")
+if os.path.exists( bin_path ):
+(pubmed_dict, pubchem_dict, known_compound_dict) = cPickle.load( open(bin_path) )
+else:
+for line in open(antismash_path + "pubmed_pubchem_links.txt","r"):
+line = line.replace('\n', '')
+tabs = line.split("\t")
+acc = tabs[0]
+if tabs[1] != "":
+pubmed_dict[acc] = tabs[1]
+if tabs[2] != "":
+pubchem_dict[acc] = tabs[2]
+if tabs[3] != "":
+known_compound_dict[acc] = tabs[3]
+cPickle.dump([pubmed_dict, pubchem_dict, known_compound_dict], open(bin_path, 'w'), -1)
+#print "Writing visualization SVGs and XHTML"
+logfile.write("Writing visualization SVGs and XHTML\n")
+queryclusterdata = {}
+nrhitgeneclusters = {}
+cblastclusternr = 1
+#print os.getcwd()
+if clusterblast == "y":
+for x in geneclusters:
+clusterblastfile = open(clusterblastoutputfolder + "cluster" + str(x) + ".txt","r")
+#print clusterblastfile
+clusterblastfile = clusterblastfile.read()
+clusterblastfile = clusterblastfile.replace("\r","\n")
+toptenhitclusters = []
+#Identify top ten hits for visualization
+hitlines = ((clusterblastfile.split("Significant hits: \n")[1]).split("\nDetails:")[0]).split("\n")
+#print '\n\n#######hitlines\n', hitlines
+a = 0
+cb_accessiondict = {}
+b = 1
+for i in hitlines:
+if " " in i:
+cb_accessiondict[b] = (i.split("\t")[0]).split(" ")[1]
+if genomic_accnr == "" or genomic_accnr not in i:
+b += 1
+if a < 10:
+if len(i) < 80:
+toptenhitclusters.append(i)
+elif len(i) >= 80:
+j = i[0:77] + "..."
+toptenhitclusters.append(j)
+a += 1
+#print clusterblastfile
+details = (clusterblastfile.split("\nDetails:")[1]).split(">>")[1:]
+#print details
+nrhitclusters = len(toptenhitclusters)
+#Save query gene cluster data
+querylines = ((clusterblastfile.split("Table of genes, locations, strands and annotations of query cluster:\n")[1]).split("\n\n\nSignificant hits:")[0]).split("\n")
+queryclustergenes = []
+queryclustergenesdetails = {}
+for i in querylines:
+tabs = i.split("\t")
+queryclustergenes.append(tabs[0])
+queryclustergenesdetails[tabs[0]] = [tabs[1],tabs[2],tabs[3],tabs[4]]
+#For every gene cluster, store hit genes and details
+colorgroupsdict = {}
+hitclusterdata = {}
+hitclusternr = 1
+compound_found = "n"
+nrhitgeneclusters[x] = 0
+for i in details:
+hitclustergenes = []
+hitclustergenesdetails = {}
+#Only calculate for first ten hit gene clusters
+if genomic_accnr == "" or genomic_accnr not in i:
+if hitclusternr <= 10:
+nrhitgeneclusters[x] = hitclusternr
+accession = cb_accessiondict[hitclusternr]
+hitclustergeneslines = ((i.split("Table of genes, locations, strands and annotations of subject cluster:\n")[1]).split("\n\nTable of Blast hits ")[0]).split("\n")
+#print '***********\n', i, '\n'
+#print hitclustergeneslines
+for j in hitclustergeneslines:
+tabs = j.split("\t")
+hitclustergenes.append(tabs[0])
+hitclustergenesdetails[tabs[0]] = [tabs[2],tabs[3],tabs[4],tabs[5],tabs[1]]
+blasthitslines = ((i.split("%coverage, e-value):\n")[1]).split("\n\n")[0]).split("\n")
+querygeneswithhits = []
+coregeneswithhits = []
+blasthitdict = {}
+blastdetailsdict = {}
+querygenes = []
+revblasthitdict = {}
+hitgenes = []
+for k in blasthitslines:
+tabs = k.split("\t")
+if tabs[0] not in querygeneswithhits:
+querygeneswithhits.append(tabs[0])
+if tabs[0] in allcoregenes and tabs[0] not in coregeneswithhits:
+coregeneswithhits.append(tabs[0])
+if blasthitdict.has_key(tabs[0]):
+hits = blasthitdict[tabs[0]]
+hits.append(tabs[1])
+blasthitdict[tabs[0]] = hits
+if revblasthitdict.has_key(tabs[1]):
+revhits = revblasthitdict[tabs[1]]
+revhits.append(tabs[0])
+revblasthitdict[tabs[1]] = revhits
+else:
+revblasthitdict[tabs[1]] = [tabs[0]]
+blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
+if tabs[0] not in querygenes:
+querygenes.append(tabs[0])
+hitgenes.append(tabs[1])
+else:
+blasthitdict[tabs[0]] = [tabs[1]]
+if revblasthitdict.has_key(tabs[1]):
+revhits = revblasthitdict[tabs[1]]
+revhits.append(tabs[0])
+revblasthitdict[tabs[1]] = revhits
+else:
+revblasthitdict[tabs[1]] = [tabs[0]]
+blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
+if tabs[0] not in querygenes:
+querygenes.append(tabs[0])
+hitgenes.append(tabs[1])
+for k in known_compound_dict.keys():
+if k in i and compound_found == "n" and len(querygeneswithhits) > 2 and len(coregeneswithhits) > 0:
+ws0.write(x,4,known_compound_dict[k])
+compound_found = "y"
+"""blasthitdict = {}
+blastdetailsdict = {}
+querygenes = []
+revblasthitdict = {}
+hitgenes = []
+for i in blasthitslines:
+tabs = i.split("\t")
+if blasthitdict.has_key(tabs[0]):
+hits = blasthitdict[tabs[0]]
+hits.append(tabs[1])
+blasthitdict[tabs[0]] = hits
+if revblasthitdict.has_key(tabs[1]):
+revhits = revblasthitdict[tabs[1]]
+revhits.append(tabs[0])
+revblasthitdict[tabs[1]] = revhits
+else:
+revblasthitdict[tabs[1]] = [tabs[0]]
+blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
+if tabs[0] not in querygenes:
+querygenes.append(tabs[0])
+hitgenes.append(tabs[1])
+else:
+blasthitdict[tabs[0]] = [tabs[1]]
+if revblasthitdict.has_key(tabs[1]):
+revhits = revblasthitdict[tabs[1]]
+revhits.append(tabs[0])
+revblasthitdict[tabs[1]] = revhits
+else:
+revblasthitdict[tabs[1]] = [tabs[0]]
+blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
+if tabs[0] not in querygenes:
+querygenes.append(tabs[0])
+hitgenes.append(tabs[1])
+"""
+#Make groups of genes for coloring
+colorgroups = []
+internalgroups = internalhomologygroupsdict[x]
+for i in internalgroups:
+querygenes_and_hits = []
+for j in i:
+#Make list of query gene and its hits
+additionalhits = []
+#For each hit, check if it was also hit by another gene; if so, only add it to the group if this hit had the lowest blast score
+otherscores = []
+queryscore = 0
+if blasthitdict.has_key(j):
+for k in blasthitdict[j]:
+for l in blastdetailsdict.keys():
+if k in l and j in l:
+queryscore = blastdetailsdict[l][1]
+elif k in l and j not in l:
+otherscores.append(blastdetailsdict[l][1])
+allscores = otherscores + [queryscore]
+if queryscore == max(allscores):
+additionalhits.append(k)
+#Add additional hits to the querygenes_and_hits list that will form a colorgroup
+querygenes_and_hits = querygenes_and_hits + additionalhits
+if j not in querygenes_and_hits:
+querygenes_and_hits.append(j)
+if len(querygenes_and_hits) > 0:
+colorgroups.append(querygenes_and_hits)
+colorgroupsdict[hitclusternr] = colorgroups
+hitclusterdata[hitclusternr] = [colorgroupsdict,hitclustergenes,hitclustergenesdetails,queryclustergenes,queryclustergenesdetails,toptenhitclusters,accession]
+hitclusternr += 1
+elif hitclusternr > 10 and hitclusternr <= 50:
+blasthitslines = ((i.split("%coverage, e-value):\n")[1]).split("\n\n")[0]).split("\n")
+querygeneswithhits = []
+coregeneswithhits = []
+for k in blasthitslines:
+tabs = k.split("\t")
+if tabs[0] not in querygeneswithhits:
+querygeneswithhits.append( tabs[0] )
+if tabs[0] in allcoregenes and tabs[0] not in coregeneswithhits:
+coregeneswithhits.append(tabs[0])
+for k in known_compound_dict.keys():
+if k in i and compound_found == "n" and len(querygeneswithhits) > 2 and len(coregeneswithhits) > 0:
+ws0.write(x,4,known_compound_dict[k])
+compound_found = "y"
+hitclusternr += 1
+queryclusterdata[cblastclusternr] = [nrhitclusters,hitclusterdata]
+cblastclusternr += 1
+wb.save(genomename + "/" + genomename + ".geneclusters.xls")
+#Gather and store data on each gene cluster
+gtrcoglist = ['SMCOG1045','SMCOG1062','SMCOG1102']
+transportercoglist = ['SMCOG1000','SMCOG1005','SMCOG1011','SMCOG1020','SMCOG1029','SMCOG1033','SMCOG1035','SMCOG1044','SMCOG1065','SMCOG1067','SMCOG1069','SMCOG1074','SMCOG1085','SMCOG1096','SMCOG1106','SMCOG1118','SMCOG1131','SMCOG1166','SMCOG1169','SMCOG1184','SMCOG1202','SMCOG1205','SMCOG1214','SMCOG1234','SMCOG1243','SMCOG1245','SMCOG1252','SMCOG1254','SMCOG1288']
+qgeneclusterdata = {}
+if smcogs == "y":
+smcogdict2 = {}
+smcogdescriptions = {}
+for i in smcogdict.keys():
+if len(smcogdict[i]) > 0 and len(smcogdict[i][0]) > 0 and ":" in smcogdict[i][0][0]:
+smcogdict2[i] = (smcogdict[i][0][0]).split(":")[0]
+smcogdescriptions[(smcogdict[i][0][0]).split(":")[0]] = (smcogdict[i][0][0]).split(":")[1]
+elif len(smcogdict[i]) > 0:
+smcogdict2[i] = smcogdict[i][0][0]
+smcogdict = smcogdict2
+for genecluster in geneclusters:
+clustergenes = clusterinfo[genecluster][4]
+clustergenes2 = []
+#for i in clustergenes:
+#  clustergenes2.append(i[4])
+[clustergenes2.append(i[4]) for i in clustergenes]
+clustergenes = clustergenes2
+clusternr = 1
+clustertype = clusterinfo[genecluster][0]
+annotations = {}
+colors = []
+starts = []
+ends = []
+strands = []
+pksnrpsprots = []
+gtrs = []
+transporters = []
+for j in clustergenes:
+annotations[j] = proteins[3][j][3]
+starts.append(int(proteins[3][j][0]))
+ends.append(int(proteins[3][j][1]))
+strands.append(proteins[3][j][2])
+if j in allcoregenes:
+colors.append("#810E15")
+else:
+colors.append("grey")
+if j in pksnrpscoregenes:
+pksnrpsprots.append(j)
+if smcogs == "y":
+if smcogdict.has_key(j) and len(smcogdict[j]) > 0 :
+if smcogdict[j][0] in gtrcoglist:
+gtrs.append(j)
+if smcogdict[j][0] in transportercoglist:
+transporters.append(j)
+clustersize = max(ends) - min(starts)
+if clusterblast == "n":
+nrhitgeneclusters = {}
+for i in geneclusters:
+nrhitgeneclusters[i] = 0
+hitgeneclusters = range(1,(nrhitgeneclusters[genecluster] + 1))
+hitgeneclusterdata = {}
+hitgeneclusterdata[genecluster] = [hitgeneclusters]
+pksnrpsprotsnames = nrpspkstypedict
+pksnrpsdomains = {}
+domlist = []
+domsdetails = {}
+substrspecnrpspredictordict = {}
+substrspecminowadict = {}
+substrspecpkssigdict = {}
+substrspecconsensusdict = {}
+krpredictionsdict = {}
+for i in pksnrpsprots:
+domlist = []
+domsdetails = {}
+doms = domaindict[i]
+for j in doms:
+nr = 1
+while j[0] + str(nr) in domlist:
+nr += 1
+domname = j[0] + str(nr)
+domlist.append(domname)
+domsdetails[domname] = [j[1],j[2]]
+if "AMP-binding" in domname or "A-OX" in domname:
+domname2 = i + "_" + "A" + str(nr)
+substrspecminowadict[domname2] = minowa_nrps_preds[i + "_A" + str(nr)]
+substrspecnrpspredictordict[domname2] = [nrps_code_preds[i + "_A" + str(nr)],nrps_svm_preds[i + "_A" + str(nr)]]
+substrspecconsensusdict[domname2] = consensuspreds[i + "_A" + str(nr)]
+if "PKS_AT" in domname:
+domname2 = i + "_" + "AT" + str(nr)
+substrspecminowadict[domname2] = minowa_pks_preds[i + "_AT" + str(nr)]
+substrspecpkssigdict[domname2] = pks_code_preds[i + "_AT" + str(nr)]
+substrspecconsensusdict[domname2] = consensuspreds[i + "_AT" + str(nr)]
+if "CAL_domain" in domname:
+domname2 = i + "_" + "CAL" + str(nr)
+substrspecminowadict[domname2] = minowa_cal_preds[i + "_CAL" + str(nr)]
+substrspecconsensusdict[domname2] = consensuspreds[i + "_CAL" + str(nr)]
+if "CAL_domain" in domname:
+domname2 = i + "_" + "CAL" + str(nr)
+substrspecminowadict[domname2] = minowa_cal_preds[i + "_CAL" + str(nr)]
+substrspecconsensusdict[domname2] = consensuspreds[i + "_CAL" + str(nr)]
+if "PKS_KR" in domname:
+domname2 = i + "_" + "KR" + str(nr)
+krpredictionsdict[domname2] = [kr_activity_preds[i + "_KR" + str(nr)],kr_stereo_preds[i + "_KR" + str(nr)]]
+pksnrpsdomains[i] = [domlist,domsdetails]
+if compound_pred_dict.has_key(genecluster):
+structpred = compound_pred_dict[genecluster]
+else:
+structpred = "N/A"
+qgeneclusterdata[genecluster] = [clustertype,clustersize,clustergenes,annotations,starts,ends,strands,pksnrpsprots,pksnrpsprotsnames,pksnrpsdomains,substrspecnrpspredictordict,substrspecminowadict,substrspecpkssigdict,substrspecconsensusdict,gtrs,transporters,colors,hitgeneclusterdata,structpred,krpredictionsdict]
+#Create genecluster svg for each gene cluster
+geneposdict = {}
+for qclusternr in geneclusters:
+data = qgeneclusterdata[qclusternr]
+#Some of the below 23 lines may already be internal to script, scan to remove unnecessary data fetching
+clustertype = data[0]
+clustersize = data[1]
+genes = data[2]
+annotations = data[3]
+starts = data[4]
+ends = data[5]
+strands = data[6]
+pksnrpsprots = data[7]
+pksnrpsprotsnames = data[8]
+pksnrpsdomains = data[9]
+substrspecnrpspredictordict = data[10]
+substrspecminowadict = data[11]
+substrspecpkssigdict = data[12]
+substrspecconsensusdict = data[13]
+gtrs = data[14]
+transporters = data[15]
+colors = data[16]
+hitgeneclusterdata = data[17]
+structpred = data[18]
+krpredictionsdict = data[19]
+relpositions = relativepositions(starts,ends,clustersize)
+rel_starts = relpositions[0]
+rel_ends = relpositions[1]
+y = 0
+for i in genes:
+geneposdict[i] = [starts[y],ends[y]]
+y += 1
+s = geneclustersvg(genes,rel_starts,rel_ends,strands,geneposdict,pksnrpsprots,pksnrpsdomains,qclusternr)
+outfile = open(svgfolder + "genecluster" + str(qclusternr) + ".svg","w")
+outfile.write(s.getXML())
+outfile.close()
+#Create ClusterBlast svg
+if clusterblast == "y":
+clusterblastpositiondata = {}
+#Create alignment svg for each pair of hit&query
+for i in geneclusters:
+hitclusters = range(queryclusterdata[i][0] + 1)[1:]
+#Create svgs for pairwise gene cluster alignment
+colorschemedict,rgbcolorscheme = calculate_colorgroups(i,hitclusters,queryclusterdata,internalhomologygroupsdict)
+for k in hitclusters:
+cresults = clusterblastresults(i,[k],queryclusterdata,colorschemedict,rgbcolorscheme)
+s = cresults[0]
+clusterblastpositiondata[str(i) + "_"+str(k)] = cresults[1]
+outfile = open(svgfolder + "clusterblast" + str(i) + "_" + str(k) + ".svg","w")
+outfile.write(s.getXML())
+outfile.close()
+#Create svgs for multiple gene cluster alignment
+cresults = clusterblastresults(i,hitclusters,queryclusterdata,colorschemedict,rgbcolorscheme)
+s = cresults[0]
+clusterblastpositiondata[str(i) + "_all"] = cresults[1]
+outfile = open(svgfolder + "clusterblast" + str(i) + "_all.svg","w")
+outfile.write(s.getXML())
+outfile.close()
+#Create folder for SEARCHGTR HTML files, load search form template
+formtemplate = open("search_form.html","r")
+formtemplate = formtemplate.read()
+formtemplate = formtemplate.replace("\r","\n")
+formtemplateparts = formtemplate.split("FASTASEQUENCE")
+#Create HTML file with gene cluster info in hidden div tags
+htmlfile = open("empty.xhtml","r")
+html = htmlfile.read()
+html = html.replace("\r","\n")
+htmlparts = html.split("<SPLIT HERE>")
+htmloutfile = open(genomename + "/display.xhtml","w")
+htmloutfile.write(htmlparts[0])
+#Add lines toreload all svgs up front
+for qclusternr in geneclusters:
+htmloutfile.write('  loadsvg(' + str(qclusternr) + ');\n')
+if clusterblast == "y":
+cblastclusters = [1,2,3,4,5,6,7,8,9,10]
+for qclusternr in geneclusters:
+nrhitclusters = queryclusterdata[qclusternr][0]
+for j in range(nrhitclusters):
+htmloutfile.write('  loadcblastsvg(' + str(qclusternr) + ',' + str(j+1) + ');\n')
+#For each gene cluster, add hidden div tags for gene names, add hidden div tags for NRPS/PKS domains, add hidden div tags for ClusterBLAST depictions
+htmloutfile.write(htmlparts[1])
+for qclusternr in geneclusters:
+data = qgeneclusterdata[qclusternr]
+pksnrpsprots = data[7]
+pksnrpsprotsnames = data[8]
+pksnrpsdomains = data[9]
+a = 0
+for i in pksnrpsprots:
+for j in pksnrpsdomains[i][0]:
+htmloutfile.write('  $("#b' + str(qclusternr) + '_00' + str(a) + '_div").hide();\n')
+a += 1
+htmloutfile.write(htmlparts[2])
+#Add top menu
+gifdict = {"t1pks":"16","t2pks":"17","t3pks":"18","t4pks":"20","nrps":"10","amglyccycl":"1","bcin":"2","blactam":"3","butyrolactone":"4","ectoine":"5","terpene":"19","indole":"7","lant":"8","melanin":"9","nucleoside":"12","other":"13","phosphoglycolipid":"14","siderophore":"15"}
+htmloutfile.write('<img border="0" align="top" src="images/empty.png" name="img0_" />\n')
+menubutton_nr = 1
+nrclustercolumns = 1
+for i in geneclusters:
+if qgeneclusterdata[i][0] in gifdict.keys():
+typenr = gifdict[qgeneclusterdata[i][0]]
+elif "-" in qgeneclusterdata[i][0]:
+typenr = "6"
+else:
+typenr = "13"
+htmloutfile.write('<a href="javascript:displaycluster(' + str(i) + ')"><img align="top" border="0" src="images/img' + str(i) + '_1.png" name="img' + str(i) + '_" onmouseover="over(' + str(i) + '),over2(0,' + typenr + ')" onmouseout="out(' + str(i) + '),out2(0,' + typenr + ')"/></a>\n')
+if menubutton_nr == 22 or menubutton_nr == 49:
+htmloutfile.write('<br/>')
+nrclustercolumns += 1
+menubutton_nr += 1
+#Add gene cluster description
+htmloutfile.write(htmlparts[3])
+extrapixelsdict = {}
+for qclusternr in geneclusters:
+data = qgeneclusterdata[qclusternr]
+clustertype = data[0]
+clustersize = data[1]
+genes = data[2]
+annotations = data[3]
+starts = data[4]
+ends = data[5]
+strands = data[6]
+pksnrpsprots = data[7]
+pksnrpsprotsnames = data[8]
+pksnrpsdomains = data[9]
+substrspecnrpspredictordict = data[10]
+substrspecminowadict = data[11]
+substrspecpkssigdict = data[12]
+substrspecconsensusdict = data[13]
+gtrs = data[14]
+transporters = data[15]
+colors = data[16]
+hitgeneclusterdata = data[17]
+structpred = data[18]
+krpredictionsdict = data[19]
+relpositions = relativepositions(starts,ends,clustersize)
+rel_starts = relpositions[0]
+rel_ends = relpositions[1]
+#Create genes overview pop-up HTMLs
+genepopupoutfile = open(htmlfolder + "geneclustergenes" + str(qclusternr) + '.html',"w")
+genepopupoutfile.write('<html>\n<head>\n<LINK href="style.css" rel="stylesheet" type="text/css">\n</head>\n<body>\nOverview of gene cluster genes:<br><br><table border=1>\n')
+genepopupoutfile.write('<tr><td><b>Gene</b></td><td><b>Annotation</b></td><td><b>Start position</b></td><td><b>End position</b></td><td><b>Strand</b></td></tr>\n')
+for i in genes:
+genepopupoutfile.write('<tr><td>' + i + '</td><td>' + annotations[i].replace("_"," ") + '</td><td>' + str(starts[genes.index(i)]) + '</td><td>' + str(ends[genes.index(i)]) + '</td><td>' + strands[genes.index(i)] +  '</td></tr>\n')
+genepopupoutfile.write('\n</table><br><br><br>Biosynthetic gene cluster signature gene domains detected: <br><br>\n')
+genepopupoutfile.write('<table border=1><tr><td><b>Gene</b></td><td><b>Detected domains</b></td><td><b>Bit scores</b></td>\n')
+for i in genes:
+if i in allcoregenes:
+detected_doms = detecteddomainsdict[i]
+for j in detected_doms:
+genepopupoutfile.write('<tr><td>' + i + '</td><td>' + str(j[0]) + '</td><td>' + str(j[1]) + '</td>\n')
+genepopupoutfile.write('\n</table><br><br><br>')
+genepopupoutfile.write('\n</body>\n</html>\n')
+genepopupoutfile.close()
+#Add gene cluster description on top
+if qclusternr == 1:
+htmloutfile.write('<div id="genecluster'+ str(qclusternr) + '">')
+else:
+htmloutfile.write('\n\n<div id="genecluster'+ str(qclusternr) + '" style="display:none">')
+#Add menu bars 1 & 2
+htmloutfile.write('<div id="bartext1" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:' + str(113 + nrclustercolumns * 28) + 'px; left:30px;"><b>Gene cluster description</b></div>')
+htmloutfile.write('<div id="bartext2" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:' + str(263 + nrclustercolumns * 28) + 'px; left:30px;"><b>PKS/NRPS domain annotation</b></div>')
+htmloutfile.write('<div id="descrbar1" style="position:absolute; z-index:1; top:' + str(110 + nrclustercolumns * 28) + 'px;"><img src="images/bar.png" height="25" width="' + str(int(0.75 * screenwidth)) + '"/></div>\n')
+htmloutfile.write('<div class="help" id="help1" style="position:absolute; z-index:1; top:' + str(112 + nrclustercolumns * 28) + 'px; left:' + str(int(screenwidth * 0.75) - 20) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#panel1" target="_blank"><img border="0" src="images/help.png"/></a></div>\n')
+htmloutfile.write('<div id="descrbar2" style="position:absolute; z-index:1; top:' + str(260 + nrclustercolumns * 28) + 'px;"><img src="images/bar.png" height="25" width="' + str(int(0.75 * screenwidth)) + '"/></div>\n')
+htmloutfile.write('<div class="help" id="help2" style="position:absolute; z-index:1; top:' + str(262 + nrclustercolumns * 28) + 'px; left:' + str(int(screenwidth * 0.75) - 20) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#panel2" target="_blank"><img border="0" src="images/help.png"/></a></div>\n')
+if screenwidth < 1280:
+htmloutfile.write('<div class="clusterdescr" style="font-size:0.7em; position:absolute; top:' + str(125 + nrclustercolumns * 28) + 'px; left:' + str(12) + 'px;">\n')
+else:
+htmloutfile.write('<div class="clusterdescr" style="font-size:0.8em; position:absolute; top:' + str(120 + nrclustercolumns * 28) + 'px; left:' + str(12) + 'px;">\n')
+htmloutfile.write("<br/>Gene Cluster " + str(qclusternr) + ". Type = " + clustertype + ". Location: "+ str(starts[0]) + " - " + str(ends[-1]) + " nt. Click on genes for more information.")
+if len(genomic_accnr) > 4:
+htmloutfile.write('&nbsp;&nbsp;<a href="http://www.ncbi.nlm.nih.gov/nuccore/' + genomic_accnr + '" target="_blank">GBK</a>')
+#Genes overview pop-up.
+if len(clustertype) > 20:
+htmloutfile.write('<br/>')
+htmloutfile.write('&nbsp;&nbsp;&nbsp;&nbsp;<a href="html/geneclustergenes' + str(qclusternr) + '.html" onclick=\'window.open("html/geneclustergenes' + str(qclusternr) + '.html","popup","width=800,height=800,scrollbars=yes,resizable=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>Genes and detection info overview</a>')
+htmloutfile.write("</div>\n\n")
+htmloutfile.write('<div id="display' + str(qclusternr) + '">\n')
+if nrclustercolumns > 1:
+spacers = nrclustercolumns - 1
+for i in range(spacers):
+htmloutfile.write('<img src="images/spacer.png"/>\n')
+htmloutfile.write('</div>\n')
+#Add gene pop-ups
+a = 0
+for i in genes:
+htmloutfile.write('<div id="a' + str(qclusternr) + '_00' + str(a) + '_div" class="hidden popup" style="position:absolute; z-index:2; top:' + str(185 + nrclustercolumns * 28) + 'px; left:' + str(int(((rel_starts[a] + rel_ends[a])/2)*0.875)) + 'px;">\n')
+htmloutfile.write(annotations[i].replace("_"," ").replace("&","&amp;") + "\n")
+if smcogs == "y":
+if smcogdict.has_key(i):
+smcog = smcogdict[i]
+htmloutfile.write("<br/>smCOG: " + smcog + " (" + smcogdescriptions[smcog].replace("_"," ").replace("&","&amp;") + ")\n")
+if smcog in gtrcoglist:
+formfileloc = searchgtrfolder + i + ".html"
+formfile = open(formfileloc,"w")
+specificformtemplate = formtemplateparts[0].replace("GlycTr",i)
+formfile.write(specificformtemplate)
+formfile.write(i + "\n" + seqdict[i])
+formfile.write(formtemplateparts[1])
+formfile.close()
+htmloutfile.write("<br/><a href=\"searchgtr/" + i + ".html\" target=\"_blank\"> Run SEARCHGTr on this gene </a>\n")
+if smcog in transportercoglist:
+link = "http://blast.jcvi.org/er-blast/index.cgi?project=transporter;program=blastp;sequence=sequence%0A" + seqdict[i]
+htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> TransportDB BLAST on this gene </a>\n")
+else:
+htmloutfile.write("<br/>smCOG: -\n")
+link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp&amp;QUERY=" + seqdict[i] + "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch"
+htmloutfile.write("<br/>Location: " + str(starts[a]) + "-" + str(ends[a]) + "\n")
+htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a><br/>\n")
+browse_start = starts[a] - 10000
+browse_end = ends[a] + 10000
+if browse_start < 0:
+browse_start = 0
+if browse_end > dnaseqlength:
+browse_end = dnaseqlength
+if genomic_accnr != "none" and genomic_accnr != "":
+htmloutfile.write('<a href="http://www.ncbi.nlm.nih.gov/projects/sviewer/?Db=gene&amp;DbFrom=protein&amp;Cmd=Link&amp;noslider=1&amp;id=' + genomic_accnr + '&amp;from=' + str(browse_start) + '&amp;to=' + str(browse_end) + '" target=\"_blank\">View genomic context</a><br/>\n')
+if smcogs == "y":
+if smcogtreedict.has_key(i.rpartition(".")[0]):
+htmloutfile.write('<a href="smcogs/' + smcogtreedict[i.rpartition(".")[0]] + '" onclick=\'window.open("smcogs/' + smcogtreedict[i.rpartition(".")[0]] + '","popup","width=1280,height=1500,resizable=yes,scrollbars=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>View smCOG seed phylogenetic tree with this gene</a>\n')
+elif smcogtreedict.has_key(i):
+htmloutfile.write('<a href="smcogs/' + smcogtreedict[i] + '" onclick=\'window.open("smcogs/' + smcogtreedict[i] + '","popup","width=1280,height=1500,resizable=yes,scrollbars=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>View smCOG seed phylogenetic tree with this gene</a>\n')
+htmloutfile.write("</div>\n\n")
+htmloutfile.write('<div id="a' + str(qclusternr) + '_00' + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(162 + nrclustercolumns * 28) + 'px; left:' + str(float((rel_starts[a]+rel_ends[a])/2)*0.9375) + 'px;">\n')
+htmloutfile.write(i)
+htmloutfile.write("</div>\n\n")
+a += 1
+#Early calculation of nr of domains to be able to fit structure prediction information of large NRPSs/PKSs
+pksnrpsdomainnr = 0
+krdomainnr = 0
+adomainnr = 0
+for i in pksnrpsprots:
+doms = pksnrpsdomains[i][0]
+first = "no"
+nra = 0
+nrat = 0
+nrkr = 0
+nrcal = 0
+for j in doms:
+if "AMP-binding" in j or "A-OX" in j:
+j = "A"
+nra += 1
+adomainnr += 1
+z = nra
+if "KR" in j:
+j = "KR"
+nrkr += 1
+krdomainnr += 1
+z = nrkr
+if "AT" in j and "docking" not in j:
+j = "AT"
+nrat += 1
+pksnrpsdomainnr += 1
+z = nrat
+if "CAL" in j:
+j = "CAL"
+nrcal += 1
+pksnrpsdomainnr += 1
+z = nrcal
+pixels = adomainnr * 50  + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16) + 375
+extrapixels = pixels - (676 + len(pksnrpsprots) * 99)
+if extrapixels < 0:
+extrapixels = 0
+extrapixelsdict[qclusternr] = extrapixels
+#Add picture of predicted chemical structure
+htmloutfile.write('<div id="verticalbar1" style="position:absolute; left:' + str(int(screenwidth * 0.75) + 12) + 'px; top:' + str(106 + nrclustercolumns * 28) + 'px;"><img src="images/linefill.png" height="' + str(1126 + len(pksnrpsprots) * 99 + extrapixels) + '" width="2"/></div>\n')
+htmloutfile.write('<div id="verticalbar2" style="position:absolute; left:' + str(int(screenwidth * 0.98)) + 'px; top:0px;"><img src="images/linefill.png" height="' + str(1288 + len(pksnrpsprots) * 99 + nrclustercolumns * 28 + extrapixels) + '" width="2"/></div>\n')
+htmloutfile.write('<div id="horizbar1" style="position:absolute; left:0px; top:' + str(92 + nrclustercolumns * 28) + 'px;"><img src="images/linefill.png" height="2" width="' + str(screenwidth * 0.98) + '"/></div>\n')
+htmloutfile.write('<div id="horizbar2" style="position:absolute; left:0px; top:82px;"><img src="images/linefill.png" height="2" width="' + str(screenwidth * 0.98) + '"/></div>\n')
+htmloutfile.write('<div id="horizbar3" style="position:absolute; left:0px; top:' + str(1223 + len(pksnrpsprots) * 99 + nrclustercolumns * 28 + extrapixels) + 'px;"><img src="images/linefill.png" height="2" width="' + str(screenwidth * 0.98) + '"/></div>\n')
+if screenwidth < 1280:
+htmloutfile.write('<div id="bartext4" style="color:#FFFFFF; font-size:0.8em; position:absolute; z-index:2; top:' + str(114 + nrclustercolumns * 28) + 'px; left:' + str(int(screenwidth * 0.75) + 30) + 'px;"><b>Predicted core structure</b></div>\n')
+else:
+htmloutfile.write('<div id="bartext4" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:' + str(113 + nrclustercolumns * 28) + 'px; left:' + str(int(screenwidth * 0.75) + 30) + 'px;"><b>Predicted core structure</b></div>\n')
+htmloutfile.write('<div class="title" style="position:absolute; top:' + str(110 + nrclustercolumns * 28) + 'px; left:' + str(screenwidth * 0.75 + 20) + 'px;">\n')
+htmloutfile.write('<div id="descrbar4" style="right:25px; position:absolute; z-index:1; top:0px; left:0px;"><img src="images/bar.png" height="25" width="' + str(int(0.21 * screenwidth)) + '"/></div>\n')
+htmloutfile.write('<div class="help" id="help4" style="position:absolute; z-index:1; top:2px; left:' + str(int(screenwidth * 0.2) - 20) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#sidepanel1" target="_blank"><img border="0" src="images/help.png"/></a></div>\n')
+if qclusternr in failedstructures:
+htmloutfile.write('<br/><br/><img src="images/nostructure_icon.png" border="1" width="' + str(int(screenwidth * 0.19)) + '" height="200" />\n')
+elif " " in structpred:
+htmloutfile.write('<br/><br/><a href="structures/genecluster' + str(qclusternr) + '.png" onclick=\'window.open("structures/genecluster' + str(qclusternr) + '.png","popup","width=600,height=300,scrollbars=yes,resizable=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'><img src="structures/genecluster' + str(qclusternr) + '_icon.png" border="1" width="' + str(int(screenwidth * 0.19)) + '" height="200" /></a>\n')
+else:
+htmloutfile.write('<br/><br/><img src="images/nostructure_icon.png" border="1" width="' + str(int(screenwidth * 0.19)) + '" height="200" />\n')
+htmloutfile.write('<div class="clusterdescr" style="font-size:0.8em;">\n')
+htmloutfile.write("Monomers prediction: " + structpred + "<br/>\n")
+if qclusternr in dockingdomainanalysis:
+htmloutfile.write('<a href="html/docking_analysis' + str(qclusternr) + '.html" onclick=\'window.open("html/docking_analysis' + str(qclusternr) + '.html","popup","width=600,height=1200,scrollbars=yes,resizable=yes,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>Docking domain analysis results.</a><br/>\n')
+nrpsfound = "no"
+pksnrpsdomainnr = 0
+adomainnr = 0
+krdomainnr = 0
+for i in pksnrpsprots:
+doms = pksnrpsdomains[i][0]
+first = "no"
+nra = 0
+nrat = 0
+nrkr = 0
+nrcal = 0
+for j in doms:
+if "AMP-binding" in j or "A-OX" in j:
+j = "A"
+nra += 1
+adomainnr += 1
+z = nra
+if "KR" in j:
+j = "KR"
+nrkr += 1
+krdomainnr += 1
+z = nrkr
+if "AT" in j and "docking" not in j:
+j = "AT"
+nrat += 1
+pksnrpsdomainnr += 1
+z = nrat
+if "CAL" in j:
+j = "CAL"
+nrcal += 1
+pksnrpsdomainnr += 1
+z = nrcal
+prediction = "no"
+domname = str(i) + "_" + str(j) + str(z)
+if domname in substrspecnrpspredictordict.keys():
+nrpsfound = "yes"
+prediction = "yes"
+if substrspecnrpspredictordict[domname][0] == "nrp":
+if first == "no":
+first = "yes"
+htmloutfile.write(i + ':<br/>')
+htmloutfile.write('<font size="1">&nbsp;&nbsp;NRPSPredictor code prediction, '+ str(j) + str(z) + ': ?</font><br/>\n')
+else:
+if first == "no":
+first = "yes"
+htmloutfile.write(i + ':<br/>')
+htmloutfile.write('<font size="1">&nbsp;&nbsp;NRPSPredictor code prediction, '+ str(j) + str(z) + ': ' + substrspecnrpspredictordict[domname][0] + '</font><br/>\n')
+if substrspecnrpspredictordict[domname][1] == "nrp":
+if first == "no":
+first = "yes"
+htmloutfile.write(i + ':<br/>')
+htmloutfile.write('<font size="1">&nbsp;&nbsp;NRPSPredictor SVM prediction, '+ str(j) + str(z) + ': ?</font><br/>\n')
+else:
+if first == "no":
+first = "yes"
+htmloutfile.write(i + ':<br/>')
+htmloutfile.write('<font size="1">&nbsp;&nbsp;NRPSPredictor SVM prediction, '+ str(j) + str(z) + ': ' + substrspecnrpspredictordict[domname][1] + '</font><br/>\n')
+if domname in substrspecminowadict.keys():
+prediction = "yes"
+if substrspecminowadict[domname] == "nrp" or substrspecminowadict[domname] == "pk":
+if first == "no":
+first = "yes"
+htmloutfile.write(i + ':<br/>')
+htmloutfile.write('<font size="1">&nbsp;&nbsp;Minowa prediction, '+ str(j) + str(z) + ': ?</font><br/>\n')
+else:
+if first == "no":
+first = "yes"
+htmloutfile.write(i + ':<br/>')
+htmloutfile.write('<font size="1">&nbsp;&nbsp;Minowa prediction, '+ str(j) + str(z) + ': ' + substrspecminowadict[domname] + '</font><br/>\n')
+if domname in substrspecpkssigdict.keys():
+prediction = "yes"
+if substrspecpkssigdict[domname] == "pk":
+if first == "no":
+first = "yes"
+htmloutfile.write(i + ':<br/>')
+htmloutfile.write('<font size="1">&nbsp;&nbsp;PKS code prediction, '+ str(j) + str(z) + ': ?</font><br/>\n')
+else:
+if first == "no":
+first = "yes"
+htmloutfile.write(i + ':<br/>')
+htmloutfile.write('<font size="1">&nbsp;&nbsp;PKS code prediction, '+ str(j) + str(z) + ': ' + substrspecpkssigdict[domname] + '</font><br/>\n')
+if domname in krpredictionsdict.keys():
+if first == "no":
+first = "yes"
+htmloutfile.write(i + ':<br/>')
+htmloutfile.write('<font size="1">&nbsp;&nbsp;KR activity, '+ str(j) + str(z) + ': ' + krpredictionsdict[domname][0] + "</font><br/>\n")
+htmloutfile.write('<font size="1">&nbsp;&nbsp;KR stereochemistry, '+ str(j) + str(z) + ': ' + krpredictionsdict[domname][1] + "</font><br/>\n")
+#Add link to prediction details pop-up
+if prediction == "yes":
+htmloutfile.write('<font size="1">&nbsp;&nbsp;&nbsp;&nbsp;<a href="substrspecs/' + domname + '.html" onclick=\'window.open("substrspecs/' + domname + '.html","popup","width=500,height=400,scrollbars=yes,resizable=no,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>Prediction details</a></font><br/>\n')
+if nrpsfound == "yes":
+htmloutfile.write('<br/><a href="http://bioinfo.lifl.fr/norine/form2.jsp" target="_blank">Perform Norine peptide search</a>')
+htmloutfile.write('</div>')
+if screenwidth < 1280:
+htmloutfile.write('<div id="bartext5" style="color:#FFFFFF; font-size:0.8em; position:absolute; z-index:2; top:' + str(624 + adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:10px;"><b>File outputs</b></div>\n')
+else:
+htmloutfile.write('<div id="bartext5" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:' + str(623 + adomainnr * 50  + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:10px;"><b>Downloadable output files</b></div>\n')
+htmloutfile.write('<div id="descrbar5" style="right:25px; position:absolute; z-index:1; top:' + str(620 + adomainnr * 50  + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:0px;"><img src="images/bar.png" height="25" width="' + str(int(0.21 * screenwidth)) + '"/></div>\n')
+htmloutfile.write('<div class="help" id="help5" style="position:absolute; z-index:1; top:' + str(622 + adomainnr * 50  + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:' + str(int(screenwidth * 0.2) - 20) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#sidepanel2" target="_blank"><img border="0" src="images/help.png"/></a></div>\n')
+htmloutfile.write('<div class="text" id="outputinfo" style="font-size:0.8em; right:25px; position:absolute; z-index:1; top:' + str(655 + adomainnr * 50  + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16)) + 'px; left:0px;">')
+if fullhmm == "y" or fullblast == "y":
+htmloutfile.write('<a href="' + oldgenomename + '.final.embl" target="_blank">Open EMBL summary file</a><br/><br/>')
+#htmloutfile.write('<a href="' + genomename + '.final.csv" target="_blank">Download CSV summary file</a><br/><br/>')
+if fullhmm == "y":
+htmloutfile.write('<a href="' + oldgenomename + '.cluster_prediction.png" onclick=\'window.open("' + oldgenomename + '.cluster_prediction.png","popup","width=1024,height=1400,scrollbars=0,resizable=0,toolbar=0,directories=0,location=0,menubar=0,status=0,left=0,top=0"); return false\'>Sec. met. enriched genome regions</a><br/><br/>')
+htmloutfile.write('<a href="' + genomename + '.geneclusters.xls" target="_blank">Open XLS overview table</a><br/><br/>')
+htmloutfile.write('</div>')
+htmloutfile.write("</div>\n\n")
+#Add descriptions of NRPS/PKS genes
+htmloutfile.write('<div class="title" style="position:absolute; top:' + str(180) + 'px; left:' + str(12) + 'px;">\n')
+htmloutfile.write("</div>\n\n")
+z = 1
+for i in pksnrpsprots:
+htmloutfile.write('<div class="text" style="position:absolute; top:' + str(228 + 84 * z + nrclustercolumns * 28) + 'px; left:' + str(12) + 'px;">\n')
+htmloutfile.write(i + " (" + pksnrpsprotsnames[i].lower() + ")")
+htmloutfile.write("</div>\n\n")
+z += 1
+#Add NRPS/PKS domain pop-ups
+longestprot = 0
+protlengthdict = {}
+for i in pksnrpsprots:
+protlength = (geneposdict[i][1] - geneposdict[i][0]) / 3
+protlengthdict[i] = protlength
+if protlength > longestprot:
+longestprot = protlength
+try:
+aa2pixelratio = longestprot * 0.75 / screenwidth
+except:
+aa2pixelratio = 0.1
+a = 0
+z = 1
+for i in pksnrpsprots:
+domainsdict = pksnrpsdomains[i][1]
+nra = 0
+nrat = 0
+nrkr = 0
+nrcal = 0
+for j in pksnrpsdomains[i][0]:
+startpos = domainsdict[j][0]
+endpos = domainsdict[j][1]
+htmloutfile.write('<div id="b' + str(qclusternr) + '_00' + str(a) + '_div" class="hidden popup" style="position:absolute; z-index:2; top:' + str(277 + 84 * z + nrclustercolumns * 28) + 'px; left:' + str( ( ( (endpos+startpos) / 2) / aa2pixelratio) * 0.9375 ) + 'px;">\n')
+htmloutfile.write("Domain " + j + " (" + i + ")")
+link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp&amp;QUERY=" + seqdict[i][startpos:endpos] + "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch"
+htmloutfile.write("<br/>Location: " + str(startpos) + "-" + str(endpos) + " AA\n")
+domid = i + "_" + j
+if "AMP-binding" in j or "A-OX" in j:
+j = "A"
+nra += 1
+y = nra
+if "PKS_KR" in j:
+j = "KR"
+nrkr += 1
+y = nrkr
+if "PKS_AT" in j:
+j = "AT"
+nrat += 1
+y = nrat
+if "CAL_domain" in j:
+j = "CAL"
+nrcal += 1
+y = nrcal
+prediction = "no"
+domid = str(i) + "_" + str(j) + str(y)
+if substrspecnrpspredictordict.has_key(domid) or substrspecminowadict.has_key(domid) or substrspecpkssigdict.has_key(domid):
+htmloutfile.write("<br/>Predicted substrate: " + substrspecconsensusdict[domid] + "\n")
+if substrspecnrpspredictordict.has_key(domid):
+htmloutfile.write("<br/>-NRPSPredictor code: " + substrspecnrpspredictordict[domid][0] + "\n")
+htmloutfile.write("<br/>-NRPSPredictor SVM: " + substrspecnrpspredictordict[domid][1] + "\n")
+if substrspecminowadict.has_key(domid):
+htmloutfile.write("<br/>-Minowa HMM: " + substrspecminowadict[domid] + "\n")
+if substrspecpkssigdict.has_key(domid):
+htmloutfile.write("<br/>-PKS code: " + substrspecpkssigdict[domid] + "\n")
+if krpredictionsdict.has_key(domid):
+htmloutfile.write("<br/>KR activity: " + krpredictionsdict[domid][0] + "\n")
+htmloutfile.write("<br/>KR stereochemistry: " + krpredictionsdict[domid][1] + "\n")
+htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this domain </a>\n")
+htmloutfile.write("</div>\n\n")
+a += 1
+z += 1
+htmloutfile.write('</div>\n')
+if clusterblast == "y":
+#Write ClusterBlast divs with pictures and description pop-up tags
+htmloutfile.write('<div id="clusterblastview" class="clusterdescr">\n\n')
+#Add menu bar 3
+htmloutfile.write('<div id="bartext3" style="color:#FFFFFF; font-size:1em; position:absolute; z-index:2; top:3px; left:20px;"><b>Homologous gene clusters</b></div>')
+htmloutfile.write('<div id="descrbar3" style="position:absolute; z-index:1; top:0px;"><img src="images/bar.png" height="25" width="' + str(int(0.75*screenwidth)) + '"/></div>')
+htmloutfile.write('<div class="help" id="help3" style="position:absolute; z-index:1; top:2px; left:' + str(int(screenwidth * 0.75) - 30) + 'px;"><a href="http://antismash.secondarymetabolites.org/help.html#panel3" target="_blank"><img border="0" src="images/help.png"/></a></div>')
+for qclusternr in geneclusters:
+nrhitclusters = queryclusterdata[qclusternr][0]
+hitclusterdata = queryclusterdata[qclusternr][1]
+if qclusternr == 1:
+htmloutfile.write('<div id="qcluster' + str(qclusternr) + '">\n<br/><br/>\n<div align="left">\n<form name="clusterform' + str(qclusternr) + '">\n<select name="selection' + str(qclusternr) + '" onchange="javascript:navigate(this);">\n')
+else:
+htmloutfile.write('<div id="qcluster' + str(qclusternr) + '" style="display:none">\n<br/><br/>\n<div align="left">\n<form name="clusterform' + str(qclusternr) + '">\n<select name="selection' + str(qclusternr) + '" onchange="javascript:navigate(this);">\n')
+htmloutfile.write('<option value="">Select gene cluster alignment</option>\n')
+for i in range(nrhitclusters):
+htmloutfile.write('<option value="javascript:displaycblastresults(' + str(qclusternr) + ',' + str(i+1) + ')">' + hitclusterdata[i+1][5][i].replace("&","&amp;") + '</option>\n')
+htmloutfile.write('</select>\n</form>\n\n</div>')
+htmloutfile.write('<div style="position:absolute; top:33px; left:' + str(screenwidth*0.625) + 'px;"><img src="images/button.gif" name="button' + str(qclusternr) + '" onclick="javascript:displaybutton(' + str(qclusternr) + ');"/></div>')
+clustersizes = []
+for i in range(nrhitclusters):
+hitclusterdata = queryclusterdata[qclusternr][1]
+queryclustergenes = hitclusterdata[1][3]
+queryclustergenesdetails = hitclusterdata[1][4]
+hitclusternumber =  i + 1
+cluster_acc = hitclusterdata[hitclusternumber][6]
+hitclustergenes = hitclusterdata[hitclusternumber][1]
+hitclustergenesdetails = hitclusterdata[hitclusternumber][2]
+relpositiondata = clusterblastpositiondata[str(qclusternr) + "_" + str(i+1)]
+qrel_starts = relpositiondata[0][0]
+qrel_ends = relpositiondata[0][1]
+hrel_starts = relpositiondata[1][hitclusternumber ][0]
+hrel_ends = relpositiondata[1][hitclusternumber ][1]
+strandsbalance = relpositiondata[2][hitclusternumber]
+if strandsbalance < 0:
+hitclustergenes.reverse()
+if qclusternr == 1 and (i+1) == 1:
+htmloutfile.write('<div id="hitcluster' + str(qclusternr) + '_' + str(i+1) + '">\n')
+else:
+htmloutfile.write('<div id="hitcluster' + str(qclusternr) + '_' + str(i+1) + '" style="display:none">\n')
+#Insert gene cluster descriptions
+cdescription = hitclusterdata[i+1][5][i].replace("&","&amp;").replace("\t"," ").partition(" ")[2].partition(" ")[2].split(", whole")[0].split(", complete")[0]
+if len(nucname) < 80:
+qdescription = nucname
+else:
+qdescription = nucname[0:77] + "..."
+htmloutfile.write('<div id="descriptionquery" style="text-align:right; position:absolute; top:70px; right:50px; font-size:10px; font-style:italic">' + qdescription + '</div>\n')
+htmloutfile.write('<div id="description' + str(qclusternr) + '" style="text-align:right; position:absolute; top:137px; right:50px; font-size:10px; font-style:italic">' + cdescription + '</div>\n')
+#Insert pubmed/pubchem links
+htmloutfile.write('<div id="pub_pics" style="position:absolute; top:60px; left:' + str(int(screenwidth * 0.0)) + 'px; font-size:10px"> Hit cluster cross-links: \n')
+htmloutfile.write('&nbsp;&nbsp;<a href="http://www.ncbi.nlm.nih.gov/nuccore/' + cluster_acc.split(".")[0] + '" target="_blank"><img align="bottom" border="0" src="images/genbank.gif"/></a>\n')
+present = "n"
+for j in pubmed_dict.keys():
+if j in cluster_acc:
+present = "y"
+for j in pubchem_dict.keys():
+if j in cluster_acc:
+present = "y"
+if present == "y":
+for j in pubmed_dict.keys():
+if j in cluster_acc:
+pubmedstring = pubmed_dict[j]
+htmloutfile.write('&nbsp;&nbsp;<a href="http://www.ncbi.nlm.nih.gov/pubmed/' + pubmedstring + '" target="_blank"><img align="bottom" border="0" src="images/pubmed.gif"/></a>\n')
+for j in pubchem_dict.keys():
+if j in cluster_acc:
+pubchemstring = pubchem_dict[j]
+if "," in pubchemstring:
+htmloutfile.write('&nbsp;&nbsp;<a href="http://www.ncbi.nlm.nih.gov/sites/entrez?db=pccompound&amp;term=' + pubchemstring + '" target="_blank"><img align="bottom" border="0" src="images/struct.gif"/></a>\n')
+else:
+htmloutfile.write('&nbsp;&nbsp;<a href="http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=' + pubchemstring + '" target="_blank"><img align="bottom" border="0" src="images/struct.gif"/></a>\n')
+htmloutfile.write('</div>\n\n')
+#Create gene pop-ups
+a = 0
+for j in queryclustergenes:
+j_accession = accessiondict[j]
+htmloutfile.write('<div id="q' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_div" class="hidden popup" style="position:absolute; top:' + str(113) + 'px; left:' + str(int(float(qrel_starts[a])*0.875)) + 'px;">\n')
+htmloutfile.write(queryclustergenesdetails[j][3].replace("_"," ").replace("&","&amp;") + "\n")
+link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp&amp;QUERY=" + j_accession + "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch"
+htmloutfile.write("<br/>Location: " + str(queryclustergenesdetails[j][0]) + "-" + str(queryclustergenesdetails[j][1]) + "\n")
+htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a>\n")
+htmloutfile.write("</div>\n\n")
+htmloutfile.write('<div id="q' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(83) + 'px; left:' + str(int(float((float(qrel_starts[a])+float(qrel_ends[a]))/2)*0.9375)) + 'px;">\n')
+htmloutfile.write(j)
+htmloutfile.write("</div>\n\n")
+a+= 1
+a = 0
+for j in hitclustergenes:
+j_accession = hitclustergenesdetails[j][4]
+htmloutfile.write('<div id="h' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_div" class="hidden popup" style="position:absolute; top:' + str(183) + 'px; left:' + str(int(float(hrel_starts[a])*0.875)) + 'px;">\n')
+htmloutfile.write(hitclustergenesdetails[j][3].replace("_"," ").replace("&","&amp;") + "\n")
+link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp&amp;QUERY=" + j_accession + "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch"
+htmloutfile.write("<br/>Location: " + str(hitclustergenesdetails[j][0]) + "-" + str(hitclustergenesdetails[j][1]) + "\n")
+htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a>\n")
+htmloutfile.write("</div>\n\n")
+htmloutfile.write('<div id="h' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(153) + 'px; left:' + str(int(float((float(hrel_starts[a])+float(hrel_ends[a]))/2)*0.9375)) + 'px;">\n')
+htmloutfile.write(j)
+htmloutfile.write("</div>\n\n")
+a += 1
+htmloutfile.write('</div>\n')
+#Find new relative positions for display of all gene clusters in one picture
+relpositiondata = clusterblastpositiondata[str(qclusternr) + "_all"]
+qrel_starts = relpositiondata[0][0]
+qrel_ends = relpositiondata[0][1]
+htmloutfile.write('<div id="hitcluster' + str(qclusternr) + '_all" style="display:none">\n')
+if len(nucname) < 80:
+qdescription = nucname
+else:
+qdescription = nucname[0:77] + "..."
+htmloutfile.write('<div id="descriptionquery" style="text-align:right; position:absolute; top:60px; right:50px; font-size:10px; font-style:italic">' + qdescription + '</div>\n')
+for i in range(nrhitclusters):
+hitclusterdata = queryclusterdata[qclusternr][1]
+queryclustergenes = hitclusterdata[1][3]
+queryclustergenesdetails = hitclusterdata[1][4]
+hitclusternumber =  i + 1
+hrel_starts = relpositiondata[1][hitclusternumber][0]
+hrel_ends = relpositiondata[1][hitclusternumber][1]
+cluster_acc = hitclusterdata[hitclusternumber][6]
+hitclustergenes = hitclusterdata[hitclusternumber][1]
+hitclustergenesdetails = hitclusterdata[hitclusternumber][2]
+strandsbalance = relpositiondata[2][hitclusternumber]
+cdescription = hitclusterdata[i+1][5][i].replace("&","&amp;").replace("\t"," ").partition(" ")[2].partition(" ")[2].split(", whole")[0].split(", complete")[0]
+htmloutfile.write('<div id="description' + str(qclusternr) + '" style="text-align:right; position:absolute; top:' + str(60 + (57 * hitclusternumber)) + 'px; right:50px; font-size:10px; font-style:italic">' + cdescription + '</div>\n')
+if hitclusternumber == 1:
+a = 0
+for j in queryclustergenes:
+htmloutfile.write('<div id="all_' + str(qclusternr) + "_0_" + str(a) + '_div" class="hidden popup" style="position:absolute; top:' + str(100) + 'px; left:' + str(int(float(qrel_starts[a])*0.875)) + 'px; z-index:2;">\n')
+htmloutfile.write(queryclustergenesdetails[j][3].replace("_"," ").replace("&","&amp;") + "\n")
+link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp&amp;QUERY=" + j + "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch"
+htmloutfile.write("<br/>Location: " + str(queryclustergenesdetails[j][0]) + "-" + str(queryclustergenesdetails[j][1]) + "\n")
+htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a>\n")
+htmloutfile.write("</div>\n\n")
+htmloutfile.write('<div id="all_' + str(qclusternr) + "_0_" + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(75) + 'px; left:' + str(int(float((float(qrel_starts[a])+float(qrel_ends[a]))/2)*0.9375)) + 'px;">\n')
+htmloutfile.write(j)
+htmloutfile.write("</div>\n\n")
+a+= 1
+a = 0
+for j in hitclustergenes:
+htmloutfile.write('<div id="all_' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_div" class="hidden popup" style="position:absolute; top:' + str(100 + 57 * hitclusternumber) + 'px; left:' + str(int(float(hrel_starts[a])*0.875)) + 'px; z-index:2;">\n')
+htmloutfile.write(hitclustergenesdetails[j][3].replace("_"," ").replace("&","&amp;") + "\n")
+link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp&amp;QUERY=" + j + "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch"
+htmloutfile.write("<br/>Location: " + str(hitclustergenesdetails[j][0]) + "-" + str(hitclustergenesdetails[j][1]) + "\n")
+htmloutfile.write("<br/><a href=\"" + link + "\" target=\"_blank\"> NCBI BlastP on this gene </a>\n")
+htmloutfile.write("</div>\n\n")
+htmloutfile.write('<div id="all_' + str(qclusternr) + "_" + str(hitclusternumber) + "_" + str(a) + '_divtext" class="hidden genenames" style="position:absolute; top:' + str(75 + 56.75 * hitclusternumber) + 'px; left:' + str(int(float((float(hrel_starts[a])+float(hrel_ends[a]))/2)*0.9375)) + 'px;">\n')
+htmloutfile.write(j)
+htmloutfile.write("</div>\n\n")
+a += 1
+htmloutfile.write('</div>\n')
+htmloutfile.write('</div>\n\n')
+if clusterblast == "y":
+htmloutfile.write('</div>\n')
+for i in geneclusters:
+data = qgeneclusterdata[i]
+extrapixels = extrapixelsdict[i]
+pksnrpsprots = data[7]
+if i == 1:
+htmloutfile.write('<div id="creditsbar' + str(i) + '" class="banner" style="position:absolute; width:' + str(int(0.98 * screenwidth)) +'px; align:\'left\'; height:75; top:' + str(1242 + int(len(pksnrpsprots) * 99) + nrclustercolumns * 28 + extrapixels) + 'px; left:0px; color:#810E15; z-index:-1;">')
+else:
+htmloutfile.write('<div id="creditsbar' + str(i) + '" class="banner" style="display:none; position:absolute; width:' + str(int(0.98 * screenwidth)) +'px; align:\'left\'; height:75; top:' + str(1242 + int(len(pksnrpsprots) * 99) + nrclustercolumns * 28 + extrapixels) + 'px; left:0px; color:#810E15; z-index:-1;">')
+htmloutfile.write('<div style="float:center; font-size:0.9em;">\n<div style="position:absolute; top:0px; left:30px;">\n<img src="images/ruglogo.gif" border="0"/>&nbsp;&nbsp;&nbsp;&nbsp;\n<img src="images/gbblogo.gif" border="0"/>&nbsp;&nbsp;&nbsp;&nbsp;\n<img src="images/tueblogo.gif" border="0"/>&nbsp;&nbsp;&nbsp;&nbsp;\n<img src="images/ucsflogo.gif" border="0"/>&nbsp;&nbsp;&nbsp;&nbsp;\n</div>\n<div style="position:absolute; top:0px; left:600px;">\nantiSMASH: Rapid identification, annotation and analysis of secondary metabolite biosynthesis gene clusters.\n<br/>Marnix H. Medema, Kai Blin, Peter Cimermancic, Victor de Jager, Piotr Zakrzewski, Michael A. Fischbach, Tilmann Weber, Rainer Breitling &amp; Eriko Takano\n<br/><i>Nucleic Acids Research</i> (2011), proposal submitted.\n</div>\n</div>\n</div>')
+#Add final part of HTML file
+htmloutfile.write(htmlparts[-1])
+#Copy accessory files for HTML viewing
+if sys.platform == ('win32'):
+copycommand1 = "copy/y vis\\* " + genomename + " > nul"
+copycommand2 = "copy/y vis\\html\\* " + genomename + "\\html > nul"
+copycommand3 = "copy/y vis\\images\\* " + genomename + "\\images > nul"
+elif sys.platform == ('linux2'):
+copycommand1 = "cp -r vis/* " + genomename + " > /dev/null"
+copycommand2 = "true"
+copycommand3 = "true"
+os.system(copycommand1)
+os.system(copycommand2)
+os.system(copycommand3)
+#Generate EMBL output
+emblfile = open(genomename + "/embl_lines.txt","w")
+for i in geneclustergenes:
+emblfile.write(i + "\t")
+if smcogs == "y":
+if smcogdict.has_key(i):
+emblfile.write("smCOG: " + smcogdict[i] + ":" + smcogdescriptions[smcogdict[i]] + "\t")
+if nrpspkstypedict.has_key(i):
+emblfile.write("NRPS/PKS type: " + nrpspkstypedict[i] + "\t")
+if domaindict.has_key(i):
+domains = domaindict[i]
+for j in domains:
+emblfile.write(j[0] + " (" + str(j[1]) + "-" + str(j[2]) + "); E-value:" + str(j[3]) + "; Bit score: " + str(j[4]) + "\t")
+nrat = 0
+for k in minowa_pks_preds.keys():
+if i in k:
+nrat += 1
+emblfile.write("AT-domain " + str(nrat) + " Minowa substrate specificity prediction: " + minowa_pks_preds[k] + "\t")
+nrat = 0
+for k in pks_code_preds.keys():
+if i in k:
+nrat += 1
+emblfile.write("AT-domain " + str(nrat) + " PKS code substrate specificity prediction: " + pks_code_preds[k] + "\t")
+nrcal = 0
+for k in minowa_cal_preds.keys():
+if i in k:
+nrcal += 1
+emblfile.write("CAL-domain " + str(nrcal) + " Minowa substrate specificity prediction: " + minowa_cal_preds[k] + "\t")
+nra = 0
+for k in minowa_nrps_preds.keys():
+if i in k:
+nra += 1
+emblfile.write("A-domain " + str(nra) + " Minowa substrate specificity prediction: " + minowa_nrps_preds[k] + "\t")
+nra = 0
+for k in nrps_code_preds.keys():
+if i in k:
+nra += 1
+emblfile.write("A-domain " + str(nra) + " Stachelhaus code substrate specificity prediction: " + nrps_code_preds[k] + "\t")
+nra = 0
+for k in nrps_svm_preds.keys():
+if i in k:
+nra += 1
+emblfile.write("A-domain " + str(nra) + " NRPSPredictor2 SVM substrate specificity prediction: " + nrps_svm_preds[k] + "\t")
+nrkr = 0
+for k in kr_activity_preds.keys():
+if i in k:
+nrkr += 1
+emblfile.write("KR-domain " + str(nrat) + " activity prediction: " + kr_activity_preds[k] + "\t")
+emblfile.write("KR-domain " + str(nrat) + " predicted stereochemistry group: " + kr_stereo_preds[k] + "\t")
+if motifdict.has_key(i):
+l = motifdict[i]
+for m in l:
+emblfile.write("Motif " + str(m[0]) + " (" + str(m[1]) + "-" + str(m[2]) + "). E-value: " + str(m[3]) + "; Bit score: " + str(m[4]) + "\t")
+emblfile.write("\n")
+emblfile.write("\n\n>>\n\n")
+#enter separate domain entries
+for i in geneclustergenes:
+strand = strandsdict[i]
+startpos = geneposdict[i][0]
+endpos = geneposdict[i][1]
+if domaindict.has_key(i):
+domains = domaindict[i]
+for j in domains:
+if strand == "+":
+emblfile.write("misc_feature\t" + str(startpos + j[1] * 3) + ".." + str(startpos + j[2] * 3) + "\t" + str(j[0]) + " domain;\tE-value: " + str(j[3]) + "\tBit score: " + str(j[4]) + "\t/colour=2\n")
+elif strand == "-":
+emblfile.write("misc_feature\tcomplement(" + str(endpos - j[2] * 3) + ".." + str(endpos - j[1] * 3) + ")\t" + str(j[0]) + "domain;\tE-value: " + str(j[3]) + "Bit score: " + str(j[4]) + "\t/colour=2\n")
+if motifdict.has_key(i):
+l = motifdict[i]
+for m in l:
+if strand == "+":
+emblfile.write("misc_feature\t" + str(startpos + m[1] * 3) + ".." + str(startpos + m[2] * 3) + "\t" + str(m[0]) + " motif;\tE-value: " + str(m[3]) + "\tBit score: " + str(m[4]) + "\t/colour=6\n")
+elif strand == "-":
+emblfile.write("misc_feature\tcomplement(" + str(endpos - m[2] * 3) + ".." + str(endpos - m[1] * 3) + ")\t" + str(m[0]) + " motif;\tE-value: " + str(m[3]) + "\tBit score: " + str(m[4]) + "\t/colour=6\n")
+emblfile.write("\n\n>>\n\n")
+for i in geneclusters:
+cstart = clusterinfo[i][1]
+if cstart == 0:
+cstart = 1
+cend = clusterinfo[i][2]
+emblfile.write("misc_feature\t" + str(cstart) + ".." + str(cend) + "\t" + clusterinfo[i][0] + " gene cluster\t/colour=13\n")
+emblfile.close()
+#Close open html file
+htmloutfile.close()
+#Run whole-genome BLAST / HMM CLUSEAN modules & ClusterFinder
+if sys.platform == ('win32'):
+copycommand = "copy " + infile + " " + genomename + ' > nul'
+if sys.platform == ('linux2'):
+copycommand = "cp " + infile + " " + genomename
+os.system(copycommand)
+os.chdir(genomename)
+args = "--cpus %s " % nrcpus
+if fullblast == "n":
+args += "--without-blast "
+if fullhmm == "n":
+args += "--without-hmmer "
+if fullhmm == "y":
+args += '--pfamdbpath %s ' % pfamdbpath
+if fullblast == "y":
+args += '--blastdbpath %s ' % blastdbpath
+logfile.write("Running CLUSEAN pipeline modules.\n")
+if sys.platform == ('win32'):
+os.system("python ..\\clusean\\scripts\\runPipeline.py %s" % args)
+if sys.platform == ('linux2'):
+os.system( antismash_path + "clusean/scripts/runPipeline.py %s" % args)
+#print antismash_path + "clusean/scripts/runPipeline.py %s" % args
+os.chdir('..')
+#Close log file
+logfile.write("antiSMASH successfully finished in " + str(elapsed) + " seconds.\n")
+#print "antiSMASH successfully finished in " + str(elapsed) + " seconds.\n"
+logfile.close()

Mercurial > repos > bjoern-gruening > antismash

comparison antismash.py @ 0:6a37d0a4510a default tip