# HG changeset patch
# User martasampaio
# Date 1555772974 14400
# Node ID a455e43048cd028d0ae97f2213a564de662c68df
# Parent  5acc4fa8b62d7a97f8354430e07b7c4abedb95ad
Uploaded

diff -r 5acc4fa8b62d -r a455e43048cd README.rst
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,29 @@
+===============
+PhagePromoter
+===============
+
+Get promoter of phage genomes
+
+PhagePromoter is a python script that predicts promoter sequences in phage genomes, using a machine learning SVM model. This model was built from a train dataset with 19 features and 3200 examples (800 positives and 2400 negatives), each representing a 65 bp sequence of a phage genome. The positive cases represent the phage sequences that are already identified as promoters. 
+
+**Inputs:**
+
+* genome format: fasta vs genbank; 
+* genome file: acepts both genbank and fasta formats;
+* both strands (yes or no): allows the search in both DNA strands;
+* threshold: represents the probability of the test sequence be a promoter (float between 0 and 1)"
+* family: The family of the testing phage - Podoviridae, Siphoviridae or Myoviridae;
+* Bacteria: The host of the phage. The train dataset include the following hosts: Bacillus, EColi, Salmonella, Pseudomonas, Yersinia, Klebsiella, Pectobacterium, Morganella, Cronobacter, Staphylococcus, Streptococcus, Streptomyces, Lactococcus. If the testing phage has a different host, select the option 'other', and it is recommended the use of a higher threshold value for more accurate results.
+* phage type: The type of the phage, according to its lifecycle: virulent or temperate;
+
+**Outputs:**
+This tool outputs two files: a FASTA file and a table in HTML, with the locations, sequence, score and type (recognized by host or phage RNAP) of the predicted promoters.
+
+**Requirements:**
+
+* Biopython
+* Sklearn 
+* Numpy
+* Pandas 
+
+
diff -r 5acc4fa8b62d -r a455e43048cd auxiliar.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/auxiliar.py	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun May 27 17:37:09 2018
+
+@author: Marta
+"""
+
+
+#get the phage host from the file 'bacteria.xlsx'
+def get_bacteria(file):
+    import pandas as pd
+    df = pd.read_excel(file,header=0,index_col=0)
+    bacteria = {}
+    for ind,row in df.iterrows():
+        bac = row['Bacteria']
+        bacteria[ind] = bac
+    return bacteria
+
+#get the phage family from the file 'family.xlsx'
+def get_families(file):
+    import pandas as pd
+    df = pd.read_excel(file,header=0,index_col=0)
+    families = {}
+    for ind,row in df.iterrows():
+        fam = row['Family']
+        families[ind] = fam
+    return families
+
+#get phage lifecycle from the file 'lifecycle.xlsx'
+def get_lifecycle(file):
+    import pandas as pd
+    df = pd.read_excel(file,header=0,index_col=0)
+    types = {}
+    for ind,row in df.iterrows():
+        lc = row['lifecycle']
+        types[ind] = lc
+    return types
+
+#reads a file with a PSSM and return the max possible score of that PSSM
+def get_max_pssm(file_pssm):
+    from Bio.Alphabet import IUPAC
+    from Bio.motifs import matrix
+    m = []
+    fic = open(file_pssm,'r')
+    rf = fic.readline()
+    while rf:
+        new_l = []
+        l = rf.strip().split('\t')
+        for val in l:
+            x = float(val)
+            new_l.append(x)
+        m.append(new_l)
+        rf = fic.readline()
+    a = IUPAC.unambiguous_dna
+    dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]}
+    pssm = matrix.PositionSpecificScoringMatrix(a,dic)
+    return pssm.max
+
+#reads a file with a PSSM and returns a list of scores in all positions of the sequence
+#returns the score divided by the maximum possible value
+def get_scores(file_pssm, seq):
+    from Bio.Alphabet import IUPAC
+    from Bio.motifs import matrix
+    maxi = get_max_pssm(file_pssm)
+    m = []
+    fic = open(file_pssm,'r')
+    rf = fic.readline()
+    while rf:
+        new_l = []
+        l = rf.strip().split('\t')
+        for val in l:
+            x = float(val)
+            new_l.append(x)
+        m.append(new_l)
+        rf = fic.readline()
+    a = IUPAC.unambiguous_dna
+    dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]}
+    pssm = matrix.PositionSpecificScoringMatrix(a,dic)
+    scores = []
+    positions = []
+    a = IUPAC.unambiguous_dna
+    seq.alphabet = a
+    for pos, score in pssm.search(seq, both=False,threshold=-50):
+        scores.append(score/maxi)
+        positions.append(pos)
+    return scores,positions
+
+#returns the frequencia of A and T bases in a sequence    
+def freq_base(seq):
+    A = seq.count('A')
+    T = seq.count('T')
+    AT = A+T
+    return AT
+
+#returns the free energy value of that sequence
+def free_energy(seq):
+    dic1 = {'AA':-1.00, 
+        'TT':-1.00, 
+        'AT':-0.88, 
+        'TA':-0.58, 
+        'CA':-1.45,
+        'AC':-1.44, 
+        'GG':-1.84, 
+        'CC':-1.84, 
+        'GA':-1.30, 
+        'AG':-1.28, 
+        'TC':-1.30, 
+        'CT':-1.28, 
+        'TG':-1.45,
+        'GT':-1.44,
+        'GC':-2.24,
+        'CG':-2.17}
+    total = 0
+    i = 0
+    j = 1
+    while i < len(seq)-1:
+        dint = seq[i]+seq[j]
+        total += dic1[dint]
+        i += 1
+        j += 1
+    return total
\ No newline at end of file
diff -r 5acc4fa8b62d -r a455e43048cd model1600.sav
Binary file model1600.sav has changed
diff -r 5acc4fa8b62d -r a455e43048cd model2400.sav
Binary file model2400.sav has changed
diff -r 5acc4fa8b62d -r a455e43048cd phagepromoter.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/phagepromoter.py	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,570 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jul 19 13:45:05 2018
+
+@author: Marta
+"""
+
+from Bio import SeqIO
+import numpy as np
+import pandas as pd
+from auxiliar import free_energy,freq_base
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.Alphabet import IUPAC
+from auxiliar import get_bacteria, get_families, get_max_pssm, get_scores, get_lifecycle
+
+#division of the test genome in sequences of 65 bp
+def get_testseqs65(form,fic,both=False):
+    ALL = []
+    indexes = []
+    a = 0
+    rec = SeqIO.read(fic,form)
+    genome = rec.seq
+    i = 0
+    j = 65
+    while j < len(genome):
+        s = genome[i:j]
+        ALL.append([1,i,j,s])
+        i += 20
+        j += 20
+        a += 1
+        indexes.append(rec.name+":"+str(a))
+    if both:
+        comp = genome.reverse_complement()
+        size = len(rec.seq)
+        i = 0
+        j = 65
+        while j < len(comp):
+            s = comp[i:j]
+            ALL.append([-1,size-j,size-i,s])
+            i += 20
+            j += 20
+            a += 1
+            indexes.append(rec.name+":"+str(a))
+    df = pd.DataFrame(ALL, index=indexes, columns=['strand','iniprom','endprom','seq'])
+    return df
+
+#calculate the scores of all sequences (similar to get_posScores and get_negScores)
+def get_testScores(loc,test):
+    scores = []
+    posis = []
+    sizes = []
+    dic = {}
+    for ind,row in test.iterrows():
+        _,window = ind.split(':')
+        strand = row['strand']
+        ini = row['iniprom']
+        end = row['endprom']
+        seq = row['seq']
+        pos = [ini,end,strand]
+        dic[window] = pos
+        s = seq
+        score10_6,pos10_6 = get_scores(os.path.join(loc,'pssm10_6.txt'), s)
+        maxi10_6 = get_max_pssm(os.path.join(loc,'pssm10_6.txt'))
+        score10_8,pos10_8 = get_scores(os.path.join(loc,'pssm10_8.txt'), s)
+        maxi10_8 = get_max_pssm(os.path.join(loc,'pssm10_8.txt'))
+        scores23,pos23 = get_scores(os.path.join(loc,'pssm_23.txt'), s)
+        maxi23 = get_max_pssm(os.path.join(loc,'pssm_23.txt'))
+        scores21,pos21 = get_scores(os.path.join(loc,'pssm_21.txt'), s)
+        maxi21 = get_max_pssm(os.path.join(loc,'pssm_21.txt'))
+        scores27,pos27 = get_scores(os.path.join(loc,'pssm_27.txt'), s)
+        maxi27 = get_max_pssm(os.path.join(loc,'pssm_27.txt'))
+        scores32,pos32 = get_scores(os.path.join(loc,'pssm_32.txt'), s)
+        maxi32 = get_max_pssm(os.path.join(loc,'pssm_32.txt'))
+        score23 = max(scores23)
+        score21 = max(scores21)
+        score27 = max(scores27)
+        score32 = max(scores32)
+        maxiphage = max(score23,score21,score27,score32)
+        if maxiphage == score23: phage_max = score23*maxi23
+        elif maxiphage == score21: phage_max = score21*maxi21
+        elif maxiphage == score27: phage_max = score27*maxi27
+        elif maxiphage == score32: phage_max = score32*maxi32
+        score35_6,pos35_6 = get_scores(os.path.join(loc,'pssm35_6.txt'), s)
+        maxi35_6 = get_max_pssm(os.path.join(loc,'pssm35_6.txt'))
+        score35_9,pos35_9 = get_scores(os.path.join(loc,'pssm35_9.txt'), s)
+        maxi35_9 = get_max_pssm(os.path.join(loc,'pssm35_9.txt'))
+        score35_t4,pos35_t4 = get_scores(os.path.join(loc,'pssm35_t4.txt'), s)
+        maxi35_t4 = get_max_pssm(os.path.join(loc,'pssm35_t4.txt'))
+        score35_cbb,pos35_cbb = get_scores(os.path.join(loc,'pssm35_cbb.txt'), s)
+        maxi35_cbb = get_max_pssm(os.path.join(loc,'pssm35_cbb.txt'))
+        score35_lb,pos35_lb = get_scores(os.path.join(loc,'pssm35_lb.txt'),s)
+        maxi35_lb = get_max_pssm(os.path.join(loc,'pssm35_lb.txt'))
+        score35_mu, pos35_mu = get_scores(os.path.join(loc,'pssm35_mu.txt'),s)
+        maxi35_mu = get_max_pssm(os.path.join(loc,'pssm35_mu.txt'))
+        
+        dists6 = []
+        score6 = []
+        for p in pos10_6:
+            for a in range(14,22):
+                d = p-a-6
+                if d >= 0: 
+                    s10 = score10_6[p]
+                    s35_6 = score35_6[d]
+                    score6.append([s35_6,s10])
+                    dists6.append([d,p])
+        dists9 = []            
+        score9 = []
+        for p in pos10_6:
+            for a in range(11,14):
+                d = p-a-9
+                if d >= 0: 
+                    s10 = score10_6[p]
+                    s35_9 = score35_9[d]
+                    score9.append([s35_9,s10])
+                    dists9.append([d,p])
+        distst4 = []
+        scoret4 = []
+        distscbb = []
+        scorecbb = []
+        for p in pos10_6:
+            for a in range(16,18):
+                d = p-a-7
+                if d >= 0: 
+                    s10 = score10_6[p]
+                    s35_t4 = score35_t4[d]
+                    s35_cbb = score35_cbb[d]
+                    scoret4.append([s35_t4,s10])
+                    distst4.append([d,p])
+                    scorecbb.append([s35_cbb,s10])
+                    distscbb.append([d,p])
+                    
+        
+        distsmu = []
+        scoremu = []
+        for p in pos10_6:
+            d = p-16-14
+            if d >= 0: 
+                s10 = score10_6[p]
+                s35_mu = score35_mu[d]
+                scoremu.append([s35_mu,s10])
+                distsmu.append([d,p])
+        
+        distslb = []
+        scorelb = []
+        for p in pos10_6:
+            d = p-13-14
+            if d >= 0: 
+                s10 = score10_6[p]
+                s35_lb = score35_lb[d]
+                scorelb.append([s35_lb,s10])
+                distslb.append([d,p])
+    
+        soma6 = [sum(x) for x in score6]
+        soma9 = [sum(x) for x in score9]
+        somat4 = [sum(x) for x in scoret4]
+        somacbb = [sum(x) for x in scorecbb]
+        somamu = [sum(x) for x in scoremu]
+        somalb = [sum(x) for x in scorelb]
+        
+        maxi6 = max(soma6)
+        maxi9 = max(soma9)
+        maxit4 = max(somat4)
+        maxicbb = max(somacbb)
+        maximu = max(somamu)
+        maxilb = max(somalb)
+        maxi_elems = max(maxi6,maxi9,maxit4,maxicbb,maxilb,maximu)
+
+        if maxi_elems == maxilb:
+            indmax = somalb.index(maxilb)
+            sc35 = scorelb[indmax][0]
+            sc10 = scorelb[indmax][1]
+            score_elems = [sc35,sc10]
+            posel = distslb[indmax]
+            size35 = 14
+            elems_maxi = sc35*maxi35_lb+sc10*maxi10_6
+        elif maxi_elems == maximu:
+            indmax = somamu.index(maximu)
+            sc35 = scoremu[indmax][0]
+            sc10 = scoremu[indmax][1]
+            score_elems = [sc35,sc10]
+            posel = distsmu[indmax]
+            size35 = 14
+            elems_maxi = sc35*maxi35_mu+sc10*maxi10_6
+        elif maxi_elems == maxi9:
+            indmax = soma9.index(maxi9)
+            sc35 = score9[indmax][0]
+            sc10 = score9[indmax][1]
+            score_elems = [sc35,sc10]
+            posel = dists9[indmax]
+            size35 = 9
+            elems_maxi = sc35*maxi35_9+sc10*maxi10_6
+        elif maxi_elems == maxit4:
+            indmax = somat4.index(maxit4)
+            sc35 = scoret4[indmax][0]
+            sc10 = scoret4[indmax][1]
+            score_elems = [sc35,sc10]
+            posel = distst4[indmax]
+            size35 = 7
+            elems_maxi = sc35*maxi35_t4+sc10*maxi10_6
+        elif maxi_elems == maxicbb:
+            indmax = somacbb.index(maxicbb)
+            sc35 = scorecbb[indmax][0]
+            sc10 = scorecbb[indmax][1]
+            score_elems = [sc35,sc10]
+            posel = distscbb[indmax]
+            size35 = 7
+            elems_maxi = sc35*maxi35_cbb+sc10*maxi10_6
+        else:
+            indmax = soma6.index(maxi6)
+            sc35 = score6[indmax][0]
+            sc10 = score6[indmax][1]
+            score_elems = [sc35,sc10]
+            posel = dists6[indmax]
+            size35 = 6
+            elems_maxi = sc35*maxi35_6+sc10*maxi10_6
+        
+        if score23 == maxiphage:
+            phage_score = score23
+            posiphage = scores23.index(score23)
+            sizephage = 23
+        elif score21 == maxiphage:
+            phage_score = score21
+            posiphage = scores21.index(score21)
+            sizephage = 21
+        elif score27 == maxiphage:
+            phage_score = score27
+            posiphage = scores27.index(score27)
+            sizephage = 27
+        else:
+            phage_score = score32
+            posiphage = scores32.index(score32)
+            sizephage = 32
+            
+        if elems_maxi >= max(score10_8)*maxi10_8:
+            i = posel[1]
+            ext = s[i-3:i-1]
+            if ext == 'TG': tg = 1
+            else: tg = 0
+            if elems_maxi > phage_max: host = 1
+            else: 
+                host = 0
+                tg = 0
+            sc = max(score10_8)
+            end35 = posel[0]+size35
+            dist = posel[1]-end35
+            scores.append([host, score_elems[1],sc,score_elems[0],phage_score,tg,dist,str(seq)])
+            posis.append([posel[1],posel[0],posiphage])
+            sizes.append([6,size35,sizephage])
+        else:
+            host = 1
+            sc = max(score10_8)
+            i = score10_8.index(sc)
+            ext = s[i-3:i-1]
+            if ext == 'TG': tg = 1
+            else: tg = 0
+            if max(score10_8)*maxi10_8 > phage_max: host = 1
+            else: 
+                host = 0
+                tg = 0
+            end35 = posel[0]+size35
+            dist = posel[1]-end35
+            scores.append([host,score_elems[1],sc,score_elems[0],phage_score,tg,dist,str(seq)])
+            posis.append([i,posel[0],posiphage])
+            sizes.append([8,size35,sizephage])
+    score = pd.DataFrame(scores, index=test.index, columns=['host','score10','score10_8','score35','score_phage','tg','dist','seq'])
+    posis = pd.DataFrame(posis, index=test.index, columns=['pos10','pos35','posphage'])
+    sizes = pd.DataFrame(sizes, index=test.index, columns=['size10','size35','size_phage'])
+    df_all = pd.concat([score,posis,sizes],axis=1)
+    return df_all,dic
+
+def create_dftest(scores_test,dic_window,family,bacteria,lifecycle):
+    tudo = []
+    tudo2 = []
+    for ind,row in scores_test.iterrows():
+        _,window = ind.split(':')
+        posis = dic_window[window]
+        strand=posis[2]
+        if strand == 1: ini=posis[0]
+        else: ini=posis[1]
+        seqprom = row['seq']
+        score10 = row['score10']
+        score10_8 = row['score10_8']
+        score35 = row['score35']
+        scorephage = row['score_phage']
+        size10 = row['size10']
+        size35 = row['size35']
+        sizephage = row['size_phage']
+        ini10 = row['pos10']
+        tg = row['tg']
+        host = row['host']
+        ini35 = row['pos35']
+        dist = row['dist']
+        end10=ini10+size10
+        iniphage = row['posphage']
+        endphage = iniphage+sizephage
+        if strand == 1:
+            if host == 0: 
+                new_seq = seqprom[iniphage:endphage]
+                new_ini = ini+iniphage+1
+                new_end = ini+endphage
+            else:
+                if size10 == 6: 
+                    new_seq = seqprom[ini35:end10]
+                    new_ini = ini+ini35+1
+                    new_end = ini+end10
+                else: 
+                    new_seq = seqprom[ini10:end10]
+                    new_ini = ini+ini10+1
+                    new_end = ini+end10
+            new_pos = '('+str(new_ini)+'..'+str(new_end)+')'
+        else:
+            if host == 0: 
+                new_seq = seqprom[iniphage:endphage]
+                new_ini = ini-endphage+1
+                new_end = ini-iniphage
+            else:
+                if size10 == 6: 
+                    new_seq = seqprom[ini35:end10]
+                    new_ini = ini-end10+1
+                    new_end = ini-ini35
+                else: 
+                    new_seq = seqprom[ini10:end10]
+                    new_ini = ini-end10+1
+                    new_end = ini-ini10
+            new_pos = 'complement('+str(new_ini)+'..'+str(new_end)+')'
+        if size10 == 6:
+            size10_6 = 1
+        else: 
+            size10_6 = 0
+        if size35 == 6:
+            size35_6 = 1
+            size35_7 = 0
+            size35_9 = 0
+            size35_14 = 0
+        elif size35 == 7:
+            size35_6 = 0
+            size35_7 = 1
+            size35_9 = 0
+            size35_14 = 0
+        elif size35 == 9:
+            size35_6 = 0
+            size35_7 = 0
+            size35_9 = 1
+            size35_14 = 0
+        else:
+            size35_6 = 0
+            size35_7 = 0
+            size35_9 = 0
+            size35_14 = 1
+        if sizephage == 23:
+            sizephage_23 = 1
+            sizephage_21 = 0
+            sizephage_32 = 0
+        elif sizephage == 21:
+            sizephage_23 = 0
+            sizephage_21 = 1
+            sizephage_32 = 0
+        elif sizephage == 32:
+            sizephage_23 = 0
+            sizephage_21 = 0
+            sizephage_32 = 1
+        else:
+            sizephage_23 = 0
+            sizephage_21 = 0
+            sizephage_32 = 0
+        if family == 'Podoviridae': 
+            Podo = 1
+            Sipho = 0
+            Myo = 0
+        elif family == 'Siphoviridae':
+            Podo = 0
+            Sipho = 1
+            Myo = 0
+        elif family == 'Myoviridae':
+            Podo = 0
+            Sipho = 0
+            Myo = 1
+        else: 
+            Podo = 0
+            Sipho = 0
+            Myo = 0
+        if bacteria == 'Bacillus':
+            bac = [1,0,0,0,0]
+        elif bacteria == 'Escherichia coli':
+            bac = [0,1,0,0,0]
+        elif bacteria == 'Klebsiella':
+            bac = [0,0,1,0,0]
+        elif bacteria == 'Pectobacterium':
+            bac = [0,0,0,1,0]
+        elif bacteria == 'Cronobacter':
+            bac = [0,0,0,0,1]
+        else:
+            bac = [0,0,0,0,0]
+        if lifecycle == 'virulent':  tp = 0
+        else: tp = 1
+        fe = free_energy(str(seqprom))
+        AT = freq_base(str(seqprom))
+        linha = [score10, score10_8, score35, dist, scorephage, fe, AT, host,size10_6,
+                 size35_6, size35_7, size35_9, size35_14,
+                 sizephage_23, sizephage_21,
+                 sizephage_32, tg, Podo, Sipho, Myo,tp]
+        linha.extend(bac)
+        tudo.append(linha)
+        linha2 = [new_pos,str(new_seq), host, size10_6,
+                 score10, score10_8, size35_6, size35_7, size35_9,size35_14, 
+                 score35, dist, sizephage_23, sizephage_21,
+                 sizephage_32, scorephage, tg, Podo, Sipho, Myo,tp, fe, AT]
+        linha2.extend(bac)
+        tudo2.append(linha2)
+    df_test = pd.DataFrame(tudo, index=scores_test.index, 
+                          columns = ['score10', 'score10_8','score35', 'dist35_10',
+                                     'scorephage','fe', 'freqAT',
+                                     'host','size10',
+                                     'size35_6', 'size35_7','size35_9','size35_14',
+                                     'sizephage_23', 
+                                     'sizephage_21', 'sizephage_32',
+                                     'TG', 'Podo', 'Sipho', 'Myo', 'tp',
+                                     'Bacillus', 'EColi', 
+                                     'Pectobacterium','Klebsiella', 'Cronobacter'])
+    df_INFO = pd.DataFrame(tudo2, index=scores_test.index, 
+                          columns = ['Positions','Promoter Sequence','host','size10',
+                                     'score10', 'score10_8',
+                                     'size35_6', 'size35_7','size35_9','size35_14',
+                                     'score35', 'dist35_10','sizephage_23', 
+                                     'sizephage_21', 'sizephage_32',
+                                     'scorephage', 'TG', 'Podo', 'Sipho', 'Myo', 'tp','fe', 'freqAT',
+                                     'EColi', 'Salmonella', 
+                                     'Pectobacterium','Cronobacter', 'Streptococcus'])
+    return df_test,df_INFO
+
+
+def get_predictions(scaler_file,model_file,test,df_testinfo,threshold):
+    from sklearn.externals import joblib
+    scaler = joblib.load(scaler_file)
+    model = joblib.load(model_file)
+    feat_scaled = pd.DataFrame(scaler.transform(test.iloc[:,:7]),index =test.index, columns=test.columns[:7])
+    TEST_scaled = pd.concat([feat_scaled,test.iloc[:,7:]],axis=1)
+    scores = model.predict_proba(TEST_scaled)
+    pos_scores = np.empty((TEST_scaled.shape[0],0), float)
+    for x in scores: pos_scores = np.append(pos_scores,x[1])
+    try: positive_indexes = np.nonzero(pos_scores>float(threshold))[0] #escolher os positivos, podia ser escolher com score > x
+    except ValueError: return 'The threshold value is not a float'
+    else:
+        if len(positive_indexes) == 0: return None
+        else:
+            positive_windows = TEST_scaled.index[positive_indexes]
+            INFO = df_testinfo.loc[positive_windows,['Positions','Promoter Sequence']]
+            promoter_type = []
+            for x in df_testinfo.loc[positive_windows,'host'].tolist():
+                if x == 0: promoter_type.append('phage')
+                else: promoter_type.append('host')
+            INFO['Type'] = promoter_type
+            INFO['Scores'] = np.around(pos_scores[positive_indexes],decimals=3)
+            INFO.index = positive_windows
+            return INFO
+
+
+def get_finaldf(test,rec):
+    new_df = test.groupby(['Positions'])
+    groups = list(new_df.groups.keys())
+    for i in range(len(groups)-1):
+        for j in range(i, len(groups)):
+            if 'complement' in groups[i]: inii = int(groups[i][11:].split('..')[0])
+            else: inii = int(groups[i][1:].split('..')[0])
+            if 'complement' in groups[j]: inij = int(groups[j][11:].split('..')[0])
+            else: inij = int(groups[j][1:].split('..')[0])
+            if inij < inii:
+                temp = groups[i]
+                groups[i] = groups[j]
+                groups[j] = temp
+    new_inds = []
+    for g in groups:
+        inds = new_df.groups[g]
+        if len(inds) == 1: new_inds.append(inds[0])
+        else:
+            maxi = max(new_df.get_group(g)['Scores'])
+            i = new_df.groups[g][new_df.get_group(g)['Scores']==maxi][0]
+            new_inds.append(i)
+    
+    output = test.loc[new_inds,:]
+    strands = []
+    new_pos = []
+    old_pos = output['Positions'].tolist()
+    
+    from Bio.SeqFeature import SeqFeature, FeatureLocation
+    feats = rec.features
+    for ind, row in output.iterrows():
+        pos = row['Positions']
+        if 'complement' in pos: 
+            strands.append('-')
+            new_pos.append(pos[10:])
+            ini,end= pos[11:-1].split('..')
+            new_loc = FeatureLocation(int(ini),int(end),strand=-1)
+        else: 
+            strands.append('+')
+            new_pos.append(pos)
+            ini,end= pos[1:-1].split('..')
+            new_loc = FeatureLocation(int(ini),int(end),strand=1)
+        feat = SeqFeature(new_loc, type='regulatory',qualifiers={'regulatory_class':['promoter'], 'note=':['predicted by PhagePromoter']})
+        feats.append(feat)            
+            
+    output.insert(loc=0, column='Strand', value=strands)
+    output['Positions'] = new_pos
+    output.to_html('output.html',index=False, justify='center')
+    recs = []
+    i = 0
+    for ind,row in output.iterrows():
+        s = Seq(row['Promoter Sequence'])
+        posis = old_pos[i]
+        typ = row['Type']
+        score = row['Scores']
+        sq = SeqRecord(seq=s, id=ind, description=typ+' '+posis+' score='+str(score))
+        recs.append(sq)
+        i += 1
+    SeqIO.write(recs, 'output.fasta','fasta')
+    new_rec = rec
+    new_rec.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
+    new_feats = sorted(feats, key=lambda x: x.location.start)
+    new_rec.features = new_feats
+    SeqIO.write(new_rec,'output.gb','genbank')
+
+if __name__== "__main__":
+    
+    import sys
+    import os
+    __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+    
+    gen_format = sys.argv[1]
+    genome_file = sys.argv[2]
+    both = sys.argv[3]
+    threshold = sys.argv[4]
+    family = sys.argv[5]
+    host = sys.argv[6]
+    phage_type = sys.argv[7]
+    model = sys.argv[8]
+    '''
+    gen_format = 'genbank'
+    genome_file = 'test-data/NC_015264.gb'
+    both = False
+    threshold = '0.50'
+    family = 'Podoviridae'
+    host = 'Pseudomonas'
+    phage_type = 'virulent'
+    model = 'SVM2400'
+    #model = 'ANN1600'
+    '''
+    
+    rec = SeqIO.read(genome_file, gen_format)
+    test_windows = get_testseqs65(gen_format, genome_file,both)
+    try: score_test,dic_window = get_testScores(__location__,test_windows)
+    except IndexError: print('Error. Input sequence can only have A,C,G or T')
+    else:
+        df_test,df_testinfo = create_dftest(score_test,dic_window,family,host,phage_type)
+        if model == 'ANN1600':
+            scaler_file = os.path.join(__location__, 'scaler1600.sav')
+            model_file = os.path.join(__location__, 'model1600.sav')
+            preds =  get_predictions(scaler_file, model_file, df_test,df_testinfo,threshold)
+            if preds is None: print('There is no sequence with a score value higher or equal to the threshold '+str(threshold))
+            elif type(preds) == str: print(preds)
+            else: output = get_finaldf(preds,rec)
+        else:
+            scaler_file = os.path.join(__location__, 'scaler2400.sav')
+            model_file = os.path.join(__location__, 'model2400.sav')
+            new_df_test = df_test.iloc[:,[0,1,2,3,4,5,6,7,8,9,13,14,16,17,19,20,22,24,25]]
+            preds =  get_predictions(scaler_file, model_file, new_df_test,df_testinfo,threshold)
+            if preds is None: print('There is no sequence with a score value higher or equal to the threshold '+str(threshold))
+            elif type(preds) == str: print(preds)
+            else: output = get_finaldf(preds,rec)
+        
diff -r 5acc4fa8b62d -r a455e43048cd phagepromoter.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/phagepromoter.xml	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,120 @@
+<tool id="get_proms" name="PhagePromoter" version="0.1.0">
+    <description>
+Get promoters of phage genomes
+    </description>
+    <requirements>
+        <requirement type="package">biopython</requirement>
+        <requirement type="package">scikit-learn</requirement>
+        <requirement type="package">numpy</requirement>
+        <requirement type="package">pandas</requirement>
+    </requirements>
+    <command detect_errors="exit_code" interpreter="python3"><![CDATA[ 
+	phagepromoter.py "$input_type.genome_format" "$genome" "$both" "$threshold" "$family" "$bacteria"  "$lifecycle"
+ "$adv.model" ]]>
+    </command>
+    <inputs>
+	<conditional name="input_type">
+           <param type="select" name="genome_format" label='file format'>
+              <option value="genbank" selected="yes">genbank</option>
+              <option value="fasta">fasta</option>
+	   </param>
+	   <when value="genbank">
+              <param type="data" name="genome" format="genbank" label='genome'/>
+	   </when>
+	   <when value="fasta">
+              <param type="data" name="genome" format="fasta" label='genome'/>
+	   </when>
+        </conditional>
+        <param type="boolean" name="both" label='Search both strands' checked="false" truevalue="-both" falsevalue="" />
+	<param name="threshold" type="float" value="0.50" label="Threshold" help="Probabilty of being a promoter (float between 0 and 1)" />
+        <param type="select" name="family" label='Phage family'>
+	  <option value="Podoviridae" selected="yes">Podoviridae</option>
+	  <option value="Siphoviridae">Siphoviridae</option>
+	  <option value="Myoviridae">Myoviridae</option>
+	</param>
+        <param type="select" name="bacteria" label='Host bacteria Genus'>
+	  <option value="Escherichia coli" selected="yes">Escherichia coli</option>
+	  <option value="Salmonella">Salmonella</option>
+	  <option value="Pseudomonas">Pseudomonas</option>
+	  <option value="Yersinia">Yersinia</option>
+	  <option value="Morganella">Morganella</option>
+	  <option value="Cronobacter">Cronobacter</option>
+	  <option value="Staphylococcus">Staphylococcus</option>
+	  <option value="Streptococcus">Streptococcus</option>
+	  <option value="Lactococcus">Lactococcus</option>
+	  <option value="Streptomyces">Streptomyces</option>
+	  <option value="Klebsiella">Klebsiella</option>
+	  <option value="Bacillus">Bacillus</option>
+	  <option value="Pectobacterium">Pectobacterium</option>
+	  <option value="other">other</option>
+	</param>
+        <param type="select" name="lifecycle" label='Phage type'>
+	  <option value="virulent" selected="yes">virulent</option>
+	  <option value="temperate">temperate</option>
+	</param>
+	<section name = 'adv' title= 'Advanced Options' expanded = 'False'>
+		<param type = "select" name="model" label="Model">
+		   <option value="SVM2400" selected="yes">SVM2400</option>
+		   <option value="ANN1600">ANN1600</option>
+		</param>
+	</section>
+    </inputs>
+    <outputs>
+        <data name="output1" format="html" from_work_dir="output.html" />
+        <data name="output2" format="fasta" from_work_dir="output.fasta" />
+        <data name="output3" format="genbank" from_work_dir="output.gb" />		
+    </outputs>
+    <tests>
+        <test>
+	        <param name="genome_format" value="genbank"/>
+            <param name="genome" value="NC_015264.gb"/>
+            <param name="both" value="False"/>
+	        <param name="threshold" value="0.50"/>
+            <param name="family" value="Podoviridae"/>
+            <param name="bacteria" value="Pseudomonas"/>
+            <param name="lifecycle" value="virulent"/>
+			<param name="model" value="SVM2400"/>
+            <output name="output1" file="output.html"/>
+            <output name="output2" file="output.fasta"/>
+            <output name="output3" file="output.gb"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+===============
+PhagePromoter
+===============
+
+Get promoters of phage genomes
+
+PhagePromoter is a python script that predicts promoter sequences in phage genomes, using machine learning models. Two different datasets were used to developed two models: the ANN model was built using a dataset with 26 features and 2400 examples (800 positives and 1600 negatives) and the SVM model was created using a dataset with 19 features and 3200 examples (800 positives and 2400 negatives). 
+Each example represents a sequence of 65 base pairs of a phage genome. The positive examples correspond to phage sequences already identified as promoters.  
+
+**Inputs:**
+
+* genome format: fasta vs genbank (default); 
+* genome file: acepts both GenBank and FASTA formats;
+* both strands: yes or no (default). Allows the search only in the direct strand or in both DNA strands;
+* threshold: represents the probability of the test sequence being a promoter (a float between 0 and 1, default=0.50). For example, if threshold=0.90, the model will only return predicted sequences with more than 90% probability of being a promoter. The larger the genome, the higher the threshold should be. 
+* Family: The family of the testing phage - Podoviridae (default), Siphoviridae or Myoviridae;
+* Host: The host of the phage. The training dataset include the following hosts: Bacillus, Escherichia coli (default), Salmonella, Pseudomonas, Yersinia, Klebsiella, Pectobacterium, Morganella, Cronobacter, Staphylococcus, Streptococcus, Streptomyces, Lactococcus. If the testing phage has a different host, select the option 'other'.
+* Phage type: The type of the phage, according to its lifecycle: virulent or temperate;
+
+**Advanced options:**
+
+* Model: the user can choose which model to run: the SVM model (default) or the ANN model. The SVM model uses more negative data, so it will return less promoters but with a higher probability of being real promoters. However, it can fail to detect some of the real promoters. On the other hand, the ANN model will predict more promoters, so it can identify more real promoters, but it is expected to predict more false negatives. 
+
+**Outputs:**
+
+This tool outputs two files: a FASTA file and a table in HTML, with the locations, sequence, score and type (recognized by host or phage RNAP) of the predicted promoters.
+In addition, the tool will output a GenBank file with the predicted promoters as features. 
+
+**Requirements:**
+
+* Biopython
+* Sklearn 
+* Numpy
+* Pandas  
+
+    ]]> </help>
+</tool>
diff -r 5acc4fa8b62d -r a455e43048cd pssm10_6.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm10_6.txt	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-3.24	1.93	-0.34	1.38	1.43	-3.05	
+-2.14	-4.24	-1.03	-1.44	-1.19	-4.05	
+-2.29	-4.46	-1.12	-1.44	-1.53	-3.59	
+1.79	-3.59	1.17	-0.61	-0.96	1.9	
diff -r 5acc4fa8b62d -r a455e43048cd pssm10_8.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm10_8.txt	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+0.09	1.72	-4.49	1.92	1.95	1.95	-4.49	1.92	
+-1.68	-1.32	-4.49	-4.49	-4.49	-4.49	-3.49	-4.49	
+-0.79	-2.49	-4.49	-2.91	-4.49	-4.49	-2.49	-4.49	
+1.03	-2.91	1.95	-4.49	-4.49	-4.49	1.88	-2.91	
diff -r 5acc4fa8b62d -r a455e43048cd pssm35_6.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm35_6.txt	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-2.84	-2.12	-2.4	1.49	-0.74	1.16	
+-2.95	-4.65	-2.56	-0.26	1.13	-1.56	
+-4.33	-2.48	1.73	-3.33	-1.69	-1.65	
+1.88	1.83	-1.65	-1.95	-0.14	0.15	
diff -r 5acc4fa8b62d -r a455e43048cd pssm35_9.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm35_9.txt	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+0.93	0.79	-0.65	-1.87	-0.14	-1.46	-1.14	-0.14	1.13	
+-1.46	-1.87	-1.46	-2.46	1.35	-3.46	-2.46	-0.14	-0.65	
+-0.87	-1.87	-1.87	1.79	-1.87	-1.87	-3.46	-1.46	-1.87	
+0.24	0.79	1.45	-3.46	-1.87	1.71	1.71	0.86	-0.14	
diff -r 5acc4fa8b62d -r a455e43048cd pssm35_cbb.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm35_cbb.txt	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-2.46	-2.46	1.45	1.79	1.79	-2.46	0.13	
+-2.46	-2.46	-2.46	-2.46	-2.46	1.54	-2.46	
+-2.46	1.79	-0.14	-2.46	-2.46	-2.46	1.24	
+1.79	-2.46	-2.46	-2.46	-2.46	-0.46	-1.46	
diff -r 5acc4fa8b62d -r a455e43048cd pssm35_lb.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm35_lb.txt	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-0.81	-0.81	-0.81	-0.81	-0.81	0.19	-0.81	-0.81	0.78	0.19	-0.81	-0.81	-0.81	-0.81	
+-0.81	-0.81	-0.81	1.19	-0.81	-0.81	-0.81	0.19	-0.81	0.19	-0.81	-0.81	-0.81	1.19	
+-0.81	-0.81	1.19	-0.81	1.19	-0.81	0.78	-0.81	0.19	-0.81	-0.81	-0.81	1.19	-0.81	
+1.19	1.19	-0.81	-0.81	-0.81	0.78	0.19	0.78	-0.81	0.19	1.19	1.19	-0.81	-0.81	
diff -r 5acc4fa8b62d -r a455e43048cd pssm35_mu.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm35_mu.txt	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-1.17	-0.17	1.15	-1.17	1.15	1.15	-1.17	-0.17	0.42	-1.17	0.42	-1.17	0.83	-1.17	
+0.83	0.83	-1.17	-0.17	-1.17	-1.17	0.83	1.15	0.83	1.42	-1.17	0.42	-1.17	0.83	
+-0.17	-1.17	-0.17	-1.17	-1.17	-0.17	-1.17	-1.17	-1.17	-1.17	0.42	0.42	-0.17	-1.17	
+-0.17	-0.17	-1.17	1.15	-0.17	-1.17	0.42	-1.17	-1.17	-1.17	-0.17	-0.17	-0.17	0.42	
diff -r 5acc4fa8b62d -r a455e43048cd pssm35_t4.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm35_t4.txt	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-2.43	-3.43	-2.43	-2.43	1.86	-2.43	1.33	
+-3.43	-3.43	-3.43	-3.43	-3.43	1.86	-3.43	
+1.82	-3.43	-2.43	-3.43	-3.43	-3.43	-3.43	
+-2.43	1.9	1.82	1.86	-2.43	-3.43	0.38	
diff -r 5acc4fa8b62d -r a455e43048cd pssm_21.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm_21.txt	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-2.0	-1.0	-0.42	-2.0	-2.0	-2.0	-2.0	-2.0	-2.0	-2.0	-2.0	1.7	-2.0	-1.0	-1.0	-2.0	-2.0	-2.0	-2.0	0.81	0.58	
+1.17	1.46	-2.0	-2.0	-1.0	-2.0	-1.0	-2.0	1.7	1.7	-0.42	-2.0	1.7	-2.0	0.81	1.7	1.7	-2.0	-2.0	0.81	0.81	
+-0.42	-2.0	1.46	1.7	1.58	-2.0	-2.0	1.7	-2.0	-2.0	1.46	-2.0	-2.0	-1.0	-0.42	-2.0	-2.0	-2.0	-2.0	-2.0	-1.0	
+-0.42	-1.0	-2.0	-2.0	-2.0	1.7	1.58	-2.0	-2.0	-2.0	-2.0	-2.0	-2.0	1.46	0.0	-2.0	-2.0	1.7	1.7	-2.0	-2.0	
diff -r 5acc4fa8b62d -r a455e43048cd pssm_23.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm_23.txt	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+0.07	1.71	1.18	-0.18	1.65	0.6	-1.86	0.82	-2.86	-0.79	-5.18	1.88	-1.86	-4.18	1.69	0.97	1.26	-0.05	-0.79	0.07	1.11	0.28	1.25	
+-0.32	-2.86	-2.86	-4.18	-2.86	0.49	0.6	0.52	1.68	-2.37	1.88	-1.86	1.84	-1.09	-2.86	-3.18	-1.18	-5.18	-0.54	-1.86	-1.86	-1.48	-1.09	
+-3.59	-2.86	-3.59	-4.18	-3.18	-0.32	0.95	-0.86	-1.37	-1.59	-1.86	-5.18	-3.18	-5.18	-2.59	-3.59	-1.18	1.56	1.41	1.3	0.6	1.16	-0.18	
+1.05	-1.18	0.6	1.59	-0.72	-1.86	-1.86	-2.01	-1.86	1.53	-5.18	-5.18	-5.18	1.79	-1.09	0.89	-0.48	-4.18	-3.59	-2.18	-4.18	-2.37	-1.86	
diff -r 5acc4fa8b62d -r a455e43048cd pssm_27.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm_27.txt	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-3.09	-2.09	1.67	-1.5	-1.09	-0.28	-3.09	-3.09	-2.09	-3.09	-3.09	-3.09	1.87	-3.09	-3.09	-3.09	-3.09	-3.09	-1.5	-3.09	-3.09	-2.09	-3.09	1.72	1.56	1.5	-1.5	
+1.82	-3.09	-3.09	1.67	1.56	1.3	-1.09	-0.5	1.16	1.67	1.67	-1.09	-3.09	1.87	-3.09	1.87	1.67	-0.77	-3.09	0.82	1.16	-0.28	0.91	-2.09	-2.09	-0.5	0.08	
+-3.09	1.82	-2.09	-2.09	-3.09	-0.77	0.08	1.16	0.5	-3.09	-0.77	-2.09	-3.09	-3.09	-2.09	-3.09	-1.09	1.67	1.77	1.0	0.61	-3.09	-3.09	-1.5	-3.09	-3.09	-0.5	
+-2.09	-3.09	-1.09	-2.09	-1.09	-3.09	1.23	-0.09	-3.09	-0.77	-3.09	1.67	-3.09	-3.09	1.82	-3.09	-2.09	-3.09	-3.09	-3.09	-3.09	1.5	0.91	-3.09	-0.5	-1.5	0.91	
diff -r 5acc4fa8b62d -r a455e43048cd pssm_32.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm_32.txt	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+1.65	-1.81	-1.81	-1.81	-1.81	1.65	-1.81	1.65	-1.81	-1.81	0.78	-1.81	-1.81	-1.81	-1.81	1.65	-1.81	-1.81	-1.81	-1.81	-1.81	-1.81	-1.81	-1.81	-1.81	1.65	-1.81	-1.81	-1.81	-1.81	1.65	1.65	
+-1.81	-1.81	1.65	1.65	-1.81	-1.81	-1.81	-1.81	-1.81	1.65	-1.81	-1.81	1.65	1.65	-1.81	-1.81	1.19	-1.81	-1.81	-1.81	-1.81	0.78	-1.81	1.65	-1.81	-1.81	-1.81	-1.81	-1.81	-1.81	-1.81	-1.81	
+-1.81	1.65	-1.81	-1.81	-1.81	-1.81	-1.81	-1.81	1.65	-1.81	0.78	-1.81	-1.81	-1.81	-0.81	-1.81	-1.81	1.51	1.65	1.65	1.65	-1.81	1.65	-1.81	-1.81	-1.81	-1.81	1.65	-1.81	1.65	-1.81	-1.81	
+-1.81	-1.81	-1.81	-1.81	1.65	-1.81	1.65	-1.81	-1.81	-1.81	-1.81	1.65	-1.81	-1.81	1.51	-1.81	0.19	-0.81	-1.81	-1.81	-1.81	0.78	-1.81	-1.81	1.65	-1.81	1.65	-1.81	1.65	-1.81	-1.81	-1.81	
diff -r 5acc4fa8b62d -r a455e43048cd scaler1600.sav
Binary file scaler1600.sav has changed
diff -r 5acc4fa8b62d -r a455e43048cd scaler2400.sav
Binary file scaler2400.sav has changed
diff -r 5acc4fa8b62d -r a455e43048cd tool_dependencies.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,6 @@
+<tool_dependency>
+     <package name="biopython"></package>
+    <package name="numpy" ></package>
+    <package name="pandas"></package>
+    <package name="scikit-learn"></package>
+</tool_dependency>