# HG changeset patch
# User martasampaio
# Date 1555772974 14400
# Node ID a455e43048cd028d0ae97f2213a564de662c68df
# Parent 5acc4fa8b62d7a97f8354430e07b7c4abedb95ad
Uploaded
diff -r 5acc4fa8b62d -r a455e43048cd README.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,29 @@
+===============
+PhagePromoter
+===============
+
+Get promoter of phage genomes
+
+PhagePromoter is a python script that predicts promoter sequences in phage genomes, using a machine learning SVM model. This model was built from a train dataset with 19 features and 3200 examples (800 positives and 2400 negatives), each representing a 65 bp sequence of a phage genome. The positive cases represent the phage sequences that are already identified as promoters.
+
+**Inputs:**
+
+* genome format: fasta vs genbank;
+* genome file: acepts both genbank and fasta formats;
+* both strands (yes or no): allows the search in both DNA strands;
+* threshold: represents the probability of the test sequence be a promoter (float between 0 and 1)"
+* family: The family of the testing phage - Podoviridae, Siphoviridae or Myoviridae;
+* Bacteria: The host of the phage. The train dataset include the following hosts: Bacillus, EColi, Salmonella, Pseudomonas, Yersinia, Klebsiella, Pectobacterium, Morganella, Cronobacter, Staphylococcus, Streptococcus, Streptomyces, Lactococcus. If the testing phage has a different host, select the option 'other', and it is recommended the use of a higher threshold value for more accurate results.
+* phage type: The type of the phage, according to its lifecycle: virulent or temperate;
+
+**Outputs:**
+This tool outputs two files: a FASTA file and a table in HTML, with the locations, sequence, score and type (recognized by host or phage RNAP) of the predicted promoters.
+
+**Requirements:**
+
+* Biopython
+* Sklearn
+* Numpy
+* Pandas
+
+
diff -r 5acc4fa8b62d -r a455e43048cd auxiliar.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/auxiliar.py Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun May 27 17:37:09 2018
+
+@author: Marta
+"""
+
+
+#get the phage host from the file 'bacteria.xlsx'
+def get_bacteria(file):
+ import pandas as pd
+ df = pd.read_excel(file,header=0,index_col=0)
+ bacteria = {}
+ for ind,row in df.iterrows():
+ bac = row['Bacteria']
+ bacteria[ind] = bac
+ return bacteria
+
+#get the phage family from the file 'family.xlsx'
+def get_families(file):
+ import pandas as pd
+ df = pd.read_excel(file,header=0,index_col=0)
+ families = {}
+ for ind,row in df.iterrows():
+ fam = row['Family']
+ families[ind] = fam
+ return families
+
+#get phage lifecycle from the file 'lifecycle.xlsx'
+def get_lifecycle(file):
+ import pandas as pd
+ df = pd.read_excel(file,header=0,index_col=0)
+ types = {}
+ for ind,row in df.iterrows():
+ lc = row['lifecycle']
+ types[ind] = lc
+ return types
+
+#reads a file with a PSSM and return the max possible score of that PSSM
+def get_max_pssm(file_pssm):
+ from Bio.Alphabet import IUPAC
+ from Bio.motifs import matrix
+ m = []
+ fic = open(file_pssm,'r')
+ rf = fic.readline()
+ while rf:
+ new_l = []
+ l = rf.strip().split('\t')
+ for val in l:
+ x = float(val)
+ new_l.append(x)
+ m.append(new_l)
+ rf = fic.readline()
+ a = IUPAC.unambiguous_dna
+ dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]}
+ pssm = matrix.PositionSpecificScoringMatrix(a,dic)
+ return pssm.max
+
+#reads a file with a PSSM and returns a list of scores in all positions of the sequence
+#returns the score divided by the maximum possible value
+def get_scores(file_pssm, seq):
+ from Bio.Alphabet import IUPAC
+ from Bio.motifs import matrix
+ maxi = get_max_pssm(file_pssm)
+ m = []
+ fic = open(file_pssm,'r')
+ rf = fic.readline()
+ while rf:
+ new_l = []
+ l = rf.strip().split('\t')
+ for val in l:
+ x = float(val)
+ new_l.append(x)
+ m.append(new_l)
+ rf = fic.readline()
+ a = IUPAC.unambiguous_dna
+ dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]}
+ pssm = matrix.PositionSpecificScoringMatrix(a,dic)
+ scores = []
+ positions = []
+ a = IUPAC.unambiguous_dna
+ seq.alphabet = a
+ for pos, score in pssm.search(seq, both=False,threshold=-50):
+ scores.append(score/maxi)
+ positions.append(pos)
+ return scores,positions
+
+#returns the frequencia of A and T bases in a sequence
+def freq_base(seq):
+ A = seq.count('A')
+ T = seq.count('T')
+ AT = A+T
+ return AT
+
+#returns the free energy value of that sequence
+def free_energy(seq):
+ dic1 = {'AA':-1.00,
+ 'TT':-1.00,
+ 'AT':-0.88,
+ 'TA':-0.58,
+ 'CA':-1.45,
+ 'AC':-1.44,
+ 'GG':-1.84,
+ 'CC':-1.84,
+ 'GA':-1.30,
+ 'AG':-1.28,
+ 'TC':-1.30,
+ 'CT':-1.28,
+ 'TG':-1.45,
+ 'GT':-1.44,
+ 'GC':-2.24,
+ 'CG':-2.17}
+ total = 0
+ i = 0
+ j = 1
+ while i < len(seq)-1:
+ dint = seq[i]+seq[j]
+ total += dic1[dint]
+ i += 1
+ j += 1
+ return total
\ No newline at end of file
diff -r 5acc4fa8b62d -r a455e43048cd model1600.sav
Binary file model1600.sav has changed
diff -r 5acc4fa8b62d -r a455e43048cd model2400.sav
Binary file model2400.sav has changed
diff -r 5acc4fa8b62d -r a455e43048cd phagepromoter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/phagepromoter.py Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,570 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jul 19 13:45:05 2018
+
+@author: Marta
+"""
+
+from Bio import SeqIO
+import numpy as np
+import pandas as pd
+from auxiliar import free_energy,freq_base
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.Alphabet import IUPAC
+from auxiliar import get_bacteria, get_families, get_max_pssm, get_scores, get_lifecycle
+
+#division of the test genome in sequences of 65 bp
+def get_testseqs65(form,fic,both=False):
+ ALL = []
+ indexes = []
+ a = 0
+ rec = SeqIO.read(fic,form)
+ genome = rec.seq
+ i = 0
+ j = 65
+ while j < len(genome):
+ s = genome[i:j]
+ ALL.append([1,i,j,s])
+ i += 20
+ j += 20
+ a += 1
+ indexes.append(rec.name+":"+str(a))
+ if both:
+ comp = genome.reverse_complement()
+ size = len(rec.seq)
+ i = 0
+ j = 65
+ while j < len(comp):
+ s = comp[i:j]
+ ALL.append([-1,size-j,size-i,s])
+ i += 20
+ j += 20
+ a += 1
+ indexes.append(rec.name+":"+str(a))
+ df = pd.DataFrame(ALL, index=indexes, columns=['strand','iniprom','endprom','seq'])
+ return df
+
+#calculate the scores of all sequences (similar to get_posScores and get_negScores)
+def get_testScores(loc,test):
+ scores = []
+ posis = []
+ sizes = []
+ dic = {}
+ for ind,row in test.iterrows():
+ _,window = ind.split(':')
+ strand = row['strand']
+ ini = row['iniprom']
+ end = row['endprom']
+ seq = row['seq']
+ pos = [ini,end,strand]
+ dic[window] = pos
+ s = seq
+ score10_6,pos10_6 = get_scores(os.path.join(loc,'pssm10_6.txt'), s)
+ maxi10_6 = get_max_pssm(os.path.join(loc,'pssm10_6.txt'))
+ score10_8,pos10_8 = get_scores(os.path.join(loc,'pssm10_8.txt'), s)
+ maxi10_8 = get_max_pssm(os.path.join(loc,'pssm10_8.txt'))
+ scores23,pos23 = get_scores(os.path.join(loc,'pssm_23.txt'), s)
+ maxi23 = get_max_pssm(os.path.join(loc,'pssm_23.txt'))
+ scores21,pos21 = get_scores(os.path.join(loc,'pssm_21.txt'), s)
+ maxi21 = get_max_pssm(os.path.join(loc,'pssm_21.txt'))
+ scores27,pos27 = get_scores(os.path.join(loc,'pssm_27.txt'), s)
+ maxi27 = get_max_pssm(os.path.join(loc,'pssm_27.txt'))
+ scores32,pos32 = get_scores(os.path.join(loc,'pssm_32.txt'), s)
+ maxi32 = get_max_pssm(os.path.join(loc,'pssm_32.txt'))
+ score23 = max(scores23)
+ score21 = max(scores21)
+ score27 = max(scores27)
+ score32 = max(scores32)
+ maxiphage = max(score23,score21,score27,score32)
+ if maxiphage == score23: phage_max = score23*maxi23
+ elif maxiphage == score21: phage_max = score21*maxi21
+ elif maxiphage == score27: phage_max = score27*maxi27
+ elif maxiphage == score32: phage_max = score32*maxi32
+ score35_6,pos35_6 = get_scores(os.path.join(loc,'pssm35_6.txt'), s)
+ maxi35_6 = get_max_pssm(os.path.join(loc,'pssm35_6.txt'))
+ score35_9,pos35_9 = get_scores(os.path.join(loc,'pssm35_9.txt'), s)
+ maxi35_9 = get_max_pssm(os.path.join(loc,'pssm35_9.txt'))
+ score35_t4,pos35_t4 = get_scores(os.path.join(loc,'pssm35_t4.txt'), s)
+ maxi35_t4 = get_max_pssm(os.path.join(loc,'pssm35_t4.txt'))
+ score35_cbb,pos35_cbb = get_scores(os.path.join(loc,'pssm35_cbb.txt'), s)
+ maxi35_cbb = get_max_pssm(os.path.join(loc,'pssm35_cbb.txt'))
+ score35_lb,pos35_lb = get_scores(os.path.join(loc,'pssm35_lb.txt'),s)
+ maxi35_lb = get_max_pssm(os.path.join(loc,'pssm35_lb.txt'))
+ score35_mu, pos35_mu = get_scores(os.path.join(loc,'pssm35_mu.txt'),s)
+ maxi35_mu = get_max_pssm(os.path.join(loc,'pssm35_mu.txt'))
+
+ dists6 = []
+ score6 = []
+ for p in pos10_6:
+ for a in range(14,22):
+ d = p-a-6
+ if d >= 0:
+ s10 = score10_6[p]
+ s35_6 = score35_6[d]
+ score6.append([s35_6,s10])
+ dists6.append([d,p])
+ dists9 = []
+ score9 = []
+ for p in pos10_6:
+ for a in range(11,14):
+ d = p-a-9
+ if d >= 0:
+ s10 = score10_6[p]
+ s35_9 = score35_9[d]
+ score9.append([s35_9,s10])
+ dists9.append([d,p])
+ distst4 = []
+ scoret4 = []
+ distscbb = []
+ scorecbb = []
+ for p in pos10_6:
+ for a in range(16,18):
+ d = p-a-7
+ if d >= 0:
+ s10 = score10_6[p]
+ s35_t4 = score35_t4[d]
+ s35_cbb = score35_cbb[d]
+ scoret4.append([s35_t4,s10])
+ distst4.append([d,p])
+ scorecbb.append([s35_cbb,s10])
+ distscbb.append([d,p])
+
+
+ distsmu = []
+ scoremu = []
+ for p in pos10_6:
+ d = p-16-14
+ if d >= 0:
+ s10 = score10_6[p]
+ s35_mu = score35_mu[d]
+ scoremu.append([s35_mu,s10])
+ distsmu.append([d,p])
+
+ distslb = []
+ scorelb = []
+ for p in pos10_6:
+ d = p-13-14
+ if d >= 0:
+ s10 = score10_6[p]
+ s35_lb = score35_lb[d]
+ scorelb.append([s35_lb,s10])
+ distslb.append([d,p])
+
+ soma6 = [sum(x) for x in score6]
+ soma9 = [sum(x) for x in score9]
+ somat4 = [sum(x) for x in scoret4]
+ somacbb = [sum(x) for x in scorecbb]
+ somamu = [sum(x) for x in scoremu]
+ somalb = [sum(x) for x in scorelb]
+
+ maxi6 = max(soma6)
+ maxi9 = max(soma9)
+ maxit4 = max(somat4)
+ maxicbb = max(somacbb)
+ maximu = max(somamu)
+ maxilb = max(somalb)
+ maxi_elems = max(maxi6,maxi9,maxit4,maxicbb,maxilb,maximu)
+
+ if maxi_elems == maxilb:
+ indmax = somalb.index(maxilb)
+ sc35 = scorelb[indmax][0]
+ sc10 = scorelb[indmax][1]
+ score_elems = [sc35,sc10]
+ posel = distslb[indmax]
+ size35 = 14
+ elems_maxi = sc35*maxi35_lb+sc10*maxi10_6
+ elif maxi_elems == maximu:
+ indmax = somamu.index(maximu)
+ sc35 = scoremu[indmax][0]
+ sc10 = scoremu[indmax][1]
+ score_elems = [sc35,sc10]
+ posel = distsmu[indmax]
+ size35 = 14
+ elems_maxi = sc35*maxi35_mu+sc10*maxi10_6
+ elif maxi_elems == maxi9:
+ indmax = soma9.index(maxi9)
+ sc35 = score9[indmax][0]
+ sc10 = score9[indmax][1]
+ score_elems = [sc35,sc10]
+ posel = dists9[indmax]
+ size35 = 9
+ elems_maxi = sc35*maxi35_9+sc10*maxi10_6
+ elif maxi_elems == maxit4:
+ indmax = somat4.index(maxit4)
+ sc35 = scoret4[indmax][0]
+ sc10 = scoret4[indmax][1]
+ score_elems = [sc35,sc10]
+ posel = distst4[indmax]
+ size35 = 7
+ elems_maxi = sc35*maxi35_t4+sc10*maxi10_6
+ elif maxi_elems == maxicbb:
+ indmax = somacbb.index(maxicbb)
+ sc35 = scorecbb[indmax][0]
+ sc10 = scorecbb[indmax][1]
+ score_elems = [sc35,sc10]
+ posel = distscbb[indmax]
+ size35 = 7
+ elems_maxi = sc35*maxi35_cbb+sc10*maxi10_6
+ else:
+ indmax = soma6.index(maxi6)
+ sc35 = score6[indmax][0]
+ sc10 = score6[indmax][1]
+ score_elems = [sc35,sc10]
+ posel = dists6[indmax]
+ size35 = 6
+ elems_maxi = sc35*maxi35_6+sc10*maxi10_6
+
+ if score23 == maxiphage:
+ phage_score = score23
+ posiphage = scores23.index(score23)
+ sizephage = 23
+ elif score21 == maxiphage:
+ phage_score = score21
+ posiphage = scores21.index(score21)
+ sizephage = 21
+ elif score27 == maxiphage:
+ phage_score = score27
+ posiphage = scores27.index(score27)
+ sizephage = 27
+ else:
+ phage_score = score32
+ posiphage = scores32.index(score32)
+ sizephage = 32
+
+ if elems_maxi >= max(score10_8)*maxi10_8:
+ i = posel[1]
+ ext = s[i-3:i-1]
+ if ext == 'TG': tg = 1
+ else: tg = 0
+ if elems_maxi > phage_max: host = 1
+ else:
+ host = 0
+ tg = 0
+ sc = max(score10_8)
+ end35 = posel[0]+size35
+ dist = posel[1]-end35
+ scores.append([host, score_elems[1],sc,score_elems[0],phage_score,tg,dist,str(seq)])
+ posis.append([posel[1],posel[0],posiphage])
+ sizes.append([6,size35,sizephage])
+ else:
+ host = 1
+ sc = max(score10_8)
+ i = score10_8.index(sc)
+ ext = s[i-3:i-1]
+ if ext == 'TG': tg = 1
+ else: tg = 0
+ if max(score10_8)*maxi10_8 > phage_max: host = 1
+ else:
+ host = 0
+ tg = 0
+ end35 = posel[0]+size35
+ dist = posel[1]-end35
+ scores.append([host,score_elems[1],sc,score_elems[0],phage_score,tg,dist,str(seq)])
+ posis.append([i,posel[0],posiphage])
+ sizes.append([8,size35,sizephage])
+ score = pd.DataFrame(scores, index=test.index, columns=['host','score10','score10_8','score35','score_phage','tg','dist','seq'])
+ posis = pd.DataFrame(posis, index=test.index, columns=['pos10','pos35','posphage'])
+ sizes = pd.DataFrame(sizes, index=test.index, columns=['size10','size35','size_phage'])
+ df_all = pd.concat([score,posis,sizes],axis=1)
+ return df_all,dic
+
+def create_dftest(scores_test,dic_window,family,bacteria,lifecycle):
+ tudo = []
+ tudo2 = []
+ for ind,row in scores_test.iterrows():
+ _,window = ind.split(':')
+ posis = dic_window[window]
+ strand=posis[2]
+ if strand == 1: ini=posis[0]
+ else: ini=posis[1]
+ seqprom = row['seq']
+ score10 = row['score10']
+ score10_8 = row['score10_8']
+ score35 = row['score35']
+ scorephage = row['score_phage']
+ size10 = row['size10']
+ size35 = row['size35']
+ sizephage = row['size_phage']
+ ini10 = row['pos10']
+ tg = row['tg']
+ host = row['host']
+ ini35 = row['pos35']
+ dist = row['dist']
+ end10=ini10+size10
+ iniphage = row['posphage']
+ endphage = iniphage+sizephage
+ if strand == 1:
+ if host == 0:
+ new_seq = seqprom[iniphage:endphage]
+ new_ini = ini+iniphage+1
+ new_end = ini+endphage
+ else:
+ if size10 == 6:
+ new_seq = seqprom[ini35:end10]
+ new_ini = ini+ini35+1
+ new_end = ini+end10
+ else:
+ new_seq = seqprom[ini10:end10]
+ new_ini = ini+ini10+1
+ new_end = ini+end10
+ new_pos = '('+str(new_ini)+'..'+str(new_end)+')'
+ else:
+ if host == 0:
+ new_seq = seqprom[iniphage:endphage]
+ new_ini = ini-endphage+1
+ new_end = ini-iniphage
+ else:
+ if size10 == 6:
+ new_seq = seqprom[ini35:end10]
+ new_ini = ini-end10+1
+ new_end = ini-ini35
+ else:
+ new_seq = seqprom[ini10:end10]
+ new_ini = ini-end10+1
+ new_end = ini-ini10
+ new_pos = 'complement('+str(new_ini)+'..'+str(new_end)+')'
+ if size10 == 6:
+ size10_6 = 1
+ else:
+ size10_6 = 0
+ if size35 == 6:
+ size35_6 = 1
+ size35_7 = 0
+ size35_9 = 0
+ size35_14 = 0
+ elif size35 == 7:
+ size35_6 = 0
+ size35_7 = 1
+ size35_9 = 0
+ size35_14 = 0
+ elif size35 == 9:
+ size35_6 = 0
+ size35_7 = 0
+ size35_9 = 1
+ size35_14 = 0
+ else:
+ size35_6 = 0
+ size35_7 = 0
+ size35_9 = 0
+ size35_14 = 1
+ if sizephage == 23:
+ sizephage_23 = 1
+ sizephage_21 = 0
+ sizephage_32 = 0
+ elif sizephage == 21:
+ sizephage_23 = 0
+ sizephage_21 = 1
+ sizephage_32 = 0
+ elif sizephage == 32:
+ sizephage_23 = 0
+ sizephage_21 = 0
+ sizephage_32 = 1
+ else:
+ sizephage_23 = 0
+ sizephage_21 = 0
+ sizephage_32 = 0
+ if family == 'Podoviridae':
+ Podo = 1
+ Sipho = 0
+ Myo = 0
+ elif family == 'Siphoviridae':
+ Podo = 0
+ Sipho = 1
+ Myo = 0
+ elif family == 'Myoviridae':
+ Podo = 0
+ Sipho = 0
+ Myo = 1
+ else:
+ Podo = 0
+ Sipho = 0
+ Myo = 0
+ if bacteria == 'Bacillus':
+ bac = [1,0,0,0,0]
+ elif bacteria == 'Escherichia coli':
+ bac = [0,1,0,0,0]
+ elif bacteria == 'Klebsiella':
+ bac = [0,0,1,0,0]
+ elif bacteria == 'Pectobacterium':
+ bac = [0,0,0,1,0]
+ elif bacteria == 'Cronobacter':
+ bac = [0,0,0,0,1]
+ else:
+ bac = [0,0,0,0,0]
+ if lifecycle == 'virulent': tp = 0
+ else: tp = 1
+ fe = free_energy(str(seqprom))
+ AT = freq_base(str(seqprom))
+ linha = [score10, score10_8, score35, dist, scorephage, fe, AT, host,size10_6,
+ size35_6, size35_7, size35_9, size35_14,
+ sizephage_23, sizephage_21,
+ sizephage_32, tg, Podo, Sipho, Myo,tp]
+ linha.extend(bac)
+ tudo.append(linha)
+ linha2 = [new_pos,str(new_seq), host, size10_6,
+ score10, score10_8, size35_6, size35_7, size35_9,size35_14,
+ score35, dist, sizephage_23, sizephage_21,
+ sizephage_32, scorephage, tg, Podo, Sipho, Myo,tp, fe, AT]
+ linha2.extend(bac)
+ tudo2.append(linha2)
+ df_test = pd.DataFrame(tudo, index=scores_test.index,
+ columns = ['score10', 'score10_8','score35', 'dist35_10',
+ 'scorephage','fe', 'freqAT',
+ 'host','size10',
+ 'size35_6', 'size35_7','size35_9','size35_14',
+ 'sizephage_23',
+ 'sizephage_21', 'sizephage_32',
+ 'TG', 'Podo', 'Sipho', 'Myo', 'tp',
+ 'Bacillus', 'EColi',
+ 'Pectobacterium','Klebsiella', 'Cronobacter'])
+ df_INFO = pd.DataFrame(tudo2, index=scores_test.index,
+ columns = ['Positions','Promoter Sequence','host','size10',
+ 'score10', 'score10_8',
+ 'size35_6', 'size35_7','size35_9','size35_14',
+ 'score35', 'dist35_10','sizephage_23',
+ 'sizephage_21', 'sizephage_32',
+ 'scorephage', 'TG', 'Podo', 'Sipho', 'Myo', 'tp','fe', 'freqAT',
+ 'EColi', 'Salmonella',
+ 'Pectobacterium','Cronobacter', 'Streptococcus'])
+ return df_test,df_INFO
+
+
+def get_predictions(scaler_file,model_file,test,df_testinfo,threshold):
+ from sklearn.externals import joblib
+ scaler = joblib.load(scaler_file)
+ model = joblib.load(model_file)
+ feat_scaled = pd.DataFrame(scaler.transform(test.iloc[:,:7]),index =test.index, columns=test.columns[:7])
+ TEST_scaled = pd.concat([feat_scaled,test.iloc[:,7:]],axis=1)
+ scores = model.predict_proba(TEST_scaled)
+ pos_scores = np.empty((TEST_scaled.shape[0],0), float)
+ for x in scores: pos_scores = np.append(pos_scores,x[1])
+ try: positive_indexes = np.nonzero(pos_scores>float(threshold))[0] #escolher os positivos, podia ser escolher com score > x
+ except ValueError: return 'The threshold value is not a float'
+ else:
+ if len(positive_indexes) == 0: return None
+ else:
+ positive_windows = TEST_scaled.index[positive_indexes]
+ INFO = df_testinfo.loc[positive_windows,['Positions','Promoter Sequence']]
+ promoter_type = []
+ for x in df_testinfo.loc[positive_windows,'host'].tolist():
+ if x == 0: promoter_type.append('phage')
+ else: promoter_type.append('host')
+ INFO['Type'] = promoter_type
+ INFO['Scores'] = np.around(pos_scores[positive_indexes],decimals=3)
+ INFO.index = positive_windows
+ return INFO
+
+
+def get_finaldf(test,rec):
+ new_df = test.groupby(['Positions'])
+ groups = list(new_df.groups.keys())
+ for i in range(len(groups)-1):
+ for j in range(i, len(groups)):
+ if 'complement' in groups[i]: inii = int(groups[i][11:].split('..')[0])
+ else: inii = int(groups[i][1:].split('..')[0])
+ if 'complement' in groups[j]: inij = int(groups[j][11:].split('..')[0])
+ else: inij = int(groups[j][1:].split('..')[0])
+ if inij < inii:
+ temp = groups[i]
+ groups[i] = groups[j]
+ groups[j] = temp
+ new_inds = []
+ for g in groups:
+ inds = new_df.groups[g]
+ if len(inds) == 1: new_inds.append(inds[0])
+ else:
+ maxi = max(new_df.get_group(g)['Scores'])
+ i = new_df.groups[g][new_df.get_group(g)['Scores']==maxi][0]
+ new_inds.append(i)
+
+ output = test.loc[new_inds,:]
+ strands = []
+ new_pos = []
+ old_pos = output['Positions'].tolist()
+
+ from Bio.SeqFeature import SeqFeature, FeatureLocation
+ feats = rec.features
+ for ind, row in output.iterrows():
+ pos = row['Positions']
+ if 'complement' in pos:
+ strands.append('-')
+ new_pos.append(pos[10:])
+ ini,end= pos[11:-1].split('..')
+ new_loc = FeatureLocation(int(ini),int(end),strand=-1)
+ else:
+ strands.append('+')
+ new_pos.append(pos)
+ ini,end= pos[1:-1].split('..')
+ new_loc = FeatureLocation(int(ini),int(end),strand=1)
+ feat = SeqFeature(new_loc, type='regulatory',qualifiers={'regulatory_class':['promoter'], 'note=':['predicted by PhagePromoter']})
+ feats.append(feat)
+
+ output.insert(loc=0, column='Strand', value=strands)
+ output['Positions'] = new_pos
+ output.to_html('output.html',index=False, justify='center')
+ recs = []
+ i = 0
+ for ind,row in output.iterrows():
+ s = Seq(row['Promoter Sequence'])
+ posis = old_pos[i]
+ typ = row['Type']
+ score = row['Scores']
+ sq = SeqRecord(seq=s, id=ind, description=typ+' '+posis+' score='+str(score))
+ recs.append(sq)
+ i += 1
+ SeqIO.write(recs, 'output.fasta','fasta')
+ new_rec = rec
+ new_rec.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
+ new_feats = sorted(feats, key=lambda x: x.location.start)
+ new_rec.features = new_feats
+ SeqIO.write(new_rec,'output.gb','genbank')
+
+if __name__== "__main__":
+
+ import sys
+ import os
+ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+ gen_format = sys.argv[1]
+ genome_file = sys.argv[2]
+ both = sys.argv[3]
+ threshold = sys.argv[4]
+ family = sys.argv[5]
+ host = sys.argv[6]
+ phage_type = sys.argv[7]
+ model = sys.argv[8]
+ '''
+ gen_format = 'genbank'
+ genome_file = 'test-data/NC_015264.gb'
+ both = False
+ threshold = '0.50'
+ family = 'Podoviridae'
+ host = 'Pseudomonas'
+ phage_type = 'virulent'
+ model = 'SVM2400'
+ #model = 'ANN1600'
+ '''
+
+ rec = SeqIO.read(genome_file, gen_format)
+ test_windows = get_testseqs65(gen_format, genome_file,both)
+ try: score_test,dic_window = get_testScores(__location__,test_windows)
+ except IndexError: print('Error. Input sequence can only have A,C,G or T')
+ else:
+ df_test,df_testinfo = create_dftest(score_test,dic_window,family,host,phage_type)
+ if model == 'ANN1600':
+ scaler_file = os.path.join(__location__, 'scaler1600.sav')
+ model_file = os.path.join(__location__, 'model1600.sav')
+ preds = get_predictions(scaler_file, model_file, df_test,df_testinfo,threshold)
+ if preds is None: print('There is no sequence with a score value higher or equal to the threshold '+str(threshold))
+ elif type(preds) == str: print(preds)
+ else: output = get_finaldf(preds,rec)
+ else:
+ scaler_file = os.path.join(__location__, 'scaler2400.sav')
+ model_file = os.path.join(__location__, 'model2400.sav')
+ new_df_test = df_test.iloc[:,[0,1,2,3,4,5,6,7,8,9,13,14,16,17,19,20,22,24,25]]
+ preds = get_predictions(scaler_file, model_file, new_df_test,df_testinfo,threshold)
+ if preds is None: print('There is no sequence with a score value higher or equal to the threshold '+str(threshold))
+ elif type(preds) == str: print(preds)
+ else: output = get_finaldf(preds,rec)
+
diff -r 5acc4fa8b62d -r a455e43048cd phagepromoter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/phagepromoter.xml Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,120 @@
+
+
+Get promoters of phage genomes
+
+
+ biopython
+ scikit-learn
+ numpy
+ pandas
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 5acc4fa8b62d -r a455e43048cd pssm10_6.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm10_6.txt Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-3.24 1.93 -0.34 1.38 1.43 -3.05
+-2.14 -4.24 -1.03 -1.44 -1.19 -4.05
+-2.29 -4.46 -1.12 -1.44 -1.53 -3.59
+1.79 -3.59 1.17 -0.61 -0.96 1.9
diff -r 5acc4fa8b62d -r a455e43048cd pssm10_8.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm10_8.txt Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+0.09 1.72 -4.49 1.92 1.95 1.95 -4.49 1.92
+-1.68 -1.32 -4.49 -4.49 -4.49 -4.49 -3.49 -4.49
+-0.79 -2.49 -4.49 -2.91 -4.49 -4.49 -2.49 -4.49
+1.03 -2.91 1.95 -4.49 -4.49 -4.49 1.88 -2.91
diff -r 5acc4fa8b62d -r a455e43048cd pssm35_6.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm35_6.txt Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-2.84 -2.12 -2.4 1.49 -0.74 1.16
+-2.95 -4.65 -2.56 -0.26 1.13 -1.56
+-4.33 -2.48 1.73 -3.33 -1.69 -1.65
+1.88 1.83 -1.65 -1.95 -0.14 0.15
diff -r 5acc4fa8b62d -r a455e43048cd pssm35_9.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm35_9.txt Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+0.93 0.79 -0.65 -1.87 -0.14 -1.46 -1.14 -0.14 1.13
+-1.46 -1.87 -1.46 -2.46 1.35 -3.46 -2.46 -0.14 -0.65
+-0.87 -1.87 -1.87 1.79 -1.87 -1.87 -3.46 -1.46 -1.87
+0.24 0.79 1.45 -3.46 -1.87 1.71 1.71 0.86 -0.14
diff -r 5acc4fa8b62d -r a455e43048cd pssm35_cbb.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm35_cbb.txt Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-2.46 -2.46 1.45 1.79 1.79 -2.46 0.13
+-2.46 -2.46 -2.46 -2.46 -2.46 1.54 -2.46
+-2.46 1.79 -0.14 -2.46 -2.46 -2.46 1.24
+1.79 -2.46 -2.46 -2.46 -2.46 -0.46 -1.46
diff -r 5acc4fa8b62d -r a455e43048cd pssm35_lb.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm35_lb.txt Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-0.81 -0.81 -0.81 -0.81 -0.81 0.19 -0.81 -0.81 0.78 0.19 -0.81 -0.81 -0.81 -0.81
+-0.81 -0.81 -0.81 1.19 -0.81 -0.81 -0.81 0.19 -0.81 0.19 -0.81 -0.81 -0.81 1.19
+-0.81 -0.81 1.19 -0.81 1.19 -0.81 0.78 -0.81 0.19 -0.81 -0.81 -0.81 1.19 -0.81
+1.19 1.19 -0.81 -0.81 -0.81 0.78 0.19 0.78 -0.81 0.19 1.19 1.19 -0.81 -0.81
diff -r 5acc4fa8b62d -r a455e43048cd pssm35_mu.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm35_mu.txt Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-1.17 -0.17 1.15 -1.17 1.15 1.15 -1.17 -0.17 0.42 -1.17 0.42 -1.17 0.83 -1.17
+0.83 0.83 -1.17 -0.17 -1.17 -1.17 0.83 1.15 0.83 1.42 -1.17 0.42 -1.17 0.83
+-0.17 -1.17 -0.17 -1.17 -1.17 -0.17 -1.17 -1.17 -1.17 -1.17 0.42 0.42 -0.17 -1.17
+-0.17 -0.17 -1.17 1.15 -0.17 -1.17 0.42 -1.17 -1.17 -1.17 -0.17 -0.17 -0.17 0.42
diff -r 5acc4fa8b62d -r a455e43048cd pssm35_t4.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm35_t4.txt Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-2.43 -3.43 -2.43 -2.43 1.86 -2.43 1.33
+-3.43 -3.43 -3.43 -3.43 -3.43 1.86 -3.43
+1.82 -3.43 -2.43 -3.43 -3.43 -3.43 -3.43
+-2.43 1.9 1.82 1.86 -2.43 -3.43 0.38
diff -r 5acc4fa8b62d -r a455e43048cd pssm_21.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm_21.txt Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-2.0 -1.0 -0.42 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 1.7 -2.0 -1.0 -1.0 -2.0 -2.0 -2.0 -2.0 0.81 0.58
+1.17 1.46 -2.0 -2.0 -1.0 -2.0 -1.0 -2.0 1.7 1.7 -0.42 -2.0 1.7 -2.0 0.81 1.7 1.7 -2.0 -2.0 0.81 0.81
+-0.42 -2.0 1.46 1.7 1.58 -2.0 -2.0 1.7 -2.0 -2.0 1.46 -2.0 -2.0 -1.0 -0.42 -2.0 -2.0 -2.0 -2.0 -2.0 -1.0
+-0.42 -1.0 -2.0 -2.0 -2.0 1.7 1.58 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 1.46 0.0 -2.0 -2.0 1.7 1.7 -2.0 -2.0
diff -r 5acc4fa8b62d -r a455e43048cd pssm_23.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm_23.txt Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+0.07 1.71 1.18 -0.18 1.65 0.6 -1.86 0.82 -2.86 -0.79 -5.18 1.88 -1.86 -4.18 1.69 0.97 1.26 -0.05 -0.79 0.07 1.11 0.28 1.25
+-0.32 -2.86 -2.86 -4.18 -2.86 0.49 0.6 0.52 1.68 -2.37 1.88 -1.86 1.84 -1.09 -2.86 -3.18 -1.18 -5.18 -0.54 -1.86 -1.86 -1.48 -1.09
+-3.59 -2.86 -3.59 -4.18 -3.18 -0.32 0.95 -0.86 -1.37 -1.59 -1.86 -5.18 -3.18 -5.18 -2.59 -3.59 -1.18 1.56 1.41 1.3 0.6 1.16 -0.18
+1.05 -1.18 0.6 1.59 -0.72 -1.86 -1.86 -2.01 -1.86 1.53 -5.18 -5.18 -5.18 1.79 -1.09 0.89 -0.48 -4.18 -3.59 -2.18 -4.18 -2.37 -1.86
diff -r 5acc4fa8b62d -r a455e43048cd pssm_27.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm_27.txt Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+-3.09 -2.09 1.67 -1.5 -1.09 -0.28 -3.09 -3.09 -2.09 -3.09 -3.09 -3.09 1.87 -3.09 -3.09 -3.09 -3.09 -3.09 -1.5 -3.09 -3.09 -2.09 -3.09 1.72 1.56 1.5 -1.5
+1.82 -3.09 -3.09 1.67 1.56 1.3 -1.09 -0.5 1.16 1.67 1.67 -1.09 -3.09 1.87 -3.09 1.87 1.67 -0.77 -3.09 0.82 1.16 -0.28 0.91 -2.09 -2.09 -0.5 0.08
+-3.09 1.82 -2.09 -2.09 -3.09 -0.77 0.08 1.16 0.5 -3.09 -0.77 -2.09 -3.09 -3.09 -2.09 -3.09 -1.09 1.67 1.77 1.0 0.61 -3.09 -3.09 -1.5 -3.09 -3.09 -0.5
+-2.09 -3.09 -1.09 -2.09 -1.09 -3.09 1.23 -0.09 -3.09 -0.77 -3.09 1.67 -3.09 -3.09 1.82 -3.09 -2.09 -3.09 -3.09 -3.09 -3.09 1.5 0.91 -3.09 -0.5 -1.5 0.91
diff -r 5acc4fa8b62d -r a455e43048cd pssm_32.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pssm_32.txt Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,4 @@
+1.65 -1.81 -1.81 -1.81 -1.81 1.65 -1.81 1.65 -1.81 -1.81 0.78 -1.81 -1.81 -1.81 -1.81 1.65 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 1.65 -1.81 -1.81 -1.81 -1.81 1.65 1.65
+-1.81 -1.81 1.65 1.65 -1.81 -1.81 -1.81 -1.81 -1.81 1.65 -1.81 -1.81 1.65 1.65 -1.81 -1.81 1.19 -1.81 -1.81 -1.81 -1.81 0.78 -1.81 1.65 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81
+-1.81 1.65 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 1.65 -1.81 0.78 -1.81 -1.81 -1.81 -0.81 -1.81 -1.81 1.51 1.65 1.65 1.65 -1.81 1.65 -1.81 -1.81 -1.81 -1.81 1.65 -1.81 1.65 -1.81 -1.81
+-1.81 -1.81 -1.81 -1.81 1.65 -1.81 1.65 -1.81 -1.81 -1.81 -1.81 1.65 -1.81 -1.81 1.51 -1.81 0.19 -0.81 -1.81 -1.81 -1.81 0.78 -1.81 -1.81 1.65 -1.81 1.65 -1.81 1.65 -1.81 -1.81 -1.81
diff -r 5acc4fa8b62d -r a455e43048cd scaler1600.sav
Binary file scaler1600.sav has changed
diff -r 5acc4fa8b62d -r a455e43048cd scaler2400.sav
Binary file scaler2400.sav has changed
diff -r 5acc4fa8b62d -r a455e43048cd tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Sat Apr 20 11:09:34 2019 -0400
@@ -0,0 +1,6 @@
+
+
+
+
+
+