# HG changeset patch # User martasampaio # Date 1555772974 14400 # Node ID a455e43048cd028d0ae97f2213a564de662c68df # Parent 5acc4fa8b62d7a97f8354430e07b7c4abedb95ad Uploaded diff -r 5acc4fa8b62d -r a455e43048cd README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,29 @@ +=============== +PhagePromoter +=============== + +Get promoter of phage genomes + +PhagePromoter is a python script that predicts promoter sequences in phage genomes, using a machine learning SVM model. This model was built from a train dataset with 19 features and 3200 examples (800 positives and 2400 negatives), each representing a 65 bp sequence of a phage genome. The positive cases represent the phage sequences that are already identified as promoters. + +**Inputs:** + +* genome format: fasta vs genbank; +* genome file: acepts both genbank and fasta formats; +* both strands (yes or no): allows the search in both DNA strands; +* threshold: represents the probability of the test sequence be a promoter (float between 0 and 1)" +* family: The family of the testing phage - Podoviridae, Siphoviridae or Myoviridae; +* Bacteria: The host of the phage. The train dataset include the following hosts: Bacillus, EColi, Salmonella, Pseudomonas, Yersinia, Klebsiella, Pectobacterium, Morganella, Cronobacter, Staphylococcus, Streptococcus, Streptomyces, Lactococcus. If the testing phage has a different host, select the option 'other', and it is recommended the use of a higher threshold value for more accurate results. +* phage type: The type of the phage, according to its lifecycle: virulent or temperate; + +**Outputs:** +This tool outputs two files: a FASTA file and a table in HTML, with the locations, sequence, score and type (recognized by host or phage RNAP) of the predicted promoters. + +**Requirements:** + +* Biopython +* Sklearn +* Numpy +* Pandas + + diff -r 5acc4fa8b62d -r a455e43048cd auxiliar.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/auxiliar.py Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun May 27 17:37:09 2018 + +@author: Marta +""" + + +#get the phage host from the file 'bacteria.xlsx' +def get_bacteria(file): + import pandas as pd + df = pd.read_excel(file,header=0,index_col=0) + bacteria = {} + for ind,row in df.iterrows(): + bac = row['Bacteria'] + bacteria[ind] = bac + return bacteria + +#get the phage family from the file 'family.xlsx' +def get_families(file): + import pandas as pd + df = pd.read_excel(file,header=0,index_col=0) + families = {} + for ind,row in df.iterrows(): + fam = row['Family'] + families[ind] = fam + return families + +#get phage lifecycle from the file 'lifecycle.xlsx' +def get_lifecycle(file): + import pandas as pd + df = pd.read_excel(file,header=0,index_col=0) + types = {} + for ind,row in df.iterrows(): + lc = row['lifecycle'] + types[ind] = lc + return types + +#reads a file with a PSSM and return the max possible score of that PSSM +def get_max_pssm(file_pssm): + from Bio.Alphabet import IUPAC + from Bio.motifs import matrix + m = [] + fic = open(file_pssm,'r') + rf = fic.readline() + while rf: + new_l = [] + l = rf.strip().split('\t') + for val in l: + x = float(val) + new_l.append(x) + m.append(new_l) + rf = fic.readline() + a = IUPAC.unambiguous_dna + dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]} + pssm = matrix.PositionSpecificScoringMatrix(a,dic) + return pssm.max + +#reads a file with a PSSM and returns a list of scores in all positions of the sequence +#returns the score divided by the maximum possible value +def get_scores(file_pssm, seq): + from Bio.Alphabet import IUPAC + from Bio.motifs import matrix + maxi = get_max_pssm(file_pssm) + m = [] + fic = open(file_pssm,'r') + rf = fic.readline() + while rf: + new_l = [] + l = rf.strip().split('\t') + for val in l: + x = float(val) + new_l.append(x) + m.append(new_l) + rf = fic.readline() + a = IUPAC.unambiguous_dna + dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]} + pssm = matrix.PositionSpecificScoringMatrix(a,dic) + scores = [] + positions = [] + a = IUPAC.unambiguous_dna + seq.alphabet = a + for pos, score in pssm.search(seq, both=False,threshold=-50): + scores.append(score/maxi) + positions.append(pos) + return scores,positions + +#returns the frequencia of A and T bases in a sequence +def freq_base(seq): + A = seq.count('A') + T = seq.count('T') + AT = A+T + return AT + +#returns the free energy value of that sequence +def free_energy(seq): + dic1 = {'AA':-1.00, + 'TT':-1.00, + 'AT':-0.88, + 'TA':-0.58, + 'CA':-1.45, + 'AC':-1.44, + 'GG':-1.84, + 'CC':-1.84, + 'GA':-1.30, + 'AG':-1.28, + 'TC':-1.30, + 'CT':-1.28, + 'TG':-1.45, + 'GT':-1.44, + 'GC':-2.24, + 'CG':-2.17} + total = 0 + i = 0 + j = 1 + while i < len(seq)-1: + dint = seq[i]+seq[j] + total += dic1[dint] + i += 1 + j += 1 + return total \ No newline at end of file diff -r 5acc4fa8b62d -r a455e43048cd model1600.sav Binary file model1600.sav has changed diff -r 5acc4fa8b62d -r a455e43048cd model2400.sav Binary file model2400.sav has changed diff -r 5acc4fa8b62d -r a455e43048cd phagepromoter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phagepromoter.py Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,570 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Jul 19 13:45:05 2018 + +@author: Marta +""" + +from Bio import SeqIO +import numpy as np +import pandas as pd +from auxiliar import free_energy,freq_base +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.Alphabet import IUPAC +from auxiliar import get_bacteria, get_families, get_max_pssm, get_scores, get_lifecycle + +#division of the test genome in sequences of 65 bp +def get_testseqs65(form,fic,both=False): + ALL = [] + indexes = [] + a = 0 + rec = SeqIO.read(fic,form) + genome = rec.seq + i = 0 + j = 65 + while j < len(genome): + s = genome[i:j] + ALL.append([1,i,j,s]) + i += 20 + j += 20 + a += 1 + indexes.append(rec.name+":"+str(a)) + if both: + comp = genome.reverse_complement() + size = len(rec.seq) + i = 0 + j = 65 + while j < len(comp): + s = comp[i:j] + ALL.append([-1,size-j,size-i,s]) + i += 20 + j += 20 + a += 1 + indexes.append(rec.name+":"+str(a)) + df = pd.DataFrame(ALL, index=indexes, columns=['strand','iniprom','endprom','seq']) + return df + +#calculate the scores of all sequences (similar to get_posScores and get_negScores) +def get_testScores(loc,test): + scores = [] + posis = [] + sizes = [] + dic = {} + for ind,row in test.iterrows(): + _,window = ind.split(':') + strand = row['strand'] + ini = row['iniprom'] + end = row['endprom'] + seq = row['seq'] + pos = [ini,end,strand] + dic[window] = pos + s = seq + score10_6,pos10_6 = get_scores(os.path.join(loc,'pssm10_6.txt'), s) + maxi10_6 = get_max_pssm(os.path.join(loc,'pssm10_6.txt')) + score10_8,pos10_8 = get_scores(os.path.join(loc,'pssm10_8.txt'), s) + maxi10_8 = get_max_pssm(os.path.join(loc,'pssm10_8.txt')) + scores23,pos23 = get_scores(os.path.join(loc,'pssm_23.txt'), s) + maxi23 = get_max_pssm(os.path.join(loc,'pssm_23.txt')) + scores21,pos21 = get_scores(os.path.join(loc,'pssm_21.txt'), s) + maxi21 = get_max_pssm(os.path.join(loc,'pssm_21.txt')) + scores27,pos27 = get_scores(os.path.join(loc,'pssm_27.txt'), s) + maxi27 = get_max_pssm(os.path.join(loc,'pssm_27.txt')) + scores32,pos32 = get_scores(os.path.join(loc,'pssm_32.txt'), s) + maxi32 = get_max_pssm(os.path.join(loc,'pssm_32.txt')) + score23 = max(scores23) + score21 = max(scores21) + score27 = max(scores27) + score32 = max(scores32) + maxiphage = max(score23,score21,score27,score32) + if maxiphage == score23: phage_max = score23*maxi23 + elif maxiphage == score21: phage_max = score21*maxi21 + elif maxiphage == score27: phage_max = score27*maxi27 + elif maxiphage == score32: phage_max = score32*maxi32 + score35_6,pos35_6 = get_scores(os.path.join(loc,'pssm35_6.txt'), s) + maxi35_6 = get_max_pssm(os.path.join(loc,'pssm35_6.txt')) + score35_9,pos35_9 = get_scores(os.path.join(loc,'pssm35_9.txt'), s) + maxi35_9 = get_max_pssm(os.path.join(loc,'pssm35_9.txt')) + score35_t4,pos35_t4 = get_scores(os.path.join(loc,'pssm35_t4.txt'), s) + maxi35_t4 = get_max_pssm(os.path.join(loc,'pssm35_t4.txt')) + score35_cbb,pos35_cbb = get_scores(os.path.join(loc,'pssm35_cbb.txt'), s) + maxi35_cbb = get_max_pssm(os.path.join(loc,'pssm35_cbb.txt')) + score35_lb,pos35_lb = get_scores(os.path.join(loc,'pssm35_lb.txt'),s) + maxi35_lb = get_max_pssm(os.path.join(loc,'pssm35_lb.txt')) + score35_mu, pos35_mu = get_scores(os.path.join(loc,'pssm35_mu.txt'),s) + maxi35_mu = get_max_pssm(os.path.join(loc,'pssm35_mu.txt')) + + dists6 = [] + score6 = [] + for p in pos10_6: + for a in range(14,22): + d = p-a-6 + if d >= 0: + s10 = score10_6[p] + s35_6 = score35_6[d] + score6.append([s35_6,s10]) + dists6.append([d,p]) + dists9 = [] + score9 = [] + for p in pos10_6: + for a in range(11,14): + d = p-a-9 + if d >= 0: + s10 = score10_6[p] + s35_9 = score35_9[d] + score9.append([s35_9,s10]) + dists9.append([d,p]) + distst4 = [] + scoret4 = [] + distscbb = [] + scorecbb = [] + for p in pos10_6: + for a in range(16,18): + d = p-a-7 + if d >= 0: + s10 = score10_6[p] + s35_t4 = score35_t4[d] + s35_cbb = score35_cbb[d] + scoret4.append([s35_t4,s10]) + distst4.append([d,p]) + scorecbb.append([s35_cbb,s10]) + distscbb.append([d,p]) + + + distsmu = [] + scoremu = [] + for p in pos10_6: + d = p-16-14 + if d >= 0: + s10 = score10_6[p] + s35_mu = score35_mu[d] + scoremu.append([s35_mu,s10]) + distsmu.append([d,p]) + + distslb = [] + scorelb = [] + for p in pos10_6: + d = p-13-14 + if d >= 0: + s10 = score10_6[p] + s35_lb = score35_lb[d] + scorelb.append([s35_lb,s10]) + distslb.append([d,p]) + + soma6 = [sum(x) for x in score6] + soma9 = [sum(x) for x in score9] + somat4 = [sum(x) for x in scoret4] + somacbb = [sum(x) for x in scorecbb] + somamu = [sum(x) for x in scoremu] + somalb = [sum(x) for x in scorelb] + + maxi6 = max(soma6) + maxi9 = max(soma9) + maxit4 = max(somat4) + maxicbb = max(somacbb) + maximu = max(somamu) + maxilb = max(somalb) + maxi_elems = max(maxi6,maxi9,maxit4,maxicbb,maxilb,maximu) + + if maxi_elems == maxilb: + indmax = somalb.index(maxilb) + sc35 = scorelb[indmax][0] + sc10 = scorelb[indmax][1] + score_elems = [sc35,sc10] + posel = distslb[indmax] + size35 = 14 + elems_maxi = sc35*maxi35_lb+sc10*maxi10_6 + elif maxi_elems == maximu: + indmax = somamu.index(maximu) + sc35 = scoremu[indmax][0] + sc10 = scoremu[indmax][1] + score_elems = [sc35,sc10] + posel = distsmu[indmax] + size35 = 14 + elems_maxi = sc35*maxi35_mu+sc10*maxi10_6 + elif maxi_elems == maxi9: + indmax = soma9.index(maxi9) + sc35 = score9[indmax][0] + sc10 = score9[indmax][1] + score_elems = [sc35,sc10] + posel = dists9[indmax] + size35 = 9 + elems_maxi = sc35*maxi35_9+sc10*maxi10_6 + elif maxi_elems == maxit4: + indmax = somat4.index(maxit4) + sc35 = scoret4[indmax][0] + sc10 = scoret4[indmax][1] + score_elems = [sc35,sc10] + posel = distst4[indmax] + size35 = 7 + elems_maxi = sc35*maxi35_t4+sc10*maxi10_6 + elif maxi_elems == maxicbb: + indmax = somacbb.index(maxicbb) + sc35 = scorecbb[indmax][0] + sc10 = scorecbb[indmax][1] + score_elems = [sc35,sc10] + posel = distscbb[indmax] + size35 = 7 + elems_maxi = sc35*maxi35_cbb+sc10*maxi10_6 + else: + indmax = soma6.index(maxi6) + sc35 = score6[indmax][0] + sc10 = score6[indmax][1] + score_elems = [sc35,sc10] + posel = dists6[indmax] + size35 = 6 + elems_maxi = sc35*maxi35_6+sc10*maxi10_6 + + if score23 == maxiphage: + phage_score = score23 + posiphage = scores23.index(score23) + sizephage = 23 + elif score21 == maxiphage: + phage_score = score21 + posiphage = scores21.index(score21) + sizephage = 21 + elif score27 == maxiphage: + phage_score = score27 + posiphage = scores27.index(score27) + sizephage = 27 + else: + phage_score = score32 + posiphage = scores32.index(score32) + sizephage = 32 + + if elems_maxi >= max(score10_8)*maxi10_8: + i = posel[1] + ext = s[i-3:i-1] + if ext == 'TG': tg = 1 + else: tg = 0 + if elems_maxi > phage_max: host = 1 + else: + host = 0 + tg = 0 + sc = max(score10_8) + end35 = posel[0]+size35 + dist = posel[1]-end35 + scores.append([host, score_elems[1],sc,score_elems[0],phage_score,tg,dist,str(seq)]) + posis.append([posel[1],posel[0],posiphage]) + sizes.append([6,size35,sizephage]) + else: + host = 1 + sc = max(score10_8) + i = score10_8.index(sc) + ext = s[i-3:i-1] + if ext == 'TG': tg = 1 + else: tg = 0 + if max(score10_8)*maxi10_8 > phage_max: host = 1 + else: + host = 0 + tg = 0 + end35 = posel[0]+size35 + dist = posel[1]-end35 + scores.append([host,score_elems[1],sc,score_elems[0],phage_score,tg,dist,str(seq)]) + posis.append([i,posel[0],posiphage]) + sizes.append([8,size35,sizephage]) + score = pd.DataFrame(scores, index=test.index, columns=['host','score10','score10_8','score35','score_phage','tg','dist','seq']) + posis = pd.DataFrame(posis, index=test.index, columns=['pos10','pos35','posphage']) + sizes = pd.DataFrame(sizes, index=test.index, columns=['size10','size35','size_phage']) + df_all = pd.concat([score,posis,sizes],axis=1) + return df_all,dic + +def create_dftest(scores_test,dic_window,family,bacteria,lifecycle): + tudo = [] + tudo2 = [] + for ind,row in scores_test.iterrows(): + _,window = ind.split(':') + posis = dic_window[window] + strand=posis[2] + if strand == 1: ini=posis[0] + else: ini=posis[1] + seqprom = row['seq'] + score10 = row['score10'] + score10_8 = row['score10_8'] + score35 = row['score35'] + scorephage = row['score_phage'] + size10 = row['size10'] + size35 = row['size35'] + sizephage = row['size_phage'] + ini10 = row['pos10'] + tg = row['tg'] + host = row['host'] + ini35 = row['pos35'] + dist = row['dist'] + end10=ini10+size10 + iniphage = row['posphage'] + endphage = iniphage+sizephage + if strand == 1: + if host == 0: + new_seq = seqprom[iniphage:endphage] + new_ini = ini+iniphage+1 + new_end = ini+endphage + else: + if size10 == 6: + new_seq = seqprom[ini35:end10] + new_ini = ini+ini35+1 + new_end = ini+end10 + else: + new_seq = seqprom[ini10:end10] + new_ini = ini+ini10+1 + new_end = ini+end10 + new_pos = '('+str(new_ini)+'..'+str(new_end)+')' + else: + if host == 0: + new_seq = seqprom[iniphage:endphage] + new_ini = ini-endphage+1 + new_end = ini-iniphage + else: + if size10 == 6: + new_seq = seqprom[ini35:end10] + new_ini = ini-end10+1 + new_end = ini-ini35 + else: + new_seq = seqprom[ini10:end10] + new_ini = ini-end10+1 + new_end = ini-ini10 + new_pos = 'complement('+str(new_ini)+'..'+str(new_end)+')' + if size10 == 6: + size10_6 = 1 + else: + size10_6 = 0 + if size35 == 6: + size35_6 = 1 + size35_7 = 0 + size35_9 = 0 + size35_14 = 0 + elif size35 == 7: + size35_6 = 0 + size35_7 = 1 + size35_9 = 0 + size35_14 = 0 + elif size35 == 9: + size35_6 = 0 + size35_7 = 0 + size35_9 = 1 + size35_14 = 0 + else: + size35_6 = 0 + size35_7 = 0 + size35_9 = 0 + size35_14 = 1 + if sizephage == 23: + sizephage_23 = 1 + sizephage_21 = 0 + sizephage_32 = 0 + elif sizephage == 21: + sizephage_23 = 0 + sizephage_21 = 1 + sizephage_32 = 0 + elif sizephage == 32: + sizephage_23 = 0 + sizephage_21 = 0 + sizephage_32 = 1 + else: + sizephage_23 = 0 + sizephage_21 = 0 + sizephage_32 = 0 + if family == 'Podoviridae': + Podo = 1 + Sipho = 0 + Myo = 0 + elif family == 'Siphoviridae': + Podo = 0 + Sipho = 1 + Myo = 0 + elif family == 'Myoviridae': + Podo = 0 + Sipho = 0 + Myo = 1 + else: + Podo = 0 + Sipho = 0 + Myo = 0 + if bacteria == 'Bacillus': + bac = [1,0,0,0,0] + elif bacteria == 'Escherichia coli': + bac = [0,1,0,0,0] + elif bacteria == 'Klebsiella': + bac = [0,0,1,0,0] + elif bacteria == 'Pectobacterium': + bac = [0,0,0,1,0] + elif bacteria == 'Cronobacter': + bac = [0,0,0,0,1] + else: + bac = [0,0,0,0,0] + if lifecycle == 'virulent': tp = 0 + else: tp = 1 + fe = free_energy(str(seqprom)) + AT = freq_base(str(seqprom)) + linha = [score10, score10_8, score35, dist, scorephage, fe, AT, host,size10_6, + size35_6, size35_7, size35_9, size35_14, + sizephage_23, sizephage_21, + sizephage_32, tg, Podo, Sipho, Myo,tp] + linha.extend(bac) + tudo.append(linha) + linha2 = [new_pos,str(new_seq), host, size10_6, + score10, score10_8, size35_6, size35_7, size35_9,size35_14, + score35, dist, sizephage_23, sizephage_21, + sizephage_32, scorephage, tg, Podo, Sipho, Myo,tp, fe, AT] + linha2.extend(bac) + tudo2.append(linha2) + df_test = pd.DataFrame(tudo, index=scores_test.index, + columns = ['score10', 'score10_8','score35', 'dist35_10', + 'scorephage','fe', 'freqAT', + 'host','size10', + 'size35_6', 'size35_7','size35_9','size35_14', + 'sizephage_23', + 'sizephage_21', 'sizephage_32', + 'TG', 'Podo', 'Sipho', 'Myo', 'tp', + 'Bacillus', 'EColi', + 'Pectobacterium','Klebsiella', 'Cronobacter']) + df_INFO = pd.DataFrame(tudo2, index=scores_test.index, + columns = ['Positions','Promoter Sequence','host','size10', + 'score10', 'score10_8', + 'size35_6', 'size35_7','size35_9','size35_14', + 'score35', 'dist35_10','sizephage_23', + 'sizephage_21', 'sizephage_32', + 'scorephage', 'TG', 'Podo', 'Sipho', 'Myo', 'tp','fe', 'freqAT', + 'EColi', 'Salmonella', + 'Pectobacterium','Cronobacter', 'Streptococcus']) + return df_test,df_INFO + + +def get_predictions(scaler_file,model_file,test,df_testinfo,threshold): + from sklearn.externals import joblib + scaler = joblib.load(scaler_file) + model = joblib.load(model_file) + feat_scaled = pd.DataFrame(scaler.transform(test.iloc[:,:7]),index =test.index, columns=test.columns[:7]) + TEST_scaled = pd.concat([feat_scaled,test.iloc[:,7:]],axis=1) + scores = model.predict_proba(TEST_scaled) + pos_scores = np.empty((TEST_scaled.shape[0],0), float) + for x in scores: pos_scores = np.append(pos_scores,x[1]) + try: positive_indexes = np.nonzero(pos_scores>float(threshold))[0] #escolher os positivos, podia ser escolher com score > x + except ValueError: return 'The threshold value is not a float' + else: + if len(positive_indexes) == 0: return None + else: + positive_windows = TEST_scaled.index[positive_indexes] + INFO = df_testinfo.loc[positive_windows,['Positions','Promoter Sequence']] + promoter_type = [] + for x in df_testinfo.loc[positive_windows,'host'].tolist(): + if x == 0: promoter_type.append('phage') + else: promoter_type.append('host') + INFO['Type'] = promoter_type + INFO['Scores'] = np.around(pos_scores[positive_indexes],decimals=3) + INFO.index = positive_windows + return INFO + + +def get_finaldf(test,rec): + new_df = test.groupby(['Positions']) + groups = list(new_df.groups.keys()) + for i in range(len(groups)-1): + for j in range(i, len(groups)): + if 'complement' in groups[i]: inii = int(groups[i][11:].split('..')[0]) + else: inii = int(groups[i][1:].split('..')[0]) + if 'complement' in groups[j]: inij = int(groups[j][11:].split('..')[0]) + else: inij = int(groups[j][1:].split('..')[0]) + if inij < inii: + temp = groups[i] + groups[i] = groups[j] + groups[j] = temp + new_inds = [] + for g in groups: + inds = new_df.groups[g] + if len(inds) == 1: new_inds.append(inds[0]) + else: + maxi = max(new_df.get_group(g)['Scores']) + i = new_df.groups[g][new_df.get_group(g)['Scores']==maxi][0] + new_inds.append(i) + + output = test.loc[new_inds,:] + strands = [] + new_pos = [] + old_pos = output['Positions'].tolist() + + from Bio.SeqFeature import SeqFeature, FeatureLocation + feats = rec.features + for ind, row in output.iterrows(): + pos = row['Positions'] + if 'complement' in pos: + strands.append('-') + new_pos.append(pos[10:]) + ini,end= pos[11:-1].split('..') + new_loc = FeatureLocation(int(ini),int(end),strand=-1) + else: + strands.append('+') + new_pos.append(pos) + ini,end= pos[1:-1].split('..') + new_loc = FeatureLocation(int(ini),int(end),strand=1) + feat = SeqFeature(new_loc, type='regulatory',qualifiers={'regulatory_class':['promoter'], 'note=':['predicted by PhagePromoter']}) + feats.append(feat) + + output.insert(loc=0, column='Strand', value=strands) + output['Positions'] = new_pos + output.to_html('output.html',index=False, justify='center') + recs = [] + i = 0 + for ind,row in output.iterrows(): + s = Seq(row['Promoter Sequence']) + posis = old_pos[i] + typ = row['Type'] + score = row['Scores'] + sq = SeqRecord(seq=s, id=ind, description=typ+' '+posis+' score='+str(score)) + recs.append(sq) + i += 1 + SeqIO.write(recs, 'output.fasta','fasta') + new_rec = rec + new_rec.seq.alphabet = IUPAC.IUPACAmbiguousDNA() + new_feats = sorted(feats, key=lambda x: x.location.start) + new_rec.features = new_feats + SeqIO.write(new_rec,'output.gb','genbank') + +if __name__== "__main__": + + import sys + import os + __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + + gen_format = sys.argv[1] + genome_file = sys.argv[2] + both = sys.argv[3] + threshold = sys.argv[4] + family = sys.argv[5] + host = sys.argv[6] + phage_type = sys.argv[7] + model = sys.argv[8] + ''' + gen_format = 'genbank' + genome_file = 'test-data/NC_015264.gb' + both = False + threshold = '0.50' + family = 'Podoviridae' + host = 'Pseudomonas' + phage_type = 'virulent' + model = 'SVM2400' + #model = 'ANN1600' + ''' + + rec = SeqIO.read(genome_file, gen_format) + test_windows = get_testseqs65(gen_format, genome_file,both) + try: score_test,dic_window = get_testScores(__location__,test_windows) + except IndexError: print('Error. Input sequence can only have A,C,G or T') + else: + df_test,df_testinfo = create_dftest(score_test,dic_window,family,host,phage_type) + if model == 'ANN1600': + scaler_file = os.path.join(__location__, 'scaler1600.sav') + model_file = os.path.join(__location__, 'model1600.sav') + preds = get_predictions(scaler_file, model_file, df_test,df_testinfo,threshold) + if preds is None: print('There is no sequence with a score value higher or equal to the threshold '+str(threshold)) + elif type(preds) == str: print(preds) + else: output = get_finaldf(preds,rec) + else: + scaler_file = os.path.join(__location__, 'scaler2400.sav') + model_file = os.path.join(__location__, 'model2400.sav') + new_df_test = df_test.iloc[:,[0,1,2,3,4,5,6,7,8,9,13,14,16,17,19,20,22,24,25]] + preds = get_predictions(scaler_file, model_file, new_df_test,df_testinfo,threshold) + if preds is None: print('There is no sequence with a score value higher or equal to the threshold '+str(threshold)) + elif type(preds) == str: print(preds) + else: output = get_finaldf(preds,rec) + diff -r 5acc4fa8b62d -r a455e43048cd phagepromoter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phagepromoter.xml Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,120 @@ + + +Get promoters of phage genomes + + + biopython + scikit-learn + numpy + pandas + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+
+ + + + + + + + + + + + + + + + + + + + + +
diff -r 5acc4fa8b62d -r a455e43048cd pssm10_6.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pssm10_6.txt Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,4 @@ +-3.24 1.93 -0.34 1.38 1.43 -3.05 +-2.14 -4.24 -1.03 -1.44 -1.19 -4.05 +-2.29 -4.46 -1.12 -1.44 -1.53 -3.59 +1.79 -3.59 1.17 -0.61 -0.96 1.9 diff -r 5acc4fa8b62d -r a455e43048cd pssm10_8.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pssm10_8.txt Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,4 @@ +0.09 1.72 -4.49 1.92 1.95 1.95 -4.49 1.92 +-1.68 -1.32 -4.49 -4.49 -4.49 -4.49 -3.49 -4.49 +-0.79 -2.49 -4.49 -2.91 -4.49 -4.49 -2.49 -4.49 +1.03 -2.91 1.95 -4.49 -4.49 -4.49 1.88 -2.91 diff -r 5acc4fa8b62d -r a455e43048cd pssm35_6.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pssm35_6.txt Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,4 @@ +-2.84 -2.12 -2.4 1.49 -0.74 1.16 +-2.95 -4.65 -2.56 -0.26 1.13 -1.56 +-4.33 -2.48 1.73 -3.33 -1.69 -1.65 +1.88 1.83 -1.65 -1.95 -0.14 0.15 diff -r 5acc4fa8b62d -r a455e43048cd pssm35_9.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pssm35_9.txt Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,4 @@ +0.93 0.79 -0.65 -1.87 -0.14 -1.46 -1.14 -0.14 1.13 +-1.46 -1.87 -1.46 -2.46 1.35 -3.46 -2.46 -0.14 -0.65 +-0.87 -1.87 -1.87 1.79 -1.87 -1.87 -3.46 -1.46 -1.87 +0.24 0.79 1.45 -3.46 -1.87 1.71 1.71 0.86 -0.14 diff -r 5acc4fa8b62d -r a455e43048cd pssm35_cbb.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pssm35_cbb.txt Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,4 @@ +-2.46 -2.46 1.45 1.79 1.79 -2.46 0.13 +-2.46 -2.46 -2.46 -2.46 -2.46 1.54 -2.46 +-2.46 1.79 -0.14 -2.46 -2.46 -2.46 1.24 +1.79 -2.46 -2.46 -2.46 -2.46 -0.46 -1.46 diff -r 5acc4fa8b62d -r a455e43048cd pssm35_lb.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pssm35_lb.txt Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,4 @@ +-0.81 -0.81 -0.81 -0.81 -0.81 0.19 -0.81 -0.81 0.78 0.19 -0.81 -0.81 -0.81 -0.81 +-0.81 -0.81 -0.81 1.19 -0.81 -0.81 -0.81 0.19 -0.81 0.19 -0.81 -0.81 -0.81 1.19 +-0.81 -0.81 1.19 -0.81 1.19 -0.81 0.78 -0.81 0.19 -0.81 -0.81 -0.81 1.19 -0.81 +1.19 1.19 -0.81 -0.81 -0.81 0.78 0.19 0.78 -0.81 0.19 1.19 1.19 -0.81 -0.81 diff -r 5acc4fa8b62d -r a455e43048cd pssm35_mu.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pssm35_mu.txt Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,4 @@ +-1.17 -0.17 1.15 -1.17 1.15 1.15 -1.17 -0.17 0.42 -1.17 0.42 -1.17 0.83 -1.17 +0.83 0.83 -1.17 -0.17 -1.17 -1.17 0.83 1.15 0.83 1.42 -1.17 0.42 -1.17 0.83 +-0.17 -1.17 -0.17 -1.17 -1.17 -0.17 -1.17 -1.17 -1.17 -1.17 0.42 0.42 -0.17 -1.17 +-0.17 -0.17 -1.17 1.15 -0.17 -1.17 0.42 -1.17 -1.17 -1.17 -0.17 -0.17 -0.17 0.42 diff -r 5acc4fa8b62d -r a455e43048cd pssm35_t4.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pssm35_t4.txt Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,4 @@ +-2.43 -3.43 -2.43 -2.43 1.86 -2.43 1.33 +-3.43 -3.43 -3.43 -3.43 -3.43 1.86 -3.43 +1.82 -3.43 -2.43 -3.43 -3.43 -3.43 -3.43 +-2.43 1.9 1.82 1.86 -2.43 -3.43 0.38 diff -r 5acc4fa8b62d -r a455e43048cd pssm_21.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pssm_21.txt Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,4 @@ +-2.0 -1.0 -0.42 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 1.7 -2.0 -1.0 -1.0 -2.0 -2.0 -2.0 -2.0 0.81 0.58 +1.17 1.46 -2.0 -2.0 -1.0 -2.0 -1.0 -2.0 1.7 1.7 -0.42 -2.0 1.7 -2.0 0.81 1.7 1.7 -2.0 -2.0 0.81 0.81 +-0.42 -2.0 1.46 1.7 1.58 -2.0 -2.0 1.7 -2.0 -2.0 1.46 -2.0 -2.0 -1.0 -0.42 -2.0 -2.0 -2.0 -2.0 -2.0 -1.0 +-0.42 -1.0 -2.0 -2.0 -2.0 1.7 1.58 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 1.46 0.0 -2.0 -2.0 1.7 1.7 -2.0 -2.0 diff -r 5acc4fa8b62d -r a455e43048cd pssm_23.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pssm_23.txt Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,4 @@ +0.07 1.71 1.18 -0.18 1.65 0.6 -1.86 0.82 -2.86 -0.79 -5.18 1.88 -1.86 -4.18 1.69 0.97 1.26 -0.05 -0.79 0.07 1.11 0.28 1.25 +-0.32 -2.86 -2.86 -4.18 -2.86 0.49 0.6 0.52 1.68 -2.37 1.88 -1.86 1.84 -1.09 -2.86 -3.18 -1.18 -5.18 -0.54 -1.86 -1.86 -1.48 -1.09 +-3.59 -2.86 -3.59 -4.18 -3.18 -0.32 0.95 -0.86 -1.37 -1.59 -1.86 -5.18 -3.18 -5.18 -2.59 -3.59 -1.18 1.56 1.41 1.3 0.6 1.16 -0.18 +1.05 -1.18 0.6 1.59 -0.72 -1.86 -1.86 -2.01 -1.86 1.53 -5.18 -5.18 -5.18 1.79 -1.09 0.89 -0.48 -4.18 -3.59 -2.18 -4.18 -2.37 -1.86 diff -r 5acc4fa8b62d -r a455e43048cd pssm_27.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pssm_27.txt Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,4 @@ +-3.09 -2.09 1.67 -1.5 -1.09 -0.28 -3.09 -3.09 -2.09 -3.09 -3.09 -3.09 1.87 -3.09 -3.09 -3.09 -3.09 -3.09 -1.5 -3.09 -3.09 -2.09 -3.09 1.72 1.56 1.5 -1.5 +1.82 -3.09 -3.09 1.67 1.56 1.3 -1.09 -0.5 1.16 1.67 1.67 -1.09 -3.09 1.87 -3.09 1.87 1.67 -0.77 -3.09 0.82 1.16 -0.28 0.91 -2.09 -2.09 -0.5 0.08 +-3.09 1.82 -2.09 -2.09 -3.09 -0.77 0.08 1.16 0.5 -3.09 -0.77 -2.09 -3.09 -3.09 -2.09 -3.09 -1.09 1.67 1.77 1.0 0.61 -3.09 -3.09 -1.5 -3.09 -3.09 -0.5 +-2.09 -3.09 -1.09 -2.09 -1.09 -3.09 1.23 -0.09 -3.09 -0.77 -3.09 1.67 -3.09 -3.09 1.82 -3.09 -2.09 -3.09 -3.09 -3.09 -3.09 1.5 0.91 -3.09 -0.5 -1.5 0.91 diff -r 5acc4fa8b62d -r a455e43048cd pssm_32.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pssm_32.txt Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,4 @@ +1.65 -1.81 -1.81 -1.81 -1.81 1.65 -1.81 1.65 -1.81 -1.81 0.78 -1.81 -1.81 -1.81 -1.81 1.65 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 1.65 -1.81 -1.81 -1.81 -1.81 1.65 1.65 +-1.81 -1.81 1.65 1.65 -1.81 -1.81 -1.81 -1.81 -1.81 1.65 -1.81 -1.81 1.65 1.65 -1.81 -1.81 1.19 -1.81 -1.81 -1.81 -1.81 0.78 -1.81 1.65 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 +-1.81 1.65 -1.81 -1.81 -1.81 -1.81 -1.81 -1.81 1.65 -1.81 0.78 -1.81 -1.81 -1.81 -0.81 -1.81 -1.81 1.51 1.65 1.65 1.65 -1.81 1.65 -1.81 -1.81 -1.81 -1.81 1.65 -1.81 1.65 -1.81 -1.81 +-1.81 -1.81 -1.81 -1.81 1.65 -1.81 1.65 -1.81 -1.81 -1.81 -1.81 1.65 -1.81 -1.81 1.51 -1.81 0.19 -0.81 -1.81 -1.81 -1.81 0.78 -1.81 -1.81 1.65 -1.81 1.65 -1.81 1.65 -1.81 -1.81 -1.81 diff -r 5acc4fa8b62d -r a455e43048cd scaler1600.sav Binary file scaler1600.sav has changed diff -r 5acc4fa8b62d -r a455e43048cd scaler2400.sav Binary file scaler2400.sav has changed diff -r 5acc4fa8b62d -r a455e43048cd tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Sat Apr 20 11:09:34 2019 -0400 @@ -0,0 +1,6 @@ + + + + + +