changeset 0:404a98e6759c draft

Uploaded
author martasampaio
date Sat, 20 Apr 2019 10:55:45 -0400
parents
children 27b787998ef2
files auxiliar.py
diffstat 1 files changed, 121 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/auxiliar.py	Sat Apr 20 10:55:45 2019 -0400
@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun May 27 17:37:09 2018
+
+@author: Marta
+"""
+
+
+#get the phage host from the file 'bacteria.xlsx'
+def get_bacteria(file):
+    import pandas as pd
+    df = pd.read_excel(file,header=0,index_col=0)
+    bacteria = {}
+    for ind,row in df.iterrows():
+        bac = row['Bacteria']
+        bacteria[ind] = bac
+    return bacteria
+
+#get the phage family from the file 'family.xlsx'
+def get_families(file):
+    import pandas as pd
+    df = pd.read_excel(file,header=0,index_col=0)
+    families = {}
+    for ind,row in df.iterrows():
+        fam = row['Family']
+        families[ind] = fam
+    return families
+
+#get phage lifecycle from the file 'lifecycle.xlsx'
+def get_lifecycle(file):
+    import pandas as pd
+    df = pd.read_excel(file,header=0,index_col=0)
+    types = {}
+    for ind,row in df.iterrows():
+        lc = row['lifecycle']
+        types[ind] = lc
+    return types
+
+#reads a file with a PSSM and return the max possible score of that PSSM
+def get_max_pssm(file_pssm):
+    from Bio.Alphabet import IUPAC
+    from Bio.motifs import matrix
+    m = []
+    fic = open(file_pssm,'r')
+    rf = fic.readline()
+    while rf:
+        new_l = []
+        l = rf.strip().split('\t')
+        for val in l:
+            x = float(val)
+            new_l.append(x)
+        m.append(new_l)
+        rf = fic.readline()
+    a = IUPAC.unambiguous_dna
+    dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]}
+    pssm = matrix.PositionSpecificScoringMatrix(a,dic)
+    return pssm.max
+
+#reads a file with a PSSM and returns a list of scores in all positions of the sequence
+#returns the score divided by the maximum possible value
+def get_scores(file_pssm, seq):
+    from Bio.Alphabet import IUPAC
+    from Bio.motifs import matrix
+    maxi = get_max_pssm(file_pssm)
+    m = []
+    fic = open(file_pssm,'r')
+    rf = fic.readline()
+    while rf:
+        new_l = []
+        l = rf.strip().split('\t')
+        for val in l:
+            x = float(val)
+            new_l.append(x)
+        m.append(new_l)
+        rf = fic.readline()
+    a = IUPAC.unambiguous_dna
+    dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]}
+    pssm = matrix.PositionSpecificScoringMatrix(a,dic)
+    scores = []
+    positions = []
+    a = IUPAC.unambiguous_dna
+    seq.alphabet = a
+    for pos, score in pssm.search(seq, both=False,threshold=-50):
+        scores.append(score/maxi)
+        positions.append(pos)
+    return scores,positions
+
+#returns the frequencia of A and T bases in a sequence    
+def freq_base(seq):
+    A = seq.count('A')
+    T = seq.count('T')
+    AT = A+T
+    return AT
+
+#returns the free energy value of that sequence
+def free_energy(seq):
+    dic1 = {'AA':-1.00, 
+        'TT':-1.00, 
+        'AT':-0.88, 
+        'TA':-0.58, 
+        'CA':-1.45,
+        'AC':-1.44, 
+        'GG':-1.84, 
+        'CC':-1.84, 
+        'GA':-1.30, 
+        'AG':-1.28, 
+        'TC':-1.30, 
+        'CT':-1.28, 
+        'TG':-1.45,
+        'GT':-1.44,
+        'GC':-2.24,
+        'CG':-2.17}
+    total = 0
+    i = 0
+    j = 1
+    while i < len(seq)-1:
+        dint = seq[i]+seq[j]
+        total += dic1[dint]
+        i += 1
+        j += 1
+    return total
\ No newline at end of file