| 
0
 | 
     1 # -*- coding: utf-8 -*-
 | 
| 
 | 
     2 """
 | 
| 
 | 
     3 Created on Sun May 27 17:37:09 2018
 | 
| 
 | 
     4 
 | 
| 
 | 
     5 @author: Marta
 | 
| 
 | 
     6 """
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 #get the phage host from the file 'bacteria.xlsx'
 | 
| 
 | 
    10 def get_bacteria(file):
 | 
| 
 | 
    11     import pandas as pd
 | 
| 
 | 
    12     df = pd.read_excel(file,header=0,index_col=0)
 | 
| 
 | 
    13     bacteria = {}
 | 
| 
 | 
    14     for ind,row in df.iterrows():
 | 
| 
 | 
    15         bac = row['Bacteria']
 | 
| 
 | 
    16         bacteria[ind] = bac
 | 
| 
 | 
    17     return bacteria
 | 
| 
 | 
    18 
 | 
| 
 | 
    19 #get the phage family from the file 'family.xlsx'
 | 
| 
 | 
    20 def get_families(file):
 | 
| 
 | 
    21     import pandas as pd
 | 
| 
 | 
    22     df = pd.read_excel(file,header=0,index_col=0)
 | 
| 
 | 
    23     families = {}
 | 
| 
 | 
    24     for ind,row in df.iterrows():
 | 
| 
 | 
    25         fam = row['Family']
 | 
| 
 | 
    26         families[ind] = fam
 | 
| 
 | 
    27     return families
 | 
| 
 | 
    28 
 | 
| 
 | 
    29 #get phage lifecycle from the file 'lifecycle.xlsx'
 | 
| 
 | 
    30 def get_lifecycle(file):
 | 
| 
 | 
    31     import pandas as pd
 | 
| 
 | 
    32     df = pd.read_excel(file,header=0,index_col=0)
 | 
| 
 | 
    33     types = {}
 | 
| 
 | 
    34     for ind,row in df.iterrows():
 | 
| 
 | 
    35         lc = row['lifecycle']
 | 
| 
 | 
    36         types[ind] = lc
 | 
| 
 | 
    37     return types
 | 
| 
 | 
    38 
 | 
| 
 | 
    39 #reads a file with a PSSM and return the max possible score of that PSSM
 | 
| 
 | 
    40 def get_max_pssm(file_pssm):
 | 
| 
 | 
    41     from Bio.Alphabet import IUPAC
 | 
| 
 | 
    42     from Bio.motifs import matrix
 | 
| 
 | 
    43     m = []
 | 
| 
 | 
    44     fic = open(file_pssm,'r')
 | 
| 
 | 
    45     rf = fic.readline()
 | 
| 
 | 
    46     while rf:
 | 
| 
 | 
    47         new_l = []
 | 
| 
 | 
    48         l = rf.strip().split('\t')
 | 
| 
 | 
    49         for val in l:
 | 
| 
 | 
    50             x = float(val)
 | 
| 
 | 
    51             new_l.append(x)
 | 
| 
 | 
    52         m.append(new_l)
 | 
| 
 | 
    53         rf = fic.readline()
 | 
| 
 | 
    54     a = IUPAC.unambiguous_dna
 | 
| 
 | 
    55     dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]}
 | 
| 
 | 
    56     pssm = matrix.PositionSpecificScoringMatrix(a,dic)
 | 
| 
 | 
    57     return pssm.max
 | 
| 
 | 
    58 
 | 
| 
 | 
    59 #reads a file with a PSSM and returns a list of scores in all positions of the sequence
 | 
| 
 | 
    60 #returns the score divided by the maximum possible value
 | 
| 
 | 
    61 def get_scores(file_pssm, seq):
 | 
| 
 | 
    62     from Bio.Alphabet import IUPAC
 | 
| 
 | 
    63     from Bio.motifs import matrix
 | 
| 
 | 
    64     maxi = get_max_pssm(file_pssm)
 | 
| 
 | 
    65     m = []
 | 
| 
 | 
    66     fic = open(file_pssm,'r')
 | 
| 
 | 
    67     rf = fic.readline()
 | 
| 
 | 
    68     while rf:
 | 
| 
 | 
    69         new_l = []
 | 
| 
 | 
    70         l = rf.strip().split('\t')
 | 
| 
 | 
    71         for val in l:
 | 
| 
 | 
    72             x = float(val)
 | 
| 
 | 
    73             new_l.append(x)
 | 
| 
 | 
    74         m.append(new_l)
 | 
| 
 | 
    75         rf = fic.readline()
 | 
| 
 | 
    76     a = IUPAC.unambiguous_dna
 | 
| 
 | 
    77     dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]}
 | 
| 
 | 
    78     pssm = matrix.PositionSpecificScoringMatrix(a,dic)
 | 
| 
 | 
    79     scores = []
 | 
| 
 | 
    80     positions = []
 | 
| 
 | 
    81     a = IUPAC.unambiguous_dna
 | 
| 
 | 
    82     seq.alphabet = a
 | 
| 
 | 
    83     for pos, score in pssm.search(seq, both=False,threshold=-50):
 | 
| 
 | 
    84         scores.append(score/maxi)
 | 
| 
 | 
    85         positions.append(pos)
 | 
| 
 | 
    86     return scores,positions
 | 
| 
 | 
    87 
 | 
| 
 | 
    88 #returns the frequencia of A and T bases in a sequence    
 | 
| 
 | 
    89 def freq_base(seq):
 | 
| 
 | 
    90     A = seq.count('A')
 | 
| 
 | 
    91     T = seq.count('T')
 | 
| 
 | 
    92     AT = A+T
 | 
| 
 | 
    93     return AT
 | 
| 
 | 
    94 
 | 
| 
 | 
    95 #returns the free energy value of that sequence
 | 
| 
 | 
    96 def free_energy(seq):
 | 
| 
 | 
    97     dic1 = {'AA':-1.00, 
 | 
| 
 | 
    98         'TT':-1.00, 
 | 
| 
 | 
    99         'AT':-0.88, 
 | 
| 
 | 
   100         'TA':-0.58, 
 | 
| 
 | 
   101         'CA':-1.45,
 | 
| 
 | 
   102         'AC':-1.44, 
 | 
| 
 | 
   103         'GG':-1.84, 
 | 
| 
 | 
   104         'CC':-1.84, 
 | 
| 
 | 
   105         'GA':-1.30, 
 | 
| 
 | 
   106         'AG':-1.28, 
 | 
| 
 | 
   107         'TC':-1.30, 
 | 
| 
 | 
   108         'CT':-1.28, 
 | 
| 
 | 
   109         'TG':-1.45,
 | 
| 
 | 
   110         'GT':-1.44,
 | 
| 
 | 
   111         'GC':-2.24,
 | 
| 
 | 
   112         'CG':-2.17}
 | 
| 
 | 
   113     total = 0
 | 
| 
 | 
   114     i = 0
 | 
| 
 | 
   115     j = 1
 | 
| 
 | 
   116     while i < len(seq)-1:
 | 
| 
 | 
   117         dint = seq[i]+seq[j]
 | 
| 
 | 
   118         total += dic1[dint]
 | 
| 
 | 
   119         i += 1
 | 
| 
 | 
   120         j += 1
 | 
| 
 | 
   121     return total |