35
|
1 import pickle
|
|
2 from Bio import SeqIO
|
|
3 import os
|
|
4 import pandas as pd
|
|
5 import numpy as np
|
|
6 from local_ctd import CalculateCTD
|
|
7 from local_AAComposition import CalculateDipeptideComposition
|
|
8 import sys
|
|
9 from Bio.SeqUtils.ProtParam import ProteinAnalysis
|
|
10
|
|
11
|
|
12 class PDPOPrediction:
|
|
13
|
|
14 def __init__(self, folder='location', mdl='', seq_file='fasta_file.fasta', ttable=11):
|
|
15 """
|
|
16 Initialize PhageDPO prediction.
|
|
17 :param folder: data path
|
|
18 :param mdl: ml model, in this case ANN or SVM
|
|
19 :param seq_file: fasta file
|
|
20 :param ttable: Translational table. By default, The Bacterial, Archaeal and Plant Plastid Code Table 11
|
|
21 """
|
|
22 self.records = []
|
|
23 self.data = {}
|
|
24 self.df_output = None
|
|
25 self.seqfile = seq_file
|
|
26 self.__location__ = os.path.realpath(os.path.join(os.getcwd(), folder))
|
|
27
|
|
28 with open(os.path.join(self.__location__, mdl), 'rb') as m:
|
|
29 self.model0 = pickle.load(m)
|
|
30 self.model = self.model0.named_steps['clf']
|
|
31 self.scaler = self.model0.named_steps['scl']
|
|
32 self.selectk = self.model0.named_steps['selector']
|
|
33 self.name = 'model'
|
|
34
|
|
35 for seq in SeqIO.parse(os.path.join(self.__location__, self.seqfile), 'fasta'):
|
|
36 record = []
|
|
37 DNA_seq = seq.seq
|
|
38 AA_seq = DNA_seq.translate(table=ttable)
|
|
39 descr_seq = seq.description.replace(' ', '')
|
|
40 self.data[descr_seq] = [DNA_seq._data, AA_seq._data]
|
|
41 record.append(seq.description)
|
|
42 record.append(DNA_seq._data)
|
|
43 record.append(AA_seq._data)
|
|
44 self.records.append(record)
|
|
45
|
|
46 columns = ['ID', 'DNAseq', 'AAseq']
|
|
47 self.df = pd.DataFrame(self.records, columns=columns)
|
|
48 #self.df = self.df.set_index('ID')
|
|
49 self.df.update(self.df.DNAseq[self.df.DNAseq.apply(type) == list].str[0])
|
|
50 self.df.update(self.df.AAseq[self.df.AAseq.apply(type) == list].str[0])
|
|
51
|
|
52 def Datastructure(self):
|
|
53 """
|
|
54 Create dataset with all features
|
|
55 """
|
|
56 def count_orf(orf_seq):
|
|
57 """
|
|
58 Function to count open reading frames
|
|
59 :param orf_seq: sequence to analyze
|
|
60 :return: dictionary with open reading frames
|
|
61 """
|
|
62 dic = {'DNA-A': 0, 'DNA-C': 0, 'DNA-T': 0, 'DNA-G': 0, 'DNA-GC': 0}
|
|
63 for letter in range(len(orf_seq)):
|
|
64 for k in range(0, 4):
|
|
65 if str(orf_seq[letter]) in list(dic.keys())[k][-1]:
|
|
66 dic[list(dic.keys())[k]] += 1
|
|
67 dic['DNA-GC'] = ((dic['DNA-C'] + dic['DNA-G']) / (
|
|
68 dic['DNA-A'] + dic['DNA-C'] + dic['DNA-T'] + dic['DNA-G'])) * 100
|
|
69 return dic
|
|
70
|
|
71 def count_aa(aa_seq):
|
|
72 """
|
|
73 Function to count amino acids
|
|
74 :param aa_seq: sequence to analyze
|
|
75 :return: dictionary with amino acid composition
|
|
76 """
|
|
77 dic = {'G': 0, 'A': 0, 'L': 0, 'V': 0, 'I': 0, 'P': 0, 'F': 0, 'S': 0, 'T': 0, 'C': 0,
|
|
78 'Y': 0, 'N': 0, 'Q': 0, 'D': 0, 'E': 0, 'R': 0, 'K': 0, 'H': 0, 'W': 0, 'M': 0}
|
|
79 for letter in range(len(aa_seq)):
|
|
80 if aa_seq[letter] in dic.keys():
|
|
81 dic[aa_seq[letter]] += 1
|
|
82 return dic
|
|
83
|
|
84 def sec_st_fr(aa_seq):
|
|
85 """
|
|
86 Function to analyze secondary structure. Helix, Turn and Sheet
|
|
87 :param aa_seq: sequence to analyze
|
|
88 :return: dictionary with composition of each secondary structure
|
|
89 """
|
|
90 st_dic = {'Helix': 0, 'Turn': 0, 'Sheet': 0}
|
|
91 stu = ProteinAnalysis(aa_seq).secondary_structure_fraction()
|
|
92 st_dic['Helix'] = stu[0]
|
|
93 st_dic['Turn'] = stu[1]
|
|
94 st_dic['Sheet'] = stu[2]
|
|
95 return st_dic
|
|
96
|
|
97
|
|
98 self.df_output = self.df.copy()
|
|
99 self.df_output.drop(['DNAseq', 'AAseq'], axis=1, inplace=True)
|
|
100 dna_feat = {}
|
|
101 aa_len = {}
|
|
102 aroma_dic = {}
|
|
103 iso_dic = {}
|
|
104 aa_content = {}
|
|
105 st_dic_master = {}
|
|
106 CTD_dic = {}
|
|
107 dp = {}
|
|
108 self.df1 = self.df[['ID']].copy()
|
|
109 self.df.drop(['ID'], axis=1, inplace=True)
|
|
110 for i in range(len(self.df)):
|
|
111 i_name = self.df.index[i]
|
|
112 dna_feat[i] = count_orf(self.df.iloc[i]['DNAseq'])
|
|
113 aa_len[i] = len(self.df.iloc[i]['AAseq'])
|
|
114 aroma_dic[i] = ProteinAnalysis(self.df.iloc[i]['AAseq']).aromaticity()
|
|
115 iso_dic[i] = ProteinAnalysis(self.df.iloc[i]['AAseq']).isoelectric_point()
|
|
116 aa_content[i] = count_aa(self.df.iloc[i]['AAseq'])
|
|
117 st_dic_master[i] = sec_st_fr(self.df.iloc[i]['AAseq'])
|
|
118 CTD_dic[i] = CalculateCTD(self.df.iloc[i]['AAseq'])
|
|
119 dp[i] = CalculateDipeptideComposition(self.df.iloc[i]['AAseq'])
|
|
120 for j in self.df.index:
|
|
121 self.df.loc[j, dna_feat[j].keys()] = dna_feat[j].values() #dic with multiple values
|
|
122 self.df.loc[j, 'AA_Len'] = int(aa_len[j]) #dic with one value
|
|
123 self.df.loc[j, 'Aromaticity'] = aroma_dic[j]
|
|
124 self.df.loc[j, 'IsoelectricPoint'] = iso_dic[j]
|
|
125 self.df.loc[j, aa_content[j].keys()] = aa_content[j].values()
|
|
126 self.df.loc[j, st_dic_master[j].keys()] = st_dic_master[j].values()
|
|
127 self.df.loc[j, CTD_dic[j].keys()] = CTD_dic[j].values()
|
|
128 self.df.loc[j, dp[j].keys()] = dp[j].values()
|
|
129 self.df.drop(['DNAseq', 'AAseq'], axis=1, inplace=True)
|
|
130
|
|
131 def Prediction(self):
|
|
132 """
|
|
133 Predicts the percentage of each CDS being depolymerase.
|
|
134 :return: model prediction
|
|
135 """
|
|
136 scores = self.model0.predict_proba(self.df.iloc[:, :])
|
|
137 pos_scores = np.empty((self.df.shape[0], 0), float)
|
|
138 for x in scores:
|
|
139 pos_scores = np.append(pos_scores, round(x[1]*100))
|
|
140 self.df_output.reset_index(inplace=True)
|
|
141 self.df_output.rename(columns={'index': 'CDS'}, inplace=True)
|
|
142 self.df_output['CDS'] += 1
|
|
143 self.df_output['{} DPO Prediction (%)'.format(self.name)] = pos_scores
|
|
144 self.df_output.to_html('output.html', index=False, justify='center')
|
|
145
|
|
146
|
|
147 if __name__ == '__main__':
|
|
148 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
|
149
|
40
|
150 model = 'svm'
|
38
|
151 fasta_file = sys.argv[1]
|
35
|
152
|
|
153 PDPO = PDPOPrediction(__location__, model, fasta_file)
|
|
154 PDPO.Datastructure()
|
|
155 PDPO.Prediction() |