comparison DPOGALAXY.py @ 32:5a0afb1578ea draft

Uploaded
author jose_duarte
date Wed, 15 Feb 2023 09:57:01 +0000
parents ce0de724097a
children
comparison
equal deleted inserted replaced
31:3d94608aea7a 32:5a0afb1578ea
1 import pickle
2 from Bio import SeqIO
3 import os
4 import pandas as pd
5 import numpy as np
6 from local_ctd import CalculateCTD
7 from local_AAComposition import CalculateDipeptideComposition
8 import sys
9 from Bio.SeqUtils.ProtParam import ProteinAnalysis
10
1 11
2 class PDPOPrediction: 12 class PDPOPrediction:
3 def __init__(self, Folder = 'location', mdl='',seq_file = 'fasta_file.fasta',ttable=11): 13
4 import pickle 14 def __init__(self, folder='location', mdl='', seq_file='fasta_file.fasta', ttable=11):
5 import pandas as pd 15 """
6 from Bio import SeqIO 16 Initialize PhageDPO prediction.
7 import os 17 :param folder: data path
8 from pathlib import Path 18 :param mdl: ml model, in this case ANN or SVM
19 :param seq_file: fasta file
20 :param ttable: Translational table. By default, The Bacterial, Archaeal and Plant Plastid Code Table 11
21 """
22 self.records = []
9 self.data = {} 23 self.data = {}
10 self.df_output = None 24 self.df_output = None
11 self.seqfile = seq_file 25 self.seqfile = seq_file
12 self.__location__ = os.path.realpath(os.path.join(os.getcwd(), Folder)) 26 self.__location__ = os.path.realpath(os.path.join(os.getcwd(), folder))
13 27
14 with open(os.path.join(self.__location__,mdl), 'rb') as m: 28 with open(os.path.join(self.__location__, mdl), 'rb') as m:
15 self.model = pickle.load(m) 29 self.model = pickle.load(m)
16 if mdl == 'SVM4311': 30 if mdl == 'SVM4311':
17 with open(os.path.join(__location__,'d4311_SCALER'),'rb') as sl: 31 with open(os.path.join(__location__, 'd4311_SCALER'), 'rb') as sl:
18 self.scaler = pickle.load(sl) 32 self.scaler = pickle.load(sl)
19 self.name = mdl 33 self.name = mdl
20 elif mdl == 'ANN7185': 34 elif mdl == 'ANN7185':
21 with open(os.path.join(__location__,'d7185_SCALER'),'rb') as sc: 35 with open(os.path.join(__location__, 'd7185_SCALER'), 'rb') as sc:
22 self.scaler = pickle.load(sc) 36 self.scaler = pickle.load(sc)
23 self.name = mdl 37 self.name = mdl
24 38
25 for seq in SeqIO.parse(os.path.join(self.__location__,self.seqfile), 'fasta'): 39 for seq in SeqIO.parse(os.path.join(self.__location__, self.seqfile), 'fasta'):
40 record = []
26 DNA_seq = seq.seq 41 DNA_seq = seq.seq
27 AA_seq = DNA_seq.translate(table=ttable) 42 AA_seq = DNA_seq.translate(table=ttable)
28 descr_seq = seq.description.replace(' ','') 43 descr_seq = seq.description.replace(' ', '')
29 self.data[descr_seq]=[DNA_seq._data,AA_seq._data] 44 self.data[descr_seq] = [DNA_seq._data, AA_seq._data]
30 self.df = pd.DataFrame({'ID':list(self.data.keys()), 45 record.append(seq.description)
31 'DNAseq':[elem[0] for elem in self.data.values()], 46 record.append(DNA_seq._data)
32 'AAseq':[elem[1] for elem in self.data.values()]}) 47 record.append(AA_seq._data)
33 self.df = self.df.set_index('ID') 48 self.records.append(record)
49
50 columns = ['ID', 'DNAseq', 'AAseq']
51 self.df = pd.DataFrame(self.records, columns=columns)
52 #self.df = self.df.set_index('ID')
53 self.df.update(self.df.DNAseq[self.df.DNAseq.apply(type) == list].str[0])
54 self.df.update(self.df.AAseq[self.df.AAseq.apply(type) == list].str[0])
34 55
35 def Datastructure(self): 56 def Datastructure(self):
36 import pandas as pd 57 """
37 import pickle 58 Create dataset with all features
38 from Bio.SeqUtils.ProtParam import ProteinAnalysis 59 """
39 from local_ctd import CalculateCTD
40 from local_AAComposition import CalculateAAComposition, CalculateDipeptideComposition
41
42 def count_orf(orf_seq): 60 def count_orf(orf_seq):
61 """
62 Function to count open reading frames
63 :param orf_seq: sequence to analyze
64 :return: dictionary with open reading frames
65 """
43 dic = {'DNA-A': 0, 'DNA-C': 0, 'DNA-T': 0, 'DNA-G': 0, 'DNA-GC': 0} 66 dic = {'DNA-A': 0, 'DNA-C': 0, 'DNA-T': 0, 'DNA-G': 0, 'DNA-GC': 0}
44 for letter in range(len(orf_seq)): 67 for letter in range(len(orf_seq)):
45 for k in range(0, 4): 68 for k in range(0, 4):
46 if orf_seq[letter] in list(dic.keys())[k][-1]: 69 if str(orf_seq[letter]) in list(dic.keys())[k][-1]:
47 dic[list(dic.keys())[k]] += 1 70 dic[list(dic.keys())[k]] += 1
48 dic['DNA-GC'] = ((dic['DNA-C'] + dic['DNA-G']) / ( 71 dic['DNA-GC'] = ((dic['DNA-C'] + dic['DNA-G']) / (
49 dic['DNA-A'] + dic['DNA-C'] + dic['DNA-T'] + dic['DNA-G'])) * 100 72 dic['DNA-A'] + dic['DNA-C'] + dic['DNA-T'] + dic['DNA-G'])) * 100
50 return dic 73 return dic
51 74
52 def count_aa(aa_seq): 75 def count_aa(aa_seq):
76 """
77 Function to count amino acids
78 :param aa_seq: sequence to analyze
79 :return: dictionary with amino acid composition
80 """
53 dic = {'G': 0, 'A': 0, 'L': 0, 'V': 0, 'I': 0, 'P': 0, 'F': 0, 'S': 0, 'T': 0, 'C': 0, 81 dic = {'G': 0, 'A': 0, 'L': 0, 'V': 0, 'I': 0, 'P': 0, 'F': 0, 'S': 0, 'T': 0, 'C': 0,
54 'Y': 0, 'N': 0, 'Q': 0, 'D': 0, 'E': 0, 'R': 0, 'K': 0, 'H': 0, 'W': 0, 'M': 0} 82 'Y': 0, 'N': 0, 'Q': 0, 'D': 0, 'E': 0, 'R': 0, 'K': 0, 'H': 0, 'W': 0, 'M': 0}
55 for letter in range(len(aa_seq)): 83 for letter in range(len(aa_seq)):
56 if aa_seq[letter] in dic.keys(): 84 if aa_seq[letter] in dic.keys():
57 dic[aa_seq[letter]] += 1 85 dic[aa_seq[letter]] += 1
58 return dic 86 return dic
59 87
60 def sec_st_fr(aa_seq): 88 def sec_st_fr(aa_seq):
61 from Bio.SeqUtils.ProtParam import ProteinAnalysis 89 """
90 Function to analyze secondary structure. Helix, Turn and Sheet
91 :param aa_seq: sequence to analyze
92 :return: dictionary with composition of each secondary structure
93 """
62 st_dic = {'Helix': 0, 'Turn': 0, 'Sheet': 0} 94 st_dic = {'Helix': 0, 'Turn': 0, 'Sheet': 0}
63 stu = ProteinAnalysis(aa_seq).secondary_structure_fraction() 95 stu = ProteinAnalysis(aa_seq).secondary_structure_fraction()
64 st_dic['Helix'] = stu[0] 96 st_dic['Helix'] = stu[0]
65 st_dic['Turn'] = stu[1] 97 st_dic['Turn'] = stu[1]
66 st_dic['Sheet'] = stu[2] 98 st_dic['Sheet'] = stu[2]
94 "KS", "KT", "MQ", "MG", "MI", "FA", "FR", "FS", "FY", "PC", "PE", "PG", "PH", "PM", "PF", 126 "KS", "KT", "MQ", "MG", "MI", "FA", "FR", "FS", "FY", "PC", "PE", "PG", "PH", "PM", "PF",
95 "PT", "SA", "SD", "SC", "SQ", "SW", "TA", "TC", "TM", "WL", "WV", "YE", "YG", "YH", "YI", 127 "PT", "SA", "SD", "SC", "SQ", "SW", "TA", "TC", "TM", "WL", "WV", "YE", "YG", "YH", "YI",
96 "YL", "YK", "YM", "YS"]} 128 "YL", "YK", "YM", "YS"]}
97 129
98 self.df_output = self.df.copy() 130 self.df_output = self.df.copy()
99 self.df_output.drop(['DNAseq','AAseq'],axis=1,inplace=True) 131 self.df_output.drop(['DNAseq', 'AAseq'], axis=1, inplace=True)
100 dna_feat = {} 132 dna_feat = {}
101 aa_len = {} 133 aa_len = {}
102 aroma_dic = {} 134 aroma_dic = {}
103 iso_dic = {} 135 iso_dic = {}
104 aa_content = {} 136 aa_content = {}
105 st_dic_master = {} 137 st_dic_master = {}
106 CTD_dic = {} 138 CTD_dic = {}
107 dp = {} 139 dp = {}
140 self.df1 = self.df[['ID']].copy()
141 self.df.drop(['ID'], axis=1, inplace=True)
108 for i in range(len(self.df)): 142 for i in range(len(self.df)):
109 i_name = self.df.index[i] 143 i_name = self.df.index[i]
110 dna_feat[i_name] = count_orf(self.df.iloc[i]['DNAseq']) 144 dna_feat[i] = count_orf(self.df.iloc[i]['DNAseq'])
111 aa_len[i_name] = len(self.df.iloc[i]['AAseq']) 145 aa_len[i] = len(self.df.iloc[i]['AAseq'])
112 aroma_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).aromaticity() 146 aroma_dic[i] = ProteinAnalysis(self.df.iloc[i]['AAseq']).aromaticity()
113 iso_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).isoelectric_point() 147 iso_dic[i] = ProteinAnalysis(self.df.iloc[i]['AAseq']).isoelectric_point()
114 aa_content[i_name] = count_aa(self.df.iloc[i]['AAseq']) 148 aa_content[i] = count_aa(self.df.iloc[i]['AAseq'])
115 st_dic_master[i_name] = sec_st_fr(self.df.iloc[i]['AAseq']) 149 st_dic_master[i] = sec_st_fr(self.df.iloc[i]['AAseq'])
116 CTD_dic[i_name] = CalculateCTD(self.df.iloc[i]['AAseq']) 150 CTD_dic[i] = CalculateCTD(self.df.iloc[i]['AAseq'])
117 dp[i_name] = CalculateDipeptideComposition(self.df.iloc[i]['AAseq']) 151 dp[i] = CalculateDipeptideComposition(self.df.iloc[i]['AAseq'])
118 for j in self.df.index: 152 for j in self.df.index:
119 self.df.loc[j, dna_feat[j].keys()] = dna_feat[j].values() #dic with multiple values 153 self.df.loc[j, dna_feat[j].keys()] = dna_feat[j].values() #dic with multiple values
120 self.df.loc[j, 'AA_Len'] = int(aa_len[j]) #dic with one value 154 self.df.loc[j, 'AA_Len'] = int(aa_len[j]) #dic with one value
121 self.df.loc[j, 'Aromaticity'] = aroma_dic[j] 155 self.df.loc[j, 'Aromaticity'] = aroma_dic[j]
122 self.df.loc[j, 'IsoelectricPoint'] = iso_dic[j] 156 self.df.loc[j, 'IsoelectricPoint'] = iso_dic[j]
123 self.df.loc[j, aa_content[j].keys()] = aa_content[j].values() 157 self.df.loc[j, aa_content[j].keys()] = aa_content[j].values()
124 self.df.loc[j, st_dic_master[j].keys()] = st_dic_master[j].values() 158 self.df.loc[j, st_dic_master[j].keys()] = st_dic_master[j].values()
125 self.df.loc[j, CTD_dic[j].keys()] = CTD_dic[j].values() 159 self.df.loc[j, CTD_dic[j].keys()] = CTD_dic[j].values()
126 self.df.loc[j, dp[j].keys()] = dp[j].values() 160 self.df.loc[j, dp[j].keys()] = dp[j].values()
127 self.df.drop(['DNAseq','AAseq'],axis=1,inplace=True) 161 self.df.drop(['DNAseq', 'AAseq'], axis=1, inplace=True)
128 162
129 def Prediction(self): 163 def Prediction(self):
130 import os 164 """
131 import pickle 165 Predicts the percentage of each CDS being depolymerase.
132 import json 166 :return: None
133 import pandas as pd 167 """
134 import numpy as np 168 ft_scaler = pd.DataFrame(self.scaler.transform(self.df.iloc[:, :]), index=self.df.index, columns=self.df.columns)
135 from pathlib import Path
136 ft_scaler = pd.DataFrame(self.scaler.transform(self.df.iloc[:, :]), index=self.df.index,columns=self.df.columns)
137 ft_scaler = ft_scaler.drop(columns=[col for col in self.df if col not in self.feat[self.name]], axis=1) 169 ft_scaler = ft_scaler.drop(columns=[col for col in self.df if col not in self.feat[self.name]], axis=1)
138 scores = self.model.predict_proba(ft_scaler) 170 scores = self.model.predict_proba(ft_scaler)
139 pos_scores = np.empty((self.df.shape[0], 0), float) 171 pos_scores = np.empty((self.df.shape[0], 0), float)
140 for x in scores: 172 for x in scores:
141 pos_scores = np.append(pos_scores, round(x[1]*100)) 173 pos_scores = np.append(pos_scores, round(x[1]*100))
142 self.df_output.reset_index(inplace=True) 174 self.df_output.reset_index(inplace=True)
143 self.df_output['{} DPO Prediction (%)'.format(self.name)]= pos_scores 175 print(self.df_output.columns)
176 self.df_output.rename(columns={'index': 'CDS'}, inplace=True)
177 self.df_output['CDS'] += 1
178 self.df_output['{} DPO Prediction (%)'.format(self.name)] = pos_scores
179 print(self.df_output)
144 #self.df_output = self.df_output.sort_values(by='{} DPO Prediction (%)'.format(self.name), ascending=False) 180 #self.df_output = self.df_output.sort_values(by='{} DPO Prediction (%)'.format(self.name), ascending=False)
145 self.df_output.to_html('output.html', index=False, justify='center') 181 self.df_output.to_html('output.html', index=False, justify='center')
146 182
183
147 if __name__ == '__main__': 184 if __name__ == '__main__':
148 import os
149 import sys
150 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) 185 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
151 186
152 model = sys.argv[1] 187 model = sys.argv[1]
153 fasta_file = sys.argv[2] 188 fasta_file = sys.argv[2]
154 189
155 PDPO = PDPOPrediction(__location__,model,fasta_file) 190 PDPO = PDPOPrediction(__location__, model, fasta_file)
156 PDPO.Datastructure() 191 PDPO.Datastructure()
157 PDPO.Prediction() 192 PDPO.Prediction()
158 193