Mercurial > repos > jose_duarte > phagedpo
comparison DPOGALAXY.py @ 25:ce0de724097a draft
Uploaded
author | jose_duarte |
---|---|
date | Sun, 12 Dec 2021 10:49:25 +0000 |
parents | |
children | 5a0afb1578ea |
comparison
equal
deleted
inserted
replaced
24:18f34a3f8d5e | 25:ce0de724097a |
---|---|
1 | |
2 class PDPOPrediction: | |
3 def __init__(self, Folder = 'location', mdl='',seq_file = 'fasta_file.fasta',ttable=11): | |
4 import pickle | |
5 import pandas as pd | |
6 from Bio import SeqIO | |
7 import os | |
8 from pathlib import Path | |
9 self.data = {} | |
10 self.df_output = None | |
11 self.seqfile = seq_file | |
12 self.__location__ = os.path.realpath(os.path.join(os.getcwd(), Folder)) | |
13 | |
14 with open(os.path.join(self.__location__,mdl), 'rb') as m: | |
15 self.model = pickle.load(m) | |
16 if mdl == 'SVM4311': | |
17 with open(os.path.join(__location__,'d4311_SCALER'),'rb') as sl: | |
18 self.scaler = pickle.load(sl) | |
19 self.name = mdl | |
20 elif mdl == 'ANN7185': | |
21 with open(os.path.join(__location__,'d7185_SCALER'),'rb') as sc: | |
22 self.scaler = pickle.load(sc) | |
23 self.name = mdl | |
24 | |
25 for seq in SeqIO.parse(os.path.join(self.__location__,self.seqfile), 'fasta'): | |
26 DNA_seq = seq.seq | |
27 AA_seq = DNA_seq.translate(table=ttable) | |
28 descr_seq = seq.description.replace(' ','') | |
29 self.data[descr_seq]=[DNA_seq._data,AA_seq._data] | |
30 self.df = pd.DataFrame({'ID':list(self.data.keys()), | |
31 'DNAseq':[elem[0] for elem in self.data.values()], | |
32 'AAseq':[elem[1] for elem in self.data.values()]}) | |
33 self.df = self.df.set_index('ID') | |
34 | |
35 def Datastructure(self): | |
36 import pandas as pd | |
37 import pickle | |
38 from Bio.SeqUtils.ProtParam import ProteinAnalysis | |
39 from local_ctd import CalculateCTD | |
40 from local_AAComposition import CalculateAAComposition, CalculateDipeptideComposition | |
41 | |
42 def count_orf(orf_seq): | |
43 dic = {'DNA-A': 0, 'DNA-C': 0, 'DNA-T': 0, 'DNA-G': 0, 'DNA-GC': 0} | |
44 for letter in range(len(orf_seq)): | |
45 for k in range(0, 4): | |
46 if orf_seq[letter] in list(dic.keys())[k][-1]: | |
47 dic[list(dic.keys())[k]] += 1 | |
48 dic['DNA-GC'] = ((dic['DNA-C'] + dic['DNA-G']) / ( | |
49 dic['DNA-A'] + dic['DNA-C'] + dic['DNA-T'] + dic['DNA-G'])) * 100 | |
50 return dic | |
51 | |
52 def count_aa(aa_seq): | |
53 dic = {'G': 0, 'A': 0, 'L': 0, 'V': 0, 'I': 0, 'P': 0, 'F': 0, 'S': 0, 'T': 0, 'C': 0, | |
54 'Y': 0, 'N': 0, 'Q': 0, 'D': 0, 'E': 0, 'R': 0, 'K': 0, 'H': 0, 'W': 0, 'M': 0} | |
55 for letter in range(len(aa_seq)): | |
56 if aa_seq[letter] in dic.keys(): | |
57 dic[aa_seq[letter]] += 1 | |
58 return dic | |
59 | |
60 def sec_st_fr(aa_seq): | |
61 from Bio.SeqUtils.ProtParam import ProteinAnalysis | |
62 st_dic = {'Helix': 0, 'Turn': 0, 'Sheet': 0} | |
63 stu = ProteinAnalysis(aa_seq).secondary_structure_fraction() | |
64 st_dic['Helix'] = stu[0] | |
65 st_dic['Turn'] = stu[1] | |
66 st_dic['Sheet'] = stu[2] | |
67 return st_dic | |
68 | |
69 self.feat={"SVM4311": ["DNA-A", "DNA-T", "DNA-G", "DNA-GC", "AA_Len", "G", "A", "S", "T", "N", "Turn", "Sheet", | |
70 "_PolarizabilityC1", "_PolarizabilityC3", "_SolventAccessibilityC1", "_SecondaryStrC1", | |
71 "_SecondaryStrC2", "_SecondaryStrC3", "_ChargeC2", "_ChargeC3", "_PolarityC1", "_NormalizedVDWVC1", | |
72 "_NormalizedVDWVC3", "_HydrophobicityC2", "_HydrophobicityC3", "_SecondaryStrT23", | |
73 "_NormalizedVDWVT13", "_PolarizabilityD1001", "_SolventAccessibilityD1001", "_SolventAccessibilityD2001", | |
74 "_SolventAccessibilityD3001", "_SecondaryStrD1025", "_ChargeD1075","_ChargeD2001", "_ChargeD2025", | |
75 "_ChargeD3025", "_ChargeD3050", "_PolarityD1075", "_PolarityD3025","_NormalizedVDWVD1001", | |
76 "_NormalizedVDWVD3050", "_HydrophobicityD2001", "DG", "DT", "GD"], | |
77 "ANN7185": ["DNA-GC", "AA_Len", "Aromaticity", "IsoelectricPoint", "G", "A", "L", "V", "I", "P", "F", | |
78 "S", "T", "C", "Y", "N", "Q", "D", "E", "R", "K", "H", "W", "M", "Turn", "Sheet", "_PolarizabilityC1", | |
79 "_PolarizabilityC2", "_PolarizabilityC3", "_SolventAccessibilityC1", "_SolventAccessibilityC2", | |
80 "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_ChargeC2", "_ChargeC3", "_PolarityC2", | |
81 "_NormalizedVDWVC2", "_NormalizedVDWVC3", "_HydrophobicityC1", "_HydrophobicityC2", "_SecondaryStrT13", | |
82 "_SecondaryStrT23", "_ChargeT12", "_ChargeT13", "_HydrophobicityT12", "_PolarizabilityD1001", | |
83 "_PolarizabilityD1025", "_PolarizabilityD1050", "_PolarizabilityD2001", "_PolarizabilityD3025", | |
84 "_PolarizabilityD3050", "_PolarizabilityD3075", "_SolventAccessibilityD1050", "_SolventAccessibilityD2001", | |
85 "_SolventAccessibilityD2025", "_SolventAccessibilityD2050", "_SolventAccessibilityD3025", | |
86 "_SolventAccessibilityD3050", "_SolventAccessibilityD3100", "_SecondaryStrD1025", "_SecondaryStrD1050", | |
87 "_SecondaryStrD1075", "_SecondaryStrD2001", "_SecondaryStrD2050", "_SecondaryStrD2075", "_ChargeD1050", | |
88 "_ChargeD1075", "_ChargeD1100", "_ChargeD2025", "_ChargeD3025", "_ChargeD3050", "_PolarityD2050", | |
89 "_PolarityD3050", "_NormalizedVDWVD1001", "_NormalizedVDWVD1050", "_NormalizedVDWVD2001", "_NormalizedVDWVD2025", | |
90 "_HydrophobicityD3001", "_HydrophobicityD3075", "AD", "AW", "AY", "RC", "RT", "NA", "NE", | |
91 "NG", "NP", "DE", "DQ", "DG", "DT", "DY", "CG", "CL", "CY", "CV", "EN", "QA", "QR", "QE", | |
92 "QI", "GA", "GR", "GD", "GQ", "GG", "GH", "GL", "GF", "GP", "GT", "GY", "HA", "HC", "HI", | |
93 "HK", "HP", "IC", "IG", "IS", "IT", "IW", "LA", "LR", "LH", "LI", "LK", "LP", "KQ", "KH", | |
94 "KS", "KT", "MQ", "MG", "MI", "FA", "FR", "FS", "FY", "PC", "PE", "PG", "PH", "PM", "PF", | |
95 "PT", "SA", "SD", "SC", "SQ", "SW", "TA", "TC", "TM", "WL", "WV", "YE", "YG", "YH", "YI", | |
96 "YL", "YK", "YM", "YS"]} | |
97 | |
98 self.df_output = self.df.copy() | |
99 self.df_output.drop(['DNAseq','AAseq'],axis=1,inplace=True) | |
100 dna_feat = {} | |
101 aa_len = {} | |
102 aroma_dic = {} | |
103 iso_dic = {} | |
104 aa_content = {} | |
105 st_dic_master = {} | |
106 CTD_dic = {} | |
107 dp = {} | |
108 for i in range(len(self.df)): | |
109 i_name = self.df.index[i] | |
110 dna_feat[i_name] = count_orf(self.df.iloc[i]['DNAseq']) | |
111 aa_len[i_name] = len(self.df.iloc[i]['AAseq']) | |
112 aroma_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).aromaticity() | |
113 iso_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).isoelectric_point() | |
114 aa_content[i_name] = count_aa(self.df.iloc[i]['AAseq']) | |
115 st_dic_master[i_name] = sec_st_fr(self.df.iloc[i]['AAseq']) | |
116 CTD_dic[i_name] = CalculateCTD(self.df.iloc[i]['AAseq']) | |
117 dp[i_name] = CalculateDipeptideComposition(self.df.iloc[i]['AAseq']) | |
118 for j in self.df.index: | |
119 self.df.loc[j, dna_feat[j].keys()] = dna_feat[j].values() #dic with multiple values | |
120 self.df.loc[j, 'AA_Len'] = int(aa_len[j]) #dic with one value | |
121 self.df.loc[j, 'Aromaticity'] = aroma_dic[j] | |
122 self.df.loc[j, 'IsoelectricPoint'] = iso_dic[j] | |
123 self.df.loc[j, aa_content[j].keys()] = aa_content[j].values() | |
124 self.df.loc[j, st_dic_master[j].keys()] = st_dic_master[j].values() | |
125 self.df.loc[j, CTD_dic[j].keys()] = CTD_dic[j].values() | |
126 self.df.loc[j, dp[j].keys()] = dp[j].values() | |
127 self.df.drop(['DNAseq','AAseq'],axis=1,inplace=True) | |
128 | |
129 def Prediction(self): | |
130 import os | |
131 import pickle | |
132 import json | |
133 import pandas as pd | |
134 import numpy as np | |
135 from pathlib import Path | |
136 ft_scaler = pd.DataFrame(self.scaler.transform(self.df.iloc[:, :]), index=self.df.index,columns=self.df.columns) | |
137 ft_scaler = ft_scaler.drop(columns=[col for col in self.df if col not in self.feat[self.name]], axis=1) | |
138 scores = self.model.predict_proba(ft_scaler) | |
139 pos_scores = np.empty((self.df.shape[0], 0), float) | |
140 for x in scores: | |
141 pos_scores = np.append(pos_scores, round(x[1]*100)) | |
142 self.df_output.reset_index(inplace=True) | |
143 self.df_output['{} DPO Prediction (%)'.format(self.name)]= pos_scores | |
144 #self.df_output = self.df_output.sort_values(by='{} DPO Prediction (%)'.format(self.name), ascending=False) | |
145 self.df_output.to_html('output.html', index=False, justify='center') | |
146 | |
147 if __name__ == '__main__': | |
148 import os | |
149 import sys | |
150 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) | |
151 | |
152 model = sys.argv[1] | |
153 fasta_file = sys.argv[2] | |
154 | |
155 PDPO = PDPOPrediction(__location__,model,fasta_file) | |
156 PDPO.Datastructure() | |
157 PDPO.Prediction() | |
158 |