annotate DPOGALAXY.py @ 12:808f6cdf2e9f draft

Uploaded
author jose_duarte
date Fri, 26 Nov 2021 12:07:54 +0000
parents 525fe9bb114b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
1 #print('Hello world')
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
2 #PS C:\Users\joseduarte\Documents\pythonfiles\phage> python pdpo_test.py
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
3 #Hello world
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
4
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
5 class PDPOPrediction:
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
6 def __init__(self, Folder = 'location', mdl='',seq_file = 'fasta_file.fasta',ttable=11):
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
7 import pickle
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
8 import pandas as pd
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
9 from Bio import SeqIO
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
10 import os
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
11 from pathlib import Path
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
12 self.data = {}
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
13 self.df_output = None
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
14 self.seqfile = seq_file
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
15 self.__location__ = os.path.realpath(os.path.join(os.getcwd(), Folder))
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
16
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
17 with open(os.path.join(self.__location__,mdl), 'rb') as m:
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
18 self.model = pickle.load(m)
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
19 if mdl == 'SVM4311':
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
20 with open(os.path.join(__location__,'d4311_SCALER'),'rb') as sl:
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
21 self.scaler = pickle.load(sl)
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
22 self.name = mdl
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
23 elif mdl == 'RF5748':
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
24 with open(os.path.join(__location__,'d5748_SCALER'),'rb') as sc:
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
25 self.scaler = pickle.load(sc)
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
26 self.name = mdl
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
27 elif mdl == 'ANN4311':
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
28 with open(os.path.join(__location__,'d4311_SCALER'),'rb') as sl:
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
29 self.scaler = pickle.load(sl)
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
30 self.name = mdl
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
31
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
32 for seq in SeqIO.parse(os.path.join(self.__location__,self.seqfile), 'fasta'):
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
33 #name_seq = seq.id
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
34 DNA_seq = seq.seq
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
35 AA_seq = DNA_seq.translate(table=ttable)
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
36 descr_seq = seq.description.replace(' ','')
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
37 self.data[descr_seq]=[DNA_seq._data,AA_seq._data]
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
38 self.df = pd.DataFrame({'ID':list(self.data.keys()),
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
39 'DNAseq':[elem[0] for elem in self.data.values()],
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
40 'AAseq':[elem[1] for elem in self.data.values()]})
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
41 self.df = self.df.set_index('ID')
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
42
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
43 def Datastructure(self):
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
44 import pandas as pd
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
45 import pickle
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
46 from Bio.SeqUtils.ProtParam import ProteinAnalysis
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
47 from propy import CTD
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
48 from propy import AAComposition
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
49
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
50 def count_orf(orf_seq):
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
51 dic = {'DNA-A': 0, 'DNA-C': 0, 'DNA-T': 0, 'DNA-G': 0, 'DNA-GC': 0}
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
52 for letter in range(len(orf_seq)):
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
53 for k in range(0, 4):
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
54 if orf_seq[letter] in list(dic.keys())[k][-1]:
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
55 dic[list(dic.keys())[k]] += 1
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
56 dic['DNA-GC'] = ((dic['DNA-C'] + dic['DNA-G']) / (
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
57 dic['DNA-A'] + dic['DNA-C'] + dic['DNA-T'] + dic['DNA-G'])) * 100
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
58 return dic
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
59
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
60 def count_aa(aa_seq):
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
61 dic = {'G': 0, 'A': 0, 'L': 0, 'V': 0, 'I': 0, 'P': 0, 'F': 0, 'S': 0, 'T': 0, 'C': 0,
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
62 'Y': 0, 'N': 0, 'Q': 0, 'D': 0, 'E': 0, 'R': 0, 'K': 0, 'H': 0, 'W': 0, 'M': 0}
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
63 for letter in range(len(aa_seq)):
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
64 if aa_seq[letter] in dic.keys():
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
65 dic[aa_seq[letter]] += 1
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
66 return dic
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
67
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
68 def sec_st_fr(aa_seq):
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
69 from Bio.SeqUtils.ProtParam import ProteinAnalysis
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
70 st_dic = {'Helix': 0, 'Turn': 0, 'Sheet': 0}
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
71 stu = ProteinAnalysis(aa_seq).secondary_structure_fraction()
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
72 st_dic['Helix'] = stu[0]
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
73 st_dic['Turn'] = stu[1]
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
74 st_dic['Sheet'] = stu[2]
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
75 return st_dic
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
76
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
77 self.feat={"SVM4311": ["DNA-A", "DNA-T", "DNA-G", "DNA-GC", "AA_Len", "G", "A", "S", "T", "N", "Turn", "Sheet",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
78 "_PolarizabilityC1", "_PolarizabilityC3", "_SolventAccessibilityC1", "_SecondaryStrC1",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
79 "_SecondaryStrC2", "_SecondaryStrC3", "_ChargeC2", "_ChargeC3", "_PolarityC1", "_NormalizedVDWVC1",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
80 "_NormalizedVDWVC3", "_HydrophobicityC2", "_HydrophobicityC3", "_SecondaryStrT23",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
81 "_NormalizedVDWVT13", "_PolarizabilityD1001", "_SolventAccessibilityD1001",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
82 "_SolventAccessibilityD2001", "_SolventAccessibilityD3001", "_SecondaryStrD1025", "_ChargeD1075",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
83 "_ChargeD2001", "_ChargeD2025", "_ChargeD3025", "_ChargeD3050", "_PolarityD1075", "_PolarityD3025",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
84 "_NormalizedVDWVD1001", "_NormalizedVDWVD3050", "_HydrophobicityD2001", "DG", "DT", "GD"],
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
85 "RF5748": ["DNA-A", "DNA-T", "DNA-G", "DNA-GC", "AA_Len", "G", "A", "S", "T", "N", "Turn", "Sheet",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
86 "_PolarizabilityC1", "_PolarizabilityC3", "_SecondaryStrC1", "_SecondaryStrC2", "_SecondaryStrC3",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
87 "_ChargeC1", "_ChargeC2", "_ChargeC3", "_NormalizedVDWVC1", "_NormalizedVDWVC3", "_HydrophobicityC2",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
88 "_HydrophobicityC3", "_SolventAccessibilityT12", "_SolventAccessibilityT13", "_SecondaryStrT23",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
89 "_NormalizedVDWVT23", "_HydrophobicityT12", "_PolarizabilityD1001", "_SolventAccessibilityD1001",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
90 "_SolventAccessibilityD2001", "_SolventAccessibilityD3001", "_SecondaryStrD1001",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
91 "_SecondaryStrD1025", "_ChargeD1025", "_ChargeD1075", "_ChargeD2001", "_ChargeD2025", "_ChargeD3025",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
92 "_ChargeD3050", "_PolarityD1001", "_PolarityD1050", "_PolarityD1075", "_PolarityD3025",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
93 "_NormalizedVDWVD1001", "_NormalizedVDWVD3001", "_HydrophobicityD1001", "_HydrophobicityD2001", "NG",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
94 "DG", "DT", "GD", "GT"],
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
95 "ANN4311": ["DNA-A", "DNA-T", "DNA-G", "DNA-GC", "AA_Len", "G", "A", "S", "T", "N", "Turn", "Sheet",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
96 "_PolarizabilityC1", "_PolarizabilityC3", "_SolventAccessibilityC1", "_SecondaryStrC1",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
97 "_SecondaryStrC2", "_SecondaryStrC3", "_ChargeC2", "_ChargeC3", "_PolarityC1", "_NormalizedVDWVC1",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
98 "_NormalizedVDWVC3", "_HydrophobicityC2", "_HydrophobicityC3", "_SecondaryStrT23",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
99 "_NormalizedVDWVT13", "_PolarizabilityD1001", "_SolventAccessibilityD1001",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
100 "_SolventAccessibilityD2001", "_SolventAccessibilityD3001", "_SecondaryStrD1025", "_ChargeD1075",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
101 "_ChargeD2001", "_ChargeD2025", "_ChargeD3025", "_ChargeD3050", "_PolarityD1075", "_PolarityD3025",
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
102 "_NormalizedVDWVD1001", "_NormalizedVDWVD3050", "_HydrophobicityD2001", "DG", "DT", "GD"]}
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
103
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
104 self.df_output = self.df.copy()
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
105 self.df_output.drop(['DNAseq','AAseq'],axis=1,inplace=True)
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
106 dna_feat = {}
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
107 aa_len = {}
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
108 aroma_dic = {}
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
109 iso_dic = {}
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
110 aa_content = {}
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
111 st_dic_master = {}
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
112 CTD_dic = {}
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
113 dp = {}
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
114 for i in range(len(self.df)):
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
115 i_name = self.df.index[i]
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
116 dna_feat[i_name] = count_orf(self.df.iloc[i]['DNAseq'])
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
117 aa_len[i_name] = len(self.df.iloc[i]['AAseq'])
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
118 aroma_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).aromaticity()
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
119 iso_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).isoelectric_point()
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
120 aa_content[i_name] = count_aa(self.df.iloc[i]['AAseq'])
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
121 st_dic_master[i_name] = sec_st_fr(self.df.iloc[i]['AAseq'])
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
122 CTD_dic[i_name] = CTD.CalculateCTD(self.df.iloc[i]['AAseq'])
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
123 dp[i_name] = AAComposition.CalculateDipeptideComposition(self.df.iloc[i]['AAseq'])
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
124 for j in self.df.index:
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
125 self.df.loc[j, dna_feat[j].keys()] = dna_feat[j].values() #dic with multiple values
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
126 self.df.loc[j, 'AA_Len'] = int(aa_len[j]) #dic with one value
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
127 self.df.loc[j, 'Aromaticity'] = aroma_dic[j]
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
128 self.df.loc[j, 'IsoelectricPoint'] = iso_dic[j]
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
129 self.df.loc[j, aa_content[j].keys()] = aa_content[j].values()
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
130 self.df.loc[j, st_dic_master[j].keys()] = st_dic_master[j].values()
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
131 self.df.loc[j, CTD_dic[j].keys()] = CTD_dic[j].values()
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
132 self.df.loc[j, dp[j].keys()] = dp[j].values()
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
133 self.df.drop(['DNAseq','AAseq'],axis=1,inplace=True)
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
134
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
135 def Prediction(self):
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
136 import os
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
137 import pickle
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
138 import json
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
139 import pandas as pd
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
140 import numpy as np
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
141 from pathlib import Path
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
142 ft_scaler = pd.DataFrame(self.scaler.transform(self.df.iloc[:, :]), index=self.df.index,columns=self.df.columns)
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
143 ft_scaler = ft_scaler.drop(columns=[col for col in self.df if col not in self.feat[self.name]], axis=1)
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
144 scores = self.model.predict_proba(ft_scaler)
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
145 pos_scores = np.empty((self.df.shape[0], 0), float)
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
146 for x in scores:
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
147 pos_scores = np.append(pos_scores, round(x[1]*100))
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
148 self.df_output.reset_index(inplace=True)
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
149 self.df_output['{} DPO Prediction (%)'.format(self.name)]= pos_scores
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
150 self.df_output = self.df_output.sort_values(by='{} DPO Prediction (%)'.format(self.name), ascending=False)
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
151 self.df_output.to_html('output.html', index=False, justify='center')
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
152
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
153 if __name__ == '__main__':
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
154 import os
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
155 import sys
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
156 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
157
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
158 model = sys.argv[1]
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
159 fasta_file = sys.argv[2]
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
160
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
161 PDPO = PDPOPrediction(__location__,model,fasta_file)
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
162 PDPO.Datastructure()
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
163 PDPO.Prediction()
525fe9bb114b Uploaded
jose_duarte
parents:
diff changeset
164