Mercurial > repos > jose_duarte > phagedpo
comparison DPOGALAXY.py @ 35:a662eb3f87c2 draft
Uploaded
author | jose_duarte |
---|---|
date | Tue, 13 Jun 2023 09:53:42 +0000 |
parents | |
children | 9558da071ec9 |
comparison
equal
deleted
inserted
replaced
34:a20338e6e58f | 35:a662eb3f87c2 |
---|---|
1 import pickle | |
2 from Bio import SeqIO | |
3 import os | |
4 import pandas as pd | |
5 import numpy as np | |
6 from local_ctd import CalculateCTD | |
7 from local_AAComposition import CalculateDipeptideComposition | |
8 import sys | |
9 from Bio.SeqUtils.ProtParam import ProteinAnalysis | |
10 | |
11 | |
12 class PDPOPrediction: | |
13 | |
14 def __init__(self, folder='location', mdl='', seq_file='fasta_file.fasta', ttable=11): | |
15 """ | |
16 Initialize PhageDPO prediction. | |
17 :param folder: data path | |
18 :param mdl: ml model, in this case ANN or SVM | |
19 :param seq_file: fasta file | |
20 :param ttable: Translational table. By default, The Bacterial, Archaeal and Plant Plastid Code Table 11 | |
21 """ | |
22 self.records = [] | |
23 self.data = {} | |
24 self.df_output = None | |
25 self.seqfile = seq_file | |
26 self.__location__ = os.path.realpath(os.path.join(os.getcwd(), folder)) | |
27 | |
28 with open(os.path.join(self.__location__, mdl), 'rb') as m: | |
29 self.model0 = pickle.load(m) | |
30 self.model = self.model0.named_steps['clf'] | |
31 self.scaler = self.model0.named_steps['scl'] | |
32 self.selectk = self.model0.named_steps['selector'] | |
33 self.name = 'model' | |
34 | |
35 for seq in SeqIO.parse(os.path.join(self.__location__, self.seqfile), 'fasta'): | |
36 record = [] | |
37 DNA_seq = seq.seq | |
38 AA_seq = DNA_seq.translate(table=ttable) | |
39 descr_seq = seq.description.replace(' ', '') | |
40 self.data[descr_seq] = [DNA_seq._data, AA_seq._data] | |
41 record.append(seq.description) | |
42 record.append(DNA_seq._data) | |
43 record.append(AA_seq._data) | |
44 self.records.append(record) | |
45 | |
46 columns = ['ID', 'DNAseq', 'AAseq'] | |
47 self.df = pd.DataFrame(self.records, columns=columns) | |
48 #self.df = self.df.set_index('ID') | |
49 self.df.update(self.df.DNAseq[self.df.DNAseq.apply(type) == list].str[0]) | |
50 self.df.update(self.df.AAseq[self.df.AAseq.apply(type) == list].str[0]) | |
51 | |
52 def Datastructure(self): | |
53 """ | |
54 Create dataset with all features | |
55 """ | |
56 def count_orf(orf_seq): | |
57 """ | |
58 Function to count open reading frames | |
59 :param orf_seq: sequence to analyze | |
60 :return: dictionary with open reading frames | |
61 """ | |
62 dic = {'DNA-A': 0, 'DNA-C': 0, 'DNA-T': 0, 'DNA-G': 0, 'DNA-GC': 0} | |
63 for letter in range(len(orf_seq)): | |
64 for k in range(0, 4): | |
65 if str(orf_seq[letter]) in list(dic.keys())[k][-1]: | |
66 dic[list(dic.keys())[k]] += 1 | |
67 dic['DNA-GC'] = ((dic['DNA-C'] + dic['DNA-G']) / ( | |
68 dic['DNA-A'] + dic['DNA-C'] + dic['DNA-T'] + dic['DNA-G'])) * 100 | |
69 return dic | |
70 | |
71 def count_aa(aa_seq): | |
72 """ | |
73 Function to count amino acids | |
74 :param aa_seq: sequence to analyze | |
75 :return: dictionary with amino acid composition | |
76 """ | |
77 dic = {'G': 0, 'A': 0, 'L': 0, 'V': 0, 'I': 0, 'P': 0, 'F': 0, 'S': 0, 'T': 0, 'C': 0, | |
78 'Y': 0, 'N': 0, 'Q': 0, 'D': 0, 'E': 0, 'R': 0, 'K': 0, 'H': 0, 'W': 0, 'M': 0} | |
79 for letter in range(len(aa_seq)): | |
80 if aa_seq[letter] in dic.keys(): | |
81 dic[aa_seq[letter]] += 1 | |
82 return dic | |
83 | |
84 def sec_st_fr(aa_seq): | |
85 """ | |
86 Function to analyze secondary structure. Helix, Turn and Sheet | |
87 :param aa_seq: sequence to analyze | |
88 :return: dictionary with composition of each secondary structure | |
89 """ | |
90 st_dic = {'Helix': 0, 'Turn': 0, 'Sheet': 0} | |
91 stu = ProteinAnalysis(aa_seq).secondary_structure_fraction() | |
92 st_dic['Helix'] = stu[0] | |
93 st_dic['Turn'] = stu[1] | |
94 st_dic['Sheet'] = stu[2] | |
95 return st_dic | |
96 | |
97 | |
98 self.df_output = self.df.copy() | |
99 self.df_output.drop(['DNAseq', 'AAseq'], axis=1, inplace=True) | |
100 dna_feat = {} | |
101 aa_len = {} | |
102 aroma_dic = {} | |
103 iso_dic = {} | |
104 aa_content = {} | |
105 st_dic_master = {} | |
106 CTD_dic = {} | |
107 dp = {} | |
108 self.df1 = self.df[['ID']].copy() | |
109 self.df.drop(['ID'], axis=1, inplace=True) | |
110 for i in range(len(self.df)): | |
111 i_name = self.df.index[i] | |
112 dna_feat[i] = count_orf(self.df.iloc[i]['DNAseq']) | |
113 aa_len[i] = len(self.df.iloc[i]['AAseq']) | |
114 aroma_dic[i] = ProteinAnalysis(self.df.iloc[i]['AAseq']).aromaticity() | |
115 iso_dic[i] = ProteinAnalysis(self.df.iloc[i]['AAseq']).isoelectric_point() | |
116 aa_content[i] = count_aa(self.df.iloc[i]['AAseq']) | |
117 st_dic_master[i] = sec_st_fr(self.df.iloc[i]['AAseq']) | |
118 CTD_dic[i] = CalculateCTD(self.df.iloc[i]['AAseq']) | |
119 dp[i] = CalculateDipeptideComposition(self.df.iloc[i]['AAseq']) | |
120 for j in self.df.index: | |
121 self.df.loc[j, dna_feat[j].keys()] = dna_feat[j].values() #dic with multiple values | |
122 self.df.loc[j, 'AA_Len'] = int(aa_len[j]) #dic with one value | |
123 self.df.loc[j, 'Aromaticity'] = aroma_dic[j] | |
124 self.df.loc[j, 'IsoelectricPoint'] = iso_dic[j] | |
125 self.df.loc[j, aa_content[j].keys()] = aa_content[j].values() | |
126 self.df.loc[j, st_dic_master[j].keys()] = st_dic_master[j].values() | |
127 self.df.loc[j, CTD_dic[j].keys()] = CTD_dic[j].values() | |
128 self.df.loc[j, dp[j].keys()] = dp[j].values() | |
129 self.df.drop(['DNAseq', 'AAseq'], axis=1, inplace=True) | |
130 | |
131 def Prediction(self): | |
132 """ | |
133 Predicts the percentage of each CDS being depolymerase. | |
134 :return: model prediction | |
135 """ | |
136 scores = self.model0.predict_proba(self.df.iloc[:, :]) | |
137 pos_scores = np.empty((self.df.shape[0], 0), float) | |
138 for x in scores: | |
139 pos_scores = np.append(pos_scores, round(x[1]*100)) | |
140 self.df_output.reset_index(inplace=True) | |
141 self.df_output.rename(columns={'index': 'CDS'}, inplace=True) | |
142 self.df_output['CDS'] += 1 | |
143 self.df_output['{} DPO Prediction (%)'.format(self.name)] = pos_scores | |
144 self.df_output.to_html('output.html', index=False, justify='center') | |
145 | |
146 | |
147 if __name__ == '__main__': | |
148 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) | |
149 | |
150 model = 'svm1495' | |
151 fasta_file = sys.argv[2] | |
152 | |
153 #model = "C:/Users/biosy/Desktop/phageDPO/DPO/svm1495" | |
154 #fasta_file = "C:/Users/biosy/Downloads/bacillus.fasta" | |
155 | |
156 PDPO = PDPOPrediction(__location__, model, fasta_file) | |
157 PDPO.Datastructure() | |
158 PDPO.Prediction() |