Mercurial > repos > jose_duarte > phagedpo
comparison DPOGALAXY.py @ 4:2152b92c19a1 draft
Uploaded
| author | jose_duarte |
|---|---|
| date | Wed, 24 Nov 2021 18:05:02 +0000 |
| parents | 525fe9bb114b |
| children |
comparison
equal
deleted
inserted
replaced
| 3:d4853259dec7 | 4:2152b92c19a1 |
|---|---|
| 1 #print('Hello world') | |
| 2 #PS C:\Users\joseduarte\Documents\pythonfiles\phage> python pdpo_test.py | |
| 3 #Hello world | |
| 4 | |
| 5 class PDPOPrediction: | |
| 6 def __init__(self, Folder = 'location', mdl='',seq_file = 'fasta_file.fasta',ttable=11): | |
| 7 import pickle | |
| 8 import pandas as pd | |
| 9 from Bio import SeqIO | |
| 10 import os | |
| 11 from pathlib import Path | |
| 12 self.data = {} | |
| 13 self.df_output = None | |
| 14 self.seqfile = seq_file | |
| 15 self.__location__ = os.path.realpath(os.path.join(os.getcwd(), Folder)) | |
| 16 | |
| 17 with open(os.path.join(self.__location__,mdl), 'rb') as m: | |
| 18 self.model = pickle.load(m) | |
| 19 if mdl == 'SVM4311': | |
| 20 with open(os.path.join(__location__,'d4311_SCALER'),'rb') as sl: | |
| 21 self.scaler = pickle.load(sl) | |
| 22 self.name = mdl | |
| 23 elif mdl == 'RF5748': | |
| 24 with open(os.path.join(__location__,'d5748_SCALER'),'rb') as sc: | |
| 25 self.scaler = pickle.load(sc) | |
| 26 self.name = mdl | |
| 27 elif mdl == 'ANN4311': | |
| 28 with open(os.path.join(__location__,'d4311_SCALER'),'rb') as sl: | |
| 29 self.scaler = pickle.load(sl) | |
| 30 self.name = mdl | |
| 31 | |
| 32 for seq in SeqIO.parse(os.path.join(self.__location__,self.seqfile), 'fasta'): | |
| 33 #name_seq = seq.id | |
| 34 DNA_seq = seq.seq | |
| 35 AA_seq = DNA_seq.translate(table=ttable) | |
| 36 descr_seq = seq.description.replace(' ','') | |
| 37 self.data[descr_seq]=[DNA_seq._data,AA_seq._data] | |
| 38 self.df = pd.DataFrame({'ID':list(self.data.keys()), | |
| 39 'DNAseq':[elem[0] for elem in self.data.values()], | |
| 40 'AAseq':[elem[1] for elem in self.data.values()]}) | |
| 41 self.df = self.df.set_index('ID') | |
| 42 | |
| 43 def Datastructure(self): | |
| 44 import pandas as pd | |
| 45 import pickle | |
| 46 from Bio.SeqUtils.ProtParam import ProteinAnalysis | |
| 47 from propy import CTD | |
| 48 from propy import AAComposition | |
| 49 | |
| 50 def count_orf(orf_seq): | |
| 51 dic = {'DNA-A': 0, 'DNA-C': 0, 'DNA-T': 0, 'DNA-G': 0, 'DNA-GC': 0} | |
| 52 for letter in range(len(orf_seq)): | |
| 53 for k in range(0, 4): | |
| 54 if orf_seq[letter] in list(dic.keys())[k][-1]: | |
| 55 dic[list(dic.keys())[k]] += 1 | |
| 56 dic['DNA-GC'] = ((dic['DNA-C'] + dic['DNA-G']) / ( | |
| 57 dic['DNA-A'] + dic['DNA-C'] + dic['DNA-T'] + dic['DNA-G'])) * 100 | |
| 58 return dic | |
| 59 | |
| 60 def count_aa(aa_seq): | |
| 61 dic = {'G': 0, 'A': 0, 'L': 0, 'V': 0, 'I': 0, 'P': 0, 'F': 0, 'S': 0, 'T': 0, 'C': 0, | |
| 62 'Y': 0, 'N': 0, 'Q': 0, 'D': 0, 'E': 0, 'R': 0, 'K': 0, 'H': 0, 'W': 0, 'M': 0} | |
| 63 for letter in range(len(aa_seq)): | |
| 64 if aa_seq[letter] in dic.keys(): | |
| 65 dic[aa_seq[letter]] += 1 | |
| 66 return dic | |
| 67 | |
| 68 def sec_st_fr(aa_seq): | |
| 69 from Bio.SeqUtils.ProtParam import ProteinAnalysis | |
| 70 st_dic = {'Helix': 0, 'Turn': 0, 'Sheet': 0} | |
| 71 stu = ProteinAnalysis(aa_seq).secondary_structure_fraction() | |
| 72 st_dic['Helix'] = stu[0] | |
| 73 st_dic['Turn'] = stu[1] | |
| 74 st_dic['Sheet'] = stu[2] | |
| 75 return st_dic | |
| 76 | |
| 77 self.feat={"SVM4311": ["DNA-A", "DNA-T", "DNA-G", "DNA-GC", "AA_Len", "G", "A", "S", "T", "N", "Turn", "Sheet", | |
| 78 "_PolarizabilityC1", "_PolarizabilityC3", "_SolventAccessibilityC1", "_SecondaryStrC1", | |
| 79 "_SecondaryStrC2", "_SecondaryStrC3", "_ChargeC2", "_ChargeC3", "_PolarityC1", "_NormalizedVDWVC1", | |
| 80 "_NormalizedVDWVC3", "_HydrophobicityC2", "_HydrophobicityC3", "_SecondaryStrT23", | |
| 81 "_NormalizedVDWVT13", "_PolarizabilityD1001", "_SolventAccessibilityD1001", | |
| 82 "_SolventAccessibilityD2001", "_SolventAccessibilityD3001", "_SecondaryStrD1025", "_ChargeD1075", | |
| 83 "_ChargeD2001", "_ChargeD2025", "_ChargeD3025", "_ChargeD3050", "_PolarityD1075", "_PolarityD3025", | |
| 84 "_NormalizedVDWVD1001", "_NormalizedVDWVD3050", "_HydrophobicityD2001", "DG", "DT", "GD"], | |
| 85 "RF5748": ["DNA-A", "DNA-T", "DNA-G", "DNA-GC", "AA_Len", "G", "A", "S", "T", "N", "Turn", "Sheet", | |
| 86 "_PolarizabilityC1", "_PolarizabilityC3", "_SecondaryStrC1", "_SecondaryStrC2", "_SecondaryStrC3", | |
| 87 "_ChargeC1", "_ChargeC2", "_ChargeC3", "_NormalizedVDWVC1", "_NormalizedVDWVC3", "_HydrophobicityC2", | |
| 88 "_HydrophobicityC3", "_SolventAccessibilityT12", "_SolventAccessibilityT13", "_SecondaryStrT23", | |
| 89 "_NormalizedVDWVT23", "_HydrophobicityT12", "_PolarizabilityD1001", "_SolventAccessibilityD1001", | |
| 90 "_SolventAccessibilityD2001", "_SolventAccessibilityD3001", "_SecondaryStrD1001", | |
| 91 "_SecondaryStrD1025", "_ChargeD1025", "_ChargeD1075", "_ChargeD2001", "_ChargeD2025", "_ChargeD3025", | |
| 92 "_ChargeD3050", "_PolarityD1001", "_PolarityD1050", "_PolarityD1075", "_PolarityD3025", | |
| 93 "_NormalizedVDWVD1001", "_NormalizedVDWVD3001", "_HydrophobicityD1001", "_HydrophobicityD2001", "NG", | |
| 94 "DG", "DT", "GD", "GT"], | |
| 95 "ANN4311": ["DNA-A", "DNA-T", "DNA-G", "DNA-GC", "AA_Len", "G", "A", "S", "T", "N", "Turn", "Sheet", | |
| 96 "_PolarizabilityC1", "_PolarizabilityC3", "_SolventAccessibilityC1", "_SecondaryStrC1", | |
| 97 "_SecondaryStrC2", "_SecondaryStrC3", "_ChargeC2", "_ChargeC3", "_PolarityC1", "_NormalizedVDWVC1", | |
| 98 "_NormalizedVDWVC3", "_HydrophobicityC2", "_HydrophobicityC3", "_SecondaryStrT23", | |
| 99 "_NormalizedVDWVT13", "_PolarizabilityD1001", "_SolventAccessibilityD1001", | |
| 100 "_SolventAccessibilityD2001", "_SolventAccessibilityD3001", "_SecondaryStrD1025", "_ChargeD1075", | |
| 101 "_ChargeD2001", "_ChargeD2025", "_ChargeD3025", "_ChargeD3050", "_PolarityD1075", "_PolarityD3025", | |
| 102 "_NormalizedVDWVD1001", "_NormalizedVDWVD3050", "_HydrophobicityD2001", "DG", "DT", "GD"]} | |
| 103 | |
| 104 self.df_output = self.df.copy() | |
| 105 self.df_output.drop(['DNAseq','AAseq'],axis=1,inplace=True) | |
| 106 dna_feat = {} | |
| 107 aa_len = {} | |
| 108 aroma_dic = {} | |
| 109 iso_dic = {} | |
| 110 aa_content = {} | |
| 111 st_dic_master = {} | |
| 112 CTD_dic = {} | |
| 113 dp = {} | |
| 114 for i in range(len(self.df)): | |
| 115 i_name = self.df.index[i] | |
| 116 dna_feat[i_name] = count_orf(self.df.iloc[i]['DNAseq']) | |
| 117 aa_len[i_name] = len(self.df.iloc[i]['AAseq']) | |
| 118 aroma_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).aromaticity() | |
| 119 iso_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).isoelectric_point() | |
| 120 aa_content[i_name] = count_aa(self.df.iloc[i]['AAseq']) | |
| 121 st_dic_master[i_name] = sec_st_fr(self.df.iloc[i]['AAseq']) | |
| 122 CTD_dic[i_name] = CTD.CalculateCTD(self.df.iloc[i]['AAseq']) | |
| 123 dp[i_name] = AAComposition.CalculateDipeptideComposition(self.df.iloc[i]['AAseq']) | |
| 124 for j in self.df.index: | |
| 125 self.df.loc[j, dna_feat[j].keys()] = dna_feat[j].values() #dic with multiple values | |
| 126 self.df.loc[j, 'AA_Len'] = int(aa_len[j]) #dic with one value | |
| 127 self.df.loc[j, 'Aromaticity'] = aroma_dic[j] | |
| 128 self.df.loc[j, 'IsoelectricPoint'] = iso_dic[j] | |
| 129 self.df.loc[j, aa_content[j].keys()] = aa_content[j].values() | |
| 130 self.df.loc[j, st_dic_master[j].keys()] = st_dic_master[j].values() | |
| 131 self.df.loc[j, CTD_dic[j].keys()] = CTD_dic[j].values() | |
| 132 self.df.loc[j, dp[j].keys()] = dp[j].values() | |
| 133 self.df.drop(['DNAseq','AAseq'],axis=1,inplace=True) | |
| 134 | |
| 135 def Prediction(self): | |
| 136 import os | |
| 137 import pickle | |
| 138 import json | |
| 139 import pandas as pd | |
| 140 import numpy as np | |
| 141 from pathlib import Path | |
| 142 ft_scaler = pd.DataFrame(self.scaler.transform(self.df.iloc[:, :]), index=self.df.index,columns=self.df.columns) | |
| 143 ft_scaler = ft_scaler.drop(columns=[col for col in self.df if col not in self.feat[self.name]], axis=1) | |
| 144 scores = self.model.predict_proba(ft_scaler) | |
| 145 pos_scores = np.empty((self.df.shape[0], 0), float) | |
| 146 for x in scores: | |
| 147 pos_scores = np.append(pos_scores, round(x[1]*100)) | |
| 148 self.df_output.reset_index(inplace=True) | |
| 149 self.df_output['{} DPO Prediction (%)'.format(self.name)]= pos_scores | |
| 150 self.df_output = self.df_output.sort_values(by='{} DPO Prediction (%)'.format(self.name), ascending=False) | |
| 151 self.df_output.to_html('output.html', index=False, justify='center') | |
| 152 | |
| 153 if __name__ == '__main__': | |
| 154 import os | |
| 155 import sys | |
| 156 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) | |
| 157 | |
| 158 model = sys.argv[1] | |
| 159 fasta_file = sys.argv[2] | |
| 160 | |
| 161 PDPO = PDPOPrediction(__location__,model,fasta_file) | |
| 162 PDPO.Datastructure() | |
| 163 PDPO.Prediction() | |
| 164 |
