Mercurial > repos > jose_duarte > phagedpo
diff DPOGALAXY.py @ 4:2152b92c19a1 draft
Uploaded
author | jose_duarte |
---|---|
date | Wed, 24 Nov 2021 18:05:02 +0000 |
parents | 525fe9bb114b |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DPOGALAXY.py Wed Nov 24 18:05:02 2021 +0000 @@ -0,0 +1,164 @@ +#print('Hello world') +#PS C:\Users\joseduarte\Documents\pythonfiles\phage> python pdpo_test.py +#Hello world + +class PDPOPrediction: + def __init__(self, Folder = 'location', mdl='',seq_file = 'fasta_file.fasta',ttable=11): + import pickle + import pandas as pd + from Bio import SeqIO + import os + from pathlib import Path + self.data = {} + self.df_output = None + self.seqfile = seq_file + self.__location__ = os.path.realpath(os.path.join(os.getcwd(), Folder)) + + with open(os.path.join(self.__location__,mdl), 'rb') as m: + self.model = pickle.load(m) + if mdl == 'SVM4311': + with open(os.path.join(__location__,'d4311_SCALER'),'rb') as sl: + self.scaler = pickle.load(sl) + self.name = mdl + elif mdl == 'RF5748': + with open(os.path.join(__location__,'d5748_SCALER'),'rb') as sc: + self.scaler = pickle.load(sc) + self.name = mdl + elif mdl == 'ANN4311': + with open(os.path.join(__location__,'d4311_SCALER'),'rb') as sl: + self.scaler = pickle.load(sl) + self.name = mdl + + for seq in SeqIO.parse(os.path.join(self.__location__,self.seqfile), 'fasta'): + #name_seq = seq.id + DNA_seq = seq.seq + AA_seq = DNA_seq.translate(table=ttable) + descr_seq = seq.description.replace(' ','') + self.data[descr_seq]=[DNA_seq._data,AA_seq._data] + self.df = pd.DataFrame({'ID':list(self.data.keys()), + 'DNAseq':[elem[0] for elem in self.data.values()], + 'AAseq':[elem[1] for elem in self.data.values()]}) + self.df = self.df.set_index('ID') + + def Datastructure(self): + import pandas as pd + import pickle + from Bio.SeqUtils.ProtParam import ProteinAnalysis + from propy import CTD + from propy import AAComposition + + def count_orf(orf_seq): + dic = {'DNA-A': 0, 'DNA-C': 0, 'DNA-T': 0, 'DNA-G': 0, 'DNA-GC': 0} + for letter in range(len(orf_seq)): + for k in range(0, 4): + if orf_seq[letter] in list(dic.keys())[k][-1]: + dic[list(dic.keys())[k]] += 1 + dic['DNA-GC'] = ((dic['DNA-C'] + dic['DNA-G']) / ( + dic['DNA-A'] + dic['DNA-C'] + dic['DNA-T'] + dic['DNA-G'])) * 100 + return dic + + def count_aa(aa_seq): + dic = {'G': 0, 'A': 0, 'L': 0, 'V': 0, 'I': 0, 'P': 0, 'F': 0, 'S': 0, 'T': 0, 'C': 0, + 'Y': 0, 'N': 0, 'Q': 0, 'D': 0, 'E': 0, 'R': 0, 'K': 0, 'H': 0, 'W': 0, 'M': 0} + for letter in range(len(aa_seq)): + if aa_seq[letter] in dic.keys(): + dic[aa_seq[letter]] += 1 + return dic + + def sec_st_fr(aa_seq): + from Bio.SeqUtils.ProtParam import ProteinAnalysis + st_dic = {'Helix': 0, 'Turn': 0, 'Sheet': 0} + stu = ProteinAnalysis(aa_seq).secondary_structure_fraction() + st_dic['Helix'] = stu[0] + st_dic['Turn'] = stu[1] + st_dic['Sheet'] = stu[2] + return st_dic + + self.feat={"SVM4311": ["DNA-A", "DNA-T", "DNA-G", "DNA-GC", "AA_Len", "G", "A", "S", "T", "N", "Turn", "Sheet", + "_PolarizabilityC1", "_PolarizabilityC3", "_SolventAccessibilityC1", "_SecondaryStrC1", + "_SecondaryStrC2", "_SecondaryStrC3", "_ChargeC2", "_ChargeC3", "_PolarityC1", "_NormalizedVDWVC1", + "_NormalizedVDWVC3", "_HydrophobicityC2", "_HydrophobicityC3", "_SecondaryStrT23", + "_NormalizedVDWVT13", "_PolarizabilityD1001", "_SolventAccessibilityD1001", + "_SolventAccessibilityD2001", "_SolventAccessibilityD3001", "_SecondaryStrD1025", "_ChargeD1075", + "_ChargeD2001", "_ChargeD2025", "_ChargeD3025", "_ChargeD3050", "_PolarityD1075", "_PolarityD3025", + "_NormalizedVDWVD1001", "_NormalizedVDWVD3050", "_HydrophobicityD2001", "DG", "DT", "GD"], + "RF5748": ["DNA-A", "DNA-T", "DNA-G", "DNA-GC", "AA_Len", "G", "A", "S", "T", "N", "Turn", "Sheet", + "_PolarizabilityC1", "_PolarizabilityC3", "_SecondaryStrC1", "_SecondaryStrC2", "_SecondaryStrC3", + "_ChargeC1", "_ChargeC2", "_ChargeC3", "_NormalizedVDWVC1", "_NormalizedVDWVC3", "_HydrophobicityC2", + "_HydrophobicityC3", "_SolventAccessibilityT12", "_SolventAccessibilityT13", "_SecondaryStrT23", + "_NormalizedVDWVT23", "_HydrophobicityT12", "_PolarizabilityD1001", "_SolventAccessibilityD1001", + "_SolventAccessibilityD2001", "_SolventAccessibilityD3001", "_SecondaryStrD1001", + "_SecondaryStrD1025", "_ChargeD1025", "_ChargeD1075", "_ChargeD2001", "_ChargeD2025", "_ChargeD3025", + "_ChargeD3050", "_PolarityD1001", "_PolarityD1050", "_PolarityD1075", "_PolarityD3025", + "_NormalizedVDWVD1001", "_NormalizedVDWVD3001", "_HydrophobicityD1001", "_HydrophobicityD2001", "NG", + "DG", "DT", "GD", "GT"], + "ANN4311": ["DNA-A", "DNA-T", "DNA-G", "DNA-GC", "AA_Len", "G", "A", "S", "T", "N", "Turn", "Sheet", + "_PolarizabilityC1", "_PolarizabilityC3", "_SolventAccessibilityC1", "_SecondaryStrC1", + "_SecondaryStrC2", "_SecondaryStrC3", "_ChargeC2", "_ChargeC3", "_PolarityC1", "_NormalizedVDWVC1", + "_NormalizedVDWVC3", "_HydrophobicityC2", "_HydrophobicityC3", "_SecondaryStrT23", + "_NormalizedVDWVT13", "_PolarizabilityD1001", "_SolventAccessibilityD1001", + "_SolventAccessibilityD2001", "_SolventAccessibilityD3001", "_SecondaryStrD1025", "_ChargeD1075", + "_ChargeD2001", "_ChargeD2025", "_ChargeD3025", "_ChargeD3050", "_PolarityD1075", "_PolarityD3025", + "_NormalizedVDWVD1001", "_NormalizedVDWVD3050", "_HydrophobicityD2001", "DG", "DT", "GD"]} + + self.df_output = self.df.copy() + self.df_output.drop(['DNAseq','AAseq'],axis=1,inplace=True) + dna_feat = {} + aa_len = {} + aroma_dic = {} + iso_dic = {} + aa_content = {} + st_dic_master = {} + CTD_dic = {} + dp = {} + for i in range(len(self.df)): + i_name = self.df.index[i] + dna_feat[i_name] = count_orf(self.df.iloc[i]['DNAseq']) + aa_len[i_name] = len(self.df.iloc[i]['AAseq']) + aroma_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).aromaticity() + iso_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).isoelectric_point() + aa_content[i_name] = count_aa(self.df.iloc[i]['AAseq']) + st_dic_master[i_name] = sec_st_fr(self.df.iloc[i]['AAseq']) + CTD_dic[i_name] = CTD.CalculateCTD(self.df.iloc[i]['AAseq']) + dp[i_name] = AAComposition.CalculateDipeptideComposition(self.df.iloc[i]['AAseq']) + for j in self.df.index: + self.df.loc[j, dna_feat[j].keys()] = dna_feat[j].values() #dic with multiple values + self.df.loc[j, 'AA_Len'] = int(aa_len[j]) #dic with one value + self.df.loc[j, 'Aromaticity'] = aroma_dic[j] + self.df.loc[j, 'IsoelectricPoint'] = iso_dic[j] + self.df.loc[j, aa_content[j].keys()] = aa_content[j].values() + self.df.loc[j, st_dic_master[j].keys()] = st_dic_master[j].values() + self.df.loc[j, CTD_dic[j].keys()] = CTD_dic[j].values() + self.df.loc[j, dp[j].keys()] = dp[j].values() + self.df.drop(['DNAseq','AAseq'],axis=1,inplace=True) + + def Prediction(self): + import os + import pickle + import json + import pandas as pd + import numpy as np + from pathlib import Path + ft_scaler = pd.DataFrame(self.scaler.transform(self.df.iloc[:, :]), index=self.df.index,columns=self.df.columns) + ft_scaler = ft_scaler.drop(columns=[col for col in self.df if col not in self.feat[self.name]], axis=1) + scores = self.model.predict_proba(ft_scaler) + pos_scores = np.empty((self.df.shape[0], 0), float) + for x in scores: + pos_scores = np.append(pos_scores, round(x[1]*100)) + self.df_output.reset_index(inplace=True) + self.df_output['{} DPO Prediction (%)'.format(self.name)]= pos_scores + self.df_output = self.df_output.sort_values(by='{} DPO Prediction (%)'.format(self.name), ascending=False) + self.df_output.to_html('output.html', index=False, justify='center') + +if __name__ == '__main__': + import os + import sys + __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + + model = sys.argv[1] + fasta_file = sys.argv[2] + + PDPO = PDPOPrediction(__location__,model,fasta_file) + PDPO.Datastructure() + PDPO.Prediction() +