# HG changeset patch # User jose_duarte # Date 1676455021 0 # Node ID 5a0afb1578ea85a43a0314535dfdc2aeb1141c47 # Parent 3d94608aea7a197d7de3bfed7c9af789c9107c1c Uploaded diff -r 3d94608aea7a -r 5a0afb1578ea DPOGALAXY.py --- a/DPOGALAXY.py Mon Dec 13 11:19:23 2021 +0000 +++ b/DPOGALAXY.py Wed Feb 15 09:57:01 2023 +0000 @@ -1,55 +1,83 @@ +import pickle +from Bio import SeqIO +import os +import pandas as pd +import numpy as np +from local_ctd import CalculateCTD +from local_AAComposition import CalculateDipeptideComposition +import sys +from Bio.SeqUtils.ProtParam import ProteinAnalysis + class PDPOPrediction: - def __init__(self, Folder = 'location', mdl='',seq_file = 'fasta_file.fasta',ttable=11): - import pickle - import pandas as pd - from Bio import SeqIO - import os - from pathlib import Path + + def __init__(self, folder='location', mdl='', seq_file='fasta_file.fasta', ttable=11): + """ + Initialize PhageDPO prediction. + :param folder: data path + :param mdl: ml model, in this case ANN or SVM + :param seq_file: fasta file + :param ttable: Translational table. By default, The Bacterial, Archaeal and Plant Plastid Code Table 11 + """ + self.records = [] self.data = {} self.df_output = None self.seqfile = seq_file - self.__location__ = os.path.realpath(os.path.join(os.getcwd(), Folder)) + self.__location__ = os.path.realpath(os.path.join(os.getcwd(), folder)) - with open(os.path.join(self.__location__,mdl), 'rb') as m: + with open(os.path.join(self.__location__, mdl), 'rb') as m: self.model = pickle.load(m) if mdl == 'SVM4311': - with open(os.path.join(__location__,'d4311_SCALER'),'rb') as sl: + with open(os.path.join(__location__, 'd4311_SCALER'), 'rb') as sl: self.scaler = pickle.load(sl) self.name = mdl elif mdl == 'ANN7185': - with open(os.path.join(__location__,'d7185_SCALER'),'rb') as sc: + with open(os.path.join(__location__, 'd7185_SCALER'), 'rb') as sc: self.scaler = pickle.load(sc) self.name = mdl - for seq in SeqIO.parse(os.path.join(self.__location__,self.seqfile), 'fasta'): + for seq in SeqIO.parse(os.path.join(self.__location__, self.seqfile), 'fasta'): + record = [] DNA_seq = seq.seq AA_seq = DNA_seq.translate(table=ttable) - descr_seq = seq.description.replace(' ','') - self.data[descr_seq]=[DNA_seq._data,AA_seq._data] - self.df = pd.DataFrame({'ID':list(self.data.keys()), - 'DNAseq':[elem[0] for elem in self.data.values()], - 'AAseq':[elem[1] for elem in self.data.values()]}) - self.df = self.df.set_index('ID') + descr_seq = seq.description.replace(' ', '') + self.data[descr_seq] = [DNA_seq._data, AA_seq._data] + record.append(seq.description) + record.append(DNA_seq._data) + record.append(AA_seq._data) + self.records.append(record) + + columns = ['ID', 'DNAseq', 'AAseq'] + self.df = pd.DataFrame(self.records, columns=columns) + #self.df = self.df.set_index('ID') + self.df.update(self.df.DNAseq[self.df.DNAseq.apply(type) == list].str[0]) + self.df.update(self.df.AAseq[self.df.AAseq.apply(type) == list].str[0]) def Datastructure(self): - import pandas as pd - import pickle - from Bio.SeqUtils.ProtParam import ProteinAnalysis - from local_ctd import CalculateCTD - from local_AAComposition import CalculateAAComposition, CalculateDipeptideComposition - + """ + Create dataset with all features + """ def count_orf(orf_seq): + """ + Function to count open reading frames + :param orf_seq: sequence to analyze + :return: dictionary with open reading frames + """ dic = {'DNA-A': 0, 'DNA-C': 0, 'DNA-T': 0, 'DNA-G': 0, 'DNA-GC': 0} for letter in range(len(orf_seq)): for k in range(0, 4): - if orf_seq[letter] in list(dic.keys())[k][-1]: + if str(orf_seq[letter]) in list(dic.keys())[k][-1]: dic[list(dic.keys())[k]] += 1 dic['DNA-GC'] = ((dic['DNA-C'] + dic['DNA-G']) / ( dic['DNA-A'] + dic['DNA-C'] + dic['DNA-T'] + dic['DNA-G'])) * 100 return dic def count_aa(aa_seq): + """ + Function to count amino acids + :param aa_seq: sequence to analyze + :return: dictionary with amino acid composition + """ dic = {'G': 0, 'A': 0, 'L': 0, 'V': 0, 'I': 0, 'P': 0, 'F': 0, 'S': 0, 'T': 0, 'C': 0, 'Y': 0, 'N': 0, 'Q': 0, 'D': 0, 'E': 0, 'R': 0, 'K': 0, 'H': 0, 'W': 0, 'M': 0} for letter in range(len(aa_seq)): @@ -58,7 +86,11 @@ return dic def sec_st_fr(aa_seq): - from Bio.SeqUtils.ProtParam import ProteinAnalysis + """ + Function to analyze secondary structure. Helix, Turn and Sheet + :param aa_seq: sequence to analyze + :return: dictionary with composition of each secondary structure + """ st_dic = {'Helix': 0, 'Turn': 0, 'Sheet': 0} stu = ProteinAnalysis(aa_seq).secondary_structure_fraction() st_dic['Helix'] = stu[0] @@ -96,7 +128,7 @@ "YL", "YK", "YM", "YS"]} self.df_output = self.df.copy() - self.df_output.drop(['DNAseq','AAseq'],axis=1,inplace=True) + self.df_output.drop(['DNAseq', 'AAseq'], axis=1, inplace=True) dna_feat = {} aa_len = {} aroma_dic = {} @@ -105,16 +137,18 @@ st_dic_master = {} CTD_dic = {} dp = {} + self.df1 = self.df[['ID']].copy() + self.df.drop(['ID'], axis=1, inplace=True) for i in range(len(self.df)): i_name = self.df.index[i] - dna_feat[i_name] = count_orf(self.df.iloc[i]['DNAseq']) - aa_len[i_name] = len(self.df.iloc[i]['AAseq']) - aroma_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).aromaticity() - iso_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).isoelectric_point() - aa_content[i_name] = count_aa(self.df.iloc[i]['AAseq']) - st_dic_master[i_name] = sec_st_fr(self.df.iloc[i]['AAseq']) - CTD_dic[i_name] = CalculateCTD(self.df.iloc[i]['AAseq']) - dp[i_name] = CalculateDipeptideComposition(self.df.iloc[i]['AAseq']) + dna_feat[i] = count_orf(self.df.iloc[i]['DNAseq']) + aa_len[i] = len(self.df.iloc[i]['AAseq']) + aroma_dic[i] = ProteinAnalysis(self.df.iloc[i]['AAseq']).aromaticity() + iso_dic[i] = ProteinAnalysis(self.df.iloc[i]['AAseq']).isoelectric_point() + aa_content[i] = count_aa(self.df.iloc[i]['AAseq']) + st_dic_master[i] = sec_st_fr(self.df.iloc[i]['AAseq']) + CTD_dic[i] = CalculateCTD(self.df.iloc[i]['AAseq']) + dp[i] = CalculateDipeptideComposition(self.df.iloc[i]['AAseq']) for j in self.df.index: self.df.loc[j, dna_feat[j].keys()] = dna_feat[j].values() #dic with multiple values self.df.loc[j, 'AA_Len'] = int(aa_len[j]) #dic with one value @@ -124,35 +158,36 @@ self.df.loc[j, st_dic_master[j].keys()] = st_dic_master[j].values() self.df.loc[j, CTD_dic[j].keys()] = CTD_dic[j].values() self.df.loc[j, dp[j].keys()] = dp[j].values() - self.df.drop(['DNAseq','AAseq'],axis=1,inplace=True) + self.df.drop(['DNAseq', 'AAseq'], axis=1, inplace=True) def Prediction(self): - import os - import pickle - import json - import pandas as pd - import numpy as np - from pathlib import Path - ft_scaler = pd.DataFrame(self.scaler.transform(self.df.iloc[:, :]), index=self.df.index,columns=self.df.columns) + """ + Predicts the percentage of each CDS being depolymerase. + :return: None + """ + ft_scaler = pd.DataFrame(self.scaler.transform(self.df.iloc[:, :]), index=self.df.index, columns=self.df.columns) ft_scaler = ft_scaler.drop(columns=[col for col in self.df if col not in self.feat[self.name]], axis=1) scores = self.model.predict_proba(ft_scaler) pos_scores = np.empty((self.df.shape[0], 0), float) for x in scores: pos_scores = np.append(pos_scores, round(x[1]*100)) self.df_output.reset_index(inplace=True) - self.df_output['{} DPO Prediction (%)'.format(self.name)]= pos_scores + print(self.df_output.columns) + self.df_output.rename(columns={'index': 'CDS'}, inplace=True) + self.df_output['CDS'] += 1 + self.df_output['{} DPO Prediction (%)'.format(self.name)] = pos_scores + print(self.df_output) #self.df_output = self.df_output.sort_values(by='{} DPO Prediction (%)'.format(self.name), ascending=False) self.df_output.to_html('output.html', index=False, justify='center') + if __name__ == '__main__': - import os - import sys __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) model = sys.argv[1] fasta_file = sys.argv[2] - PDPO = PDPOPrediction(__location__,model,fasta_file) + PDPO = PDPOPrediction(__location__, model, fasta_file) PDPO.Datastructure() PDPO.Prediction()