# HG changeset patch
# User jose_duarte
# Date 1639306075 0
# Node ID d5e832622d3b594f77503c03c4ecd288af5447c5
# Parent  ba3d3530104bacf2ad1a01d02ecae249f67c562a
Deleted selected files

diff -r ba3d3530104b -r d5e832622d3b ANN7185
Binary file ANN7185 has changed
diff -r ba3d3530104b -r d5e832622d3b DPOGALAXY.py
--- a/DPOGALAXY.py	Wed Dec 08 10:30:36 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,162 +0,0 @@
-#print('Hello world')
-#PS C:\Users\joseduarte\Documents\pythonfiles\phage> python pdpo_test.py
-#Hello world
-
-class PDPOPrediction:
-    def __init__(self, Folder = 'location', mdl='',seq_file = 'fasta_file.fasta',ttable=11):
-        import pickle
-        import pandas as pd
-        from Bio import SeqIO
-        import os
-        from pathlib import Path
-        self.data = {}
-        self.df_output = None
-        self.seqfile = seq_file
-        self.__location__ = os.path.realpath(os.path.join(os.getcwd(), Folder))
-
-        with open(os.path.join(self.__location__,mdl), 'rb') as m:
-            self.model = pickle.load(m)
-        if mdl == 'SVM4311':
-            with open(os.path.join(__location__,'d4311_SCALER'),'rb') as sl:
-                self.scaler = pickle.load(sl)
-                self.name = mdl
-        elif mdl == 'ANN7185':
-            with open(os.path.join(__location__,'d7185_SCALER'),'rb') as sc:
-                self.scaler = pickle.load(sc)
-                self.name = mdl
-
-        for seq in SeqIO.parse(os.path.join(self.__location__,self.seqfile), 'fasta'):
-            #name_seq = seq.id
-            DNA_seq = seq.seq
-            AA_seq = DNA_seq.translate(table=ttable)
-            descr_seq = seq.description.replace(' ','')
-            self.data[descr_seq]=[DNA_seq._data,AA_seq._data]
-        self.df = pd.DataFrame({'ID':list(self.data.keys()),
-                              'DNAseq':[elem[0] for elem in self.data.values()],
-                              'AAseq':[elem[1] for elem in self.data.values()]})
-        self.df = self.df.set_index('ID')
-
-    def Datastructure(self):
-        import pandas as pd
-        import pickle
-        from Bio.SeqUtils.ProtParam import ProteinAnalysis
-        from propy import CTD
-        from propy import AAComposition
-
-        def count_orf(orf_seq):
-            dic = {'DNA-A': 0, 'DNA-C': 0, 'DNA-T': 0, 'DNA-G': 0, 'DNA-GC': 0}
-            for letter in range(len(orf_seq)):
-                for k in range(0, 4):
-                    if orf_seq[letter] in list(dic.keys())[k][-1]:
-                        dic[list(dic.keys())[k]] += 1
-            dic['DNA-GC'] = ((dic['DNA-C'] + dic['DNA-G']) / (
-                    dic['DNA-A'] + dic['DNA-C'] + dic['DNA-T'] + dic['DNA-G'])) * 100
-            return dic
-
-        def count_aa(aa_seq):
-            dic = {'G': 0, 'A': 0, 'L': 0, 'V': 0, 'I': 0, 'P': 0, 'F': 0, 'S': 0, 'T': 0, 'C': 0,
-                   'Y': 0, 'N': 0, 'Q': 0, 'D': 0, 'E': 0, 'R': 0, 'K': 0, 'H': 0, 'W': 0, 'M': 0}
-            for letter in range(len(aa_seq)):
-                if aa_seq[letter] in dic.keys():
-                    dic[aa_seq[letter]] += 1
-            return dic
-
-        def sec_st_fr(aa_seq):
-            from Bio.SeqUtils.ProtParam import ProteinAnalysis
-            st_dic = {'Helix': 0, 'Turn': 0, 'Sheet': 0}
-            stu = ProteinAnalysis(aa_seq).secondary_structure_fraction()
-            st_dic['Helix'] = stu[0]
-            st_dic['Turn'] = stu[1]
-            st_dic['Sheet'] = stu[2]
-            return st_dic
-
-        self.feat={"SVM4311": ["DNA-A", "DNA-T", "DNA-G", "DNA-GC", "AA_Len", "G", "A", "S", "T", "N", "Turn", "Sheet",
-                               "_PolarizabilityC1", "_PolarizabilityC3", "_SolventAccessibilityC1", "_SecondaryStrC1",
-                               "_SecondaryStrC2", "_SecondaryStrC3", "_ChargeC2", "_ChargeC3", "_PolarityC1", "_NormalizedVDWVC1",
-                               "_NormalizedVDWVC3", "_HydrophobicityC2", "_HydrophobicityC3", "_SecondaryStrT23",
-                               "_NormalizedVDWVT13", "_PolarizabilityD1001", "_SolventAccessibilityD1001", "_SolventAccessibilityD2001",
-                               "_SolventAccessibilityD3001", "_SecondaryStrD1025", "_ChargeD1075","_ChargeD2001", "_ChargeD2025",
-                               "_ChargeD3025", "_ChargeD3050", "_PolarityD1075", "_PolarityD3025","_NormalizedVDWVD1001",
-                               "_NormalizedVDWVD3050", "_HydrophobicityD2001", "DG", "DT", "GD"],
-                   "ANN7185": ["DNA-GC", "AA_Len", "Aromaticity", "IsoelectricPoint", "G", "A", "L", "V", "I", "P", "F",
-                               "S", "T", "C", "Y", "N", "Q", "D", "E", "R", "K", "H", "W", "M", "Turn", "Sheet", "_PolarizabilityC1",
-                               "_PolarizabilityC2", "_PolarizabilityC3", "_SolventAccessibilityC1", "_SolventAccessibilityC2",
-                               "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_ChargeC2", "_ChargeC3", "_PolarityC2",
-                               "_NormalizedVDWVC2", "_NormalizedVDWVC3", "_HydrophobicityC1", "_HydrophobicityC2", "_SecondaryStrT13",
-                               "_SecondaryStrT23", "_ChargeT12", "_ChargeT13", "_HydrophobicityT12", "_PolarizabilityD1001",
-                               "_PolarizabilityD1025", "_PolarizabilityD1050", "_PolarizabilityD2001", "_PolarizabilityD3025",
-                               "_PolarizabilityD3050", "_PolarizabilityD3075", "_SolventAccessibilityD1050", "_SolventAccessibilityD2001",
-                               "_SolventAccessibilityD2025", "_SolventAccessibilityD2050", "_SolventAccessibilityD3025",
-                               "_SolventAccessibilityD3050", "_SolventAccessibilityD3100", "_SecondaryStrD1025", "_SecondaryStrD1050",
-                               "_SecondaryStrD1075", "_SecondaryStrD2001", "_SecondaryStrD2050", "_SecondaryStrD2075", "_ChargeD1050",
-                               "_ChargeD1075", "_ChargeD1100", "_ChargeD2025", "_ChargeD3025", "_ChargeD3050", "_PolarityD2050",
-                               "_PolarityD3050", "_NormalizedVDWVD1001", "_NormalizedVDWVD1050", "_NormalizedVDWVD2001", "_NormalizedVDWVD2025",
-                               "_HydrophobicityD3001", "_HydrophobicityD3075", "AD", "AW", "AY", "RC", "RT", "NA", "NE",
-                               "NG", "NP", "DE", "DQ", "DG", "DT", "DY", "CG", "CL", "CY", "CV", "EN", "QA", "QR", "QE",
-                               "QI", "GA", "GR", "GD", "GQ", "GG", "GH", "GL", "GF", "GP", "GT", "GY", "HA", "HC", "HI",
-                               "HK", "HP", "IC", "IG", "IS", "IT", "IW", "LA", "LR", "LH", "LI", "LK", "LP", "KQ", "KH",
-                               "KS", "KT", "MQ", "MG", "MI", "FA", "FR", "FS", "FY", "PC", "PE", "PG", "PH", "PM", "PF",
-                               "PT", "SA", "SD", "SC", "SQ", "SW", "TA", "TC", "TM", "WL", "WV", "YE", "YG", "YH", "YI",
-                               "YL", "YK", "YM", "YS"]}
-
-        self.df_output = self.df.copy()
-        self.df_output.drop(['DNAseq','AAseq'],axis=1,inplace=True)
-        dna_feat = {}
-        aa_len = {}
-        aroma_dic = {}
-        iso_dic = {}
-        aa_content = {}
-        st_dic_master = {}
-        CTD_dic = {}
-        dp = {}
-        for i in range(len(self.df)):
-            i_name = self.df.index[i]
-            dna_feat[i_name] = count_orf(self.df.iloc[i]['DNAseq'])
-            aa_len[i_name] = len(self.df.iloc[i]['AAseq'])
-            aroma_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).aromaticity()
-            iso_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).isoelectric_point()
-            aa_content[i_name] = count_aa(self.df.iloc[i]['AAseq'])
-            st_dic_master[i_name] = sec_st_fr(self.df.iloc[i]['AAseq'])
-            CTD_dic[i_name] = CTD.CalculateCTD(self.df.iloc[i]['AAseq'])
-            dp[i_name] = AAComposition.CalculateDipeptideComposition(self.df.iloc[i]['AAseq'])
-        for j in self.df.index:
-            self.df.loc[j, dna_feat[j].keys()] = dna_feat[j].values() #dic with multiple values
-            self.df.loc[j, 'AA_Len'] = int(aa_len[j]) #dic with one value
-            self.df.loc[j, 'Aromaticity'] = aroma_dic[j]
-            self.df.loc[j, 'IsoelectricPoint'] = iso_dic[j]
-            self.df.loc[j, aa_content[j].keys()] = aa_content[j].values()
-            self.df.loc[j, st_dic_master[j].keys()] = st_dic_master[j].values()
-            self.df.loc[j, CTD_dic[j].keys()] = CTD_dic[j].values()
-            self.df.loc[j, dp[j].keys()] = dp[j].values()
-        self.df.drop(['DNAseq','AAseq'],axis=1,inplace=True)
-
-    def Prediction(self):
-        import os
-        import pickle
-        import json
-        import pandas as pd
-        import numpy as np
-        from pathlib import Path
-        ft_scaler = pd.DataFrame(self.scaler.transform(self.df.iloc[:, :]), index=self.df.index,columns=self.df.columns)
-        ft_scaler = ft_scaler.drop(columns=[col for col in self.df if col not in self.feat[self.name]], axis=1)
-        scores = self.model.predict_proba(ft_scaler)
-        pos_scores = np.empty((self.df.shape[0], 0), float)
-        for x in scores:
-            pos_scores = np.append(pos_scores, round(x[1]*100))
-        self.df_output.reset_index(inplace=True)
-        self.df_output['{} DPO Prediction (%)'.format(self.name)]= pos_scores
-        #self.df_output = self.df_output.sort_values(by='{} DPO Prediction (%)'.format(self.name), ascending=False)
-        self.df_output.to_html('output.html', index=False, justify='center')
-
-if __name__ == '__main__':
-    import os
-    import sys
-    __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-
-    model = sys.argv[1]
-    fasta_file = sys.argv[2]
-
-    PDPO = PDPOPrediction(__location__,model,fasta_file)
-    PDPO.Datastructure()
-    PDPO.Prediction()
-
diff -r ba3d3530104b -r d5e832622d3b PhageDPO.xml
--- a/PhageDPO.xml	Wed Dec 08 10:30:36 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,68 +0,0 @@
-<tool id="PhageDPO" name="PhageDPO" version="0.1.0" python_template_version="3.5">
-	<description>
-Phage Depolymerase Finder
-	</description>
-    <requirements>
-        <requirement type="package" version="1.78">biopython</requirement>
-        <requirement type="package" version="0.24.1">scikit-learn</requirement>
-        <requirement type="package">numpy</requirement>
-        <requirement type="package" version="1.2.3">pandas</requirement>
-		<requirement type="package" version="1.0.2">propy3</requirement>
-    </requirements>
-    <command detect_errors="exit_code"><![CDATA[
-        python '$__tool_directory__/DPOGALAXY.py' '$adv.model' '${input1}'
-    ]]></command>
-    <inputs>
-		<param type="data" name="input1" format="fasta" label="Fasta file"/>
-		<section name = 'adv' title= 'Advanced Options' expanded = 'False'>
-		<param type = "select" name="model" label="Model">
-		   <option value="SVM4311" selected="yes">SVM4311</option>
-		   <option value="ANN7185">ANN7185</option>
-		</param>
-	</section>
-		
-    </inputs>
-    <outputs>
-		<data name="output1" format="html" from_work_dir="output.html" 
-		label="DPO Prediction"/>
-    </outputs>
-	<tests>
-        <test>
-			<param name="model" value="SVM4311"/>
-	        <param name="input1" value="fasta_file.fasta"/>
-            <output name="output1" file="output.html"/>
-        </test>
-    </tests>
-    <help><![CDATA[
-	
-========
-PhageDPO
-========
-
-Predicts the existance of Phage Polysaccharide Depolymerase.
-
-PhageDPO is a python script that predicts the existance of depolymerases (DPOs) using supervised machine learning models. 
-Two different datasets were used to develop two models: The SVM model was built using a dataset with 45 features and 4311 examples (1437 positives and 2874 negatives) and the ANN model was created using a dataset with 166 features and 7185 examples (1437 positives and 5748 negatives).
-
-**Inputs:**
-
-* fasta file: fasta file format contain the nucleotide sequences.
-
-**Advanced options:**
-	
-* Model: selection of the model to run: the SVM model (default) or the ANN model. The SVM model focus on true positive detection while avoiding false positives. On the other hand, the ANN model uses more negative data ensuring that all DPOs are identified.	
-	
-**Outputs:**
-
-The tool outputs an html file containing the name of the sequence and the percentage of positive prediction for DPO.	
-
-**Requirements:**
-	
-* Biopython
-* Sklearn 
-* Numpy
-* Pandas 
-* Propy	
-
-    ]]></help>
-</tool>
\ No newline at end of file
diff -r ba3d3530104b -r d5e832622d3b SVM4311
Binary file SVM4311 has changed
diff -r ba3d3530104b -r d5e832622d3b d4311_SCALER
Binary file d4311_SCALER has changed
diff -r ba3d3530104b -r d5e832622d3b d7185_SCALER
Binary file d7185_SCALER has changed