changeset 0:6b226c5907a1 draft default tip

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fastg2protlib commit e777bdb1d28b1ffee75cb1a8ad782a50c10a5358"
author galaxyp
date Fri, 07 Aug 2020 06:17:31 -0400
parents
children
files app_validate.py application.py fastg2protlib-peptides.xml fastg2protlib-validate.xml macros.xml test-data/mgf_tst.tab test-data/tst_valid.db test-data/two.fastg
diffstat 8 files changed, 314 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/app_validate.py	Fri Aug 07 06:17:31 2020 -0400
@@ -0,0 +1,32 @@
+import argparse
+
+import fastg2protlib.fastg2protlib as fg
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run peptides for fastg")
+    parser.add_argument("msgf", help="Path MSGF+ tabular results.")
+    parser.add_argument(
+        "-d",
+        "--dbname",
+        default="results.db",
+        help="Name for the results database. Defaults to results.db",
+    )
+    parser.add_argument(
+        "-f",
+        "--fdr",
+        default=0.10,
+        type=float,
+        help="FDR cutoff for accepting PSM validation.",
+    )
+    parser.add_argument(
+        "-x",
+        "--decoy_header",
+        default="XXX_",
+        help="String used for marking decoy proteins.",
+    )
+
+    args = parser.parse_args()
+    fg.verified_proteins(
+        args.msgf, fdr_level=0.10, decoy_header="XXX_", db_name=args.dbname
+    )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/application.py	Fri Aug 07 06:17:31 2020 -0400
@@ -0,0 +1,88 @@
+import argparse
+
+import fastg2protlib.fastg2protlib as fg
+
+expasy_rules = [
+    "arg-c",
+    "asp-n",
+    "bnps-skatole",
+    "caspase 1",
+    "caspase 2",
+    "caspase 3",
+    "caspase 4",
+    "caspase 5",
+    "caspase 6",
+    "caspase 7",
+    "caspase 8",
+    "caspase 9",
+    "caspase 10",
+    "chymotrypsin high specificity",
+    "chymotrypsin low specificity",
+    "clostripain",
+    "cnbr",
+    "enterokinase",
+    "factor xa",
+    "formic acid",
+    "glutamyl endopeptidase",
+    "granzyme b",
+    "hydroxylamine",
+    "iodosobenzoic acid",
+    "lysc",
+    "ntcb",
+    "pepsin ph1.3",
+    "pepsin ph2.0",
+    "proline endopeptidase",
+    "proteinase k",
+    "staphylococcal peptidase i",
+    "thermolysin",
+    "thrombin",
+    "trypsin",
+    "trypsin_exception",
+]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run peptides for fastg")
+    parser.add_argument("fastg", help="Path to Spades formatted FASTG.")
+    parser.add_argument(
+        "-d",
+        "--dbname",
+        default="results.db",
+        help="Name for the results database. Defaults to results.db",
+    )
+    parser.add_argument(
+        "-c",
+        "--cleavage",
+        default="trypsin",
+        help="Cleavage rule from ExPASy cleavage rules. Defaults to trypsin.",
+    )
+    parser.add_argument(
+        "-p",
+        "--min_protein_length",
+        default=55,
+        type=int,
+        help="Minimum protein length in number of amino acids. Defaults to 55.",
+    )
+    parser.add_argument(
+        "-m",
+        "--min_peptide_length",
+        default=8,
+        type=int,
+        help="Minimum peptide length in amino acids. Defaults to eight.",
+    )
+    parser.add_argument(
+        "-l", "--plots", default=True, type=bool, help="Generate diagnostic plots.",
+    )
+
+    args = parser.parse_args()
+
+    print(args)
+
+    fg.peptides_for_fastg(
+        fastg_filename=args.fastg,
+        db_name=args.dbname,
+        cleavage=args.cleavage,
+        min_protein_length=(args.min_protein_length * 3),
+        min_peptide_length=args.min_peptide_length,
+        create_plots=args.plots,
+    )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastg2protlib-peptides.xml	Fri Aug 07 06:17:31 2020 -0400
@@ -0,0 +1,59 @@
+<tool id="fastg2protlib-peptides" name="FASTG2Protlib-Peptides" version="@VERSION@">
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <description>Generate FASTA from FASTG</description>
+    <expand macro="pkg_requirement" />
+    <command detect_errors="exit_code">
+        <![CDATA[
+        python '$__tool_directory__/application.py' 
+        -m $min_peptide_length
+        -p $min_protein_length  
+        -c $cleavage 
+        -d 'results.db' 
+        -l $show_plots
+        '$fastg_file'
+        ]]>
+    </command>
+    <inputs>
+        <param name="fastg_file" type="data" format="fastg" label="FASTG file" />
+        <param name="cleavage" type="select" label="Peptide Cleavage">
+            <option value="trypsin" selected="true">Trypsin</option>
+            <expand macro="cleavages" />
+        </param>
+        <param name="min_protein_length" type="integer" value="55" label="Minimum Protein Length in Amino Acids" />
+        <param name="min_peptide_length" type="integer" value="8" label="Minimum Peptide Length in Amino Acids" />
+        <param name="show_plots" type="boolean" checked="true" label="Create Diagnostic Plots" />
+    </inputs>
+    <outputs>
+        <data name="peptide_fasta" format="txt" from_work_dir="peptide.fasta" label="${on_string} Peptides from FASTG" />
+        <data name="results_db" format="sqlite" from_work_dir="results.db" label="${on_string} Results DB" />
+        <data name="aa_count_plot" format="png" from_work_dir="aa_count_chart.png" label="${on_string} AA Count Plot">
+            <filter>show_plots == True</filter>
+        </data>
+        <data name="fastg_length_plot" format="png" from_work_dir="fastg_seq_lengths.png" label="${on_string} FASTG Sequence Length Plot">
+            <filter>show_plots == True</filter>
+        </data>
+        <data name="protein_length_plot" format="png" from_work_dir="protein_seq_lengths.png" label="${on_string} Protein Sequence Length Plot">
+            <filter>show_plots == True</filter>
+        </data>
+        <data name="gc_pct_plot" format="png" from_work_dir="gc_pct.png" label="${on_string} GC Percent Plot">
+            <filter>show_plots == True</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="fastg_file" value="two.fastg" />
+            <param name="cleavage" value="trypsin" />
+            <param name="min_protein_length" value="20" />
+            <param name="min_peptide_length" value="8" />
+            <param name="show_plots" value="false" />
+            <output name="peptide_fasta">
+                <assert_contents>
+                    <has_text text="IFLPFSTHSR" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <expand macro="help-text" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastg2protlib-validate.xml	Fri Aug 07 06:17:31 2020 -0400
@@ -0,0 +1,41 @@
+<tool id="fastg2protlib-validate" name="FASTG2Protlib-Validate" version="@VERSION@">
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="pkg_requirement" />
+    <description>Validate a candidate protein library</description>
+
+    <command detect_errors="exit_code">
+        <![CDATA[
+        python '$__tool_directory__/app_validate.py' 
+        -d '$database_file'
+        -f $fdr_level
+        -x '$decoy_header'
+        '$tabular_file'
+        ]]>
+    </command>
+    <inputs>
+        <param name="tabular_file" type="data" format="txt" label="MSGF+ tabular file" />
+        <param name="database_file" type="data" format="sqlite" label="Database Name"/>
+        <param name="fdr_level" type="float" value="0.10" label="FDR value for validation."/>
+        <param name="decoy_header" type="text" value="XXX_" label="Decoy protein header"/>
+    </inputs>
+    <outputs>
+        <data name="protein_fasta" format="fasta" from_work_dir="protein.fasta" label="Validated protein library"/>
+        <data name="protein_score" format="csv" from_work_dir="protein_scores.csv" label="Protein scores"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="tabular_file" value="mgf_tst.tab" />
+            <param name="database_file" value="tst_valid.db" />
+            <param name="fdr_level" value="0.10" />
+            <param name="decoy_header" value="XXX_" />
+            <output name="protein_fasta">
+                <assert_contents>
+                    <has_text text="RYSRPLSHL" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>    
+    <expand macro="help-text" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Fri Aug 07 06:17:31 2020 -0400
@@ -0,0 +1,29 @@
+<macros>
+    <token name="@VERSION@">1.0.2</token>
+    <xml name="cleavages">
+        <option value="arg-c">Arg-c</option>
+        <option value="asp-n">Asp-n</option>
+        <option value="thrombin">Thrombin</option></xml>
+    <xml name="help-text">
+        <help>
+            <![CDATA[
+
+                FASTG2Protlib creates a validated protein FASTA library starting from FASTG output. The tool
+                operates in two steps.
+
+                **Generate Peptides from FASTG**
+                
+                Use the tool to generate peptides cleaved from putative proteins generated from a FASTG file.
+
+                **Generate Verified Protein Library**
+               
+                Use the tool to generate a verified protein library with MSGF+ verified peptides as input
+            ]]>
+        </help>
+    </xml>
+    <xml name="pkg_requirement">
+        <requirements>
+            <requirement type="package" version="@VERSION@">fastg2protlib</requirement>
+        </requirements>
+    </xml>
+</macros>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mgf_tst.tab	Fri Aug 07 06:17:31 2020 -0400
@@ -0,0 +1,25 @@
+#SpecFile	SpecID	ScanNum	FragMethod	Precursor	IsotopeError	PrecursorError(ppm)	Charge	Peptide	Protein	DeNovoScore	MSGFScore	SpecEValue	EValue	QValue	PepQValue
+wendt005_mickela_20200214_17647_12_V.mzML	index=8575	-1	CID	501.26144	1	14.785407	3	+42.011IFLPFSTHSR+0.984	Pep_1|Protein_1(pre=-,post=-)	74	36	2.7071892E-10	2.855757E-4	0.0	0.0
+wendt005_mickela_20200214_17647_12_V.mzML	index=10628	-1	CID	631.3283	0	16.628782	2	RTVWSN+0.984GTSPR	Pep_2|Protein_1_29(pre=-,post=P)	61	36	6.954425E-10	7.203602E-4	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=6020	-1	CID	607.79266	1	-11.606342	2	+42.011AQ+0.984YWLSQFK	Pep_3|Protein_1_28(pre=-,post=-)	23	10	9.667708E-10	9.6076715E-4	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=11469	-1	CID	470.6093	1	-7.4827867	3	RLLLQ+0.984C+57.021PRVPR	Pep_4|Protein_2(pre=-,post=L)	68	35	1.2193706E-9	0.0012630607	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=15017	-1	CID	799.7483	0	2.747454	3	YFM+15.995YSIQYILIFYVQYVK	Pep_5|Protein_2_29(pre=-,post=-)	2	-17	2.587433E-9	0.0029598887	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=16951	-1	CID	424.5418	1	-19.301939	3	RC+57.021GPLQASEPR	Pep_6|Protein_4_16_31_32(pre=-,post=E)	69	41	3.636947E-9	0.0037672587	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=34154	-1	CID	768.88916	1	13.068233	2	+42.011STPVELEFSQ+0.984VEK	Pep_7|Protein_5_34(pre=-,post=-)	77	33	4.083382E-9	0.0043801093	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=12592	-1	CID	701.0163	0	-5.6593018	3	YQSTPNIYYILYMYIR	Pep_8|Protein_5_6_34_40(pre=-,post=-)	77	19	6.9333055E-9	0.0077557205	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=12620	-1	CID	485.91306	0	-10.551063	3	M+15.995SGIITN+0.984EISVFK	Pep_9|Protein_7_9_22(pre=-,post=-)	55	28	7.184936E-9	0.007707044	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=5150	-1	CID	479.9201	0	-11.509454	3	YFEGKPVIEEVK	Pep_10|Protein_7_22(pre=-,post=-)	87	44	7.371949E-9	0.007776514	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=26411	-1	CID	708.3275	0	-15.423832	2	PAQ+0.984PTGTRPC+57.021SSR	Pep_11|Protein_8_21(pre=R,post=-)	41	15	7.913002E-9	0.008488016	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=22717	-1	CID	783.3668	0	11.843052	2	+42.011EEQDTFAVNSQQK	XXX_Pep_22060|Protein_2878(pre=-,post=-)	135	39	7.923481E-9	0.008499257	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=8020	-1	CID	629.31287	0	13.772342	2	+42.011FQEPQQPWR	XXX_Pep_16062|Protein_2307(pre=-,post=-)	31	14	8.15928E-9	0.008108611	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=20351	-1	CID	497.26334	0	0.79782444	2	LVPASGMYR	XXX_Pep_9843|Protein_1629(pre=-,post=-)	28	16	8.401295E-9	0.0083491225	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=18867	-1	CID	559.29803	1	19.583662	3	+42.011LIGTATSVDEAIAN+0.984EK	XXX_Pep_14112|Protein_2085(pre=-,post=-)	57	21	8.928303E-9	0.009987362	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=43084	-1	CID	653.8416	1	0.048319984	2	+42.011YSNYILYTVK	XXX_Pep_3426|Protein_702(pre=-,post=-)	13	0	9.61208E-9	0.009760836	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=43084	-1	CID	653.8416	0	14.842637	2	+42.011YSN+0.984YILYTVK	XXX_Pep_3426|Protein_702(pre=-,post=-)	13	0	9.61208E-9	0.009760836	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=32362	-1	CID	1036.4833	0	5.6531625	3	AWIGMQ+0.984WNGIEWNAM+15.995EWIQLEWNGK	XXX_Pep_14712|Protein_2168(pre=-,post=-)	13	-31	1.0415514E-8	0.012581826	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=10370	-1	CID	461.22098	0	0.13233389	2	N+0.984DTQMLAK	XXX_Pep_7493|Protein_1335_1348(pre=-,post=-)	83	51	1.0437349E-8	0.0101322755	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=21871	-1	CID	523.7984	0	1.3982916	2	+42.011AYVLNISPK	XXX_Pep_31555|Protein_3895(pre=-,post=-)	86	43	1.0667454E-8	0.010601209	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=15561	-1	CID	581.9749	0	17.304827	3	GLDWDLAADLEGN+0.984IIK	XXX_Pep_17726|Protein_2472(pre=-,post=-)	107	48	1.0803276E-8	0.012084738	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=22491	-1	CID	522.26483	0	-13.790032	2	+42.011Q+0.984LEAVQ+0.984VGR	XXX_Pep_12397|Protein_1871(pre=-,post=-)	46	31	1.09720055E-8	0.01090387	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=9573	-1	CID	623.81696	0	-13.893293	2	+42.011RHALDGPWPR	XXX_Pep_17806|Protein_2476(pre=-,post=Q)	28	13	1.1109479E-8	0.011281409	0.088495575	0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML	index=16000	-1	CID	381.54922	1	13.824602	3	+42.011N+0.984TYLSFLIK	XXX_Pep_29178|Protein_3650(pre=-,post=-)	66	42	1.1131118E-8	0.011061994	0.088495575	0.0952381
Binary file test-data/tst_valid.db has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/two.fastg	Fri Aug 07 06:17:31 2020 -0400
@@ -0,0 +1,40 @@
+>EDGE_1_length_84_cov_1.0:EDGE_3_length_84_cov_1.0;
+CGTTATTCGCGCCCACTCTCCCATTTATCCGCGCAAGCGGATGCGATGCGATTGCCCGCTAAGATATTCTTACCATTCTCGACA
+>EDGE_1_length_84_cov_1.0';
+TGTCGAGAATGGTAAGAATATCTTAGCGGGCAATCGCATCGCATCCGCTTGCGCGGATAAATGGGAGAGTGGGCGCGAATAACG
+>EDGE_2_length_84_cov_1.0:EDGE_3_length_84_cov_1.0;
+CTGGTCCTGTTGACTACAATGGGCCCAACTCAATCACAGCTCGAGCGCCTTGAATAACATACTCATCTCTATACATTCTCGACA
+>EDGE_2_length_84_cov_1.0':EDGE_3_length_84_cov_1.0';
+TGTCGAGAATGTATAGAGATGAGTATGTTATTCAAGGCGCTCGAGCTGTGATTGAGTTGGGCCCATTGTAGTCAACAGGACCAG
+>EDGE_3_length_84_cov_1.0:EDGE_2_length_84_cov_1.0,EDGE_4_length_84_cov_1.0;
+CATTCTCGACATGCTGAGCTGAGACGGCGTCGATGCATAGCGGACTTTCGGTCAGTCGCAATTCCTCACGAGACTGGTCCTGTT
+>EDGE_3_length_84_cov_1.0':EDGE_2_length_84_cov_1.0',EDGE_1_length_84_cov_1.0';
+AACAGGACCAGTCTCGTGAGGAATTGCGACTGACCGAAAGTCCGCTATGCATCGACGCCGTCTCAGCTCAGCATGTCGAGAATG
+>EDGE_4_length_84_cov_1.0:EDGE_5_length_84_cov_1.0;
+CTGGTCCTGTTACAGAGCTGGCGTACGCGTTGAACACTTCACAGATGATAGGGATTCGGGTAAAGAGCGTGTCATTGGGGGCTT
+>EDGE_4_length_84_cov_1.0':EDGE_3_length_84_cov_1.0';
+AAGCCCCCAATGACACGCTCTTTACCCGAATCCCTATCATCTGTGAAGTGTTCAACGCGTACGCCAGCTCTGTAACAGGACCAG
+>EDGE_5_length_84_cov_1.0;
+ATTGGGGGCTTCATACATAGAGCAAGGGCGTCGAACGGTCGTGAAAGTCTTAGTACCGCACGTACCAACTTACTGAGGATATTG
+>EDGE_5_length_84_cov_1.0':EDGE_4_length_84_cov_1.0',EDGE_6_length_84_cov_1.0';
+CAATATCCTCAGTAAGTTGGTACGTGCGGTACTAAGACTTTCACGACCGTTCGACGCCCTTGCTCTATGTATGAAGCCCCCAAT
+>EDGE_6_length_84_cov_1.0:EDGE_5_length_84_cov_1.0;
+AAGAGGCCGCCACCGTTTTAGGGGGGGAAGGTTGAAGATCTCCTCTTCTCATGACTGAACTCGCGAGGGCCGTATTGGGGGCTT
+>EDGE_6_length_84_cov_1.0':EDGE_8_length_84_cov_1.0';
+AAGCCCCCAATACGGCCCTCGCGAGTTCAGTCATGAGAAGAGGAGATCTTCAACCTTCCCCCCCTAAAACGGTGGCGGCCTCTT
+>EDGE_7_length_84_cov_1.0:EDGE_8_length_84_cov_1.0;
+AAGAGGCCGCCAAAGAACAAAGGCTTACTGTGCGCAGAGGAACGCCCATTTAGCGGCTGGCGTTTTGAATCCTTTTAATATTGT
+>EDGE_7_length_84_cov_1.0':EDGE_8_length_84_cov_1.0';
+ACAATATTAAAAGGATTCAAAACGCCAGCCGCTAAATGGGCGTTCCTCTGCGCACAGTAAGCCTTTGTTCTTTGGCGGCCTCTT
+>EDGE_8_length_84_cov_1.0:EDGE_7_length_84_cov_1.0,EDGE_6_length_84_cov_1.0;
+TTTAATATTGTTTAATCCAATTCCCTCATTTAGGACCCTACCAAGTCAACATTGGTATATGAATGCGACCTCGAAGAGGCCGCC
+>EDGE_8_length_84_cov_1.0':EDGE_7_length_84_cov_1.0',EDGE_9_length_84_cov_1.0';
+GGCGGCCTCTTCGAGGTCGCATTCATATACCAATGTTGACTTGGTAGGGTCCTAAATGAGGGAATTGGATTAAACAATATTAAA
+>EDGE_9_length_84_cov_1.0:EDGE_8_length_84_cov_1.0;
+TAAAAATGACAGTGGTTGGTGCTCTAAACTTCATTTGGTTAACTCGTGTATCAGCGCGATAGGCTGTTAGAGGTTTAATATTGT
+>EDGE_9_length_84_cov_1.0';
+ACAATATTAAACCTCTAACAGCCTATCGCGCTGATACACGAGTTAACCAAATGAAGTTTAGAGCACCAACCACTGTCATTTTTA
+>EDGE_10_length_84_cov_1.0;
+ATGGCAAGGTACTTCCGGTCTTAATGAATGGCCGGGAAAGGTACGCACGCGGTATGGGGGGGTGAAGGGGCGAATAGACAGGCT
+>EDGE_10_length_84_cov_1.0':EDGE_10_length_84_cov_1.0;
+AGCCTGTCTATTCGCCCCTTCACCCCCCCATACCGCGTGCGTACCTTTCCCGGCCATTCATTAAGACCGGAAGTACCTTGCCAT