Mercurial > repos > galaxyp > fastg2protlib
changeset 0:6b226c5907a1 draft default tip
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fastg2protlib commit e777bdb1d28b1ffee75cb1a8ad782a50c10a5358"
author | galaxyp |
---|---|
date | Fri, 07 Aug 2020 06:17:31 -0400 |
parents | |
children | |
files | app_validate.py application.py fastg2protlib-peptides.xml fastg2protlib-validate.xml macros.xml test-data/mgf_tst.tab test-data/tst_valid.db test-data/two.fastg |
diffstat | 8 files changed, 314 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app_validate.py Fri Aug 07 06:17:31 2020 -0400 @@ -0,0 +1,32 @@ +import argparse + +import fastg2protlib.fastg2protlib as fg + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run peptides for fastg") + parser.add_argument("msgf", help="Path MSGF+ tabular results.") + parser.add_argument( + "-d", + "--dbname", + default="results.db", + help="Name for the results database. Defaults to results.db", + ) + parser.add_argument( + "-f", + "--fdr", + default=0.10, + type=float, + help="FDR cutoff for accepting PSM validation.", + ) + parser.add_argument( + "-x", + "--decoy_header", + default="XXX_", + help="String used for marking decoy proteins.", + ) + + args = parser.parse_args() + fg.verified_proteins( + args.msgf, fdr_level=0.10, decoy_header="XXX_", db_name=args.dbname + )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/application.py Fri Aug 07 06:17:31 2020 -0400 @@ -0,0 +1,88 @@ +import argparse + +import fastg2protlib.fastg2protlib as fg + +expasy_rules = [ + "arg-c", + "asp-n", + "bnps-skatole", + "caspase 1", + "caspase 2", + "caspase 3", + "caspase 4", + "caspase 5", + "caspase 6", + "caspase 7", + "caspase 8", + "caspase 9", + "caspase 10", + "chymotrypsin high specificity", + "chymotrypsin low specificity", + "clostripain", + "cnbr", + "enterokinase", + "factor xa", + "formic acid", + "glutamyl endopeptidase", + "granzyme b", + "hydroxylamine", + "iodosobenzoic acid", + "lysc", + "ntcb", + "pepsin ph1.3", + "pepsin ph2.0", + "proline endopeptidase", + "proteinase k", + "staphylococcal peptidase i", + "thermolysin", + "thrombin", + "trypsin", + "trypsin_exception", +] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run peptides for fastg") + parser.add_argument("fastg", help="Path to Spades formatted FASTG.") + parser.add_argument( + "-d", + "--dbname", + default="results.db", + help="Name for the results database. Defaults to results.db", + ) + parser.add_argument( + "-c", + "--cleavage", + default="trypsin", + help="Cleavage rule from ExPASy cleavage rules. Defaults to trypsin.", + ) + parser.add_argument( + "-p", + "--min_protein_length", + default=55, + type=int, + help="Minimum protein length in number of amino acids. Defaults to 55.", + ) + parser.add_argument( + "-m", + "--min_peptide_length", + default=8, + type=int, + help="Minimum peptide length in amino acids. Defaults to eight.", + ) + parser.add_argument( + "-l", "--plots", default=True, type=bool, help="Generate diagnostic plots.", + ) + + args = parser.parse_args() + + print(args) + + fg.peptides_for_fastg( + fastg_filename=args.fastg, + db_name=args.dbname, + cleavage=args.cleavage, + min_protein_length=(args.min_protein_length * 3), + min_peptide_length=args.min_peptide_length, + create_plots=args.plots, + )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastg2protlib-peptides.xml Fri Aug 07 06:17:31 2020 -0400 @@ -0,0 +1,59 @@ +<tool id="fastg2protlib-peptides" name="FASTG2Protlib-Peptides" version="@VERSION@"> + <macros> + <import>macros.xml</import> + </macros> + <description>Generate FASTA from FASTG</description> + <expand macro="pkg_requirement" /> + <command detect_errors="exit_code"> + <![CDATA[ + python '$__tool_directory__/application.py' + -m $min_peptide_length + -p $min_protein_length + -c $cleavage + -d 'results.db' + -l $show_plots + '$fastg_file' + ]]> + </command> + <inputs> + <param name="fastg_file" type="data" format="fastg" label="FASTG file" /> + <param name="cleavage" type="select" label="Peptide Cleavage"> + <option value="trypsin" selected="true">Trypsin</option> + <expand macro="cleavages" /> + </param> + <param name="min_protein_length" type="integer" value="55" label="Minimum Protein Length in Amino Acids" /> + <param name="min_peptide_length" type="integer" value="8" label="Minimum Peptide Length in Amino Acids" /> + <param name="show_plots" type="boolean" checked="true" label="Create Diagnostic Plots" /> + </inputs> + <outputs> + <data name="peptide_fasta" format="txt" from_work_dir="peptide.fasta" label="${on_string} Peptides from FASTG" /> + <data name="results_db" format="sqlite" from_work_dir="results.db" label="${on_string} Results DB" /> + <data name="aa_count_plot" format="png" from_work_dir="aa_count_chart.png" label="${on_string} AA Count Plot"> + <filter>show_plots == True</filter> + </data> + <data name="fastg_length_plot" format="png" from_work_dir="fastg_seq_lengths.png" label="${on_string} FASTG Sequence Length Plot"> + <filter>show_plots == True</filter> + </data> + <data name="protein_length_plot" format="png" from_work_dir="protein_seq_lengths.png" label="${on_string} Protein Sequence Length Plot"> + <filter>show_plots == True</filter> + </data> + <data name="gc_pct_plot" format="png" from_work_dir="gc_pct.png" label="${on_string} GC Percent Plot"> + <filter>show_plots == True</filter> + </data> + </outputs> + <tests> + <test> + <param name="fastg_file" value="two.fastg" /> + <param name="cleavage" value="trypsin" /> + <param name="min_protein_length" value="20" /> + <param name="min_peptide_length" value="8" /> + <param name="show_plots" value="false" /> + <output name="peptide_fasta"> + <assert_contents> + <has_text text="IFLPFSTHSR" /> + </assert_contents> + </output> + </test> + </tests> + <expand macro="help-text" /> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastg2protlib-validate.xml Fri Aug 07 06:17:31 2020 -0400 @@ -0,0 +1,41 @@ +<tool id="fastg2protlib-validate" name="FASTG2Protlib-Validate" version="@VERSION@"> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="pkg_requirement" /> + <description>Validate a candidate protein library</description> + + <command detect_errors="exit_code"> + <![CDATA[ + python '$__tool_directory__/app_validate.py' + -d '$database_file' + -f $fdr_level + -x '$decoy_header' + '$tabular_file' + ]]> + </command> + <inputs> + <param name="tabular_file" type="data" format="txt" label="MSGF+ tabular file" /> + <param name="database_file" type="data" format="sqlite" label="Database Name"/> + <param name="fdr_level" type="float" value="0.10" label="FDR value for validation."/> + <param name="decoy_header" type="text" value="XXX_" label="Decoy protein header"/> + </inputs> + <outputs> + <data name="protein_fasta" format="fasta" from_work_dir="protein.fasta" label="Validated protein library"/> + <data name="protein_score" format="csv" from_work_dir="protein_scores.csv" label="Protein scores"/> + </outputs> + <tests> + <test> + <param name="tabular_file" value="mgf_tst.tab" /> + <param name="database_file" value="tst_valid.db" /> + <param name="fdr_level" value="0.10" /> + <param name="decoy_header" value="XXX_" /> + <output name="protein_fasta"> + <assert_contents> + <has_text text="RYSRPLSHL" /> + </assert_contents> + </output> + </test> + </tests> + <expand macro="help-text" /> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Fri Aug 07 06:17:31 2020 -0400 @@ -0,0 +1,29 @@ +<macros> + <token name="@VERSION@">1.0.2</token> + <xml name="cleavages"> + <option value="arg-c">Arg-c</option> + <option value="asp-n">Asp-n</option> + <option value="thrombin">Thrombin</option></xml> + <xml name="help-text"> + <help> + <![CDATA[ + + FASTG2Protlib creates a validated protein FASTA library starting from FASTG output. The tool + operates in two steps. + + **Generate Peptides from FASTG** + + Use the tool to generate peptides cleaved from putative proteins generated from a FASTG file. + + **Generate Verified Protein Library** + + Use the tool to generate a verified protein library with MSGF+ verified peptides as input + ]]> + </help> + </xml> + <xml name="pkg_requirement"> + <requirements> + <requirement type="package" version="@VERSION@">fastg2protlib</requirement> + </requirements> + </xml> +</macros> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mgf_tst.tab Fri Aug 07 06:17:31 2020 -0400 @@ -0,0 +1,25 @@ +#SpecFile SpecID ScanNum FragMethod Precursor IsotopeError PrecursorError(ppm) Charge Peptide Protein DeNovoScore MSGFScore SpecEValue EValue QValue PepQValue +wendt005_mickela_20200214_17647_12_V.mzML index=8575 -1 CID 501.26144 1 14.785407 3 +42.011IFLPFSTHSR+0.984 Pep_1|Protein_1(pre=-,post=-) 74 36 2.7071892E-10 2.855757E-4 0.0 0.0 +wendt005_mickela_20200214_17647_12_V.mzML index=10628 -1 CID 631.3283 0 16.628782 2 RTVWSN+0.984GTSPR Pep_2|Protein_1_29(pre=-,post=P) 61 36 6.954425E-10 7.203602E-4 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=6020 -1 CID 607.79266 1 -11.606342 2 +42.011AQ+0.984YWLSQFK Pep_3|Protein_1_28(pre=-,post=-) 23 10 9.667708E-10 9.6076715E-4 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=11469 -1 CID 470.6093 1 -7.4827867 3 RLLLQ+0.984C+57.021PRVPR Pep_4|Protein_2(pre=-,post=L) 68 35 1.2193706E-9 0.0012630607 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=15017 -1 CID 799.7483 0 2.747454 3 YFM+15.995YSIQYILIFYVQYVK Pep_5|Protein_2_29(pre=-,post=-) 2 -17 2.587433E-9 0.0029598887 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=16951 -1 CID 424.5418 1 -19.301939 3 RC+57.021GPLQASEPR Pep_6|Protein_4_16_31_32(pre=-,post=E) 69 41 3.636947E-9 0.0037672587 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=34154 -1 CID 768.88916 1 13.068233 2 +42.011STPVELEFSQ+0.984VEK Pep_7|Protein_5_34(pre=-,post=-) 77 33 4.083382E-9 0.0043801093 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=12592 -1 CID 701.0163 0 -5.6593018 3 YQSTPNIYYILYMYIR Pep_8|Protein_5_6_34_40(pre=-,post=-) 77 19 6.9333055E-9 0.0077557205 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=12620 -1 CID 485.91306 0 -10.551063 3 M+15.995SGIITN+0.984EISVFK Pep_9|Protein_7_9_22(pre=-,post=-) 55 28 7.184936E-9 0.007707044 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=5150 -1 CID 479.9201 0 -11.509454 3 YFEGKPVIEEVK Pep_10|Protein_7_22(pre=-,post=-) 87 44 7.371949E-9 0.007776514 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=26411 -1 CID 708.3275 0 -15.423832 2 PAQ+0.984PTGTRPC+57.021SSR Pep_11|Protein_8_21(pre=R,post=-) 41 15 7.913002E-9 0.008488016 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=22717 -1 CID 783.3668 0 11.843052 2 +42.011EEQDTFAVNSQQK XXX_Pep_22060|Protein_2878(pre=-,post=-) 135 39 7.923481E-9 0.008499257 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=8020 -1 CID 629.31287 0 13.772342 2 +42.011FQEPQQPWR XXX_Pep_16062|Protein_2307(pre=-,post=-) 31 14 8.15928E-9 0.008108611 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=20351 -1 CID 497.26334 0 0.79782444 2 LVPASGMYR XXX_Pep_9843|Protein_1629(pre=-,post=-) 28 16 8.401295E-9 0.0083491225 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=18867 -1 CID 559.29803 1 19.583662 3 +42.011LIGTATSVDEAIAN+0.984EK XXX_Pep_14112|Protein_2085(pre=-,post=-) 57 21 8.928303E-9 0.009987362 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=43084 -1 CID 653.8416 1 0.048319984 2 +42.011YSNYILYTVK XXX_Pep_3426|Protein_702(pre=-,post=-) 13 0 9.61208E-9 0.009760836 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=43084 -1 CID 653.8416 0 14.842637 2 +42.011YSN+0.984YILYTVK XXX_Pep_3426|Protein_702(pre=-,post=-) 13 0 9.61208E-9 0.009760836 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=32362 -1 CID 1036.4833 0 5.6531625 3 AWIGMQ+0.984WNGIEWNAM+15.995EWIQLEWNGK XXX_Pep_14712|Protein_2168(pre=-,post=-) 13 -31 1.0415514E-8 0.012581826 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=10370 -1 CID 461.22098 0 0.13233389 2 N+0.984DTQMLAK XXX_Pep_7493|Protein_1335_1348(pre=-,post=-) 83 51 1.0437349E-8 0.0101322755 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=21871 -1 CID 523.7984 0 1.3982916 2 +42.011AYVLNISPK XXX_Pep_31555|Protein_3895(pre=-,post=-) 86 43 1.0667454E-8 0.010601209 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=15561 -1 CID 581.9749 0 17.304827 3 GLDWDLAADLEGN+0.984IIK XXX_Pep_17726|Protein_2472(pre=-,post=-) 107 48 1.0803276E-8 0.012084738 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=22491 -1 CID 522.26483 0 -13.790032 2 +42.011Q+0.984LEAVQ+0.984VGR XXX_Pep_12397|Protein_1871(pre=-,post=-) 46 31 1.09720055E-8 0.01090387 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=9573 -1 CID 623.81696 0 -13.893293 2 +42.011RHALDGPWPR XXX_Pep_17806|Protein_2476(pre=-,post=Q) 28 13 1.1109479E-8 0.011281409 0.088495575 0.0952381 +wendt005_mickela_20200214_17647_12_V.mzML index=16000 -1 CID 381.54922 1 13.824602 3 +42.011N+0.984TYLSFLIK XXX_Pep_29178|Protein_3650(pre=-,post=-) 66 42 1.1131118E-8 0.011061994 0.088495575 0.0952381
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/two.fastg Fri Aug 07 06:17:31 2020 -0400 @@ -0,0 +1,40 @@ +>EDGE_1_length_84_cov_1.0:EDGE_3_length_84_cov_1.0; +CGTTATTCGCGCCCACTCTCCCATTTATCCGCGCAAGCGGATGCGATGCGATTGCCCGCTAAGATATTCTTACCATTCTCGACA +>EDGE_1_length_84_cov_1.0'; +TGTCGAGAATGGTAAGAATATCTTAGCGGGCAATCGCATCGCATCCGCTTGCGCGGATAAATGGGAGAGTGGGCGCGAATAACG +>EDGE_2_length_84_cov_1.0:EDGE_3_length_84_cov_1.0; +CTGGTCCTGTTGACTACAATGGGCCCAACTCAATCACAGCTCGAGCGCCTTGAATAACATACTCATCTCTATACATTCTCGACA +>EDGE_2_length_84_cov_1.0':EDGE_3_length_84_cov_1.0'; +TGTCGAGAATGTATAGAGATGAGTATGTTATTCAAGGCGCTCGAGCTGTGATTGAGTTGGGCCCATTGTAGTCAACAGGACCAG +>EDGE_3_length_84_cov_1.0:EDGE_2_length_84_cov_1.0,EDGE_4_length_84_cov_1.0; +CATTCTCGACATGCTGAGCTGAGACGGCGTCGATGCATAGCGGACTTTCGGTCAGTCGCAATTCCTCACGAGACTGGTCCTGTT +>EDGE_3_length_84_cov_1.0':EDGE_2_length_84_cov_1.0',EDGE_1_length_84_cov_1.0'; +AACAGGACCAGTCTCGTGAGGAATTGCGACTGACCGAAAGTCCGCTATGCATCGACGCCGTCTCAGCTCAGCATGTCGAGAATG +>EDGE_4_length_84_cov_1.0:EDGE_5_length_84_cov_1.0; +CTGGTCCTGTTACAGAGCTGGCGTACGCGTTGAACACTTCACAGATGATAGGGATTCGGGTAAAGAGCGTGTCATTGGGGGCTT +>EDGE_4_length_84_cov_1.0':EDGE_3_length_84_cov_1.0'; +AAGCCCCCAATGACACGCTCTTTACCCGAATCCCTATCATCTGTGAAGTGTTCAACGCGTACGCCAGCTCTGTAACAGGACCAG +>EDGE_5_length_84_cov_1.0; +ATTGGGGGCTTCATACATAGAGCAAGGGCGTCGAACGGTCGTGAAAGTCTTAGTACCGCACGTACCAACTTACTGAGGATATTG +>EDGE_5_length_84_cov_1.0':EDGE_4_length_84_cov_1.0',EDGE_6_length_84_cov_1.0'; +CAATATCCTCAGTAAGTTGGTACGTGCGGTACTAAGACTTTCACGACCGTTCGACGCCCTTGCTCTATGTATGAAGCCCCCAAT +>EDGE_6_length_84_cov_1.0:EDGE_5_length_84_cov_1.0; +AAGAGGCCGCCACCGTTTTAGGGGGGGAAGGTTGAAGATCTCCTCTTCTCATGACTGAACTCGCGAGGGCCGTATTGGGGGCTT +>EDGE_6_length_84_cov_1.0':EDGE_8_length_84_cov_1.0'; +AAGCCCCCAATACGGCCCTCGCGAGTTCAGTCATGAGAAGAGGAGATCTTCAACCTTCCCCCCCTAAAACGGTGGCGGCCTCTT +>EDGE_7_length_84_cov_1.0:EDGE_8_length_84_cov_1.0; +AAGAGGCCGCCAAAGAACAAAGGCTTACTGTGCGCAGAGGAACGCCCATTTAGCGGCTGGCGTTTTGAATCCTTTTAATATTGT +>EDGE_7_length_84_cov_1.0':EDGE_8_length_84_cov_1.0'; +ACAATATTAAAAGGATTCAAAACGCCAGCCGCTAAATGGGCGTTCCTCTGCGCACAGTAAGCCTTTGTTCTTTGGCGGCCTCTT +>EDGE_8_length_84_cov_1.0:EDGE_7_length_84_cov_1.0,EDGE_6_length_84_cov_1.0; +TTTAATATTGTTTAATCCAATTCCCTCATTTAGGACCCTACCAAGTCAACATTGGTATATGAATGCGACCTCGAAGAGGCCGCC +>EDGE_8_length_84_cov_1.0':EDGE_7_length_84_cov_1.0',EDGE_9_length_84_cov_1.0'; +GGCGGCCTCTTCGAGGTCGCATTCATATACCAATGTTGACTTGGTAGGGTCCTAAATGAGGGAATTGGATTAAACAATATTAAA +>EDGE_9_length_84_cov_1.0:EDGE_8_length_84_cov_1.0; +TAAAAATGACAGTGGTTGGTGCTCTAAACTTCATTTGGTTAACTCGTGTATCAGCGCGATAGGCTGTTAGAGGTTTAATATTGT +>EDGE_9_length_84_cov_1.0'; +ACAATATTAAACCTCTAACAGCCTATCGCGCTGATACACGAGTTAACCAAATGAAGTTTAGAGCACCAACCACTGTCATTTTTA +>EDGE_10_length_84_cov_1.0; +ATGGCAAGGTACTTCCGGTCTTAATGAATGGCCGGGAAAGGTACGCACGCGGTATGGGGGGGTGAAGGGGCGAATAGACAGGCT +>EDGE_10_length_84_cov_1.0':EDGE_10_length_84_cov_1.0; +AGCCTGTCTATTCGCCCCTTCACCCCCCCATACCGCGTGCGTACCTTTCCCGGCCATTCATTAAGACCGGAAGTACCTTGCCAT