view metanovo.xml @ 5:d6dcd3173bdf draft default tip

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/metanovo commit fd12c123235f5067858bbd22b70fb0082d1e2199
author galaxyp
date Sat, 11 May 2024 16:46:46 +0000
parents 7a5ff5359b13
children
line wrap: on
line source

<tool id="metanovo" name="MetaNovo" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.09">
  <description>
    Produce targeted databases for mass spectrometry analysis.
  </description>
  <macros>
    <token name="@TOOL_VERSION@">1.9.4</token>
    <token name="@VERSION_SUFFIX@">4</token>
    <token name="@SUBSTITUTION_RX@">[^\w\-\.]</token>
    <import>macros_modifications.xml</import>
  </macros>
  <xrefs>
    <xref type="bio.tools">metanovo</xref>
  </xrefs>
  <requirements>
    <requirement type="package" version="@TOOL_VERSION@">metanovo</requirement>
  </requirements>
  <command>
    <![CDATA[
      #import re
      #set $mgf_dir = 'mgf_files'
      #set $fasta_dir = 'fasta_file'
      #set fasta_name = re.sub('@SUBSTITUTION_RX@', '_', str($input_fasta.element_identifier))
      mkdir $mgf_dir &&
      mkdir $fasta_dir &&
      ln -s '$input_fasta' '$fasta_dir/$fasta_name' &&

      #if $input_type.type == "collection"
      #for $input_mgf_item in $input_type.input_mgf_collection:
      #set mgf_name = re.sub('@SUBSTITUTION_RX@', '_', str($input_mgf_item.element_identifier))
      ln -s '$input_mgf_item' '$mgf_dir/$mgf_name' &&
      #end for
      #else
      #set mgf_name = re.sub('@SUBSTITUTION_RX@', '_', str($input_type.input_mgf.element_identifier))
      ln -s '$input_type.input_mgf' '$mgf_dir/$mgf_name' &&
      #end if

      ## the number of threads should be number of available threads-1 according to the docs
      threads=\${GALAXY_SLOTS:-3} &&
      if [ \$threads -gt 1 ]; then
        (( threads-- ));
      fi &&
      echo "THREAD_LIMIT=\$threads" >> config.sh &&

      metanovo.sh config.sh
    ]]>
  </command>

  <configfiles>
    <configfile filename="config.sh"><![CDATA[#slurp
#import re
MGF_FOLDER=mgf_files
#set fasta_name = re.sub('@SUBSTITUTION_RX@', '_', str($input_fasta.element_identifier))
FASTA_FILE=fasta_file/'$fasta_name'
OUTPUT_FOLDER=.
CHUNKSIZE=$processing_control.CHUNKSIZE
JVM_Xmx='10000M'
JVM_Xms='1024M'
mn_specificity='$metanovo_parameters.mn_specificity'
mn_enzymes='$metanovo_parameters.mn_enzymes'
mn_max_missed_cleavages=$metanovo_parameters.mn_max_missed_cleavages
dg_pepnovo=0
dg_pnovo=0
dg_novor=0
dg_directag=1
prec_tol=$spectrum_matching_parameters.prec_tol
prec_ppm=$spectrum_matching_parameters.prec_ppm
frag_tol=$spectrum_matching_parameters.frag_tol
frag_ppm=$spectrum_matching_parameters.frag_ppm
digestion=$spectrum_matching_parameters.digestion
enzyme='$spectrum_matching_parameters.enzyme'
specificity=$spectrum_matching_parameters.specificity
mc='$spectrum_matching_parameters.mc'
fixed_mods="$spectrum_matching_parameters.fixed_mods"
variable_mods="$spectrum_matching_parameters.variable_mods"
min_charge=$spectrum_matching_parameters.min_charge
max_charge=$spectrum_matching_parameters.max_charge
fi='$spectrum_matching_parameters.fi'
ri='$spectrum_matching_parameters.ri'
min_isotope='$spectrum_matching_parameters.min_isotope'
max_isotope='$spectrum_matching_parameters.max_isotope'
annotation_level=$spectrum_annotation.annotation_level
annotation_high_resolution=$spectrum_annotation.annotation_high_resolution
sequence_index_type=$sequence_matching.sequence_index_type
sequence_matching_type=$sequence_matching.sequence_matching_type
sequence_matching_x=$sequence_matching.sequence_matching_x
import_peptide_length_min=$import_filters.import_peptide_length_min
import_peptide_length_max=$import_filters.import_peptide_length_max
import_precursor_mz_ppm=$import_filters.import_precursor_mz_ppm
exclude_unknown_ptms=$import_filters.exclude_unknown_ptms
ptm_score=$ptm_localization.ptm_score
score_neutral_losses=$ptm_localization.score_neutral_losses
ptm_sequence_matching_type=$ptm_localization.ptm_sequence_matching_type
ptm_alignment=$ptm_localization.ptm_alignment
useGeneMapping=$gene_annotation.useGeneMapping
updateGeneMapping=$gene_annotation.updateGeneMapping
simplify_groups=$protein_inference.simplify_groups
simplify_score=$protein_inference.simplify_score
simplify_enzymaticity=$protein_inference.simplify_enzymaticity
simplify_evidence=$protein_inference.simplify_evidence
simplify_uncharacterized=$protein_inference.simplify_uncharacterized
psm_fdr=$validation_levels.psm_fdr
peptide_fdr=$validation_levels.peptide_fdr
protein_fdr=$validation_levels.protein_fdr
group_psms=$validation_levels.group_psms
group_peptides=$validation_levels.group_peptides
merge_subgroups=$validation_levels.merge_subgroups
protein_fraction_mw_confidence='$fraction_analysis.protein_fraction_mw_confidence'
pepnovo_hitlist_length=1
pepnovo_estimate_charge=1
pepnovo_correct_prec_mass=1
pepnovo_discard_spectra=1
pepnovo_fragmentation_model='CID_IT_TRYP'
pepnovo_generate_blast=0
directag_tic_cutoff=$directag.directag_tic_cutoff
directag_max_peak_count=$directag.directag_max_peak_count
directag_intensity_classes=$directag.directag_intensity_classes
directag_adjust_precursor=$directag.directag_adjust_precursor
directag_min_adjustment='$directag.directag_min_adjustment'
directag_max_adjustment='$directag.directag_max_adjustment'
directag_adjustment_step='$directag.directag_adjustment_step'
directag_charge_states='$directag.directag_charge_states'
directag_ms_charge_state='$directag.directag_ms_charge_state'
directag_duplicate_spectra='$directag.directag_duplicate_spectra'
directag_deisotoping='$directag.directag_deisotoping'
directag_isotope_tolerance='$directag.directag_isotope_tolerance'
directag_complement_tolerance='$directag.directag_complement_tolerance'
directag_tag_length='$directag.directag_tag_length'
directag_max_var_mods='$directag.directag_max_var_mods'
directag_max_tag_count='$directag.directag_max_tag_count'
directag_intensity_weight='$directag.directag_intensity_weight'
directag_fidelity_weight='$directag.directag_fidelity_weight'
directag_complement_weight='$directag.directag_complement_weight'
novor_fragmentation=HCD
novor_mass_analyzer=Trap
 ]]></configfile>
  </configfiles>

  <inputs>
    <conditional name="input_type">
      <param name="type" type="select" label="MGF Input Type" help="Submit either a single file, or a collection of files.">
        <option selected="true" value="single">Single file</option>
        <option value="collection">Collection</option>
      </param>
      <when value="single">
        <param name="input_mgf" type="data" format="mgf" optional="true" label="MGF File" />
      </when>
      <when value="collection">
        <param name="input_mgf_collection" type="data_collection" optional="true" label="MGF Collection" />
      </when>
    </conditional>

    <param name="input_fasta" type="data" format="fasta" label="FASTA File" />

    <section name="processing_control" expanded="False" title="Processing Control">
      <param name="CHUNKSIZE" label="Size to split fasta for parallel processing" value="100000" type="integer" optional="true"/>
    </section>
    <section name="metanovo_parameters" expanded="False" title="MetaNovo Parameters">
      <param argument="-mn_specificity" label="Enzyme Specificity" type="select">
        <option selected="true" value="specific">specific</option>
        <option value="semi-specific">semi-specific</option>
        <option value="unspecific">unspecific</option>
      </param>
      <param argument="-mn_enzymes" label="Enzyme Rule" type="select">
        <option value="Trypsin">Trypsin</option>
        <option selected="true" value="Trypsin, no P rule">Trypsin, no P rule</option>
        <option value="Whole protein">Whole protein</option>
      </param>
      <param argument="-mn_max_missed_cleavages" label="Number of enzymatic missed cleavages" value="2" type="integer" optional="true"/>
    </section>
    <section name="spectrum_matching_parameters" expanded="False" title="Spectrum Matching Parameters">
      <param argument="-prec_tol" label="Precursor ion mass tolerance" value="10.0" type="float" optional="true"/>
      <param argument="-prec_ppm" label="Precursor ion tolerance unit" type="select">
        <option value="0">Da</option>
        <option selected="true" value="1">ppm</option>
      </param>
      <param argument="-frag_tol" label="Fragment ion mass tolerance" value="0.05" type="float" optional="true"/>
      <param argument="-frag_ppm" label="Fragment ion tolerance unit" type="select">
        <option selected="true" value="0">Da</option>
        <option value="1">ppm</option>
      </param>
      <param argument="-digestion" label="Digestion" type="select">
        <option selected="true" value="0">Enzyme</option>
        <option value="1">Unspecific</option>
        <option value="2">Whole Protein</option>
      </param>
      <param argument="-enzyme" label="Enzyme" type="select" multiple="true">
        <option value="Trypsin">Trypsin</option>
        <option selected="true" value="Trypsin (no P rule)">Trypsin (no P rule)</option>
        <option value="Arg-C">Arg-C</option>
        <option value="Arg-C (no P rule)">Arg-C (no P rule)</option>
        <option value="Arg-N">Arg-N</option>
        <option value="Glu-C">Glu-C</option>
        <option value="Lys-C">Lys-C</option>
        <option value="Lys-C (no P rule)">Lys-C (no P rule)</option>
        <option value="Lys-N">Lys-N</option>
        <option value="Asp-N">Asp-N</option>
        <option value="Asp-N (ambic)">Asp-N (ambic)</option>
        <option value="Chymotrypsin">Chymotrypsin</option>
        <option value="Chymotrypsin (no P rule)">Chymotrypsin (no P rule)</option>
        <option value="Pepsin A">Pepsin A</option>
        <option value="CNBr">CNBr</option>
        <option value="Thermolysin">Thermolysin</option>
        <option value="LysargiNase">LysargiNase</option>
      </param>
      <param argument="-specificity" label="Specificity" type="select">
          <option selected="true" value="0">Specific</option>
          <option value="1">Semi-Specific</option>
          <option value="2">N-term Specific</option>
          <option value="3">C-term Specific</option>
      </param>
      <param argument="-mc" label="Number of allowed missed cleavages" value="2" type="text" optional="true" help="If more than one enzyme was used, please provide the missed cleavages for every enzyme in the same order, with a comma separated list, e.g. &quot;2, 1&quot;."/>
      <param argument="-fixed_mods" label="Fixed modifications as comma separated list" type="select" multiple="true">
        <expand macro="fixed_modifications"/>
      </param>
      <param argument="-variable_mods" label="Variable modifications as comma separated list" type="select" multiple="true">
        <expand macro="variable_modifications"/>
      </param>
      <param argument="-min_charge" label="Minimal charge to search for" value="2" type="integer" optional="true"/>
      <param argument="-max_charge" label="Maximal charge to search for" value="4" type="integer" optional="true"/>
      <param argument="-fi" label="Type of forward ion searched" value="b" type="text" optional="true"/>
      <param argument="-ri" label="Type of rewind ion searched" value="y" type="text" optional="true"/>
      <param argument="-min_isotope" label="Minimum precursor isotope" value="0" type="integer" optional="true"/>
      <param argument="-max_isotope" label="Maximum precursor isotope" value="1" type="integer" optional="true"/>
    </section>
    <section name="spectrum_annotation" expanded="False" title="Spectrum Annotation">
      <param argument="-annotation_level" label="The intensity threshold to consider for annotation" value="0.75" type="float" optional="true" help="Using percentiles, 0.75 means that the 25% most intense peaks will be annotated."/>
      <param argument="-annotation_high_resolution" label="If true the most accurate peak will be selected within the m/z tolerance." truevalue="1" falsevalue="0" type="boolean" checked="true"/>
    </section>
    <section name="sequence_matching" expanded="False" title="Sequence Matching">
      <param argument="-sequence_index_type" label="sequence_index_type (deprecated)" value="0" type="integer" optional="true"/>
      <param argument="-sequence_matching_type" label="The peptide to protein sequence matching type" type="select">
          <option value="0">Character Sequence</option>
          <option value="1">Amino Acids</option>
          <option selected="true" value="2">Indistinguishable Amino Acids</option>
      </param>
      <param argument="-sequence_matching_x" label="The maximal share of Xs in a sequence, 0.25 means 25% of X's" value="0.25" type="float" optional="true"/>
    </section>
    <section name="import_filters" expanded="False" title="Import Filters">
      <param argument="-import_peptide_length_min" label="The minimal peptide length to consider when importing identification files" value="8" type="integer" optional="true"/>
      <param argument="-import_peptide_length_max" label="The maximal peptide length to consider when importing identification files" value="30" type="integer" optional="true"/>
      <param argument="-import_precursor_mz_ppm" label="Maximal precursor ion deviation unit" type="select">
        <option selected="true" value="0">Da</option>
        <option value="1">ppm</option>
      </param>
      <param argument="-exclude_unknown_ptms" label="Peptides presenting unrecognized PTMs will be excluded" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
    </section>
    <section name="ptm_localization" expanded="False" title="PTM Localization">
      <param argument="-ptm_score" label="The PTM probabilistic score to use for modification localization" type="select">
        <option value="0">A-score</option>
        <option selected="true" value="1">PhosphoRS</option>
        <option value="2">None</option>
      </param>
      <param argument="-score_neutral_losses" label="Include neutral losses in spectrum annotation of the PTM score" truevalue="1" falsevalue="0" type="boolean" checked="false"/>
      <param argument="-ptm_sequence_matching_type" label="The modification to peptide sequence matching type" type="select">
          <option value="0">Character Sequence</option>
          <option selected="true" value="1">Amino Acids</option>
          <option value="2">Indistinguishable Amino Acids</option>
      </param>
      <param argument="-ptm_alignment" label="Align peptide ambiguously localized PTMs on confident sites" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
    </section>
    <section name="gene_annotation" expanded="False" title="Gene Annotation">
      <param argument="-useGeneMapping" label="Use and save gene mappings along with the project" truevalue="1" falsevalue="0" type="boolean" checked="true"  help="UniProt databases only"/>
      <param argument="-updateGeneMapping" label="Update gene mappings automatically from Ensembl" truevalue="1" falsevalue="0" type="boolean" checked="true" help="UniProt databases only"/>
    </section>
    <section name="protein_inference" expanded="False" title="Protein Inference">
      <param argument="-simplify_groups" label="Simplify protein groups" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
      <param argument="-simplify_score" label="Simplify protein groups based on the PeptideShaker target/decoy score" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
      <param argument="-simplify_enzymaticity" label="Simplify protein groups based on the peptide enzymaticity" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
      <param argument="-simplify_evidence" label="Simplify protein groups based on the UniProt protein evidence" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
      <param argument="-simplify_uncharacterized" label="Simplify protein groups based on the protein characterization" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
    </section>
    <section name="validation_levels" expanded="False" title="Validation Levels">
      <param argument="-psm_fdr" label="FDR at the PSM level in percent" value="1" type="integer" optional="true"/>
      <param argument="-peptide_fdr" label="FDR at the peptide level in percent" value="1" type="integer" optional="true"/>
      <param argument="-protein_fdr" label="FDR at the protein level in percent" value="1" type="integer" optional="true"/>
      <param argument="-group_psms" label="Group PSMs by charge for scoring and validation" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
      <param argument="-group_peptides" label="Group peptides by modification status for scoring and validation" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
      <param argument="-merge_subgroups" label="Merge small PSM and peptide groups for scoring and validation" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
    </section>
    <section name="fraction_analysis" expanded="False" title="Fraction Analysis">
      <param argument="-protein_fraction_mw_confidence" label="Minimum confidence required for a protein in the fraction MW plot" value="95.0" type="float" optional="true"/>
    </section>
    <section name="directag" expanded="False" title="DirecTag">
      <param argument="-directag_tic_cutoff" label="TIC cutoff in percent" value="85" type="integer" optional="true"/>
      <param argument="-directag_max_peak_count" label="Max peak count" value="400" type="integer" optional="true"/>
      <param argument="-directag_intensity_classes" label="Number of intensity classes" value="3" type="integer" optional="true"/>
      <param argument="-directag_adjust_precursor" label="Adjust precursor" truevalue="1" falsevalue="0" type="boolean" checked="false"/>
      <param argument="-directag_min_adjustment" label="Minimum precursor adjustment" value="-2.5" type="float" optional="true"/>
      <param argument="-directag_max_adjustment" label="Maximum precursor adjustment" value="2.5" type="float" optional="true"/>
      <param argument="-directag_adjustment_step" label="Precursor adjustment step" value="0.1" type="float" optional="true"/>
      <param argument="-directag_charge_states" label="Number of charge states considered" value="3" type="integer" optional="true"/>
      <param argument="-directag_ms_charge_state" label="Use charge state from M spectrum" truevalue="1" falsevalue="0" type="boolean" checked="false"/>
      <param argument="-directag_duplicate_spectra" label="Duplicate spectra per charge" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
      <param argument="-directag_deisotoping" label="Deisotoping mode" type="select">
        <option selected="true" value="0">No deisotoping</option>
        <option value="1">Precursor only</option>
        <option value="2">Precursor and candidate</option>
      </param>
      <param argument="-directag_isotope_tolerance" label="Isotope mz tolerance" value="0.25" type="float" optional="true"/>
      <param argument="-directag_complement_tolerance" label="Complement mz tolerance" value="0.5" type="float" optional="true"/>
      <param argument="-directag_tag_length" label="Tag length" value="4" type="integer" optional="true"/>
      <param argument="-directag_max_var_mods" label="Maximum variable modifications per sequence" value="2" type="integer" optional="true"/>
      <param argument="-directag_max_tag_count" label="Maximum tag count" value="5" type="integer" optional="true"/>
      <param argument="-directag_intensity_weight" label="Intensity score weight" value="1.0" type="float" optional="true"/>
      <param argument="-directag_fidelity_weight" label="Fidelity score weight" value="1.0" type="float" optional="true"/>
      <param argument="-directag_complement_weight" label="Complement score weight" value="1.0" type="float" optional="true"/>
    </section>
  </inputs>
  <outputs>
    <data name="output_fasta" format="fasta" from_work_dir="metanovo/metanovo.fasta" label="${tool.name} on ${on_string}: FASTA"/>
    <data name="output_csv" format="csv" from_work_dir="metanovo/metanovo.csv" label="${tool.name} on ${on_string}: CSV"/>
  </outputs>
  <tests>
    <test expect_num_outputs="2">
      <param name="input_mgf" value="sample_data_1.mgf" ftype="mgf"/>
      <param name="input_fasta" value="sample_fasta_single.fasta" ftype="fasta"/>
      <output name="output_csv" ftype="csv">
          <assert_contents>
              <!-- Check header. -->
              <has_text text=",index,Accession,Record,ID,PeptideCount,Peptides,ScanCount,Scans,Organism,Length,File,Sample sample_data_1 (msms),SAF sample_data_1,NSAF sample_data_1,Summed_NSAF,Protein_Prob,Organism_Prob,MSMS_Percent,Combined_Prob"/>
          </assert_contents>
      </output>
      <output name="output_fasta" ftype="fasta" file="sample_output_single.fasta"/>
    </test>
    <test expect_num_outputs="2">
      <param name="type" value="collection"/>
      <param name="input_mgf_collection">
        <collection type="list">
          <element name="sample_data_1.mgf" value="sample_data_1.mgf" />
          <element name="sample_data_2.mgf" value="sample_data_2.mgf" />
        </collection>
      </param>
      <param name="input_fasta" value="sample_fasta_collection.fasta" ftype="fasta"/>
      <output name="output_csv" ftype="csv">
          <assert_contents>
              <!-- Check header. -->
              <has_text text=",index,Accession,File,ID,Length,Organism,PeptideCount,Peptides,Record,SAF sample_data_1,SAF sample_data_2,Sample sample_data_1 (msms),Sample sample_data_2 (msms),ScanCount,Scans,NSAF sample_data_1,NSAF sample_data_2,Summed_NSAF,Protein_Prob,Organism_Prob,MSMS_Percent,Combined_Prob"/>
          </assert_contents>
      </output>
      <output name="output_fasta" ftype="fasta" file="sample_output_collection.fasta"/>
    </test>
  </tests>
  <help><![CDATA[
**MetaNovo**

MetaNovo searches MS/MS data against a FASTA database of known proteins.

Two outputs are produced:

- MetaNovo Output FASTA: the matching proteins produced by the search.
- MetaNovo Output CSV: information about the job and other useful metadata.

Two inputs are required: an MGF file or files and a FASTA database file.

Two different input types are available for the MGF input. The correct input configuration depends on the desired use case, as outlined below:

=======================================================  =============
Use case                                                 Configuration
=======================================================  =============
Single input MGF file, single output FASTA file          **Single file** input with **Single dataset** selected
Multiple input MGF files, multiple output FASTA files\*  **Single file** input with **Multiple datasets** OR **Dataset collection** selected
Multiple input MGF files, single output FASTA file       **Collection** input
=======================================================  =============

**\*** One for each MGF file.

In the second use case, a separate MetaNovo job is spawned for each input MGF. In the third use case, a single MetaNovo job runs with all MGF files in the collection as input.

If the third use case fails due to memory limitations, users are recommended to use the second option. The multiple output FASTA databases may be merged to generate a reduced, compact database.
    ]]>
  </help>
  <citations>
      <citation type="doi">10.1101/605550</citation>
  </citations>
</tool>