view metanovo.xml @ 4:7a5ff5359b13 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/metanovo commit c220dc85d59698a73b0f173d46e269e27264d6d8"
author galaxyp
date Fri, 22 Apr 2022 13:31:08 +0000
parents 4a851c02f558
children d6dcd3173bdf
line wrap: on
line source

<tool id="metanovo" name="MetaNovo" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.09">
  <description>
    Produce targeted databases for mass spectrometry analysis.
  </description>
  <requirements>
    <requirement type="package" version="@TOOL_VERSION@">metanovo</requirement>
  </requirements>
  <macros>
    <token name="@TOOL_VERSION@">1.9.4</token>
    <token name="@VERSION_SUFFIX@">4</token>
    <token name="@SUBSTITUTION_RX@">[^\w\-\.]</token>
    <import>macros_modifications.xml</import>
  </macros>
  <command>
    <![CDATA[
      #import re
      #set $mgf_dir = 'mgf_files'
      #set $fasta_dir = 'fasta_file'
      #set fasta_name = re.sub('@SUBSTITUTION_RX@', '_', str($input_fasta.element_identifier))
      mkdir $mgf_dir &&
      mkdir $fasta_dir &&
      ln -s '$input_fasta' '$fasta_dir/$fasta_name' &&

      #if $input_type.type == "collection"
      #for $input_mgf_item in $input_type.input_mgf_collection:
      #set mgf_name = re.sub('@SUBSTITUTION_RX@', '_', str($input_mgf_item.element_identifier))
      ln -s '$input_mgf_item' '$mgf_dir/$mgf_name' &&
      #end for
      #else
      #set mgf_name = re.sub('@SUBSTITUTION_RX@', '_', str($input_type.input_mgf.element_identifier))
      ln -s '$input_type.input_mgf' '$mgf_dir/$mgf_name' &&
      #end if

      ## the number of threads should be number of available threads-1 according to the docs
      threads=\${GALAXY_SLOTS:-3} &&
      if [ \$threads -gt 1 ]; then
        (( threads-- ));
      fi &&
      echo "THREAD_LIMIT=\$threads" >> config.sh &&

      metanovo.sh config.sh
    ]]>
  </command>

  <configfiles>
    <configfile filename="config.sh"><![CDATA[#slurp
#import re
MGF_FOLDER=mgf_files
#set fasta_name = re.sub('@SUBSTITUTION_RX@', '_', str($input_fasta.element_identifier))
FASTA_FILE=fasta_file/'$fasta_name'
OUTPUT_FOLDER=.
CHUNKSIZE=$processing_control.CHUNKSIZE
JVM_Xmx='10000M'
JVM_Xms='1024M'
mn_specificity='$metanovo_parameters.mn_specificity'
mn_enzymes='$metanovo_parameters.mn_enzymes'
mn_max_missed_cleavages=$metanovo_parameters.mn_max_missed_cleavages
dg_pepnovo=0
dg_pnovo=0
dg_novor=0
dg_directag=1
prec_tol=$spectrum_matching_parameters.prec_tol
prec_ppm=$spectrum_matching_parameters.prec_ppm
frag_tol=$spectrum_matching_parameters.frag_tol
frag_ppm=$spectrum_matching_parameters.frag_ppm
digestion=$spectrum_matching_parameters.digestion
enzyme='$spectrum_matching_parameters.enzyme'
specificity=$spectrum_matching_parameters.specificity
mc='$spectrum_matching_parameters.mc'
fixed_mods="$spectrum_matching_parameters.fixed_mods"
variable_mods="$spectrum_matching_parameters.variable_mods"
min_charge=$spectrum_matching_parameters.min_charge
max_charge=$spectrum_matching_parameters.max_charge
fi='$spectrum_matching_parameters.fi'
ri='$spectrum_matching_parameters.ri'
min_isotope='$spectrum_matching_parameters.min_isotope'
max_isotope='$spectrum_matching_parameters.max_isotope'
annotation_level=$spectrum_annotation.annotation_level
annotation_high_resolution=$spectrum_annotation.annotation_high_resolution
sequence_index_type=$sequence_matching.sequence_index_type
sequence_matching_type=$sequence_matching.sequence_matching_type
sequence_matching_x=$sequence_matching.sequence_matching_x
import_peptide_length_min=$import_filters.import_peptide_length_min
import_peptide_length_max=$import_filters.import_peptide_length_max
import_precursor_mz_ppm=$import_filters.import_precursor_mz_ppm
exclude_unknown_ptms=$import_filters.exclude_unknown_ptms
ptm_score=$ptm_localization.ptm_score
score_neutral_losses=$ptm_localization.score_neutral_losses
ptm_sequence_matching_type=$ptm_localization.ptm_sequence_matching_type
ptm_alignment=$ptm_localization.ptm_alignment
useGeneMapping=$gene_annotation.useGeneMapping
updateGeneMapping=$gene_annotation.updateGeneMapping
simplify_groups=$protein_inference.simplify_groups
simplify_score=$protein_inference.simplify_score
simplify_enzymaticity=$protein_inference.simplify_enzymaticity
simplify_evidence=$protein_inference.simplify_evidence
simplify_uncharacterized=$protein_inference.simplify_uncharacterized
psm_fdr=$validation_levels.psm_fdr
peptide_fdr=$validation_levels.peptide_fdr
protein_fdr=$validation_levels.protein_fdr
group_psms=$validation_levels.group_psms
group_peptides=$validation_levels.group_peptides
merge_subgroups=$validation_levels.merge_subgroups
protein_fraction_mw_confidence='$fraction_analysis.protein_fraction_mw_confidence'
pepnovo_hitlist_length=1
pepnovo_estimate_charge=1
pepnovo_correct_prec_mass=1
pepnovo_discard_spectra=1
pepnovo_fragmentation_model='CID_IT_TRYP'
pepnovo_generate_blast=0
directag_tic_cutoff=$directag.directag_tic_cutoff
directag_max_peak_count=$directag.directag_max_peak_count
directag_intensity_classes=$directag.directag_intensity_classes
directag_adjust_precursor=$directag.directag_adjust_precursor
directag_min_adjustment='$directag.directag_min_adjustment'
directag_max_adjustment='$directag.directag_max_adjustment'
directag_adjustment_step='$directag.directag_adjustment_step'
directag_charge_states='$directag.directag_charge_states'
directag_ms_charge_state='$directag.directag_ms_charge_state'
directag_duplicate_spectra='$directag.directag_duplicate_spectra'
directag_deisotoping='$directag.directag_deisotoping'
directag_isotope_tolerance='$directag.directag_isotope_tolerance'
directag_complement_tolerance='$directag.directag_complement_tolerance'
directag_tag_length='$directag.directag_tag_length'
directag_max_var_mods='$directag.directag_max_var_mods'
directag_max_tag_count='$directag.directag_max_tag_count'
directag_intensity_weight='$directag.directag_intensity_weight'
directag_fidelity_weight='$directag.directag_fidelity_weight'
directag_complement_weight='$directag.directag_complement_weight'
novor_fragmentation=HCD
novor_mass_analyzer=Trap
 ]]></configfile>
  </configfiles>

  <inputs>
    <conditional name="input_type">
      <param name="type" type="select" label="MGF Input Type" help="Submit either a single file, or a collection of files.">
        <option selected="true" value="single">Single file</option>
        <option value="collection">Collection</option>
      </param>
      <when value="single">
        <param name="input_mgf" type="data" format="mgf" optional="true" label="MGF File" />
      </when>
      <when value="collection">
        <param name="input_mgf_collection" type="data_collection" optional="true" label="MGF Collection" />
      </when>
    </conditional>

    <param name="input_fasta" type="data" format="fasta" label="FASTA File" />

    <section name="processing_control" expanded="False" title="Processing Control">
      <param name="CHUNKSIZE" label="Size to split fasta for parallel processing" value="100000" type="integer" optional="true"/>
    </section>
    <section name="metanovo_parameters" expanded="False" title="MetaNovo Parameters">
      <param name="mn_specificity" argument="-mn_specificity" label="Enzyme Specificity" type="select">
        <option selected="true" value="specific">specific</option>
        <option value="semi-specific">semi-specific</option>
        <option value="unspecific">unspecific</option>
      </param>
      <param name="mn_enzymes" argument="-mn_enzymes" label="Enzyme Rule" type="select">
        <option value="Trypsin">Trypsin</option>
        <option selected="true" value="Trypsin, no P rule">Trypsin, no P rule</option>
        <option value="Whole protein">Whole protein</option>
      </param>
      <param name="mn_max_missed_cleavages" argument="-mn_max_missed_cleavages" label="Number of enzymatic missed cleavages" value="2" type="integer" optional="true"/>
    </section>
    <section name="spectrum_matching_parameters" expanded="False" title="Spectrum Matching Parameters">
      <param name="prec_tol" argument="-prec_tol" label="Precursor ion mass tolerance" value="10.0" type="float" optional="true"/>
      <param name="prec_ppm" argument="-prec_ppm" label="Precursor ion tolerance unit" type="select">
        <option value="0">Da</option>
        <option selected="true" value="1">ppm</option>
      </param>
      <param name="frag_tol" argument="-frag_tol" label="Fragment ion mass tolerance" value="0.05" type="float" optional="true"/>
      <param name="frag_ppm" argument="-frag_ppm" label="Fragment ion tolerance unit" type="select">
        <option selected="true" value="0">Da</option>
        <option value="1">ppm</option>
      </param>
      <param name="digestion" argument="-digestion" label="Digestion" type="select">
        <option selected="true" value="0">Enzyme</option>
        <option value="1">Unspecific</option>
        <option value="2">Whole Protein</option>
      </param>
      <param name="enzyme" argument="-enzyme" label="Enzyme" type="select" multiple="true">
        <option value="Trypsin">Trypsin</option>
        <option selected="true" value="Trypsin (no P rule)">Trypsin (no P rule)</option>
        <option value="Arg-C">Arg-C</option>
        <option value="Arg-C (no P rule)">Arg-C (no P rule)</option>
        <option value="Arg-N">Arg-N</option>
        <option value="Glu-C">Glu-C</option>
        <option value="Lys-C">Lys-C</option>
        <option value="Lys-C (no P rule)">Lys-C (no P rule)</option>
        <option value="Lys-N">Lys-N</option>
        <option value="Asp-N">Asp-N</option>
        <option value="Asp-N (ambic)">Asp-N (ambic)</option>
        <option value="Chymotrypsin">Chymotrypsin</option>
        <option value="Chymotrypsin (no P rule)">Chymotrypsin (no P rule)</option>
        <option value="Pepsin A">Pepsin A</option>
        <option value="CNBr">CNBr</option>
        <option value="Thermolysin">Thermolysin</option>
        <option value="LysargiNase">LysargiNase</option>
      </param>
      <param name="specificity" argument="-specificity" label="Specificity" type="select">
          <option selected="true" value="0">Specific</option>
          <option value="1">Semi-Specific</option>
          <option value="2">N-term Specific</option>
          <option value="3">C-term Specific</option>
      </param>
      <param name="mc" argument="-mc" label="Number of allowed missed cleavages" value="2" type="text" optional="true" help="If more than one enzyme was used, please provide the missed cleavages for every enzyme in the same order, with a comma separated list, e.g. &quot;2, 1&quot;."/>
      <param name="fixed_mods" argument="-fixed_mods" label="Fixed modifications as comma separated list" type="select" multiple="true">
        <expand macro="fixed_modifications"/>
      </param>
      <param name="variable_mods" argument="-variable_mods" label="Variable modifications as comma separated list" type="select" multiple="true">
        <expand macro="variable_modifications"/>
      </param>
      <param name="min_charge" argument="-min_charge" label="Minimal charge to search for" value="2" type="integer" optional="true"/>
      <param name="max_charge" argument="-max_charge" label="Maximal charge to search for" value="4" type="integer" optional="true"/>
      <param name="fi" argument="-fi" label="Type of forward ion searched" value="b" type="text" optional="true"/>
      <param name="ri" argument="-ri" label="Type of rewind ion searched" value="y" type="text" optional="true"/>
      <param name="min_isotope" argument="-min_isotope" label="Minimum precursor isotope" value="0" type="integer" optional="true"/>
      <param name="max_isotope" argument="-max_isotope" label="Maximum precursor isotope" value="1" type="integer" optional="true"/>
    </section>
    <section name="spectrum_annotation" expanded="False" title="Spectrum Annotation">
      <param name="annotation_level" argument="-annotation_level" label="The intensity threshold to consider for annotation" value="0.75" type="float" optional="true" help="Using percentiles, 0.75 means that the 25% most intense peaks will be annotated."/>
      <param name="annotation_high_resolution" argument="-annotation_high_resolution" label="If true the most accurate peak will be selected within the m/z tolerance." truevalue="1" falsevalue="0" type="boolean" checked="true"/>
    </section>
    <section name="sequence_matching" expanded="False" title="Sequence Matching">
      <param name="sequence_index_type" argument="-sequence_index_type" label="sequence_index_type (deprecated)" value="0" type="integer" optional="true"/>
      <param name="sequence_matching_type" argument="-sequence_matching_type" label="The peptide to protein sequence matching type" type="select">
          <option value="0">Character Sequence</option>
          <option value="1">Amino Acids</option>
          <option selected="true" value="2">Indistinguishable Amino Acids</option>
      </param>
      <param name="sequence_matching_x" argument="-sequence_matching_x" label="The maximal share of Xs in a sequence, 0.25 means 25% of X's" value="0.25" type="float" optional="true"/>
    </section>
    <section name="import_filters" expanded="False" title="Import Filters">
      <param name="import_peptide_length_min" argument="-import_peptide_length_min" label="The minimal peptide length to consider when importing identification files" value="8" type="integer" optional="true"/>
      <param name="import_peptide_length_max" argument="-import_peptide_length_max" label="The maximal peptide length to consider when importing identification files" value="30" type="integer" optional="true"/>
      <param name="import_precursor_mz_ppm" argument="-import_precurosor_mz_ppm [sic]" label="Maximal precursor ion deviation unit" type="select">
        <option selected="true" value="0">Da</option>
        <option value="1">ppm</option>
      </param>
      <param name="exclude_unknown_ptms" argument="-exclude_unknown_ptms" label="Peptides presenting unrecognized PTMs will be excluded" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
    </section>
    <section name="ptm_localization" expanded="False" title="PTM Localization">
      <param name="ptm_score" argument="-ptm_score" label="The PTM probabilistic score to use for modification localization" type="select">
        <option value="0">A-score</option>
        <option selected="true" value="1">PhosphoRS</option>
        <option value="2">None</option>
      </param>
      <param name="score_neutral_losses" argument="-score_neutral_losses" label="Include neutral losses in spectrum annotation of the PTM score" truevalue="1" falsevalue="0" type="boolean" checked="false"/>
      <param name="ptm_sequence_matching_type" argument="-ptm_sequence_matching_type" label="The modification to peptide sequence matching type" type="select">
          <option value="0">Character Sequence</option>
          <option selected="true" value="1">Amino Acids</option>
          <option value="2">Indistinguishable Amino Acids</option>
      </param>
      <param name="ptm_alignment" argument="-ptm_alignment" label="Align peptide ambiguously localized PTMs on confident sites" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
    </section>
    <section name="gene_annotation" expanded="False" title="Gene Annotation">
      <param name="useGeneMapping" argument="-useGeneMapping" label="Use and save gene mappings along with the project" truevalue="1" falsevalue="0" type="boolean" checked="true"  help="UniProt databases only"/>
      <param name="updateGeneMapping" argument="-updateGeneMapping" label="Update gene mappings automatically from Ensembl" truevalue="1" falsevalue="0" type="boolean" checked="true" help="UniProt databases only"/>
    </section>
    <section name="protein_inference" expanded="False" title="Protein Inference">
      <param name="simplify_groups" argument="-simplify_groups" label="Simplify protein groups" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
      <param name="simplify_score" argument="-simplify_score" label="Simplify protein groups based on the PeptideShaker target/decoy score" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
      <param name="simplify_enzymaticity" argument="-simplify_enzymaticity" label="Simplify protein groups based on the peptide enzymaticity" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
      <param name="simplify_evidence" argument="-simplify_evidence" label="Simplify protein groups based on the UniProt protein evidence" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
      <param name="simplify_uncharacterized" argument="-simplify_uncharacterized" label="Simplify protein groups based on the protein characterization" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
    </section>
    <section name="validation_levels" expanded="False" title="Validation Levels">
      <param name="psm_fdr" argument="-psm_fdr" label="FDR at the PSM level in percent" value="1" type="integer" optional="true"/>
      <param name="peptide_fdr" argument="-peptide_fdr" label="FDR at the peptide level in percent" value="1" type="integer" optional="true"/>
      <param name="protein_fdr" argument="-protein_fdr" label="FDR at the protein level in percent" value="1" type="integer" optional="true"/>
      <param name="group_psms" argument="-group_psms" label="Group PSMs by charge for scoring and validation" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
      <param name="group_peptides" argument="-group_peptides" label="Group peptides by modification status for scoring and validation" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
      <param name="merge_subgroups" argument="-group_peptides" label="Merge small PSM and peptide groups for scoring and validation" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
    </section>
    <section name="fraction_analysis" expanded="False" title="Fraction Analysis">
      <param name="protein_fraction_mw_confidence" argument="-protein_fraction_mw_confidence" label="Minimum confidence required for a protein in the fraction MW plot" value="95.0" type="float" optional="true"/>
    </section>
    <section name="directag" expanded="False" title="DirecTag">
      <param name="directag_tic_cutoff" argument="-directag_tic_cutoff" label="TIC cutoff in percent" value="85" type="integer" optional="true"/>
      <param name="directag_max_peak_count" argument="-directag_max_peak_count" label="Max peak count" value="400" type="integer" optional="true"/>
      <param name="directag_intensity_classes" argument="-directag_intensity_classes" label="Number of intensity classes" value="3" type="integer" optional="true"/>
      <param name="directag_adjust_precursor" argument="-directag_adjust_precursor" label="Adjust precursor" truevalue="1" falsevalue="0" type="boolean" checked="false"/>
      <param name="directag_min_adjustment" argument="-directag_min_adjustment" label="Minimum precursor adjustment" value="-2.5" type="float" optional="true"/>
      <param name="directag_max_adjustment" argument="-directag_max_adjustment" label="Maximum precursor adjustment" value="2.5" type="float" optional="true"/>
      <param name="directag_adjustment_step" argument="-directag_adjustment_step" label="Precursor adjustment step" value="0.1" type="float" optional="true"/>
      <param name="directag_charge_states" argument="-directag_charge_states" label="Number of charge states considered" value="3" type="integer" optional="true"/>
      <param name="directag_ms_charge_state" argument="-directag_ms_charge_state" label="Use charge state from M spectrum" truevalue="1" falsevalue="0" type="boolean" checked="false"/>
      <param name="directag_duplicate_spectra" argument="-directag_duplicate_spectra" label="Duplicate spectra per charge" truevalue="1" falsevalue="0" type="boolean" checked="true"/>
      <param name="directag_deisotoping" argument="-directag_deisotoping" label="Deisotoping mode" type="select">
        <option selected="true" value="0">No deisotoping</option>
        <option value="1">Precursor only</option>
        <option value="2">Precursor and candidate</option>
      </param>
      <param name="directag_isotope_tolerance" argument="-directag_isotope_tolerance" label="Isotope mz tolerance" value="0.25" type="float" optional="true"/>
      <param name="directag_complement_tolerance" argument="-directag_complement_tolerance" label="Complement mz tolerance" value="0.5" type="float" optional="true"/>
      <param name="directag_tag_length" argument="-directag_tag_length" label="Tag length" value="4" type="integer" optional="true"/>
      <param name="directag_max_var_mods" argument="-directag_max_var_mods" label="Maximum variable modifications per sequence" value="2" type="integer" optional="true"/>
      <param name="directag_max_tag_count" argument="-directag_max_tag_count" label="Maximum tag count" value="5" type="integer" optional="true"/>
      <param name="directag_intensity_weight" argument="-directag_intensity_weight" label="Intensity score weight" value="1.0" type="float" optional="true"/>
      <param name="directag_fidelity_weight" argument="-directag_fidelity_weight" label="Fidelity score weight" value="1.0" type="float" optional="true"/>
      <param name="directag_complement_weight" argument="-directag_complement_weight" label="Complement score weight" value="1.0" type="float" optional="true"/>
    </section>
  </inputs>
  <outputs>
    <data name="output_fasta" format="fasta" from_work_dir="metanovo/metanovo.fasta" label="${tool.name} on ${on_string}: FASTA"/>
    <data name="output_csv" format="csv" from_work_dir="metanovo/metanovo.csv" label="${tool.name} on ${on_string}: CSV"/>
  </outputs>
  <tests>
    <test expect_num_outputs="2">
      <param name="input_mgf" value="sample_data_1.mgf" ftype="mgf"/>
      <param name="input_fasta" value="sample_fasta_single.fasta" ftype="fasta"/>
      <output name="output_csv" ftype="csv">
          <assert_contents>
              <!-- Check header. -->
              <has_text text=",index,Accession,Record,ID,PeptideCount,Peptides,ScanCount,Scans,Organism,Length,File,Sample sample_data_1 (msms),SAF sample_data_1,NSAF sample_data_1,Summed_NSAF,Protein_Prob,Organism_Prob,MSMS_Percent,Combined_Prob"/>
          </assert_contents>
      </output>
      <output name="output_fasta" ftype="fasta" file="sample_output_single.fasta"/>
    </test>
    <test expect_num_outputs="2">
      <param name="type" value="collection"/>
      <param name="input_mgf_collection">
        <collection type="list">
          <element name="sample_data_1.mgf" value="sample_data_1.mgf" />
          <element name="sample_data_2.mgf" value="sample_data_2.mgf" />
        </collection>
      </param>
      <param name="input_fasta" value="sample_fasta_collection.fasta" ftype="fasta"/>
      <output name="output_csv" ftype="csv">
          <assert_contents>
              <!-- Check header. -->
              <has_text text=",index,Accession,File,ID,Length,Organism,PeptideCount,Peptides,Record,SAF sample_data_1,SAF sample_data_2,Sample sample_data_1 (msms),Sample sample_data_2 (msms),ScanCount,Scans,NSAF sample_data_1,NSAF sample_data_2,Summed_NSAF,Protein_Prob,Organism_Prob,MSMS_Percent,Combined_Prob"/>
          </assert_contents>
      </output>
      <output name="output_fasta" ftype="fasta" file="sample_output_collection.fasta"/>
    </test>
  </tests>
  <help><![CDATA[
**MetaNovo**

MetaNovo searches MS/MS data against a FASTA database of known proteins.

Two outputs are produced:

- MetaNovo Output FASTA: the matching proteins produced by the search.
- MetaNovo Output CSV: information about the job and other useful metadata.

Two inputs are required: an MGF file or files and a FASTA database file.

Two different input types are available for the MGF input. The correct input configuration depends on the desired use case, as outlined below:

=======================================================  =============
Use case                                                 Configuration
=======================================================  =============
Single input MGF file, single output FASTA file          **Single file** input with **Single dataset** selected
Multiple input MGF files, multiple output FASTA files\*  **Single file** input with **Multiple datasets** OR **Dataset collection** selected
Multiple input MGF files, single output FASTA file       **Collection** input
=======================================================  =============

**\*** One for each MGF file.

In the second use case, a separate MetaNovo job is spawned for each input MGF. In the third use case, a single MetaNovo job runs with all MGF files in the collection as input.

If the third use case fails due to memory limitations, users are recommended to use the second option. The multiple output FASTA databases may be merged to generate a reduced, compact database.
    ]]>
  </help>
  <citations>
      <citation type="doi">10.1101/605550</citation>
  </citations>
</tool>