changeset 2:25261529840c

Uploaded
author iracooke
date Mon, 04 Mar 2013 17:11:46 -0500
parents 49d15d40117d
children 4c2e97f928d7
files interprophet.xml interprophet_wrapper.rb peptide_prophet.xml peptide_prophet_wrapper.rb protein_prophet.xml protein_prophet_wrapper.rb repository_dependencies.xml tool_dependencies.xml
diffstat 8 files changed, 364 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interprophet.xml	Mon Mar 04 17:11:46 2013 -0500
@@ -0,0 +1,61 @@
+<tool id="proteomics_search_interprophet_1" name="InterProphet" version="1.0.0">
+	
+	<requirements>
+	    <requirement type="package" version="1.1.9">galaxy_protk</requirement>
+	    <requirement type="package" version="4.6.1">trans_proteomic_pipeline</requirement>
+   </requirements>
+
+
+  <description>Combine Peptide Prophet results from multiple search engines</description>
+
+  <command interpreter="ruby">
+
+	interprophet_wrapper.rb $output $use_nss $use_nrs $use_nse $use_nsi $use_nsm --minprob $minprob
+
+
+	## Inputs.
+	${first_input}
+	#for $input_file in $input_files:
+	${input_file.additional_input}
+	#end for  	
+
+  </command>
+
+  <inputs>
+
+	<param name="first_input" type="data" format="peptideprophet_pepxml" label="Peptide Prophet Results" help="These files will typically be outputs from search tools that have subsequently been run through peptide prophet"/> 	
+		
+	<repeat name="input_files" title="Additional PepXML Input Files">
+		<param format="peptideprophet_pepxml" name="additional_input" type="data" label="PepXML produced by Peptide Prophet" help=""/>
+	</repeat>
+	
+	<param name="use_nss" checked="true" type="boolean" label="Include NSS in Model" help="Include NSS (Number of Sibling Searches) in Statistical Model" truevalue="blank" falsevalue="--nonss"/>
+	<param name="use_nrs" checked="true" type="boolean" label="Include NRS in Model" help="Include NRS (Number of Replicate Spectra) in Statistical Model" truevalue="blank" falsevalue="--nonrs"/>
+	<param name="use_nse" checked="true" type="boolean" label="Include NSE in Model" help="Include NSE (Number of Sibling Experiments) in Statistical Model" truevalue="blank" falsevalue="--nonse"/>
+	<param name="use_nsi" checked="true" type="boolean" label="Include NSI in Model" help="Include NSI (Number of Sibling Ions) in Statistical Model" truevalue="blank" falsevalue="--nonsi"/>
+	<param name="use_nsm" checked="true" type="boolean" label="Include NSM in Model" help="Include NSM (Number of Sibling Modifications) in Statistical Model" truevalue="blank" falsevalue="--nonsm"/>
+	
+	<param name="minprob" type="text" label="Minimum threshod probability for reporting results"/>
+		
+  </inputs>
+  <outputs>
+    <data format="interprophet_pepxml" name="output" metadata_source="first_input" label="interprophet.${first_input.display_name}" from_work_dir="interprophet_output.pep.xml"/>
+  </outputs>
+
+ <help>
+
+**What it does**
+
+Takes a set of pepXML files (possibly generated using different search engines) and calculates updated identification probabilities for each peptide.  The updated probabilities are based on a statistical model that combines evidence from identifications across all of the input files, spectra, modified states and charge states. 
+
+----
+
+**Citation**
+
+If you use this tool please read and cite the paper describing iProphet
+
+Shteynberg D, et al. “iProphet: Improved statistical validation of peptide identifications in shotgun proteomics.” *Molecular and Cellular Proteomics* 10, M111.007690 (2011).
+
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interprophet_wrapper.rb	Mon Mar 04 17:11:46 2013 -0500
@@ -0,0 +1,56 @@
+require 'pathname'
+
+$VERBOSE=nil
+
+# Hard-Coded argument order and number of arguments
+#
+actual_output_path_string=ARGV[0]
+use_nss=ARGV[1]
+use_nrs=ARGV[2]
+use_nse=ARGV[3]
+use_nsi=ARGV[4]
+use_nsm=ARGV[5]
+minprob=ARGV[6]
+minprob_val=ARGV[7]
+
+wd= Dir.pwd
+original_input_files=ARGV.drop(7)
+# End hard coded args #
+
+cmd=""
+
+output_substitution_cmds=""
+
+input_files=original_input_files.collect do |input|
+
+  # We append ".pep.xml" to the input file name because interprophet can't handle anything else
+  # In order for this to work properly we need to create a symbolic link our working directory
+  #
+  original_input_path=Pathname.new("#{input}")
+  actual_input_path_string="#{wd}/#{original_input_path.basename}.pep.xml"
+
+  cmd << "ln -s #{input} #{actual_input_path_string};"
+  output_substitution_cmds << "ruby -pi -e \"gsub('#{actual_input_path_string}', '#{input}.pep.xml')\" interprophet_output.pep.xml;"
+  actual_input_path_string
+end
+
+cmd << "rvm 1.9.3@protk-1.1.9 do interprophet.rb"
+
+cmd << " --no-nss" unless use_nss=="blank"
+cmd << " --no-nrs" unless use_nrs=="blank"
+cmd << " --no-nse" unless use_nse=="blank"
+cmd << " --no-nsi" unless use_nsi=="blank"
+cmd << " --no-nsm" unless use_nsm=="blank"
+
+
+input_files.each { |input|
+  cmd << " #{input}"
+}
+
+
+cmd << " -o interprophet_output.pep.xml -r"
+
+cmd << ";#{output_substitution_cmds}"
+
+%x[#{cmd}]
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/peptide_prophet.xml	Mon Mar 04 17:11:46 2013 -0500
@@ -0,0 +1,83 @@
+<tool id="proteomics_search_peptide_prophet_1" name="Peptide Prophet" version="1.0.0">
+    <requirements>
+        <requirement type="package" version="1.1.9">galaxy_protk</requirement>
+        <requirement type="package" version="4.6.1">trans_proteomic_pipeline</requirement>
+   </requirements>
+   
+	<description>Calculate Peptide Prophet statistics on search results</description>
+
+	<command interpreter="ruby">peptide_prophet_wrapper.rb ${output} ${input_file}  -r $glyco $useicat $phospho $usepi $usert $accurate_mass $no_ntt $no_nmc $use_gamma $use_only_expect $force_fit $allow_alt_instruments $maldi
+	</command>
+
+	<inputs>
+	
+    <param name="input_file" type="data" format="raw_pepxml" multiple="false" label="Raw Search Results" help="These files will typically be outputs from omssa or xtandem search tools"/>
+
+	<param name="glyco" type="boolean" label="Expect true positives to have a glycocapture motif" truevalue="--glyco" falsevalue=""/>
+	<param name="useicat" type="boolean" label="Use icat information" truevalue="--useicat" falsevalue="--no-useicat"/>
+	<param name="phospho" type="boolean" label="Use phospho information" truevalue="--phospho" falsevalue=""/>
+	<param name="usepi" type="boolean" label="Use pI information" truevalue="--usepi" falsevalue=""/>
+	<param name="usert" type="boolean" label="Use hydrophobicity / RT information" truevalue="--usert" falsevalue=""/>
+	<param name="accurate_mass" type="boolean" label="Use accurate mass binning" truevalue="--accurate-mass" falsevalue=""/>
+	<param name="no_ntt" type="boolean" label="Don't use NTT model" truevalue="--no-ntt" falsevalue=""/>
+	<param name="no_nmc" type="boolean" label="Don't use NMC model" truevalue="--no-nmc" falsevalue=""/>
+	<param name="use_gamma" type="boolean" label="Use Gamma distribution to model the negatives" help="Applies only to X!Tandem results" truevalue="--usegamma" falsevalue=""/>
+	<param name="use_only_expect" type="boolean" label="Only use Expect Score as the discriminant" help="Applies only to X!Tandem results. 
+        Helpful for data with homologous top hits e.g. phospho or glyco" truevalue="--use-only-expect" falsevalue=""/>
+	<param name="force_fit" type="boolean" label="Force fitting" help="Bypasses automatic mixture model checks and forces fitting of a mixture model" truevalue="--force-fit" falsevalue=""/>
+	<param name="allow_alt_instruments" type="boolean" label="Allow multiple instrument types" help="Warning instead of exit with error if instrument types between runs is different" truevalue="--allow-alt-instruments" falsevalue=""/>
+	<param name="maldi" type="boolean" label="Maldi data" truevalue="-l" falsevalue=""/>
+	
+	
+  </inputs>
+  <outputs>
+    <data format="peptideprophet_pepxml" name="output" metadata_source="input_file" label="peptide_prophet.${input_file.display_name}.pep.xml" from_work_dir="peptide_prophet_output.pep.xml"/>
+  </outputs>
+
+<help>
+
+**What it does**
+
+Given raw search engine scores as inputs this tool estimates the accuracy of peptide assignments.  From a practical perspective it estimates the probability that each peptide assignment is correct (providing probabilities as outputs), given raw scores (possibly on some arbitrary scale) as inputs. 
+
+----
+
+**Citation**
+
+If you use this tool please read and cite the paper describing the statistical model implemented by Peptide Prophet
+
+Keller A., et al. “Empirical Statistical Model to Estimate the Accuracy of Peptide Identifications Made by MS/MS and Database Search” *Anal. Chem.* 74, 5383-5392 (2002).
+
+
+</help>
+
+
+<!--PeptideProphet options [following the 'O']:
+                 i [use icat information in PeptideProphet]
+                 f [do not use icat information in PeptideProphet]
+                 g [use N-glyc motif information in PeptideProphet]
+                 H [use Phospho information in PeptideProphet]
+                 m [maldi data]
+                 I [use pI information in PeptideProphet]
+                 R [use Hydrophobicity / RT information in PeptideProphet]
+                 F [force the fitting of the mixture model, bypass automatic mixture model checks]
+                 A [use accurate mass binning in PeptideProphet]
+                 w [warning instead of exit with error if instrument types between runs is different]
+                 x [exclude all entries with asterisked score values in PeptideProphet]
+                 l [leave alone all entries with asterisked score values in PeptideProphet]
+                 n [use hardcoded default initialization parameters of the distributions]
+                 P [use Non-parametric model, can only be used with decoy option]
+                 N [do not use the NTT model]
+                 M [do not use the NMC model]
+                 G [use Gamma Distribution to model the Negatives (applies only to X!Tandem data)]
+                 E [only use Expect Score as the Discriminant(applies only to X!Tandem data, 
+                    helpful for data with homologous top hits e.g. phospho or glyco)]
+                 d [report decoy hits with a computed probability based on the model learned]
+                 p [run ProteinProphet afterwards]
+                 t [do not create png data plot]
+                 u [do not assemble protein groups in ProteinProphet analysis]
+                 s [do not use Occam's Razor in ProteinProphet analysis to 
+                    derive the simplest protein list to explain observed peptides]
+-->
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/peptide_prophet_wrapper.rb	Mon Mar 04 17:11:46 2013 -0500
@@ -0,0 +1,36 @@
+require 'pathname'
+
+$VERBOSE=nil
+
+actual_output_path_string=ARGV.shift
+
+# Second argument is the original input file name ... we'll change this below
+original_input_file=ARGV[0]
+
+# Before doing anything we append create a link to the input file in our working dir with ".pep.xml" appended to the input 
+# name because peptide prophet can't handle anything else
+
+wd= Dir.pwd
+
+original_input_path=Pathname.new("#{original_input_file}")
+actual_input_path_string="#{wd}/#{original_input_path.basename}.pep.xml"
+full_tmp_output_path_string="#{wd}/peptide_prophet_output.pep.xml"
+
+cmd = "ln -s #{original_input_file} #{actual_input_path_string};"
+
+cmd << "rvm 1.9.3@protk-1.1.9 do peptide_prophet.rb"
+
+
+ARGV[0]="#{actual_input_path_string}"
+
+ARGV.each { |a|    
+  cmd << " #{a}" 
+}
+
+cmd << " -o peptide_prophet_output.pep.xml"
+
+# Finally we need to fix up the output file so any references to the temporary working file are changed to refs to the original input file
+cmd << ";ruby -pi -e \"gsub('#{actual_input_path_string}', '#{original_input_file}')\" peptide_prophet_output.pep.xml"
+cmd << ";ruby -pi -e \"gsub('#{full_tmp_output_path_string}', '#{actual_output_path_string}')\" peptide_prophet_output.pep.xml"
+
+%x[#{cmd}]
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/protein_prophet.xml	Mon Mar 04 17:11:46 2013 -0500
@@ -0,0 +1,71 @@
+<tool id="proteomics_search_protein_prophet_1" name="Protein Prophet" version="1.0.0">
+	<requirements>
+	    <requirement type="package" version="1.1.9">galaxy_protk</requirement>
+	    <requirement type="package" version="4.6.1">trans_proteomic_pipeline</requirement>
+   </requirements>
+   
+  <description>Calculate Protein Prophet statistics on search results</description>
+
+
+<!-- Note .. the input file is assumed to be the first argument -->
+  <command>protein_prophet.rb --galaxy $input_file -r $iproph $nooccam $groupwts $normprotlen $logprobs $confem $allpeps $unmapped $instances $delude --minprob=$minprob --minindep=$minindep </command>
+  <inputs>
+	
+    <param name="input_file" type="data" format="peptideprophet_pepxml,interprophet_pepxml" multiple="false" label="Peptide Prophet Results" help="These files will typically be outputs from peptide prophet or interprophet"/>
+
+
+	<param name="iproph" selected="true" type="boolean" label="Inputs are from iProphet" truevalue="--iprophet-input" falsevalue=""/>
+	<param name="nooccam" type="boolean" label="Don't apply Occam's razor" help="When selected no attempt will be made to derive the simplest protein list explaining observed peptides" truevalue="--no-occam" falsevalue=""/>
+	<param name="groupwts" type="boolean" label="Use group weights" help="Check peptide's total weight (rather than actual weight) in the Protein Group against the threshold" truevalue="--group-wts" falsevalue=""/>
+	<param name="normprotlen" type="boolean" label="Normalize NSP using Protein Length" truevalue="--norm-protlen" falsevalue=""/>
+	<param name="logprobs" type="boolean" label="Use the log of probability in the confidence calculations" truevalue="--log-prob" falsevalue=""/>
+	<param name="confem" type="boolean" label="Use the EM to compute probability given the confidenct" truevalue="--confem" falsevalue=""/>
+	<param name="allpeps" type="boolean" label="Consider all possible peptides in the database in the confidence model" truevalue="--allpeps" falsevalue=""/>
+	<param name="unmapped" type="boolean" label="Report results for unmapped proteins" truevalue="--unmapped" falsevalue=""/>
+	<param name="instances" type="boolean" label="Use Expected Number of Ion Instances to adjust the peptide probabilities prior to NSP adjustment" truevalue="--instances" falsevalue=""/>
+	<param name="delude" type="boolean" label="Do NOT use peptide degeneracy information when assessing proteins" truevalue="--delude" falsevalue=""/>
+
+	<param name="minprob" type="text" label="Minimum peptide prophet probability for peptides to be considered" value="0.05"/>
+	<param name="minindep" type="text" label="Minimum percentage of independent peptides required for a protein" value="0"/>
+	
+  </inputs>
+  <outputs>
+    <data format="protxml" name="output" metadata_source="input_file" label="protein_prophet.${input_file.display_name}.protXML" from_work_dir="protein_prophet_results.prot.xml"/>
+  </outputs>
+
+
+<!--NOPLOT: do not generate plot png file
+		NOOCCAM: non-conservative maximum protein list
+		GROUPWTS: check peptide's total weight in the Protein Group against the threshold (default: check peptide's actual weight against threshold)   
+		NORMPROTLEN: Normalize NSP using Protein Length
+		LOGPROBS: Use the log of the probabilities in the Confidence calculations
+		CONFEM: Use the EM to compute probability given the confidence 
+		ALLPEPS: Consider all possible peptides in the database in the confidence model
+		UNMAPPED: Report results for UNMAPPED proteins
+		INSTANCES: Use Expected Number of Ion Instances to adjust the peptide probabilities prior to NSP adjustment
+		DELUDE: do NOT use peptide degeneracy information when assessing proteins
+		
+		MINPROB: peptideProphet probabilty threshold (default=0.05) 
+		MININDEP: minimum percentage of independent peptides required for a protein (default=0) 
+		
+		
+-->
+
+  <help>
+
+**What it does**
+
+Given a set of peptide assignments from MS/MS spectra in the form of a pepXML file, this tool estimates probabilities at the protein level.  As output, the tool produces a protXML file, which contains proteins along with the estimated probabilities that those proteins were present.  Probabilities are estimated using a statistical model based on the number of peptides corresponding to that protein and the confidence that each of those peptides were assigned correctly.  It takes account of the fact that peptides may correspond to more than one protein. 
+
+----
+
+**Citation**
+
+If you use this tool please read and cite the paper describing the statistical model implemented by Protein Prophet
+
+Nesvizhskii A., et al. “A Statistical Model for Identifying Proteins by Tandem Mass Spectrometry” *Anal. Chem.* 75, 4646-4658 (2003).
+
+
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/protein_prophet_wrapper.rb	Mon Mar 04 17:11:46 2013 -0500
@@ -0,0 +1,35 @@
+require 'pathname'
+
+$VERBOSE=nil
+
+actual_output_path_string=ARGV.shift
+
+# Second argument is the original input file name ... we'll change this below
+original_input_file=ARGV[0]
+
+# Before doing anything we append create a link to the input file in our working dir with ".pep.xml" appended to the input 
+# name because peptide prophet can't handle anything else
+
+wd= Dir.pwd
+
+original_input_path=Pathname.new("#{original_input_file}")
+actual_input_path_string="#{wd}/#{original_input_path.basename}.pep.xml"
+
+cmd = "ln -s #{original_input_file} #{actual_input_path_string};"
+
+cmd << "rvm 1.9.3@protk-1.1.9 do protein_prophet.rb"
+
+
+ARGV[0]="#{actual_input_path_string}"
+
+ARGV.each { |a| 
+    
+  cmd << " #{a}" 
+}
+
+cmd << " -o protein_prophet_results.prot.xml"
+
+cmd << ";ruby -pi -e \"gsub('#{actual_input_path_string}', '#{original_input_file}.pep.xml')\" protein_prophet_results.prot.xml"
+
+%x[#{cmd}]
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/repository_dependencies.xml	Mon Mar 04 17:11:46 2013 -0500
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+<repositories description="Proteomics datatypes and mascot2xml from the TPP">
+
+     <repository toolshed="http://toolshed.g2.bx.psu.edu" name="proteomics_datatypes" owner="iracooke" changeset_revision="84c6c70a4e5a"/>
+
+	<repository toolshed="http://toolshed.g2.bx.psu.edu" name="galaxy_protk" owner="iracooke" changeset_revision="51f7c347c955"/>
+
+     <repository toolshed="http://toolshed.g2.bx.psu.edu" name="protk_trans_proteomic_pipeline" owner="iracooke" changeset_revision="f302406a0297"/>
+
+ </repositories>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Mar 04 17:11:46 2013 -0500
@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<tool_dependency>
+
+    <package name="galaxy_protk" version="1.1.9">
+	     <repository toolshed="http://toolshed.g2.bx.psu.edu" name="galaxy_protk" owner="iracooke" changeset_revision="51f7c347c955"/>
+    </package>
+
+	<package name="trans_proteomic_pipeline" version="4.6.1">
+        <repository toolshed="http://toolshed.g2.bx.psu.edu" name="protk_trans_proteomic_pipeline" owner="iracooke" changeset_revision="f302406a0297" />
+    </package>
+
+</tool_dependency>
\ No newline at end of file