diff msfilt.xml @ 0:d50f079096ee

Push to main toolshed
author pieter.lukasse@wur.nl
date Wed, 08 Jan 2014 11:39:16 +0100
children 72d4a37869ee
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/msfilt.xml	Wed Jan 08 11:39:16 2014 +0100
@@ -0,0 +1,229 @@
+<tool name="MsFilt" id="msfilt" version="1.0.2">
+	<description>Filters annotations based MS/MS peptide identification and annotation quality measures</description>
+	<!-- 
+	   For remote debugging start you listener on port 8000 and use the following as command interpreter:
+	       java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 
+	                    //////////////////////////
+	    -->
+	<command interpreter="java -jar ">
+	    MsFilt.jar 
+	    -apmlFile $apmlFile
+	    -datasetCode $apmlFile.metadata.base_name
+	    -rankingMetadataFile $rankingMetadataFile
+	    -statisticalMeasuresConfigFile $statisticalMeasuresConfigFile
+	    -annotationSourceConfigFile $annotationSourceConfigFile
+	    -outApml $outputApml
+	    -outNewIdsApml $outNewIdsApml
+	    -outFullCSV $outputCSV 
+	    -outRankingTable $outRankingTable
+	    -outProteinCoverageCSV $outProteinCoverageCSV
+	    -fpCriteriaExpression "$fpCriteriaExpression"
+	    -filterOutFPAnnotations $filterOutFPAnnotations
+	    -fpCriteriaExpressionForIds "$fpCriteriaExpressionForIds"
+	    -filterOutFPIds $filterOutFPIds
+	    -filterOutUnannotatedAlignments $filterOutUnannotatedAlignments
+	    -addRawRankingInfo $addRawRankingInfo
+	    -addScaledIntensityInfo $addScaledIntensityInfo
+	    -addRawIntensityInfo $addRawIntensityInfo
+    	-outReport $htmlReportFile
+	    -outReportPicturesPath $htmlReportFile.files_path
+	</command>
+	<inputs>
+   		<param name="apmlFile" type="data" format="apml" optional="true" 
+   		         label="(Optional) Peptide quantification file (APML)" 
+   		         help="The APML contents as aligned and annotated feature lists. E.g. produced by 
+   		               SEDMAT or Quantiline tools." />
+   		<repeat name="annotationSourceFiles" title="(Optional) Peptide identification files" help="Full set of MS/MS peptide identification files, including peptides that could not be quantified.">
+   			<param name="identificationsFile" type="data" format="apml,mzidentml,prims.fileset.zip" label="Identifications file (APML or MZIDENTML or MZIDENTML fileSet)" />
+   			<param name="spectraFile" type="data" format="mzidentml,prims.fileset.zip" optional="true" label="(Optional) Spectra fileSet (mzml file or fileSet)"
+   				   help="Select this in case your Identifications file is MZIDENTML or MZIDENTML fileSet" />
+   		</repeat>
+     	<!-- 
+     	<param name="maxNrRankings" type="integer" size="10" value="0" label="Maximum nr. of items to leave in the final ranking (set=0 for no limit) " />
+     	-->
+     	<!--  TODO add info somewhere that deltaRt is 'corrected deltaRt' -->
+		<param name="rankingWeightConfig" type="text" area="true" size="13x70" label="Quality Measures (qm's) and ranking weights configuration" 
+		help="Here you may specify a weight for each of the Quality Measures (QMs). These are used for the final QM score and possibly for ranking (e.g. in case of label-free data
+		processed by SEDMAT). The format is: QM alias => QM name,weight. "
+value="qmDRT =&gt; delta rt (standard score),1
+&#xd;&#xa;qmDMA =&gt; delta mass annotation (standard score),1
+&#xd;&#xa;qmDMP =&gt; delta mass psm (standard score),1
+&#xd;&#xa;qmBSCR =&gt; best peptide score (standard score),1
+&#xd;&#xa;qmALCV =&gt; alignment coverage (fraction),1
+&#xd;&#xa;qmSTCV =&gt; score type coverage (fraction),1
+&#xd;&#xa;qmPACV =&gt; peptide's best proteinAnnotCoverage (standard score),1
+&#xd;&#xa;qmPICV =&gt; peptide's best proteinIdentifCoverage (standard score),1
+&#xd;&#xa;qmANS =&gt; annotation sources (count),1
+&#xd;&#xa;qmCSEV =&gt; charge states evidence (count),0.2
+&#xd;&#xa;qmBCSP=&gt; best correlation with source or product peptide (correl),1
+&#xd;&#xa;qmBCCS =&gt; best correlation with other charge state (correl),1
+&#xd;&#xa;qmBCOS =&gt; best correlation with other sibling peptide (correl),1
+		<param name="statisticalMeasuresConfig" type="text" area="true" size="6x70" label="Statistical measures configuration" 
+		help="Here you may specify the statistical measures that are found in the ms/ms results (e.g. p or e-values). 
+		The format is: SM alias => SM name,type,mode[min/max]. "
+value="smXTD =&gt; MS:1001330,XSLASH!Tandem:expect,min
+&#xd;&#xa;pvCSVEX =&gt; p_value,CSV_EXPORT,min
+&#xd;&#xa;smLIKELIHOOD =&gt; LOGLIKELIHOOD,PLGS/Databank-search,max
+     	<param name="filterOutUnannotatedAlignments" type="boolean" checked="true" 
+     	label="Filter out unannotated alignments" 
+     	help="This helps decrease the output file size (features with no annotation are then not reported anymore)"/>
+		<param name="filterOutFPAnnotations" type="boolean" checked="true" 
+     	label="Filter out False Positive (FP) annotations" />
+		<param name="fpCriteriaExpression" type="text" size="120" label="False Positive (FP) criteria for annotations" 
+		help="Criteria (in standard score measures) for classifying an annotation as False Positive (FP). 
+		You can build logical rules using the QM aliases above, the keywords 'and', 'or' and parenthesis. 
+		Comparisons can be made with '==,&lt;,&gt;&lt;=,&gt;='" 
+		value="qmDRT &lt;0 or qmDMA &lt;-0.5 or (qmDMP &lt;-0.5 and qmBSCR&lt;-0.5) or (!isNaN(smXTD) and smXTD &gt;0.01)"/>
+     	<param name="filterOutFPIds" type="boolean" checked="true" 
+     	label="Filter out False Positive (FP) peptide identifications" />
+		<param name="fpCriteriaExpressionForIds" type="text" size="120" 
+		label="False Positive (FP) criteria for identifications" 
+		help="Criteria (in standard score measures) for classifying a peptide identification as False Positive (FP). 
+		Here you can use a subset of the quality measures (qmDMP, qmBSCR, qmSTCV, qmPICV, qmCSEV) and all statistical measures."
+		value="(qmDMP &lt;-0.5 and qmBSCR&lt;-0.5) or (!isNaN(smXTD) and smXTD &gt;0.01)"/>
+     	<param name="addRawRankingInfo" type="boolean" checked="false" 
+     	label="Include the raw scores/values of the ranking attributes in the CSV output" 
+     	help="This will result in one extra column per ranking attribute, each column holding the original data for this attribute (before normalization)."/>
+     	<param name="addScaledIntensityInfo" type="boolean" checked="false" 
+     	label="Include computed scaled intensity values in the CSV output" 
+     	help="The autoscaled and 'z-score'scaled (aka 'standard-score'scaled) intensity values are then added to the full CSV output file"/>
+     	<param name="addRawIntensityInfo" type="boolean" checked="false" 
+     	label="Include the raw intensity values in the CSV output" 
+     	help="The original intensity values (as found in the input file) are then added to the full CSV output file"/>
+	</inputs>
+	<configfiles>
+		<configfile name="rankingMetadataFile">${rankingWeightConfig}</configfile>
+		<configfile name="statisticalMeasuresConfigFile">${statisticalMeasuresConfig}</configfile>
+		<configfile name="annotationSourceConfigFile">## start comment
+		## iterate over the selected files and store their names in the config file
+		#for $i, $s in enumerate( $annotationSourceFiles )
+			${s.identificationsFile}|${s.spectraFile}
+			## also print out the datatype in the next line, based on previously configured datatype
+			#if isinstance( $s.identificationsFile.datatype, $__app__.datatypes_registry.get_datatype_by_extension('apml').__class__):
+				apml
+			#else:
+        		mzid
+      		#end if
+		#end for
+		## end comment</configfile>
+	</configfiles>
+	<outputs>
+	  <data name="outputApml" format="apml" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: quantifications (filtered APML)" metadata_source="apmlFile">
+	 	<!-- If the expression is false, the file is not created -->
+	  	<filter>( apmlFile != None )</filter>
+	  </data>
+	  <data name="outNewIdsApml" format="apml" label="${tool.name} on ${on_string}: identifications (filtered APML)" >
+	  	<filter>( filterOutFPIds == True )</filter>
+	  </data>
+	  <data name="outputCSV" format="csv" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: Full CSV" metadata_source="apmlFile">
+	  	<filter>( apmlFile != None )</filter>
+	  </data>
+	  <data name="outRankingTable" format="csv" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: Ranking table (CSV)" metadata_source="apmlFile">
+	  	<filter>( apmlFile != None )</filter>
+	  </data>
+	  <data name="outProteinCoverageCSV" format="csv" label="${tool.name} on ${on_string}: Protein coverage details (CSV)">
+	  	<!-- If the expression is false, the file is not created -->
+	  	<filter>( len(list(enumerate(annotationSourceFiles))) > 0 )</filter>
+	  </data>
+	  <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report"/>
+	</outputs>
+	<tests>
+	</tests>
+  <help>
+.. class:: infomark
+This tool takes in peptide quantification results (e.g. either by SEDMAT for label-free data or by Quantiline for labeled data)
+and calculates a number of quality measures that can help in assessing the correctness of the quantification assignment and of the MS/MS peptide 
+identification itself. The user can use any combination of quality measures (qm's) and statistical measures (sm's) to filter out
+low scoring entries.   
+.. class:: infomark
+In the label-free data processed by SEDMAT it is possible that a feature quantification gets assigned to different peptides. This means
+we have an ambiguous assignment. In such a case
+this tool also does a ranking of the different assignments according to their quality measures so that the best scoring assignment
+gets ranked as first.  
+**List of abbreviations**
+QM: Quality Measure
+SM: Statistical Measure (e.g. p-value, e-value from MS/MS identification) 
+PSM:  "Peptide to Spectrum Match" (aka peptide identification)
+FP: False Positive
+**Filtering options details**
+The FP criteria will be applied to an annotation even if the corresponding quality measures involved 
+in the expression can NOT ALL be determined. QMs that cannot be determined, get the value 0 (zero) which is 
+equal to giving it the average value. 
+The output report shows some plots that visualize the filtering done. This can help in fine-tuning the right filtering
+**Output details**
+*APML output*
+This tools returns the given APML alignment file further annotated at the alignment level with the best ranking 
+peptides of each respective alignment. This APML can be used in subsequent Galaxy tools like the proteomics tools
+from NBIC.  
+The APML output can also be used for the Protein Inference step (see Quantifere tool).
+*CSV output*
+It also returns a CSV format output with the full quality measures and scoring and ranking details. The user could use
+this to manually determine new weights for some of the quality measures by techniques such as 
+linear regression. In other words, this CSV can then be used to fine-tune the weights in a next run. 
+Many of the quality measures (QMs) are normalized to their Standard Score (aka z-score). 
+`See Standard Score for more details...`__ 
+Next to giving insight into how the ranking was established, a more complete version of this CSV file is also
+generated for tools that cannot or won't process the APML output format.  
+Below an brief overview of the CSV and an illustration of the ranking done in case of ambiguous peptides to feature assignments
+(explained above, can happen in case of label-free data processing by SEDMAT).
+.. image:: $PATH_TO_IMAGES/msfilt_csv_out.png 
+.. __: javascript:window.open('http://en.wikipedia.org/wiki/Standard_score','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')
+  </help>