view msfilt.xml @ 20:125a6afa800c

fix to NapQ
author pieter.lukasse@wur.nl
date Mon, 26 Jan 2015 06:31:52 +0100
parents ad911e9aaf33
children
line wrap: on
line source

<tool name="MsFilt" id="msfilt" version="1.0.4">
	<description>Filters annotations based MS/MS peptide identification and annotation quality measures</description>
	<!-- 
	   For remote debugging start you listener on port 8000 and use the following as command interpreter:
	       java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 
	                    //////////////////////////
	    -->
	<command interpreter="java -jar ">
	    MsFilt.jar 
	    -apmlFile $apmlFile
	    -datasetCode $apmlFile.metadata.base_name
	    -rankingMetadataFile $rankingMetadataFile
	    -statisticalMeasuresConfigFile $statisticalMeasuresConfigFile
	    -annotationSourceConfigFile $annotationSourceConfigFile
	    -outApml $outputApml
	    -outNewIdsApml $outNewIdsApml
	    -outFullCSV $outputCSV 
	    -outRankingTable $outRankingTable
	    -outProteinCoverageCSV $outProteinCoverageCSV
	    -fpCriteriaExpression "$fpCriteriaExpression"
	    -filterOutFPAnnotations $filterOutFPAnnotations
	    -fpCriteriaExpressionForIds "$fpCriteriaExpressionForIds"
	    -filterOutFPIds $filterOutFPIds
	    -filterOutUnannotatedAlignments $filterOutUnannotatedAlignments
	    -addRawRankingInfo $addRawRankingInfo
	    -addScaledIntensityInfo $addScaledIntensityInfo
	    -addRawIntensityInfo $addRawIntensityInfo
    	-outReport $htmlReportFile
	    -outReportPicturesPath $htmlReportFile.files_path
	    #if $containsPepxml.pepxmlInSet == True
        	-pepxmlDataType $containsPepxml.pepxmlDataType
			-pepxmlGeneratedBy $containsPepxml.pepxmlGeneratedBy
        #end if
	</command>
	
	<inputs>
	 	
   		<param name="apmlFile" type="data" format="apml" optional="true" 
   		         label="(Optional) Peptide quantification file (APML)" 
   		         help="The APML contents as aligned and annotated feature lists. E.g. produced by 
   		               SEDMAT or Quantiline tools." />
   		
   		<repeat name="annotationSourceFiles" title="(Optional) Peptide identification files" help="Full set of MS/MS peptide identification files, including peptides that could not be quantified.">
   			<param name="identificationsFile" type="data" format="apml,pepxml,mzidentml,prims.fileset.zip" label="Identifications file (APML, pepxml, MZIDENTML or MZIDENTML fileSet)" />
   		</repeat>
   		
   		<!-- ================== PEPXML specific ================== -->
   		<conditional name="containsPepxml">
	  		<param name="pepxmlInSet" type="boolean" truevalue="Yes" falsevalue="No" checked="false" 
	    		label="Identifications set contains one or more files in pepxml format"
	    		help="Indicate whether one or more (Optional) Peptide identification files is in pepxml format. Support for pepxml is still considered 'beta'."/> 
   			<when value="Yes">
	      		<param name="pepxmlDataType" type="select" label=">> Type of data stored in the pepxml"
	      		       help="Options marked with (*) are ProteomeDiscoverer specific scenarios">
			    	<option value="" selected="true">--Please select--</option>
			    	<option value="single_2d" >2D LC-MS runs, one per msms_run_summary</option>
			    	<option value="multi_2d">(*) 2D LC-MS runs, multiple runs (e.g. rx.F1 to rx.FN) merged as a 'single' msms_run_summary</option>
			    	<option value="single_1d">1D LC-MS runs, one per msms_run_summary</option>
				</param>
				<param name="pepxmlGeneratedBy" type="select" label=">> pepxml generated by"
					help="Some tools, like ProteomeDiscoverer 1.4, have specific issues in their pepxml generation logic. Correctly indicating the tool used here will ensure known issues are taken 
					     into consideration when the file is parsed." >
					<option value="" selected="true">--Please select--</option>
					<option value="proteome_discoverer_v1.4">ProteomeDiscoverer 1.4</option>
			    	<option value="other">Other</option> 
				</param>
	      	</when>
	      	<when value="No">
     		</when>
     	</conditional>
   		<!-- ================== END - PEPXML specific ================== -->
   		
     	<!-- 
     	<param name="maxNrRankings" type="integer" size="10" value="0" label="Maximum nr. of items to leave in the final ranking (set=0 for no limit) " />
     	-->
     	<!--  TODO add info somewhere that deltaRt is 'corrected deltaRt' -->
		<param name="rankingWeightConfig" type="text" area="true" size="13x70" label="Quality Measures (qm's) and ranking weights configuration" 
		help="Here you may specify a weight for each of the Quality Measures (QMs). These are used for the final QM score and possibly for ranking (e.g. in case of label-free data
		processed by SEDMAT). The format is: QM alias => QM name,weight. "
value="qmDRT =&gt; delta rt (standard score),1
&#xd;&#xa;qmDMA =&gt; delta mass annotation (standard score),1
&#xd;&#xa;qmDMP =&gt; delta mass psm (standard score),1
&#xd;&#xa;qmBSCR =&gt; best peptide score (standard score),1
&#xd;&#xa;qmALCV =&gt; alignment coverage (fraction),1
&#xd;&#xa;qmSTCV =&gt; score type coverage (fraction),1
&#xd;&#xa;qmPACV =&gt; peptide's best proteinAnnotCoverage (standard score),1
&#xd;&#xa;qmPICV =&gt; peptide's best proteinIdentifCoverage (standard score),1
&#xd;&#xa;qmANS =&gt; annotation sources (count),1
&#xd;&#xa;qmCSEV =&gt; charge states evidence (count),0.2
&#xd;&#xa;qmBCSP=&gt; best correlation with source or product peptide (correl),1
&#xd;&#xa;qmBCCS =&gt; best correlation with other charge state (correl),1
&#xd;&#xa;qmBCOS =&gt; best correlation with other sibling peptide (correl),1
"/>

		<param name="statisticalMeasuresConfig" type="text" area="true" size="8x70" label="Statistical measures configuration" 
		help="Here you may specify the statistical measures that are found in the ms/ms results (e.g. p or e-values). 
		The format is: SM alias => SM name,type,mode[min/max]. "
value="smXTD =&gt; MS:1001330,XSLASH!Tandem:expect,min
&#xd;&#xa;pvCSVEX =&gt; p_value,CSV_EXPORT,min
&#xd;&#xa;smAUTO_LIKELIHOOD =&gt; AUTOMOD_LOGLIKELIHOOD,PLGS/Auto-mod,max
&#xd;&#xa;smLIKELIHOOD =&gt; LOGLIKELIHOOD,PLGS/Databank-search,max
&#xd;&#xa;smPercoProb =&gt; Percolator: probability,Percolator probability,max
&#xd;&#xa;smPercoPEP =&gt; Percolator: PEP,Percolator PEP,min
&#xd;&#xa;smPercoQval =&gt; Percolator: q-Value,Percolator q-Value,max
"/>

     	<param name="filterOutUnannotatedAlignments" type="boolean" checked="true" 
     	label="Filter out unannotated alignments" 
     	help="This helps decrease the output file size (features with no annotation are then not reported anymore)"/>

		<param name="filterOutFPAnnotations" type="boolean" checked="true" 
     	label="Filter out False Positive (FP) annotations" />

		<param name="fpCriteriaExpression" type="text" size="120" label="False Positive (FP) criteria for annotations" 
		help="Criteria (in standard score measures) for classifying an annotation as False Positive (FP). 
		You can build logical rules using the QM aliases above, the keywords 'and', 'or' and parenthesis. 
		Comparisons can be made with '==,&lt;,&gt;&lt;=,&gt;='" 
		value="qmDRT &lt;0 or qmDMA &lt;-0.5 or (qmDMP &lt;-0.5 and qmBSCR&lt;-0.5) or (!isNaN(smXTD) and smXTD &gt;0.01)"/>


     	<param name="filterOutFPIds" type="boolean" checked="true" 
     	label="Filter out False Positive (FP) peptide identifications" />
		
		<param name="fpCriteriaExpressionForIds" type="text" size="120" 
		label="False Positive (FP) criteria for identifications" 
		help="Criteria (in standard score measures) for classifying a peptide identification as False Positive (FP). 
		Here you can use a subset of the quality measures (qmDMP, qmBSCR, qmSTCV, qmPICV, qmCSEV) and all statistical measures."
		value="(qmDMP &lt;-0.5 and qmBSCR&lt;-0.5) or (!isNaN(smXTD) and smXTD &gt;0.01)"/>
				
     	
     	<param name="addRawRankingInfo" type="boolean" checked="false" 
     	label="Include the raw scores/values of the ranking attributes in the CSV output" 
     	help="This will result in one extra column per ranking attribute, each column holding the original data for this attribute (before normalization)."/>
     	
     	<param name="addScaledIntensityInfo" type="boolean" checked="false" 
     	label="Include computed scaled intensity values in the CSV output" 
     	help="The autoscaled and 'z-score'scaled (aka 'standard-score'scaled) intensity values are then added to the full CSV output file"/>
     	
     	<param name="addRawIntensityInfo" type="boolean" checked="false" 
     	label="Include the raw intensity values in the CSV output" 
     	help="The original intensity values (as found in the input file) are then added to the full CSV output file"/>
     	
     	
	</inputs>
	<configfiles>
		<configfile name="rankingMetadataFile">${rankingWeightConfig}</configfile>
		<configfile name="statisticalMeasuresConfigFile">${statisticalMeasuresConfig}</configfile>
		<configfile name="annotationSourceConfigFile">## start comment
		## iterate over the selected files and store their names in the config file
		#for $i, $s in enumerate( $annotationSourceFiles )
			${s.identificationsFile}
			## also print out the datatype in the next line, based on previously configured datatype
			#if isinstance( $s.identificationsFile.datatype, $__app__.datatypes_registry.get_datatype_by_extension('pepxml').__class__):
				pepxml
			#elif isinstance( $s.identificationsFile.datatype, $__app__.datatypes_registry.get_datatype_by_extension('apml').__class__):
				apml
			#else:
        		mzid
      		#end if
		#end for
		## end comment</configfile>
	</configfiles>
	<outputs>
	  <data name="outputApml" format="apml" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: quantifications (filtered APML)" metadata_source="apmlFile">
	 	<!-- If the expression is false, the file is not created -->
	  	<filter>( apmlFile != None )</filter>
	  </data>
	  <data name="outNewIdsApml" format="apml" label="${tool.name} on ${on_string}: identifications (filtered APML)" >
	  	<filter>( filterOutFPIds == True )</filter>
	  </data>
	  <data name="outputCSV" format="csv" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: Full CSV" metadata_source="apmlFile">
	  	<filter>( apmlFile != None )</filter>
	  </data>
	  <data name="outRankingTable" format="csv" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: Ranking table (CSV)" metadata_source="apmlFile">
	  	<filter>( apmlFile != None )</filter>
	  </data>
	  <data name="outProteinCoverageCSV" format="csv" label="${tool.name} on ${on_string}: Protein coverage details (CSV)">
	  	<!-- If the expression is false, the file is not created -->
	  	<filter>( len(list(enumerate(annotationSourceFiles))) > 0 )</filter>
	  </data>
	  <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report"/>
	</outputs>
	<tests>
	</tests>
  <help>
  
.. class:: infomark
  
This tool takes in peptide quantification results (e.g. either by SEDMAT for label-free data or by Quantiline for labeled data)
and calculates a number of quality measures that can help in assessing the correctness of the quantification assignment and of the MS/MS peptide 
identification itself. The user can use any combination of quality measures (qm's) and statistical measures (sm's) to filter out
low scoring entries.   

.. class:: infomark

In the label-free data processed by SEDMAT it is possible that a feature quantification gets assigned to different peptides. This means
we have an ambiguous assignment. In such a case
this tool also does a ranking of the different assignments according to their quality measures so that the best scoring assignment
gets ranked as first.  

-----

**List of abbreviations**

QM: Quality Measure

SM: Statistical Measure (e.g. p-value, e-value from MS/MS identification) 

PSM:  "Peptide to Spectrum Match" (aka peptide identification)

FP: False Positive

-----

**Filtering options details**

The FP criteria will be applied to an annotation even if the corresponding quality measures involved 
in the expression can NOT ALL be determined. QMs that cannot be determined, get the value 0 (zero) which is 
equal to giving it the average value. 

The output report shows some plots that visualize the filtering done. This can help in fine-tuning the right filtering
criteria.


*Filter criteria examples for filtering identifications*

================================== ===========================================================================
Data type                          Example filter criteria
---------------------------------- ---------------------------------------------------------------------------
QExactive/ProteomeDiscoverer data  * (qmBSCR&lt;-1.0) or (!isNaN(smPercoProb) and 18-20*smPercoProb&gt;qmBSCR)
X!Tandem identifications data      * (qmDMP &lt;-0.5 and qmBSCR&lt;-0.5) or (!isNaN(smXTD) and smXTD&gt;0.01)
================================== ===========================================================================


-----

**Output details**

*APML output*

This tools returns the given APML alignment file further annotated at the alignment level with the best ranking 
peptides of each respective alignment. This APML can be used in subsequent Galaxy tools like the proteomics tools
from NBIC.  

The APML output can also be used for the Protein Inference step (see Quantifere tool).

*CSV output*

It also returns a CSV format output with the full quality measures and scoring and ranking details. The user could use
this to manually determine new weights for some of the quality measures by techniques such as 
linear regression. In other words, this CSV can then be used to fine-tune the weights in a next run. 

Many of the quality measures (QMs) are normalized to their Standard Score (aka z-score). 
`See Standard Score for more details...`__ 

Next to giving insight into how the ranking was established, a more complete version of this CSV file is also
generated for tools that cannot or won't process the APML output format.  

Below an brief overview of the CSV and an illustration of the ranking done in case of ambiguous peptides to feature assignments
(explained above, can happen in case of label-free data processing by SEDMAT).


.. image:: $PATH_TO_IMAGES/msfilt_csv_out.png 



.. __: javascript:window.open('http://en.wikipedia.org/wiki/Standard_score','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')




  </help>
</tool>