view msfilt.xml @ 7:6a95bdfbe36d

fixes in thread-safety of filtering step
author pieter.lukasse@wur.nl
date Tue, 04 Feb 2014 16:32:21 +0100
parents d50f079096ee
children 72d4a37869ee
line wrap: on
line source

<tool name="MsFilt" id="msfilt" version="1.0.2">
	<description>Filters annotations based MS/MS peptide identification and annotation quality measures</description>
	<!-- 
	   For remote debugging start you listener on port 8000 and use the following as command interpreter:
	       java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 
	                    //////////////////////////
	    -->
	<command interpreter="java -jar ">
	    MsFilt.jar 
	    -apmlFile $apmlFile
	    -datasetCode $apmlFile.metadata.base_name
	    -rankingMetadataFile $rankingMetadataFile
	    -statisticalMeasuresConfigFile $statisticalMeasuresConfigFile
	    -annotationSourceConfigFile $annotationSourceConfigFile
	    -outApml $outputApml
	    -outNewIdsApml $outNewIdsApml
	    -outFullCSV $outputCSV 
	    -outRankingTable $outRankingTable
	    -outProteinCoverageCSV $outProteinCoverageCSV
	    -fpCriteriaExpression "$fpCriteriaExpression"
	    -filterOutFPAnnotations $filterOutFPAnnotations
	    -fpCriteriaExpressionForIds "$fpCriteriaExpressionForIds"
	    -filterOutFPIds $filterOutFPIds
	    -filterOutUnannotatedAlignments $filterOutUnannotatedAlignments
	    -addRawRankingInfo $addRawRankingInfo
	    -addScaledIntensityInfo $addScaledIntensityInfo
	    -addRawIntensityInfo $addRawIntensityInfo
    	-outReport $htmlReportFile
	    -outReportPicturesPath $htmlReportFile.files_path
	</command>
	
	<inputs>
	 	
   		<param name="apmlFile" type="data" format="apml" optional="true" 
   		         label="(Optional) Peptide quantification file (APML)" 
   		         help="The APML contents as aligned and annotated feature lists. E.g. produced by 
   		               SEDMAT or Quantiline tools." />
   		
   		<repeat name="annotationSourceFiles" title="(Optional) Peptide identification files" help="Full set of MS/MS peptide identification files, including peptides that could not be quantified.">
   			<param name="identificationsFile" type="data" format="apml,mzidentml,prims.fileset.zip" label="Identifications file (APML or MZIDENTML or MZIDENTML fileSet)" />
   			<param name="spectraFile" type="data" format="mzidentml,prims.fileset.zip" optional="true" label="(Optional) Spectra fileSet (mzml file or fileSet)"
   				   help="Select this in case your Identifications file is MZIDENTML or MZIDENTML fileSet" />
   		</repeat>
   		
     	<!-- 
     	<param name="maxNrRankings" type="integer" size="10" value="0" label="Maximum nr. of items to leave in the final ranking (set=0 for no limit) " />
     	-->
     	<!--  TODO add info somewhere that deltaRt is 'corrected deltaRt' -->
		<param name="rankingWeightConfig" type="text" area="true" size="13x70" label="Quality Measures (qm's) and ranking weights configuration" 
		help="Here you may specify a weight for each of the Quality Measures (QMs). These are used for the final QM score and possibly for ranking (e.g. in case of label-free data
		processed by SEDMAT). The format is: QM alias => QM name,weight. "
value="qmDRT =&gt; delta rt (standard score),1
&#xd;&#xa;qmDMA =&gt; delta mass annotation (standard score),1
&#xd;&#xa;qmDMP =&gt; delta mass psm (standard score),1
&#xd;&#xa;qmBSCR =&gt; best peptide score (standard score),1
&#xd;&#xa;qmALCV =&gt; alignment coverage (fraction),1
&#xd;&#xa;qmSTCV =&gt; score type coverage (fraction),1
&#xd;&#xa;qmPACV =&gt; peptide's best proteinAnnotCoverage (standard score),1
&#xd;&#xa;qmPICV =&gt; peptide's best proteinIdentifCoverage (standard score),1
&#xd;&#xa;qmANS =&gt; annotation sources (count),1
&#xd;&#xa;qmCSEV =&gt; charge states evidence (count),0.2
&#xd;&#xa;qmBCSP=&gt; best correlation with source or product peptide (correl),1
&#xd;&#xa;qmBCCS =&gt; best correlation with other charge state (correl),1
&#xd;&#xa;qmBCOS =&gt; best correlation with other sibling peptide (correl),1
"/>

		<param name="statisticalMeasuresConfig" type="text" area="true" size="6x70" label="Statistical measures configuration" 
		help="Here you may specify the statistical measures that are found in the ms/ms results (e.g. p or e-values). 
		The format is: SM alias => SM name,type,mode[min/max]. "
value="smXTD =&gt; MS:1001330,XSLASH!Tandem:expect,min
&#xd;&#xa;pvCSVEX =&gt; p_value,CSV_EXPORT,min
&#xd;&#xa;smAUTO_LIKELIHOOD =&gt; AUTOMOD_LOGLIKELIHOOD,PLGS/Auto-mod,max
&#xd;&#xa;smLIKELIHOOD =&gt; LOGLIKELIHOOD,PLGS/Databank-search,max
"/>

     	<param name="filterOutUnannotatedAlignments" type="boolean" checked="true" 
     	label="Filter out unannotated alignments" 
     	help="This helps decrease the output file size (features with no annotation are then not reported anymore)"/>

		<param name="filterOutFPAnnotations" type="boolean" checked="true" 
     	label="Filter out False Positive (FP) annotations" />

		<param name="fpCriteriaExpression" type="text" size="120" label="False Positive (FP) criteria for annotations" 
		help="Criteria (in standard score measures) for classifying an annotation as False Positive (FP). 
		You can build logical rules using the QM aliases above, the keywords 'and', 'or' and parenthesis. 
		Comparisons can be made with '==,&lt;,&gt;&lt;=,&gt;='" 
		value="qmDRT &lt;0 or qmDMA &lt;-0.5 or (qmDMP &lt;-0.5 and qmBSCR&lt;-0.5) or (!isNaN(smXTD) and smXTD &gt;0.01)"/>


     	<param name="filterOutFPIds" type="boolean" checked="true" 
     	label="Filter out False Positive (FP) peptide identifications" />
		
		<param name="fpCriteriaExpressionForIds" type="text" size="120" 
		label="False Positive (FP) criteria for identifications" 
		help="Criteria (in standard score measures) for classifying a peptide identification as False Positive (FP). 
		Here you can use a subset of the quality measures (qmDMP, qmBSCR, qmSTCV, qmPICV, qmCSEV) and all statistical measures."
		value="(qmDMP &lt;-0.5 and qmBSCR&lt;-0.5) or (!isNaN(smXTD) and smXTD &gt;0.01)"/>
				
     	
     	<param name="addRawRankingInfo" type="boolean" checked="false" 
     	label="Include the raw scores/values of the ranking attributes in the CSV output" 
     	help="This will result in one extra column per ranking attribute, each column holding the original data for this attribute (before normalization)."/>
     	
     	<param name="addScaledIntensityInfo" type="boolean" checked="false" 
     	label="Include computed scaled intensity values in the CSV output" 
     	help="The autoscaled and 'z-score'scaled (aka 'standard-score'scaled) intensity values are then added to the full CSV output file"/>
     	
     	<param name="addRawIntensityInfo" type="boolean" checked="false" 
     	label="Include the raw intensity values in the CSV output" 
     	help="The original intensity values (as found in the input file) are then added to the full CSV output file"/>
     	
     	
	</inputs>
	<configfiles>
		<configfile name="rankingMetadataFile">${rankingWeightConfig}</configfile>
		<configfile name="statisticalMeasuresConfigFile">${statisticalMeasuresConfig}</configfile>
		<configfile name="annotationSourceConfigFile">## start comment
		## iterate over the selected files and store their names in the config file
		#for $i, $s in enumerate( $annotationSourceFiles )
			${s.identificationsFile}|${s.spectraFile}
			## also print out the datatype in the next line, based on previously configured datatype
			#if isinstance( $s.identificationsFile.datatype, $__app__.datatypes_registry.get_datatype_by_extension('apml').__class__):
				apml
			#else:
        		mzid
      		#end if
		#end for
		## end comment</configfile>
	</configfiles>
	<outputs>
	  <data name="outputApml" format="apml" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: quantifications (filtered APML)" metadata_source="apmlFile">
	 	<!-- If the expression is false, the file is not created -->
	  	<filter>( apmlFile != None )</filter>
	  </data>
	  <data name="outNewIdsApml" format="apml" label="${tool.name} on ${on_string}: identifications (filtered APML)" >
	  	<filter>( filterOutFPIds == True )</filter>
	  </data>
	  <data name="outputCSV" format="csv" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: Full CSV" metadata_source="apmlFile">
	  	<filter>( apmlFile != None )</filter>
	  </data>
	  <data name="outRankingTable" format="csv" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: Ranking table (CSV)" metadata_source="apmlFile">
	  	<filter>( apmlFile != None )</filter>
	  </data>
	  <data name="outProteinCoverageCSV" format="csv" label="${tool.name} on ${on_string}: Protein coverage details (CSV)">
	  	<!-- If the expression is false, the file is not created -->
	  	<filter>( len(list(enumerate(annotationSourceFiles))) > 0 )</filter>
	  </data>
	  <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report"/>
	</outputs>
	<tests>
	</tests>
  <help>
  
.. class:: infomark
  
This tool takes in peptide quantification results (e.g. either by SEDMAT for label-free data or by Quantiline for labeled data)
and calculates a number of quality measures that can help in assessing the correctness of the quantification assignment and of the MS/MS peptide 
identification itself. The user can use any combination of quality measures (qm's) and statistical measures (sm's) to filter out
low scoring entries.   

.. class:: infomark

In the label-free data processed by SEDMAT it is possible that a feature quantification gets assigned to different peptides. This means
we have an ambiguous assignment. In such a case
this tool also does a ranking of the different assignments according to their quality measures so that the best scoring assignment
gets ranked as first.  

-----

**List of abbreviations**

QM: Quality Measure

SM: Statistical Measure (e.g. p-value, e-value from MS/MS identification) 

PSM:  "Peptide to Spectrum Match" (aka peptide identification)

FP: False Positive

-----

**Filtering options details**

The FP criteria will be applied to an annotation even if the corresponding quality measures involved 
in the expression can NOT ALL be determined. QMs that cannot be determined, get the value 0 (zero) which is 
equal to giving it the average value. 

The output report shows some plots that visualize the filtering done. This can help in fine-tuning the right filtering
criteria.

-----

**Output details**

*APML output*

This tools returns the given APML alignment file further annotated at the alignment level with the best ranking 
peptides of each respective alignment. This APML can be used in subsequent Galaxy tools like the proteomics tools
from NBIC.  

The APML output can also be used for the Protein Inference step (see Quantifere tool).

*CSV output*

It also returns a CSV format output with the full quality measures and scoring and ranking details. The user could use
this to manually determine new weights for some of the quality measures by techniques such as 
linear regression. In other words, this CSV can then be used to fine-tune the weights in a next run. 

Many of the quality measures (QMs) are normalized to their Standard Score (aka z-score). 
`See Standard Score for more details...`__ 

Next to giving insight into how the ranking was established, a more complete version of this CSV file is also
generated for tools that cannot or won't process the APML output format.  

Below an brief overview of the CSV and an illustration of the ranking done in case of ambiguous peptides to feature assignments
(explained above, can happen in case of label-free data processing by SEDMAT).


.. image:: $PATH_TO_IMAGES/msfilt_csv_out.png 



.. __: javascript:window.open('http://en.wikipedia.org/wiki/Standard_score','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')




  </help>
</tool>