view quantifere.xml @ 18:ad911e9aaf33

small fix in msfilt report output
author pieter.lukasse@wur.nl
date Fri, 01 Aug 2014 17:22:37 +0200
parents 40ec8770780d
children d31c6978d9d0
line wrap: on
line source

<tool name="Quantifere" id="quantifere1" version="1.0.3">
	<description>Protein Inference by Peptide Quantification patterns</description>
	<!-- 
	   For remote debugging start you listener on port 8000 and use the following as command interpreter:
	       java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 
	                    //////////////////////////
	    -->
	<command interpreter="java -jar ">
	    Quantifere.jar 
	    -annotatedQuantificationFilesList $annotatedQuantificationFilesList
	    -identificationFilesList $identificationFilesList
    	-statisticalMeasuresConfigFile $statisticalMeasuresConfigFile
	    -quantificationDataToUse $quantificationDataToUse
	    -minCorrel $minCorrel
	    -minProtCoverage $minProtCoverage
	    -minAboveAverageHits $minAboveAverageHits
	    -minNrIdsForInferencePeptide $minNrIdsForInferencePeptide
	    -refineModel $refineModel
	    -functionalAnnotationCSV $functionalAnnotationCSV
	    -outputCSV $outputCSV
	    -outputInferenceLogCSV $outputInferenceLogCSV
	    -outputSummaryAnnotationCSV $outputSummaryAnnotationCSV
	    -outReport $htmlReportFile
	    -outReportPicturesPath $htmlReportFile.files_path
	    #if $is2D_LC_MS.fractions == True
        	-namingConventionCodesForFractions $is2D_LC_MS.namingConventionCodesForFractions
        #end if
	</command>
	
	<inputs>
	 	
   		<repeat name="annotatedQuantificationFiles" title="Peptide (filtered) quantification files (APML)" 
   		help="The APML contents as aligned, annotated and scored feature lists, 
   		as produced by MsFilt tool. Select one or more files. For 2D-LC-MS we expect one file per fraction.">
   			<param name="annotatedQuantificationFile" size="50" type="data" format="apml" label="File (APML format)" />
   		</repeat>
   		
   		<repeat name="identificationFiles" title="Peptide (filtered) identification files (MS/MS identifications)" 
   		help="Full set of MS/MS peptide identification files, including peptides that could not be quantified.
   		This set of identifications is ideally filtered on some quality and 
   		statistical measures (e.g. as is done by MsFilt). Tip: to base the inference only on the 
   		selected peptide quantification files, you
   		can select the same quantification files here as well. Select one or more files.">
   			<param name="identificationFile" size="50" type="data" format="apml,mzid" label="File (APML or MZIDENTML format)" />
   		</repeat>
   		
   		<conditional name="is2D_LC_MS">
     		<param name="fractions" type="boolean" truevalue="Yes" falsevalue="No" checked="false" 
     		label="Data is from 2D LC-MS"
     		help="Data acquisition was done in multiple fractions."/>
     		<when value="Yes"> 
     			<param name="namingConventionCodesForFractions" type="text" size="100" value="" 
     			label="Part of run/file name that identifies the 2D LC-MS fraction" 
     			help="Add the CSV list of codes that occur in the file names 
     				and that stand for a fraction code. E.g. '_F1,_F2,_F3,etc.' In this
     				way different peptide identifications from the same sample but measured 
     				in different fractions can be merged together. Otherwise each (fraction) file
     				is seen as a separate sample."/> <!-- could do regular expressions as well but this would be hard for biologists, e.g. _F\d\b -->
     		</when>
     		<when value="No">
     		</when>
     	</conditional>
   		
   		<param name="statisticalMeasuresConfig" type="text" area="true" size="8x70" label="Statistical measures configuration" 
				help="Here you may specify the statistical measures that are found in the ms/ms results (e.g. p or e-values). 
				The format is: SM alias => SM name,type,mode[min/max]. Leaving this configuration out while these are present in the
				dataset will have the effect that they will be wrongly used as a regular scoring scheme, having effect on for example
				the filter criteria below like 'Minimum number of peptide matches with a score above average' ."
value="smXTD =&gt; MS:1001330,XSLASH!Tandem:expect,min
&#xd;&#xa;pvCSVEX =&gt; p_value,CSV_EXPORT,min
&#xd;&#xa;smAUTO_LIKELIHOOD =&gt; AUTOMOD_LOGLIKELIHOOD,PLGS/Auto-mod,max
&#xd;&#xa;smLIKELIHOOD =&gt; LOGLIKELIHOOD,PLGS/Databank-search,max
&#xd;&#xa;smPercoProb =&gt; Percolator: probability,Percolator probability,max
&#xd;&#xa;smPercoPEP =&gt; Percolator: PEP,Percolator PEP,min
&#xd;&#xa;smPercoQval =&gt; Percolator: q-Value,Percolator q-Value,max
"/>
<!-- keep value attribute above aligned like this to avoid white spaces in the value -->				
   		<param name="quantificationDataToUse" type="select" 
   		label="Quantification data to use" 
   		help="Quantification data to use for the pattern clustering and inference steps. NB: check if the chosen data is also 
   		      present in your file, or choose 'auto' to let Quantifere check which quantification type is present in most peptides.">
	    	<option value="auto" selected="true">auto</option>
	    	<option value="getIntensity">(TODO)raw intensities</option>
	    	<option value="getApexIntensity">(TODO)apex intensities</option>
	    	<option value="getNormalizedIntensity">(TODO)normalized intensities</option>
		</param>
   		<!-- TODO let minCorrel default value vary according to quantification type chosen above -->
		<param name="minCorrel" type="float" size="10" value="0.85" label="Minimum correlation in a cluster" help="Features will be grouped by their protein annotation and 
		sample intensity values correlation. Set here the minimum correlation expected between grouped members. This is used to guide the clustering algorithm."/>

		<!--  simple extra heuristics to remove some "noise" protein hits  -->
		<param name="minProtCoverage" type="float" size="10" value="5.0" label="Minimum protein coverage (%)" help="This will remove proteins that have a too small 
		portion of their sequence covered by peptide matches."/>
		
		<param name="minAboveAverageHits" type="integer" size="10" value="1" label="Minimum number of different peptide matches with a score above average" 
		help="This will remove proteins that do not have enough reasonable peptides hits."/>

		<param name="minNrIdsForInferencePeptide" type="integer" size="10" value="1" label="Minimum number of peptide identifications for inference peptides" 
		help="Minimum number of peptide identifications a peptide needs to be used as inference peptide for secondary proteins."/>


     	<param name="functionalAnnotationCSV" type="data" format="csv,txt,tsv" optional="true" 
     	label="(Functional)annotation mapping file (csv or tsv format)" 
     	help="Optional file that maps protein accessions to a network, pathway or other higher level annotations. In this file a header line is expected with these 2 columns (names and lower case is important): accession,annotation"/>
     	
     	<param name="refineModel" type="boolean" checked="true" label="Refine matches model" 
     	help="This will let the algorithm search for a reduced set of secondary protein matches that still explains the variation in the peptide quantification patterns"/>
     	
     	
     	<param name="summaryReport" type="boolean" checked="true" label="Generate summary report"/>
     	
	</inputs>
	<configfiles>
		<configfile name="annotatedQuantificationFilesList">## start comment
		## iterate over the selected files and store their names in the config file
		#for $i, $s in enumerate( $annotatedQuantificationFiles )
			${s.annotatedQuantificationFile}
		#end for
		## end comment</configfile>
		
		<configfile name="identificationFilesList">## start comment
		## iterate over the selected files and store their names in the config file
		#for $i, $s in enumerate( $identificationFiles )
			${s.identificationFile}
			## also print out the datatype in the next line, based on previously configured datatype
			#if isinstance( $s.identificationFile.datatype, $__app__.datatypes_registry.get_datatype_by_extension('apml').__class__):
				apml
			#else:
        		mzid
      		#end if
		#end for
		## end comment</configfile>
		<configfile name="statisticalMeasuresConfigFile">## start comment
			${statisticalMeasuresConfig}
		</configfile>
	</configfiles>
	<outputs>
	  <data name="outputCSV" format="csv" label="${tool.name} on ${on_string}: Proteins list (CSV)" />
	  <data name="outputInferenceLogCSV" format="csv" label="${tool.name} on ${on_string}: Inference log (CSV)"/>
	  <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report">
	 	<!-- If the expression is false, the file is not created -->
	  	<filter>( summaryReport == True )</filter>
	  </data>
	  <data name="outputSummaryAnnotationCSV" format="csv" label="${tool.name} on ${on_string} - Functional annotation summary (CSV)">
	 	<!-- If the expression is false, the file is not created -->
	  	<filter>( functionalAnnotationCSV != None )</filter>
	  </data>
	</outputs>
	<tests>
	</tests>
  <help>
  
.. class:: infomark
  
This tool takes Peptide Quantification patterns and uses this to do Protein Inference of both Primary Protein 
identifications as well as Secondary Protein identifications. This last class of protein identifications 
can not be done by traditional protein inference methods that look only at peptide identifications and 
their quality parameters. 


-----

**List of definitions**

Primary Protein identification: protein identification belonging to the minimum set of proteins needed
to account for the observed peptides.  

Secondary Protein identification: extra protein identifications that do not below to the minimum set
of proteins mentioned above. 

raw intensities : is the intensity value resulting from the integration of the feature peak area

apex intensities: is the intensity value as on the highest point of the feature peak

normalized intensities : is the intensity normalized by some means

-----

**Minimum correlation in a cluster**

TODO - add doc.

-----

**Output details**

*Proteins list (CSV)*

This is the list of primary and secondary proteins and their calculated inference score. Proteins 
with exactly the same peptide hits are also grouped together and labeled as primary_group and secondary_group
instead of simply primary and secondary.


*Inference log (CSV)*

This CSV table shows all data, both inferred and ruled out proteins. This can be used by the user to 
troubleshoot the inference process and understand why certain proteins might have been ruled out. 
The CSV is provided in such a format that the data can easily be explored in a Cytoscape network. 

The figure below shows an example of the data being explored in Cytoscape using also the 
`Cytoscape chartplugin`_ to visualize the quantification data when selecting the peptide nodes. 

.. image:: $PATH_TO_IMAGES/quantifere_cyto_out.png 

.

.. _Cytoscape chartplugin: http://apps.cytoscape.org/apps/chartplugin



  </help>
</tool>