Mercurial > repos > pieterlukasse > prims_proteomics

diff quantifere.xml @ 0:d50f079096ee
Push to main toolshed
author: pieter.lukasse@wur.nl
date: Wed, 08 Jan 2014 11:39:16 +0100
children: 73c7c6589202
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/quantifere.xml	Wed Jan 08 11:39:16 2014 +0100
@@ -0,0 +1,206 @@
+<tool name="Quantifere" id="quantifere1" version="1.0.2">
+	<description>Protein Inference by Peptide Quantification patterns</description>
+	<!-- 
+	   For remote debugging start you listener on port 8000 and use the following as command interpreter:
+	       java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 
+	                    //////////////////////////
+	    -->
+	<command interpreter="java -jar ">
+	    Quantifere.jar 
+	    -annotatedQuantificationFilesList $annotatedQuantificationFilesList
+	    -identificationFilesList $identificationFilesList
+    	-statisticalMeasuresConfigFile $statisticalMeasuresConfigFile
+	    -quantificationDataToUse $quantificationDataToUse
+	    -minCorrel $minCorrel
+	    -minProtCoverage $minProtCoverage
+	    -minAboveAverageHits $minAboveAverageHits
+	    -minNrIdsForInferencePeptide $minNrIdsForInferencePeptide
+	    -refineModel $refineModel
+	    -functionalAnnotationCSV $functionalAnnotationCSV
+	    -outputCSV $outputCSV
+	    -outputInferenceLogCSV $outputInferenceLogCSV
+	    -outputSummaryAnnotationCSV $outputSummaryAnnotationCSV
+	    -outReport $htmlReportFile
+	    -outReportPicturesPath $htmlReportFile.files_path
+	    #if $is2D_LC_MS.fractions == True
+        	-namingConventionCodesForFractions $is2D_LC_MS.namingConventionCodesForFractions
+        #end if
+	</command>
+	
+	<inputs>
+	 	
+   		<repeat name="annotatedQuantificationFiles" title="Peptide (filtered) quantification files (APML)" 
+   		help="The APML contents as aligned, annotated and scored feature lists, 
+   		as produced by MsFilt tool. Select one or more files. For 2D-LC-MS we expect one file per fraction.">
+   			<param name="annotatedQuantificationFile" size="50" type="data" format="apml" label="File (APML format)" />
+   		</repeat>
+   		
+   		<repeat name="identificationFiles" title="Peptide (filtered) identification files (MS/MS identifications)" 
+   		help="Full set of MS/MS peptide identification files, including peptides that could not be quantified.
+   		This set of identifications is ideally filtered on some quality and 
+   		statistical measures (e.g. as is done by MsFilt). Tip: to base the inference only on the 
+   		selected peptide quantification files, you
+   		can select the same quantification files here as well. Select one or more files.">
+   			<param name="identificationFile" size="50" type="data" format="apml,mzid" label="File (APML or MZIDENTML format)" />
+   		</repeat>
+   		
+   		<conditional name="is2D_LC_MS">
+     		<param name="fractions" type="boolean" truevalue="Yes" falsevalue="No" checked="false" 
+     		label="Data is from 2D LC-MS"
+     		help="Data acquisition was done in multiple fractions."/>
+     		<when value="Yes"> 
+     			<param name="namingConventionCodesForFractions" type="text" size="100" value="" 
+     			label="Part of run/file name that identifies the 2D LC-MS fraction" 
+     			help="Add the CSV list of codes that occur in the file names 
+     				and that stand for a fraction code. E.g. '_F1,_F2,_F3,etc.' In this
+     				way different peptide identifications from the same sample but measured 
+     				in different fractions can be merged together. Otherwise each (fraction) file
+     				is seen as a separate sample."/> <!-- could do regular expressions as well but this would be hard for biologists, e.g. _F\d\b -->
+     		</when>
+     	</conditional>
+   		
+   		<param name="statisticalMeasuresConfig" type="text" area="true" size="6x70" label="Statistical measures configuration" 
+				help="Here you may specify the statistical measures that are found in the ms/ms results (e.g. p or e-values). 
+				The format is: SM alias => SM name,type,mode[min/max]. Leaving this configuration out while these are present in the
+				dataset will have the effect that they will be wrongly used as a regular scoring scheme, having effect on for example
+				the filter criteria below like 'Minimum number of peptide matches with a score above average' ."
+value="smXTD =&gt; MS:1001330,XSLASH!Tandem:expect,min
+&#xd;&#xa;pvCSVEX =&gt; p_value,CSV_EXPORT,min
+&#xd;&#xa;smAUTO_LIKELIHOOD =&gt; AUTOMOD_LOGLIKELIHOOD,PLGS/Auto-mod,max
+&#xd;&#xa;smLIKELIHOOD =&gt; LOGLIKELIHOOD,PLGS/Databank-search,max
+"/>
+<!-- keep value attribute above aligned like this to avoid white spaces in the value -->				
+   		<param name="quantificationDataToUse" type="select" 
+   		label="Quantification data to use" 
+   		help="Quantification data to use for the pattern clustering and inference steps. NB: check if the chosen data is also 
+   		      present in your file, or choose 'auto' to let Quantifere check which quantification type is present in most peptides.">
+	    	<option value="auto" selected="true">auto</option>
+	    	<option value="getIntensity">(TODO)raw intensities</option>
+	    	<option value="getApexIntensity">(TODO)apex intensities</option>
+	    	<option value="getNormalizedIntensity">(TODO)normalized intensities</option>
+		</param>
+   		<!-- TODO let minCorrel default value vary according to quantification type chosen above -->
+		<param name="minCorrel" type="float" size="10" value="0.85" label="Minimum correlation in a cluster" help="Features will be grouped by their protein annotation and 
+		sample intensity values correlation. Set here the minimum correlation expected between grouped members. This is used to guide the clustering algorithm."/>
+
+		<!--  simple extra heuristics to remove some "noise" protein hits  -->
+		<param name="minProtCoverage" type="float" size="10" value="5.0" label="Minimum protein coverage (%)" help="This will remove proteins that have a too small 
+		portion of their sequence covered by peptide matches."/>
+		
+		<param name="minAboveAverageHits" type="integer" size="10" value="1" label="Minimum number of different peptide matches with a score above average" 
+		help="This will remove proteins that do not have enough reasonable peptides hits."/>
+
+		<param name="minNrIdsForInferencePeptide" type="integer" size="10" value="1" label="Minimum number of peptide identifications for inference peptides" 
+		help="Minimum number of peptide identifications a peptide needs to be used as inference peptide for secondary proteins."/>
+
+
+     	<param name="functionalAnnotationCSV" type="data" format="csv,txt,tsv" optional="true" 
+     	label="(Functional)annotation mapping file (csv or tsv format)" 
+     	help="Optional file that maps protein accessions to a network, pathway or other higher level annotations. In this file a header line is expected with these 2 columns (names and lower case is important): accession,annotation"/>
+     	
+     	<param name="refineModel" type="boolean" checked="true" label="Refine matches model" 
+     	help="This will let the algorithm search for a reduced set of secondary protein matches that still explains the variation in the peptide quantification patterns"/>
+     	
+     	
+     	<param name="summaryReport" type="boolean" checked="true" label="Generate summary report"/>
+     	
+	</inputs>
+	<configfiles>
+		<configfile name="annotatedQuantificationFilesList">## start comment
+		## iterate over the selected files and store their names in the config file
+		#for $i, $s in enumerate( $annotatedQuantificationFiles )
+			${s.annotatedQuantificationFile}
+		#end for
+		## end comment</configfile>
+		
+		<configfile name="identificationFilesList">## start comment
+		## iterate over the selected files and store their names in the config file
+		#for $i, $s in enumerate( $identificationFiles )
+			${s.identificationFile}
+			## also print out the datatype in the next line, based on previously configured datatype
+			#if isinstance( $s.identificationFile.datatype, $__app__.datatypes_registry.get_datatype_by_extension('apml').__class__):
+				apml
+			#else:
+        		mzid
+      		#end if
+		#end for
+		## end comment</configfile>
+		<configfile name="statisticalMeasuresConfigFile">## start comment
+			${statisticalMeasuresConfig}
+		</configfile>
+	</configfiles>
+	<outputs>
+	  <data name="outputCSV" format="csv" label="${tool.name} on ${on_string}: Proteins list (CSV)" />
+	  <data name="outputInferenceLogCSV" format="csv" label="${tool.name} on ${on_string}: Inference log (CSV)"/>
+	  <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report">
+	 	<!-- If the expression is false, the file is not created -->
+	  	<filter>( summaryReport == True )</filter>
+	  </data>
+	  <data name="outputSummaryAnnotationCSV" format="csv" label="${tool.name} on ${on_string} - Functional annotation summary (CSV)">
+	 	<!-- If the expression is false, the file is not created -->
+	  	<filter>( functionalAnnotationCSV != None )</filter>
+	  </data>
+	</outputs>
+	<tests>
+	</tests>
+  <help>
+  
+.. class:: infomark
+  
+This tool takes Peptide Quantification patterns and uses this to do Protein Inference of both Primary Protein 
+identifications as well as Secondary Protein identifications. This last class of protein identifications 
+can not be done by traditional protein inference methods that look only at peptide identifications and 
+their quality parameters. 
+
+
+-----
+
+**List of definitions**
+
+Primary Protein identification: protein identification belonging to the minimum set of proteins needed
+to account for the observed peptides.  
+
+Secondary Protein identification: extra protein identifications that do not below to the minimum set
+of proteins mentioned above. 
+
+raw intensities : is the intensity value resulting from the integration of the feature peak area
+
+apex intensities: is the intensity value as on the highest point of the feature peak
+
+normalized intensities : is the intensity normalized by some means
+
+-----
+
+**Minimum correlation in a cluster**
+
+TODO - add doc.
+
+-----
+
+**Output details**
+
+*Proteins list (CSV)*
+
+This is the list of primary and secondary proteins and their calculated inference score. Proteins 
+with exactly the same peptide hits are also grouped together and labeled as primary_group and secondary_group
+instead of simply primary and secondary.
+
+
+*Inference log (CSV)*
+
+This CSV table shows all data, both inferred and ruled out proteins. This can be used by the user to 
+troubleshoot the inference process and understand why certain proteins might have been ruled out. 
+The CSV is provided in such a format that the data can easily be explored in a Cytoscape network. 
+
+The figure below shows an example of the data being explored in Cytoscape using also the 
+`Cytoscape chartplugin`_ to visualize the quantification data when selecting the peptide nodes. 
+
+.. image:: $PATH_TO_IMAGES/quantifere_cyto_out.png 
+
+
+.. _Cytoscape chartplugin: http://apps.cytoscape.org/apps/chartplugin
+
+
+
+  </help>
+</tool>
author	pieter.lukasse@wur.nl
date	Wed, 08 Jan 2014 11:39:16 +0100
parents
children	73c7c6589202