Mercurial > repos > pieterlukasse > prims_proteomics
diff msfilt.xml @ 0:d50f079096ee
Push to main toolshed
author | pieter.lukasse@wur.nl |
---|---|
date | Wed, 08 Jan 2014 11:39:16 +0100 |
parents | |
children | 72d4a37869ee |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/msfilt.xml Wed Jan 08 11:39:16 2014 +0100 @@ -0,0 +1,229 @@ +<tool name="MsFilt" id="msfilt" version="1.0.2"> + <description>Filters annotations based MS/MS peptide identification and annotation quality measures</description> + <!-- + For remote debugging start you listener on port 8000 and use the following as command interpreter: + java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 + ////////////////////////// + --> + <command interpreter="java -jar "> + MsFilt.jar + -apmlFile $apmlFile + -datasetCode $apmlFile.metadata.base_name + -rankingMetadataFile $rankingMetadataFile + -statisticalMeasuresConfigFile $statisticalMeasuresConfigFile + -annotationSourceConfigFile $annotationSourceConfigFile + -outApml $outputApml + -outNewIdsApml $outNewIdsApml + -outFullCSV $outputCSV + -outRankingTable $outRankingTable + -outProteinCoverageCSV $outProteinCoverageCSV + -fpCriteriaExpression "$fpCriteriaExpression" + -filterOutFPAnnotations $filterOutFPAnnotations + -fpCriteriaExpressionForIds "$fpCriteriaExpressionForIds" + -filterOutFPIds $filterOutFPIds + -filterOutUnannotatedAlignments $filterOutUnannotatedAlignments + -addRawRankingInfo $addRawRankingInfo + -addScaledIntensityInfo $addScaledIntensityInfo + -addRawIntensityInfo $addRawIntensityInfo + -outReport $htmlReportFile + -outReportPicturesPath $htmlReportFile.files_path + </command> + + <inputs> + + <param name="apmlFile" type="data" format="apml" optional="true" + label="(Optional) Peptide quantification file (APML)" + help="The APML contents as aligned and annotated feature lists. E.g. produced by + SEDMAT or Quantiline tools." /> + + <repeat name="annotationSourceFiles" title="(Optional) Peptide identification files" help="Full set of MS/MS peptide identification files, including peptides that could not be quantified."> + <param name="identificationsFile" type="data" format="apml,mzidentml,prims.fileset.zip" label="Identifications file (APML or MZIDENTML or MZIDENTML fileSet)" /> + <param name="spectraFile" type="data" format="mzidentml,prims.fileset.zip" optional="true" label="(Optional) Spectra fileSet (mzml file or fileSet)" + help="Select this in case your Identifications file is MZIDENTML or MZIDENTML fileSet" /> + </repeat> + + <!-- + <param name="maxNrRankings" type="integer" size="10" value="0" label="Maximum nr. of items to leave in the final ranking (set=0 for no limit) " /> + --> + <!-- TODO add info somewhere that deltaRt is 'corrected deltaRt' --> + <param name="rankingWeightConfig" type="text" area="true" size="13x70" label="Quality Measures (qm's) and ranking weights configuration" + help="Here you may specify a weight for each of the Quality Measures (QMs). These are used for the final QM score and possibly for ranking (e.g. in case of label-free data + processed by SEDMAT). The format is: QM alias => QM name,weight. " +value="qmDRT => delta rt (standard score),1 +
qmDMA => delta mass annotation (standard score),1 +
qmDMP => delta mass psm (standard score),1 +
qmBSCR => best peptide score (standard score),1 +
qmALCV => alignment coverage (fraction),1 +
qmSTCV => score type coverage (fraction),1 +
qmPACV => peptide's best proteinAnnotCoverage (standard score),1 +
qmPICV => peptide's best proteinIdentifCoverage (standard score),1 +
qmANS => annotation sources (count),1 +
qmCSEV => charge states evidence (count),0.2 +
qmBCSP=> best correlation with source or product peptide (correl),1 +
qmBCCS => best correlation with other charge state (correl),1 +
qmBCOS => best correlation with other sibling peptide (correl),1 +"/> + + <param name="statisticalMeasuresConfig" type="text" area="true" size="6x70" label="Statistical measures configuration" + help="Here you may specify the statistical measures that are found in the ms/ms results (e.g. p or e-values). + The format is: SM alias => SM name,type,mode[min/max]. " +value="smXTD => MS:1001330,XSLASH!Tandem:expect,min +
pvCSVEX => p_value,CSV_EXPORT,min +
smAUTO_LIKELIHOOD => AUTOMOD_LOGLIKELIHOOD,PLGS/Auto-mod,max +
smLIKELIHOOD => LOGLIKELIHOOD,PLGS/Databank-search,max +"/> + + <param name="filterOutUnannotatedAlignments" type="boolean" checked="true" + label="Filter out unannotated alignments" + help="This helps decrease the output file size (features with no annotation are then not reported anymore)"/> + + <param name="filterOutFPAnnotations" type="boolean" checked="true" + label="Filter out False Positive (FP) annotations" /> + + <param name="fpCriteriaExpression" type="text" size="120" label="False Positive (FP) criteria for annotations" + help="Criteria (in standard score measures) for classifying an annotation as False Positive (FP). + You can build logical rules using the QM aliases above, the keywords 'and', 'or' and parenthesis. + Comparisons can be made with '==,<,><=,>='" + value="qmDRT <0 or qmDMA <-0.5 or (qmDMP <-0.5 and qmBSCR<-0.5) or (!isNaN(smXTD) and smXTD >0.01)"/> + + + <param name="filterOutFPIds" type="boolean" checked="true" + label="Filter out False Positive (FP) peptide identifications" /> + + <param name="fpCriteriaExpressionForIds" type="text" size="120" + label="False Positive (FP) criteria for identifications" + help="Criteria (in standard score measures) for classifying a peptide identification as False Positive (FP). + Here you can use a subset of the quality measures (qmDMP, qmBSCR, qmSTCV, qmPICV, qmCSEV) and all statistical measures." + value="(qmDMP <-0.5 and qmBSCR<-0.5) or (!isNaN(smXTD) and smXTD >0.01)"/> + + + <param name="addRawRankingInfo" type="boolean" checked="false" + label="Include the raw scores/values of the ranking attributes in the CSV output" + help="This will result in one extra column per ranking attribute, each column holding the original data for this attribute (before normalization)."/> + + <param name="addScaledIntensityInfo" type="boolean" checked="false" + label="Include computed scaled intensity values in the CSV output" + help="The autoscaled and 'z-score'scaled (aka 'standard-score'scaled) intensity values are then added to the full CSV output file"/> + + <param name="addRawIntensityInfo" type="boolean" checked="false" + label="Include the raw intensity values in the CSV output" + help="The original intensity values (as found in the input file) are then added to the full CSV output file"/> + + + </inputs> + <configfiles> + <configfile name="rankingMetadataFile">${rankingWeightConfig}</configfile> + <configfile name="statisticalMeasuresConfigFile">${statisticalMeasuresConfig}</configfile> + <configfile name="annotationSourceConfigFile">## start comment + ## iterate over the selected files and store their names in the config file + #for $i, $s in enumerate( $annotationSourceFiles ) + ${s.identificationsFile}|${s.spectraFile} + ## also print out the datatype in the next line, based on previously configured datatype + #if isinstance( $s.identificationsFile.datatype, $__app__.datatypes_registry.get_datatype_by_extension('apml').__class__): + apml + #else: + mzid + #end if + #end for + ## end comment</configfile> + </configfiles> + <outputs> + <data name="outputApml" format="apml" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: quantifications (filtered APML)" metadata_source="apmlFile"> + <!-- If the expression is false, the file is not created --> + <filter>( apmlFile != None )</filter> + </data> + <data name="outNewIdsApml" format="apml" label="${tool.name} on ${on_string}: identifications (filtered APML)" > + <filter>( filterOutFPIds == True )</filter> + </data> + <data name="outputCSV" format="csv" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: Full CSV" metadata_source="apmlFile"> + <filter>( apmlFile != None )</filter> + </data> + <data name="outRankingTable" format="csv" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: Ranking table (CSV)" metadata_source="apmlFile"> + <filter>( apmlFile != None )</filter> + </data> + <data name="outProteinCoverageCSV" format="csv" label="${tool.name} on ${on_string}: Protein coverage details (CSV)"> + <!-- If the expression is false, the file is not created --> + <filter>( len(list(enumerate(annotationSourceFiles))) > 0 )</filter> + </data> + <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report"/> + </outputs> + <tests> + </tests> + <help> + +.. class:: infomark + +This tool takes in peptide quantification results (e.g. either by SEDMAT for label-free data or by Quantiline for labeled data) +and calculates a number of quality measures that can help in assessing the correctness of the quantification assignment and of the MS/MS peptide +identification itself. The user can use any combination of quality measures (qm's) and statistical measures (sm's) to filter out +low scoring entries. + +.. class:: infomark + +In the label-free data processed by SEDMAT it is possible that a feature quantification gets assigned to different peptides. This means +we have an ambiguous assignment. In such a case +this tool also does a ranking of the different assignments according to their quality measures so that the best scoring assignment +gets ranked as first. + +----- + +**List of abbreviations** + +QM: Quality Measure + +SM: Statistical Measure (e.g. p-value, e-value from MS/MS identification) + +PSM: "Peptide to Spectrum Match" (aka peptide identification) + +FP: False Positive + +----- + +**Filtering options details** + +The FP criteria will be applied to an annotation even if the corresponding quality measures involved +in the expression can NOT ALL be determined. QMs that cannot be determined, get the value 0 (zero) which is +equal to giving it the average value. + +The output report shows some plots that visualize the filtering done. This can help in fine-tuning the right filtering +criteria. + +----- + +**Output details** + +*APML output* + +This tools returns the given APML alignment file further annotated at the alignment level with the best ranking +peptides of each respective alignment. This APML can be used in subsequent Galaxy tools like the proteomics tools +from NBIC. + +The APML output can also be used for the Protein Inference step (see Quantifere tool). + +*CSV output* + +It also returns a CSV format output with the full quality measures and scoring and ranking details. The user could use +this to manually determine new weights for some of the quality measures by techniques such as +linear regression. In other words, this CSV can then be used to fine-tune the weights in a next run. + +Many of the quality measures (QMs) are normalized to their Standard Score (aka z-score). +`See Standard Score for more details...`__ + +Next to giving insight into how the ranking was established, a more complete version of this CSV file is also +generated for tools that cannot or won't process the APML output format. + +Below an brief overview of the CSV and an illustration of the ranking done in case of ambiguous peptides to feature assignments +(explained above, can happen in case of label-free data processing by SEDMAT). + + +.. image:: $PATH_TO_IMAGES/msfilt_csv_out.png + + + +.. __: javascript:window.open('http://en.wikipedia.org/wiki/Standard_score','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes') + + + + + </help> +</tool>