view cgatools/tools/cgatools_1.5/calldiff.xml @ 0:182426b32995 draft default tip

Uploaded
author completegenomics
date Mon, 18 Jun 2012 20:15:00 -0400
parents
children
line wrap: on
line source

<tool id="cg_calldiff" name="calldiff(beta) 1.5" version="1.0.0">
<!--
This tool creates a GUI for the calldiff function of cgatools from Complete Genomics, Inc.
written 6-18-2012 by bcrain@completegenomics.com
-->

  <description>compares two Complete Genomics variant files.</description> <!--adds description in toolbar-->

  <requirements>
  	<requirement type="binary">cgatools</requirement>
  </requirements>

  <command> <!--run executable-->
  	cgatools | head -1;
  	cgatools calldiff --beta
  	--reference ${crr.fields.path}
		--variantsA $data_sources.inputA
		--variantsB $data_sources.inputB
		$validation 
		$diploid 
		--locus-stats-column-count $column 
		--max-hypothesis-count $hypothesis
		--output-prefix cg_
		--reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/  */,/g'` 
		#if $somatic.report6 == "SomaticOutput"
			--genome-rootA $somatic.genomeA
			--genome-rootB $somatic.genomeB
			--calibration-root $somatic.calibration
		#end if
  </command>

  <outputs>
  	<data format="tabular" name="output1" from_work_dir="cg_SuperlocusOutput.tsv" label="${tool.name} on ${on_string}: SuperlocusOutput">
  	<filter>(report1 == 'SuperlocusOutput')</filter>
  	</data>
  	<data format="tabular" name="output2" from_work_dir="cg_SuperlocusStats.tsv" label="${tool.name} on ${on_string}: SuperlocusStats">
  	<filter>(report2 == 'SuperlocusStats')</filter>
  	</data>
  	<data format="tabular" name="output3" from_work_dir="cg_LocusOutput.tsv" label="${tool.name} on ${on_string}: LocusOutput">
  	<filter>(report3 == 'LocusOutput')</filter>
  	</data>
  	<data format="tabular" name="output4" from_work_dir="cg_LocusStats.tsv" label="${tool.name} on ${on_string}: LocusStats">
  	<filter>(report4 == 'LocusStats')</filter>
  	</data>
  	<data format="tabular" name="output5a" from_work_dir="cg_VariantsA.tsv" label="${tool.name} on ${on_string}: VariantsA">
  	<filter>(report5 == 'VariantOutput')</filter>
  	</data>
  	<data format="tabular" name="output5b" from_work_dir="cg_VariantsB.tsv" label="${tool.name} on ${on_string}: VariantsB">
  	<filter>(report5 == 'VariantOutput')</filter>
  	</data>
  	<data format="tabular" name="output6" from_work_dir="cg_SomaticOutput.tsv" label="${tool.name} on ${on_string}: SomaticOutput">
  	<filter>(somatic['report6'] == 'SomaticOutput')</filter>
  	</data>
  </outputs>
  
  <inputs>
		<!--form field to select crr file-->
		<param name="crr" type="select" label="Reference genome (.crr file)">
			<options from_data_table="cg_crr_files" />
		</param>
	
		<!--conditional to select variant file input-->
  	<conditional name="data_sources">
      <param name="data_source" type="select" label="Where are the input varfiles?">
        <option value="in" selected="true">imported into Galaxy</option>
        <option value="out">located outside Galaxy (available only for local Galaxy instances)</option>
      </param>
      <when value="in">
				<!--form field to select variant files-->
				<param name="inputA" type="data" format="cg_var" label="Var file A">
					<validator type="unspecified_build" />
					<validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
					 metadata_name="dbkey" metadata_column="1"
					 message="cgatools is not currently available for this build."/>
				</param>
				<param name="inputB" type="data" format="cg_var" label="Var file B">
					<validator type="unspecified_build" />
					<validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
					 metadata_name="dbkey" metadata_column="1"
					 message="cgatools is not currently available for this build."/>
				</param>
			</when>
      <when value="out">
				<!--form field to select crr file-->
				<param name="inputA" type="text" label="Variant file A (/path/varfile)" size="300" help="Variant file can be compressed (gz, bz2), e.g. /harddrive/GS00000XXXX-DID/GS00000YYYY-ASM/GS00123-DNA_G01_2000/ASM/var-GS00000YYYY-ASM.tsv.bz2"/>
				<param name="inputB" type="text" label="Variant file B (/path/varfile)" size="300" help="Variant file can be compressed (gz, bz2), e.g. /harddrive/GS00000XXXX-DID/GS00000YYYY-ASM/GS00123-DNA_G01_2000/ASM/var-GS00000YYYY-ASM.tsv.bz2."/>
			</when>
		</conditional>
		
		<param name="diploid" type="select" label="Use diploid variant model" help="Uses varScoreEAF instead of varScoreVAF in somatic score computations. Also, uses diploid variant model instead of variable allele mixture model.">
			<option value="">no</option>
			<option value="--diploid">yes</option>
		</param>
			
		<param name="column" type="integer" label="Number of columns for locus compare classification in the locus stats file (default 15)" value="15"/>
		
		<param name="hypothesis" type="integer" label="Maximum number of possible phasings to consider for a superlocus (default 32)" value="32"/>
		      
		<param name="validation" type="select" label="Reference cover validation" help="Turns on/off validation that all bases of a chromosome are covered by calls of the variant file.">
			<option value="">on</option>
			<option value="--no-reference-cover-validation">off</option>
		</param>
			
		<param name="report1" type="select" label="Create report SuperlocusOutput">
			<option value="">no</option>
			<option value="SuperlocusOutput">yes</option>
		</param>
		<param name="report2" type="select" label="Create report SuperlocusStats">
			<option value="">no</option>
			<option value="SuperlocusStats">yes</option>
		</param>
		<param name="report3" type="select" label="Create report LocusOutput">
			<option value="">no</option>
			<option value="LocusOutput">yes</option>
		</param>
		<param name="report4" type="select" label="Create report LocusStats">
			<option value="">no</option>
			<option value="LocusStats">yes</option>
		</param>
		<param name="report5" type="select" label="Create report VariantOutput" help="Both variant files annotated by comparison results.If the somatic output report is requested, file A is also annotated with the same score ranks as produced in that report.">
			<option value="">no</option>
			<option value="VariantOutput">yes</option>
		</param>
		
		<conditional name="somatic">
			<param name="report6" type="select" label="Create report SomaticOutput" help="This report can only be generated on local Galaxy instances. Report for the list of simple variations that are present only in file 'A', annotated with the score that indicates the probability of the variation being truly somatic. Note: generating this report slows calldiff by 10x-20x.">
				<option value="">no</option>
				<option value="SomaticOutput">yes</option>
			</param>
			<when value="SomaticOutput">
				<param name="genomeA" type="text" size="300" label="Directory for genome A (/path/dir)" help="The 'A' genome directory, e.g. /harddrive/GS00000XXXX-DID/GS00000YYYY-ASM/GS00123-DNA_G01_2000; this directory is expected to contain ASM/REF and ASM/EVIDENCE subdirectories."/>
				<param name="genomeB" type="text" size="300" label="Directory for genome B (/path/dir)" help="The 'B' genome directory, e.g. /harddrive/GS00000XXXX-DID/GS00000YYYY-ASM/GS00123-DNA_G01_2000; this directory is expected to contain ASM/REF and ASM/EVIDENCE subdirectories."/>
				<param name="calibration" type="text" size="300" label="Directory calibration data (/path/dir)" help="The directory containing calibration data. For example, there should exist a file calibration-root/0.0.0/metrics.tsv. Calibration data can be downloaded from ftp://ftp.completegenomics.com/ScoreCalibrationFiles/var-calibration-v1.tgz"/>
			</when>
		</conditional>
		
  </inputs>

  <help>
  
**What it does**

This tool compares two Complete Genomics variant files.

**cgatools 1.5.0 Documentation**

Userguide: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-user-guide.pdf

Release notes: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-release-notes.pdf

**Command line reference**::

		COMMAND NAME
		  calldiff - Compares two Complete Genomics variant files.
		
		DESCRIPTION
		  Compares two Complete Genomics variant files. Divides the genome up into 
		  superloci of nearby variants, then compares the superloci. Also refines the
		  comparison to determine per-call or per-locus comparison results.
				
		  Comparison results are usually described by a semi-colon separated string, 
		  one per allele. Each allele's comparison result is one of the following 
		  classifications:
				
		    ref-identical   The alleles of the two variant files are identical, and
		                    they are consistent with the reference.
		    alt-identical   The alleles of the two variant files are identical, and
		                    they are inconsistent with the reference.
		    ref-consistent  The alleles of the two variant files are consistent, 
		                    and they are consistent with the reference.
		    alt-consistent  The alleles of the two variant files are consistent, 
		                    and they are inconsistent with the reference.
		    onlyA           The alleles of the two variant files are inconsistent, 
		                    and only file A is inconsistent with the reference.
		    onlyB           The alleles of the two variant files are inconsistent, 
		                    and only file B is inconsistent with the reference.
		    mismatch        The alleles of the two variant files are inconsistent, 
		                    and they are both inconsistent with the reference.
		    phase-mismatch  The two variant files would be consistent if the 
		                    hapLink field had been empty, but they are 
		                    inconsistent.
		    ploidy-mismatch The superlocus did not have uniform ploidy.
				
		  In some contexts, this classification is rolled up into a simplified 
		  classification, which is one of "identical", "consistent", "onlyA", 
		  "onlyB", or "mismatch".
				
		  A good place to start looking at the results is the superlocus-output file.
		  It has columns defined as follows:
				
		    SuperlocusId   An identifier given to the superlocus.
		    Chromosome     The name of the chromosome.
		    Begin          The 0-based offset of the start of the superlocus.
		    End            The 0-based offset of the base one past the end of the 
		                   superlocus.
		    Classification The match classification of the superlocus.
		    Reference      The reference sequence.
		    AllelesA       A semicolon-separated list of the alleles (one per 
		                   haplotype) for variant file A, for the phasing with the 
		                   best comparison result.
		    AllelesB       A semicolon-separated list of the alleles (one per 
		                   haplotype) for variant file B, for the phasing with the 
		                   best comparison result.
				
		  The locus-output file contains, for each locus in file A and file B that is
		  not consistent with the reference, an annotated set of calls for the locus.
		  The calls are annotated with the following columns:
				
		    SuperlocusId            The id of the superlocus containing the locus.
		    File                    The variant file (A or B).
		    LocusClassification     The locus classification is determined by the 
		                            varType column of the call that is inconsistent
		                            with the reference, concatenated with a 
		                            modifier that describes whether the locus is 
		                            heterozygous, homozygous, or contains no-calls.
		                            If there is no one variant in the locus (i.e., 
		                            it is heterozygous alt-alt), the locus 
		                            classification begins with "other".
		    LocusDiffClassification The match classification for the locus. This is
		                            defined to be the best of the comparison of the
		                            locus to the same region in the other file, or 
		                            the comparison of the superlocus.
				
		  The somatic output file contains a list of putative somatic variations of 
		  genome A. The output includes only those loci that can be classified as 
		  snp, del, ins or sub in file A, and are called reference in the file B. 
		  Every locus is annotated with the following columns:
				
		    VarCvgA                 The totalReadCount from file A for this locus 
		                            (computed on the fly if file A is not a 
		                            masterVar file).
		    VarScoreA               The varScoreVAF from file A, or varScoreEAF if 
		                            the "--diploid" option is used.
		    RefCvgB                 The maximum of the uniqueSequenceCoverage 
		                            values for the locus in genome B.
		    RefScoreB               Minimum of the reference scores of the locus in
		                            genome B.
		    SomaticCategory         The category used for determining the 
		                            calibrated scores and the SomaticRank.
		    VarScoreACalib          The calibrated variant score of file A, under 
		                            the model selected by using or not using the 
		                            "--diploid" option, and corrected for the count
		                            of heterozygous variants observed in this 
		                            genome. See user guide for more information.
		    VarScoreBCalib          The calibrated reference score of file B, under
		                            the model selected by using or not using the 
		                            "--diploid" option, and corrected for the count
		                            of heterozygous variants observed in this 
		                            genome. See user guide for more information.
		    SomaticRank             The estimated rank of this somatic mutation, 
		                            amongst all true somatic mutations within this 
		                            SomaticCategory. The value is a number between 
		                            0 and 1; a value of 0.012 means, for example, 
		                            that an estimated 1.2% of the true somatic 
		                            mutations in this somaticCategory have a 
		                            somaticScore less than the somaticScore for 
		                            this mutation. See user guide for more 
		                            information.
		    SomaticScore            An integer that provides a total order on 
		                            quality for all somatic mutations. It is equal 
		                            to -10*log10( P(false)/P(true) ), under the 
		                            assumption that this genome has a rate of 
		                            somatic mutation equal to 1/Mb for 
		                            SomaticCategory snp, 1/10Mb for SomaticCategory
		                            ins, 1/10Mb for SomaticCategory del, and 1/20Mb
		                            for SomaticCategory sub. The computation is 
		                            based on the assumptions described in the user 
		                            guide, and is affected by choice of variant 
		                            model selected by using or not using the 
		                            "--diploid" option.
		    SomaticQuality          Equal to VQHIGH for all somatic mutations where
		                            SomaticScore &gt;= -10. Otherwise, this column is 
		                            empty.
				
		OPTIONS
		  -h [ --help ] 
		      Print this help message.
		
		  --reference arg
		      The input crr file.
		
		  --variantsA arg
		      The "A" input variant file.
		
		  --variantsB arg
		      The "B" input variant file.
		
		  --output-prefix arg
		      The path prefix for all output reports.
		
		  --reports arg (=SuperlocusOutput,SuperlocusStats,LocusOutput,LocusStats)
		      Comma-separated list of reports to generate. (Beware any reports whose 
		      name begins with "Debug".) A report is one of:
		        SuperlocusOutput      Report for superlocus classification.
		        SuperlocusStats       Report for superlocus classification stats.
		        LocusOutput           Report for locus classification.
		        LocusStats            Report for locus stats.
		        VariantOutput         Both variant files annotated by comparison 
		                              results.If the somatic output report is 
		                              requested, file A is also annotated with the 
		                              same score ranks as produced in that report.
		        SomaticOutput         Report for the list of simple variations that
		                              are present only in file "A", annotated with 
		                              the score that indicates the probability of 
		                              the variation being truly somatic. Requires 
		                              beta, genome-rootA, and genome-rootB options 
		                              to be provided as well. Note: generating this
		                              report slows calldiff by 10x-20x.
		        DebugCallOutput       Report for call classification.
		        DebugSuperlocusOutput Report for debug superlocus information.
		        DebugSomaticOutput    Report for distribution estimates used for 
		                              somatic rescoring. Only produced if 
		                              SomaticOutput is also turned on.
		
		  --diploid 
		      Uses varScoreEAF instead of varScoreVAF in somatic score computations. 
		      Also, uses diploid variant model instead of variable allele mixture 
		      model.
		
		  --locus-stats-column-count arg (=15)
		      The number of columns for locus compare classification in the locus 
		      stats file.
		
		  --max-hypothesis-count arg (=32)
		      The maximum number of possible phasings to consider for a superlocus.
		
		  --no-reference-cover-validation 
		      Turns off validation that all bases of a chromosome are covered by 
		      calls of the variant file.
		
		  --genome-rootA arg
		      The "A" genome directory, for example /data/GS00118-DNA_A01; this 
		      directory is expected to contain ASM/REF and ASM/EVIDENCE 
		      subdirectories.
		
		  --genome-rootB arg
		      The "B" genome directory.
		
		  --calibration-root arg
		      The directory containing calibration data. For example, there should 
		      exist a file calibration-root/0.0.0/metrics.tsv.
		
		  --beta 
		      This flag enables the SomaticOutput report, which is beta 
		      functionality.
		
		SUPPORTED FORMAT_VERSION
		  0.3 or later
  </help>
</tool>