comparison tools/naive_variant_caller.xml @ 1:ae6edc0012ba

Populate naive_variant_caller repository.
author Daniel Blankenberg <dan@bx.psu.edu>
date Thu, 29 Aug 2013 10:54:14 -0400
parents
children 8398666758e3
comparison
equal deleted inserted replaced
0:0fa83c466e9d 1:ae6edc0012ba
1 <tool id="naive_variant_caller" name="Naive Variant Caller" version="0.0.1">
2 <description> - tabulate variable sites from BAM datasets</description>
3 <requirements>
4 <requirement type="package" version="1.7.1">numpy</requirement>
5 <requirement type="package" version="0.0.1">pyBamParser</requirement>
6 <requirement type="package" version="0.0.1">pyBamTools</requirement>
7 </requirements>
8 <stdio>
9 <exit_code range="1:" err_level="fatal" />
10 <exit_code range=":-1" err_level="fatal" />
11 </stdio>
12 <command interpreter="python">naive_variant_caller.py
13 -o "${output_vcf}"
14
15 #for $input_bam in $reference_source.input_bams:
16 -b "${input_bam.input_bam}"
17 -i "${input_bam.input_bam.metadata.bam_index}"
18 #end for
19
20 #if $reference_source.reference_source_selector != "history":
21 -r "${reference_source.ref_file.fields.path}"
22 #elif $reference_source.ref_file:
23 -r "${reference_source.ref_file}"
24 #end if
25
26 #for $region in $regions:
27 --region "${region.chromosome}:${region.start}-${region.end}"
28 #end for
29
30 ${variants_only}
31
32 ${use_strand}
33
34 --ploidy "${$ploidy}"
35
36 --min_support_depth "${min_support_depth}"
37
38 #if str($min_base_quality):
39 --min_base_quality "${min_base_quality}"
40 #end if
41
42 #if str($min_mapping_quality):
43 --min_mapping_quality "${min_mapping_quality}"
44 #end if
45
46 --coverage_dtype "${coverage_dtype}"
47
48 --allow_out_of_bounds_positions
49
50 </command>
51 <inputs>
52 <conditional name="reference_source">
53 <param name="reference_source_selector" type="select" label="Choose the source for the reference list">
54 <option value="cached">Locally cached</option>
55 <option value="history">History</option>
56 </param>
57 <when value="cached">
58 <repeat name="input_bams" title="BAM file" min="1" >
59 <param name="input_bam" type="data" format="bam" label="BAM file">
60 <validator type="unspecified_build" />
61 <validator type="dataset_metadata_in_data_table" table_name="sam_fa_indexes" metadata_name="dbkey" metadata_column="value" message="Sequences are not currently available for the specified build." /> <!-- fixme!!! this needs to be a select -->
62 </param>
63 </repeat>
64 <param name="ref_file" type="select" label="Using reference genome" >
65 <options from_data_table="sam_fa_indexes">
66 <!-- <filter type="data_meta" key="dbkey" ref="input_bam" column="dbkey"/> does not yet work in a repeat...-->
67 </options>
68 <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
69 </param>
70 </when>
71 <when value="history"> <!-- FIX ME!!!! -->
72 <repeat name="input_bams" title="BAM file" min="1" >
73 <param name="input_bam" type="data" format="bam" label="BAM file" >
74 </param>
75 </repeat>
76 <param name="ref_file" type="data" format="fasta" label="Using reference file" optional="True" />
77 </when>
78 </conditional>
79
80 <repeat name="regions" title="Restrict to regions" min="0" >
81 <param name="chromosome" type="text" value="" optional="False" label="Chromosome" />
82 <param name="start" type="integer" value="" optional="True" label="Start" />
83 <param name="end" type="integer" value="" optional="True" label="End" />
84 </repeat>
85
86 <!-- TODO: enhance filtering -->
87 <param name="min_support_depth" type="integer" value="0" min="0" label="Minimum number of reads needed to consider a REF/ALT" />
88 <param name="min_base_quality" type="integer" value="" label="Minimum base quality" optional="True" />
89 <param name="min_mapping_quality" type="integer" value="" label="Minimum mapping quality" optional="True" />
90
91 <param name="ploidy" type="integer" value="2" min="1" label="Ploidy" />
92 <param name="variants_only" type="boolean" truevalue="--variants_only" falsevalue="" checked="False" label="Only write out positions with with possible alternate alleles"/>
93
94 <param name="use_strand" type="boolean" truevalue="--use_strand" falsevalue="" checked="False" label="Report counts by strand"/>
95
96 <param name="coverage_dtype" type="select" label="Choose the dtype to use for storing coverage information" help="This affects the maximum recorded value for a position, e.g. uint8 would be 255 coverage, but will require the least amount of RAM">
97 <option value="uint8">uint8</option>
98 <option value="uint16" selected="True">uint16</option>
99 <option value="uint32">uint32</option>
100 <option value="uint64">uint64</option>
101 </param>
102
103 </inputs>
104 <outputs>
105 <data format="vcf" name="output_vcf" />
106 </outputs>
107 <help>
108 **What it does**
109
110 This tool is a naive variant caller that processes aligned sequencing reads from the BAM format and produces a VCF file containing per position variant calls. This tool allows multiple BAM files to be provided as input and utilizes read group information to make calls for individual samples.
111
112 User configurable options allow filtering reads that do not pass mapping or base quality thresholds and minimum per base read depth; user's can also specify the ploidy and whether to consider each strand separately.
113
114 In addition to calling alternate alleles based upon simple ratios of nucleotides at a position, per base nucleotide counts are also provided. A custom tag, NC, is used within the Genotype fields. The NC field is a comma-separated listing of nucleotide counts in the form of &lt;nucleotide&gt;=&lt;count&gt;, where a plus or minus character is prepended to indicate strand, if the strandedness option was specified.
115
116
117 ------
118
119 **Inputs**
120
121 Accepts one or more BAM input files and a reference genome from the built-in list or from a FASTA file in your history.
122
123
124 **Outputs**
125
126 The output is in VCF format.
127
128 Example VCF output line, without reporting by strand:
129 ``chrM 16029 . T G,A,C . . AC=15,9,5;AF=0.00155311658729,0.000931869952371,0.000517705529095 GT:AC:AF:NC 0/0:15,9,5:0.00155311658729,0.000931869952371,0.000517705529095:A=9,C=5,T=9629,G=15,``
130
131 Example VCF output line, when reporting by strand:
132 ``chrM 16029 . T G,A,C . . AC=15,9,5;AF=0.00155311658729,0.000931869952371,0.000517705529095 GT:AC:AF:NC 0/0:15,9,5:0.00155311658729,0.000931869952371,0.000517705529095:+T=3972,-A=9,-C=5,-T=5657,-G=15,``
133
134 **Options**
135
136 Reference Genome:
137
138 Ensure that you have selected the correct reference genome, either from the list of built-in genomes or by selecting the corresponding FASTA file from your history.
139
140 Restrict to regions:
141
142 You can specify any number of regions on which you would like to receive results. You can specify just a chromosome name, or a chromosome name and start postion, or a chromosome name and start and end position for the set of desired regions.
143
144 Minimum number of reads needed to consider a REF/ALT:
145
146 This value declares the minimum number of reads containing a particular base at each position in order to list and use said allele in genotyping calls. Default is 0.
147
148 Minimum base quality:
149
150 The minimum base quality score needed for the position in a read to be used for nucleotide counts and genotyping. Default is no filter.
151
152 Minimum mapping quality:
153
154 The minimum mapping quality score needed to consider a read for nucleotide counts and genotyping. Default is no filter.
155
156 Ploidy:
157
158 The number of genotype calls to make at each reported position.
159
160 Only write out positions with with possible alternate alleles:
161
162 When set, only positions which have at least one non-reference nucleotide which passes declare filters will be present in the output.
163
164 Report counts by strand:
165
166 When set, nucleotide counts (NC) will be reported in reference to the aligned read's source strand. Reported as: &lt;strand&gt;&lt;BASE&gt;=&lt;COUNT&gt;.
167
168 Choose the dtype to use for storing coverage information:
169
170 This controls the maximum depth value for each nucleotide/position/strand (when specified). Smaller values require the least amount of memory, but have smaller maximal limits.
171
172 +--------+----------------------------+
173 | name | maximum coverage value |
174 +========+============================+
175 | uint8 | 255 |
176 +--------+----------------------------+
177 | uint16 | 65,535 |
178 +--------+----------------------------+
179 | uint32 | 4,294,967,295 |
180 +--------+----------------------------+
181 | uint64 | 18,446,744,073,709,551,615 |
182 +--------+----------------------------+
183
184 ------
185
186 **Citation**
187
188 If you use this tool, please cite Blankenberg D, et al. *In preparation.*
189
190 </help>
191 <tests>
192 <test>
193 <param name="reference_source_selector" value="history" />
194 <param name="input_bam" value="fake_phiX174_reads_1.bam" ftype="bam" />
195 <param name="ref_file" value="phiX174.fasta" ftype="fasta" />
196 <param name="regions" value="0" />
197 <param name="min_support_depth" value="0" />
198 <param name="min_base_quality" value="" />
199 <param name="min_mapping_quality" value="" />
200 <param name="ploidy" value="2" />
201 <param name="variants_only" value="False" />
202 <param name="use_strand" value="False" />
203 <param name="coverage_dtype" value="uint8" />
204 <output name="output_vcf" file="fake_phiX174_reads_1_test_out_1.vcf" compare="contains" />
205 </test>
206 </tests>
207
208 </tool>