annotate computeGCBias.xml @ 16:16310f8b24d5 draft

Uploaded
author bgruening
date Mon, 16 Dec 2013 15:13:08 -0500
parents b4c5dd45778a
children 5ea8782d650c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
135f3bae5c56 Uploaded
bgruening
parents: 9
diff changeset
1 <tool id="deeptools_computeGCBias" name="computeGCBias" version="1.0.2">
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
2 <description>to see whether your samples should be normalized for GC bias</description>
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
3 <expand macro="requirements" />
10
135f3bae5c56 Uploaded
bgruening
parents: 9
diff changeset
4 <expand macro="stdio" />
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
5 <macros>
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
6 <import>deepTools_macros.xml</import>
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
7 </macros>
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
8 <command>
5
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
9 ln -s $bamInput local_bamInput.bam;
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
10 ln -s $bamInput.metadata.bam_index local_bamInput.bam.bai;
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
11
5
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
12 computeGCBias
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
13 @THREADS@
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
14
5
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
15 --bamfile 'local_bamInput.bam'
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
16 --GCbiasFrequenciesFile $outFileName
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
17 --fragmentLength $fragmentLength
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
18
5
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
19 @reference_genome_source@
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
20
5
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
21 #if $effectiveGenomeSize.effectiveGenomeSize_opt == "specific":
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
22 --effectiveGenomeSize $effectiveGenomeSize.effectiveGenomeSize
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
23 #else:
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
24 --effectiveGenomeSize $effectiveGenomeSize.effectiveGenomeSize_opt
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
25 #end if
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
26
5
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
27 #if $advancedOpt.showAdvancedOpt == "yes":
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
28 #if str($advancedOpt.region.value) != '':
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
29 --region '$advancedOpt.region'
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
30 #end if
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
31
5
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
32 --sampleSize '$advancedOpt.sampleSize'
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
33 --regionSize '$advancedOpt.regionSize'
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
34
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
35 #if $advancedOpt.filterOut:
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
36 --filterOut $advancedOpt.filterOut
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
37 #end if
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
38
5
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
39 #if $advancedOpt.extraSampling:
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
40 --extraSampling $advancedOpt.extraSampling
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
41 #end if
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
42 #end if
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
43
13
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
44 #if str($image_format) != 'none':
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
45 --biasPlot $outImageName
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
46 --plotFileFormat $image_format
5
c54d31467be4 Uploaded
bgruening
parents: 0
diff changeset
47 #end if
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
48 </command>
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
49 <inputs>
9
c53a73b8eef9 Uploaded
bgruening
parents: 5
diff changeset
50 <param name="bamInput" format="bam" type="data" label="BAM file"
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
51 help="The BAM file must be sorted."/>
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
52
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
53 <expand macro="reference_genome_source" />
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
54 <expand macro="effectiveGenomeSize" />
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
55
13
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
56 <param name="fragmentLength" type="integer" value="300" min="1"
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
57 label="Fragment length used for the sequencing"
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
58 help ="If paired-end reads are used, the fragment length is computed from the BAM file."/>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
59 <conditional name="advancedOpt">
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
60 <param name="showAdvancedOpt" type="select" label="Show advanced options" >
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
61 <option value="no" selected="true">no</option>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
62 <option value="yes">yes</option>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
63 </param>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
64 <when value="no" />
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
65 <when value="yes">
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
66 <param name="region" type="text" value=""
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
67 label="Region of the genome to limit the operation to"
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
68 help="This is useful when testing parameters to reduce the computing time. The format is chr:start:end, for example &quot;chr10&quot; or &quot;chr10:456700:891000&quot;" />
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
69
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
70 <param name="sampleSize" type="integer" value="50000000" min="1"
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
71 label="Number of sampling points to be considered" />
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
72
13
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
73 <param name="regionSize" type="integer" value="300" min="1"
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
74 label="Region size"
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
75 help ="To plot the reads per GC over a region, the size of the region is required (see below for more details of the mthod). By default, the bin size is set to 300 bp, which is close to the standard fragment size many sequencing applications. However, if the depth of sequencing is low, a larger bin size will be required, otherwise many bins will not overlap with any read."/>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
76
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
77 <param name="filterOut" type="data" format="bed" optional="true"
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
78 label="BED file containing genomic regions to be excluded from the estimation of the correction"
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
79 help="Such regions usually contain repetitive regions and peaks that if included will bias the correction. It is recommended to filter out known repetitive regions if multi-reads (reads that map to more than one genomic position) were excluded. In the case of ChIP-seq data, it is recommended to first use a peak caller to identify and filter out the identified peaks." />
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
80 <param name="extraSampling" type="data" format="bed" optional="true"
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
81 label="BED file containing genomic regions for which extra sampling is required because they are underrepresented in the genome"
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
82 help="" />
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
83 </when>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
84 </conditional>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
85 <param name="image_format" type="select" label="GC bias plot" help="If given, a diagnostic image summarizing the GC bias found on the sample will be created.">
16
16310f8b24d5 Uploaded
bgruening
parents: 13
diff changeset
86 <option value="none">No image</option>
16310f8b24d5 Uploaded
bgruening
parents: 13
diff changeset
87 <option value="png" selected="true">Image in png format</option>
13
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
88 <option value="pdf">Image in pdf format</option>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
89 <option value="svg">Image in svg format</option>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
90 <option value="eps">Image in eps format</option>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
91 <option value="emf">Image in emf format</option>
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
92 </param>
13
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
93 </inputs>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
94 <outputs>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
95 <data format="tabular" name="outFileName" />
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
96 <data format="png" name="outImageName" label="${tool.name} GC-bias Plot">
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
97 <filter>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
98 ((
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
99 image_format != 'none'
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
100 ))
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
101 </filter>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
102 <change_format>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
103 <when input="image_format" value="pdf" format="pdf" />
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
104 <when input="image_format" value="svg" format="svg" />
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
105 <when input="image_format" value="eps" format="eps" />
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
106 <when input="image_format" value="emf" format="emf" />
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
107 </change_format>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
108 </data>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
109 </outputs>
b4c5dd45778a Uploaded
bgruening
parents: 10
diff changeset
110 <help>
0
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
111
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
112 **What it does**
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
113
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
114 This tool computes the GC bias using the method proposed by Benjamini and Speed (2012). Nucleic Acids Res. (see below for more explanations)
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
115 The output is used to plot the bias and can also be used later on to correct the bias with the tool correctGCbias.
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
116 There are two plots produced by the tool: a boxplot showing the absolute read numbers per genomic-GC bin and an x-y plot
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
117 depicting the ratio of observed/expected reads per genomic GC content bin.
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
118
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
119 -----
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
120
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
121 **Summary of the method used**
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
122
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
123 In order to estimate how many reads with what kind of GC content one should have sequenced, we first need to determine how many regions the specific
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
124 reference genome contains for each amount of GC content, i.e. how many regions in the genome have 50% GC (or 10% GC or 90% GC or...).
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
125 We then sample a large number of equally sized genome bins and count how many times we see a bin with 50% GC (or 10% GC or 90% or...). These EXPECTED values are independent of any
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
126 sequencing as it only depends on the respective reference genome (i.e. it will most likely vary between mouse and fruit fly due to their genome's different GC contents).
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
127 The OBSERVED values are based on the reads from the sequenced sample. Instead of noting how many genomic regions there are per GC content, we now count the reads per GC content.
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
128 In an ideal sample without GC bias, the ratio of OBSERVED/EXPECTED values should be close to 1 regardless of the GC content. Due to PCR (over)amplifications, the majority of ChIP samples
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
129 usually shows a significant bias towards reads with high GC content (>50%)
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
130
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
131 .. image:: $PATH_TO_IMAGES/QC_GCplots_input.png
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
132
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
133
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
134 **Output files**:
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
135
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
136 - Diagnostic plot
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
137
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
138 - box plot of absolute read numbers per genomic GC bin
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
139 - x-y plot of observed/expected read ratios per genomic GC content bin
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
140
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
141 - Data matrix
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
142
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
143 - to be used for GC correction with correctGCbias
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
144
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
145
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
146 -----
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
147
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
148 .. class:: infomark
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
149
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
150 @REFERENCES@
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
151
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
152 </help>
d957e25e18a3 Uploaded
bgruening
parents:
diff changeset
153 </tool>