annotate computeGCBias.xml @ 27:bf1b1dcdd67b draft

Uploaded
author bgruening
date Mon, 17 Mar 2014 16:23:58 -0400
parents 2ad3b027dfcd
children f7712a057440
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
27
bf1b1dcdd67b Uploaded
bgruening
parents: 26
diff changeset
1 <tool id="deeptools_computeGCBias" name="computeGCBias" version="1.0.5">
26
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
2 <description>to see whether your samples should be normalized for GC bias</description>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
3 <expand macro="requirements" />
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
4 <expand macro="stdio" />
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
5 <macros>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
6 <token name="@BINARY@">computeGCBias</token>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
7 <import>deepTools_macros.xml</import>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
8 </macros>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
9 <command>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
10 ln -s $bamInput local_bamInput.bam;
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
11 ln -s $bamInput.metadata.bam_index local_bamInput.bam.bai;
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
12
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
13 computeGCBias
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
14 @THREADS@
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
15
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
16 --bamfile 'local_bamInput.bam'
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
17 --GCbiasFrequenciesFile $outFileName
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
18 --fragmentLength $fragmentLength
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
19
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
20 @reference_genome_source@
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
21
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
22 #if $effectiveGenomeSize.effectiveGenomeSize_opt == "specific":
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
23 --effectiveGenomeSize $effectiveGenomeSize.effectiveGenomeSize
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
24 #else:
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
25 --effectiveGenomeSize $effectiveGenomeSize.effectiveGenomeSize_opt
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
26 #end if
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
27
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
28 #if str($region).strip() != '':
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
29 --region '$region'
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
30 #end if
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
31
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
32 #if $advancedOpt.showAdvancedOpt == "yes":
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
33
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
34 --sampleSize '$advancedOpt.sampleSize'
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
35 --regionSize '$advancedOpt.regionSize'
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
36
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
37 #if $advancedOpt.filterOut:
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
38 --filterOut $advancedOpt.filterOut
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
39 #end if
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
40
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
41 #if $advancedOpt.extraSampling:
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
42 --extraSampling $advancedOpt.extraSampling
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
43 #end if
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
44 #end if
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
45
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
46 #if str($image_format) != 'none':
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
47 --biasPlot $outImageName
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
48 --plotFileFormat $image_format
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
49 #end if
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
50 </command>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
51 <inputs>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
52 <param name="bamInput" format="bam" type="data" label="BAM file"
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
53 help="The BAM file must be sorted."/>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
54
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
55 <expand macro="reference_genome_source" />
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
56 <expand macro="effectiveGenomeSize" />
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
57
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
58 <param name="fragmentLength" type="integer" value="300" min="1"
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
59 label="Fragment length used for the sequencing"
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
60 help ="If paired-end reads are used, the fragment length is computed from the BAM file."/>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
61
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
62 <expand macro="region_limit_operation" />
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
63
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
64 <conditional name="advancedOpt">
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
65 <param name="showAdvancedOpt" type="select" label="Show advanced options" >
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
66 <option value="no" selected="true">no</option>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
67 <option value="yes">yes</option>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
68 </param>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
69 <when value="no" />
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
70 <when value="yes">
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
71 <param name="sampleSize" type="integer" value="50000000" min="1"
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
72 label="Number of sampling points to be considered" />
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
73
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
74 <param name="regionSize" type="integer" value="300" min="1"
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
75 label="Region size"
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
76 help ="To plot the reads per GC over a region, the size of the region is required (see below for more details of the mthod). By default, the bin size is set to 300 bp, which is close to the standard fragment size many sequencing applications. However, if the depth of sequencing is low, a larger bin size will be required, otherwise many bins will not overlap with any read."/>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
77
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
78 <param name="filterOut" type="data" format="bed" optional="true"
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
79 label="BED file containing genomic regions to be excluded from the estimation of the correction"
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
80 help="Such regions usually contain repetitive regions and peaks that if included will bias the correction. It is recommended to filter out known repetitive regions if multi-reads (reads that map to more than one genomic position) were excluded. In the case of ChIP-seq data, it is recommended to first use a peak caller to identify and filter out the identified peaks." />
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
81 <param name="extraSampling" type="data" format="bed" optional="true"
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
82 label="BED file containing genomic regions for which extra sampling is required because they are underrepresented in the genome"
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
83 help="" />
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
84 </when>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
85 </conditional>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
86 <param name="image_format" type="select" label="GC bias plot" help="If given, a diagnostic image summarizing the GC bias found on the sample will be created.">
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
87 <option value="none">No image</option>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
88 <option value="png" selected="true">Image in png format</option>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
89 <option value="pdf">Image in pdf format</option>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
90 <option value="svg">Image in svg format</option>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
91 <option value="eps">Image in eps format</option>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
92 <option value="emf">Image in emf format</option>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
93 </param>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
94 </inputs>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
95 <outputs>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
96 <data format="tabular" name="outFileName" />
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
97 <data format="png" name="outImageName" label="${tool.name} GC-bias Plot">
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
98 <filter>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
99 ((
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
100 image_format != 'none'
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
101 ))
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
102 </filter>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
103 <change_format>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
104 <when input="image_format" value="pdf" format="pdf" />
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
105 <when input="image_format" value="svg" format="svg" />
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
106 <when input="image_format" value="eps" format="eps" />
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
107 <when input="image_format" value="emf" format="emf" />
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
108 </change_format>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
109 </data>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
110 </outputs>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
111 <help>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
112
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
113 **What it does**
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
114
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
115 This tool computes the GC bias using the method proposed by Benjamini and Speed (2012) Nucleic Acids Res. (see below for more explanations)
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
116 The output is used to plot the bias and can also be used later on to correct the bias with the tool correctGCbias.
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
117 There are two plots produced by the tool: a boxplot showing the absolute read numbers per genomic-GC bin and an x-y plot
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
118 depicting the ratio of observed/expected reads per genomic GC content bin.
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
119
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
120 -----
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
121
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
122 **Summary of the method used**
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
123
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
124 In order to estimate how many reads with what kind of GC content one should have sequenced, we first need to determine how many regions the specific
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
125 reference genome contains for each amount of GC content, i.e. how many regions in the genome have 50% GC (or 10% GC or 90% GC or...).
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
126 We then sample a large number of equally sized genome bins and count how many times we see a bin with 50% GC (or 10% GC or 90% or...). These EXPECTED values are independent of any
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
127 sequencing as it only depends on the respective reference genome (i.e. it will most likely vary between mouse and fruit fly due to their genome's different GC contents).
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
128 The OBSERVED values are based on the reads from the sequenced sample. Instead of noting how many genomic regions there are per GC content, we now count the reads per GC content.
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
129 In an ideal sample without GC bias, the ratio of OBSERVED/EXPECTED values should be close to 1 regardless of the GC content. Due to PCR (over)amplifications, the majority of ChIP samples
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
130 usually shows a significant bias towards reads with high GC content (>50%)
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
131
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
132 .. image:: $PATH_TO_IMAGES/QC_GCplots_input.png
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
133
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
134
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
135 You can find more details on the computeGCBias wiki page: computeGCBias wiki: https://github.com/fidelram/deepTools/wiki/QC#wiki-computeGCbias
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
136
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
137
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
138 **Output files**:
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
139
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
140 - Diagnostic plot
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
141
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
142 - box plot of absolute read numbers per genomic GC bin
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
143 - x-y plot of observed/expected read ratios per genomic GC content bin
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
144
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
145 - Data matrix
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
146
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
147 - to be used for GC correction with correctGCbias
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
148
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
149
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
150 -----
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
151
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
152 @REFERENCES@
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
153
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
154 </help>
2ad3b027dfcd Uploaded
bgruening
parents:
diff changeset
155 </tool>