Mercurial > repos > bgruening > diffbind
diff diffbind.xml @ 9:6171163112de draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/diffbind commit 9de99de5fb4c62f889814ea43b8800ce8d28eb83
author | iuc |
---|---|
date | Sun, 28 Jan 2018 05:10:25 -0500 |
parents | |
children | d7725c5596ab |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/diffbind.xml Sun Jan 28 05:10:25 2018 -0500 @@ -0,0 +1,402 @@ +<tool id="diffbind" name="DiffBind" version="2.6.5.0"> + <description> differential binding analysis of ChIP-Seq peak data</description> + <requirements> + <requirement type="package" version="2.6.5">bioconductor-diffbind</requirement> + <requirement type="package" version="1.20.0">r-getopt</requirement> + <!--added rmysql requirement to remove: "Warning: namespace ‘RMySQL’ is not available"--> + <requirement type="package" version="0.10.11">r-rmysql</requirement> + </requirements> + <stdio> + <regex match="Execution halted" + source="both" + level="fatal" + description="Execution halted." /> + <regex match="Input-Error 01" + source="both" + level="fatal" + description="Error in your input parameters: Make sure you only apply factors to selected samples." /> + <regex match="Error in" + source="both" + level="fatal" + description="An undefined error occured, please check your intput carefully and contact your administrator." /> + </stdio> + <version_command><![CDATA[ +echo $(R --version | grep version | grep -v GNU)", DiffBind version" $(R --vanilla --slave -e "library(DiffBind); cat(sessionInfo()\$otherPkgs\$DiffBind\$Version)" 2> /dev/null | grep -v -i "WARNING: ")," getopt version" $(R --vanilla --slave -e "library(getopt); cat(sessionInfo()\$otherPkgs\$getopt\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", rmysql version" $(R --vanilla --slave -e "library(rmysql); cat(sessionInfo()\$otherPkgs\$rmysql\$Version)" 2> /dev/null | grep -v -i "WARNING: ") + ]]></version_command> + <command><![CDATA[ + ## seems that diffbind also needs file extensions to work properly + #set $counter = 1 + #for $sample in $samples: + ln -s $sample.bamreads #echo str($counter) + "_bamreads.bam"# && + ln -s ${sample.bamreads.metadata.bam_index} #echo str($counter) + "_bamreads.bai"# && + #if str( $sample.bamcontrol ) != 'None': + ln -s $sample.bamcontrol #echo str($counter) + "_bamcontrol.bam"# && + ln -s ${sample.bamcontrol.metadata.bam_index} #echo str($counter) + "_bamcontrol.bai"# && + #end if + #set $counter = $counter + 1 + #end for + + Rscript '$__tool_directory__/diffbind.R' + -i $infile + -o '$outfile' + -p '$plots' + -f $format + -t $th + + #if $binding_affinity_matrix: + -b + #end if +]]> + </command> + <configfiles> +<configfile name="infile"><![CDATA[ +#set $counter = 1 +#for $sample in $samples: +#if str( $sample.bamcontrol ) != 'None' and $counter == 1: +SampleID,Tissue,Factor,Condition,Replicate,bamReads,bamControl,Peaks +#elif $counter == 1: +SampleID,Tissue,Factor,Condition,Replicate,bamReads,Peaks +#end if +#if str( $sample.bamcontrol ) != 'None': +$sample.sample_id,$sample.tissue,$sample.factor,$sample.condition,$sample.replicate,#echo str($counter) + '_bamreads.bam'#,#echo str($counter) + '_bamcontrol.bam'#,$sample.peaks +#else: +$sample.sample_id,$sample.tissue,$sample.factor,$sample.condition,$sample.replicate,#echo str($counter) + '_bamreads.bam'#,$sample.peaks +#end if +#set $counter = $counter + 1 +#end for]]></configfile> + </configfiles> + <inputs> + <repeat name="samples" title="Samples" min="2"> + <param name="sample_id" type="text" value="Sample ID" label="Specify a sample id" help="e.g. BT474.1-" /> + <param name="tissue" type="text" value="Tissue" label="Specify the tissue" help="e.g. BT474" /> + <param name="factor" type="text" value="Factor Name" label="Specify a factor name" help="e.g. ER" /> + <param name="condition" type="text" value="Condition" label="Specify the condition" help="e.g. Resistent" /> + <param name="replicate" type="integer" value="1" label="Specify the replicate number" help="e.g. 1" /> + <param name="bamreads" type="data" format="bam" label="Read BAM file" help="Specify the Read BAM file, used for Peak calling."/> + <param name="bamcontrol" type="data" format="bam" optional="True" label="Control BAM file" help="If specifying a control BAM file for this sample, then all samples are required to specify one."/> + <param name="peaks" type="data" format="bed" label="Peak file" help="Result of your Peak calling experiment."/> + </repeat> + <param name="th" type="float" value="1" min="0" max="1" + label="FDR Threshold" + help="Significance threshold; all sites with FDR less than or equal to this value will be included in the report. A value of 1 will include all binding sites in the report. Default: 1"/> + <param name="pdf" type="boolean" truevalue="" falsevalue="" checked="true" + label="Visualising the analysis results" + help="output an additional PDF file" /> + <param name="format" type="select" label="Output Format"> + <option value="bed">BED</option> + <option value="gff">GFF</option> + <option value="wig">WIG</option> + </param> + <param name="binding_affinity_matrix" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output binding affinity matrix?" help="Output a table of the binding scores" /> + </inputs> + <outputs> + <data name="outfile" format="bed" label="Differential binding sites on ${on_string}"> + <change_format> + <when input="format" value="wig" format="wig" /> + <when input="format" value="gff" format="gff" /> + </change_format> + </data> + <data name="plots" format="pdf" label="Differential binding sites on ${on_string}"> + <filter>pdf == True</filter> + </data> + <data name="binding_matrix" format="tabular" from_work_dir="bmatrix.tab" label="Differential binding sites on ${on_string}"> + <filter>binding_affinity_matrix == True</filter> + </data> + </outputs> + <tests> + <test> + <repeat name="samples"> + <param name="sample_id" value="BT4741" /> + <param name="tissue" value="BT474" /> + <param name="factor" value="ER" /> + <param name="condition" value="Resistant" /> + <param name="replicate" value="1" /> + <param name="bamreads" ftype="bam" value="BT474_ER_1.bam" /> + <param name="peaks" ftype="bed" value="BT474_ER_1.bed.gz" /> + </repeat> + <repeat name="samples"> + <param name="sample_id" value="BT4742" /> + <param name="tissue" value="BT474" /> + <param name="factor" value="ER" /> + <param name="condition" value="Resistant" /> + <param name="replicate" value="2" /> + <param name="bamreads" ftype="bam" value="BT474_ER_2.bam" /> + <param name="peaks" ftype="bed" value="BT474_ER_2.bed.gz" /> + </repeat> + <repeat name="samples"> + <param name="sample_id" value="MCF71" /> + <param name="tissue" value="MCF7" /> + <param name="factor" value="ER" /> + <param name="condition" value="Responsive" /> + <param name="replicate" value="1" /> + <param name="bamreads" ftype="bam" value="MCF7_ER_1.bam" /> + <param name="peaks" ftype="bed" value="MCF7_ER_1.bed.gz" /> + </repeat> + <repeat name="samples"> + <param name="sample_id" value="MCF72" /> + <param name="tissue" value="MCF7" /> + <param name="factor" value="ER" /> + <param name="condition" value="Responsive" /> + <param name="replicate" value="2" /> + <param name="bamreads" ftype="bam" value="MCF7_ER_2.bam" /> + <param name="peaks" ftype="bed" value="MCF7_ER_2.bed.gz" /> + </repeat> + <param name="pdf" value="True" /> + <param name="binding_affinity_matrix" value="True" /> + <output name="outfile" value="out_diffbind.bed" /> + <output name="binding_matrix" value="out_binding.matrix" /> + </test> + </tests> + <help><![CDATA[ + +.. class:: infomark + +**What it does** + +DiffBind_ is a `Bioconductor package`_ that provides functions for processing ChIP-Seq data enriched for genomic loci where specific +protein/DNA binding occurs, including peak sets identified by ChIP-Seq peak callers and +aligned sequence read datasets. It is designed to work with multiple peak sets simultaneously, +representing different ChIP experiments (antibodies, transcription factor and/or histone +marks, experimental conditions, replicates) as well as managing the results of multiple peak +callers. + +The primary emphasis of DiffBind is on identifying sites that are differentially bound +between two sample groups. It includes functions to support the processing of peak sets, +including overlapping and merging peak sets, counting sequencing reads overlapping intervals +in peak sets, and identifying statistically significantly differentially bound sites based on +evidence of binding affinity (measured by differences in read densities). To this end it uses +statistical routines developed in an RNA-Seq context (primarily the Bioconductor packages +edgeR and DESeq2 ). Additionally, the package builds on Rgraphics routines to provide a +set of standardized plots to aid in binding analysis. + +The `DiffBind User Guide`_ includes a brief overview of the processing flow, followed by four sections of +examples: the first focusing on the core task of obtaining differentially bound sites based on +affinity data, the second working through the main plotting routines, the third discussing the +use of a blocking factor, and the fourth revisiting occupancy data (peak calls) in more detail, +as well as comparing the results of an occupancy-based analysis with an affinity-based one. +Finally, certain technical aspects of the how these analyses are accomplished are detailed. + +Note DiffBind requires a minimum of four samples (two groups with two replicates each). + +.. _DiffBind: https://bioconductor.org/packages/release/bioc/html/DiffBind.html +.. _`Bioconductor package`: https://bioconductor.org/packages/release/bioc/html/DiffBind.html +.. _`DiffBind User Guide`: https://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf + +**Inputs** + +DiffBind works primarily with peaksets, which are sets of genomic intervals representing +candidate protein binding sites. Each interval consists of a chromosome, a start and end +position, and usually a score of some type indicating confidence in, or strength of, the peak. +Associated with each peakset are metadata relating to the experiment from which the peakset +was derived. Additionally, files containing mapped sequencing reads (generally .bam files) can +be associated with each peakset (one for the ChIP data, and optionally another representing +a control sample) + +**Sample Information** + +You have to specify your sample information in the tool form above. + +Example: + + ============= ========== ========== ============= ============= + **SampleID** **Tissue** **Factor** **Condition** **Replicate** + ------------- ---------- ---------- ------------- ------------- + BT4741 BT474 ER Resistant 1 + BT4742 BT474 ER Resistant 2 + MCF71 MCF7 ER Responsive 1 + MCF72 MCF7 ER Responsive 2 + MCF73 MCF7 ER Responsive 3 + T47D1 T47D ER Responsive 1 + T47D2 T47D ER Responsive 2 + MCF7r1 MCF7 ER Resistant 1 + MCF7r2 MCF7 ER Resistant 2 + ZR751 ZR75 ER Responsive 1 + ZR752 ZR75 ER Responsive 2 + ============= ========== ========== ============= ============= + +Or provide a sample sheet tabular file such as below. + +Example: + + ======== ====== ====== ========== ========== ========= ==================== ========= ===================== ================= ========== + SampleID Tissue Factor Condition Treatment Replicate bamReads ControlID bamControl Peaks PeakCaller + ======== ====== ====== ========== ========== ========= ==================== ========= ===================== ================= ========== + BT4741 BT474 ER Resistant Full-Media 1 Chr18_BT474_ER_1.bam BT474c Chr18_BT474_input.bam BT474_ER_1.bed.gz bed + BT4742 BT474 ER Resistant Full-Media 2 Chr18_BT474_ER_2.bam BT474c Chr18_BT474_input.bam BT474_ER_2.bed.gz bed + MCF71 MCF7 ER Responsive Full-Media 1 Chr18_MCF7_ER_1.bam MCF7c Chr18_MCF7_input.bam MCF7_ER_1.bed.gz bed + MCF72 MCF7 ER Responsive Full-Media 2 Chr18_MCF7_ER_2.bam MCF7c Chr18_MCF7_input.bam MCF7_ER_2.bed.gz bed + MCF73 MCF7 ER Responsive Full-Media 3 Chr18_MCF7_ER_3.bam MCF7c Chr18_MCF7_input.bam MCF7_ER_3.bed.gz bed + T47D1 T47D ER Responsive Full-Media 1 Chr18_T47D_ER_1.bam T47Dc Chr18_T47D_input.bam T47D_ER_1.bed.gz bed + T47D2 T47D ER Responsive Full-Media 2 Chr18_T47D_ER_2.bam T47Dc Chr18_T47D_input.bam T47D_ER_2.bed.gz bed + MCF7r1 MCF7 ER Resistant Full-Media 1 Chr18_TAMR_ER_1.bam TAMRc Chr18_TAMR_input.bam TAMR_ER_1.bed.gz bed + MCF7r2 MCF7 ER Resistant Full-Media 2 Chr18_TAMR_ER_2.bam TAMRc Chr18_TAMR_input.bam TAMR_ER_2.bed.gz bed + ZR751 ZR75 ER Responsive Full-Media 1 Chr18_ZR75_ER_1.bam ZR75c Chr18_ZR75_input.bam ZR75_ER_1.bed.gz bed + ZR752 ZR75 ER Responsive Full-Media 2 Chr18_ZR75_ER_2.bam ZR75c Chr18_ZR75_input.bam ZR75_ER_2.bed.gz bed + ======== ====== ====== ========== ========== ========= ==================== ========= ===================== ================= ========== + + +**Peak files** + +Result of your Peak calling experiment in bed format, one file for each sample is required. + +Example: + + ======= ======= ======= =============== ======= + 1 2 3 4 **5** + ======= ======= ======= =============== ======= + chr18 215562 216063 MACS_peak_16037 56.11 + chr18 311530 312105 MACS_peak_16038 222.49 + chr18 356656 357315 MACS_peak_16039 92.06 + chr18 371110 372092 MACS_peak_16040 123.86 + chr18 395116 396464 MACS_peak_16041 1545.39 + chr18 399014 400382 MACS_peak_16042 1835.19 + chr18 499134 500200 MACS_peak_16043 748.32 + chr18 503518 504552 MACS_peak_16044 818.30 + chr18 531672 532274 MACS_peak_16045 159.30 + chr18 568326 569282 MACS_peak_16046 601.11 + ======= ======= ======= =============== ======= + +* BAM file which contains the mapped sequencing reads can be associated with each peakset +* Control BAM file represents a control dataset and are optional, but have to specified for all when used. + + +**Outputs** + +As output format you can choose BED, GFF, WIG. + +Example: + +======== ====== =======+ +seqnames ranges strand Conc Conc_Resistant + +2452 chr18 [64490686, 64491186] * | 6.36 1.39 +1291 chr18 [34597713, 34598213] * | 5.33 0.22 +976 chr18 [26860997, 26861497] * | 7.3 3.13 +2338 chr18 [60892900, 60893400] * | 7.13 1.84 +2077 chr18 [55569087, 55569587] * | 5.52 1.89 + +Conc_Responsive Fold p-value FDR +<numeric> <numeric> <numeric> <numeric> +2452 7 -5.61 3.57e-10 1.02e-06 +1291 5.97 -5.75 1.1e-09 1.57e-06 +976 7.92 -4.79 1.1e-08 1.05e-05 +2338 7.77 -5.93 1.68e-08 1.17e-05 +2077 6.13 -4.23 2.36e-08 1.17e-05 + +The value columns show the +Conc mean read concentration over all the samples (the default calculation uses log2 normalized ChIP read counts with control read counts subtracted) +Conc_Resistant mean concentration over the first (Resistant) group +Conc_Responsive mean concentration over second (Responsive) group +Fold column shows the difference in mean concentrations between the two groups (Conc_Resistant - Conc_Responsive), with a positive value indicating increased binding affinity in the Resistant group and a negative value indicating increased binding affinity in the Responsive group. +p-value confidence measure for identifying these sites as differentially bound +FDR a multiple testing corrected FDR p-value + + +**Binding Affinity Matrix** + +The final result of counting is a binding affinity matrix containing a (normalized) read count for each sample at every potential binding site. With this matrix, the samples can be re-clustered using affinity, rather than occupancy, data. The binding affinity matrix can be used for QC plotting as well as for subsequent +differential analysis. + +Example: + + ====== ====== ====== ========== ========== ========= ====== ========= ==== + ID Tissue Factor Condition Treatment Replicate Caller Intervals FRiP + ====== ====== ====== ========== ========== ========= ====== ========= ==== + BT4741 BT474 ER Resistant Full-Media 1 counts 2845 0.16 + BT4742 BT474 ER Resistant Full-Media 2 counts 2845 0.15 + MCF71 MCF7 ER Responsive Full-Media 1 counts 2845 0.27 + MCF72 MCF7 ER Responsive Full-Media 2 counts 2845 0.17 + MCF73 MCF7 ER Responsive Full-Media 3 counts 2845 0.23 + T47D1 T47D ER Responsive Full-Media 1 counts 2845 0.10 + T47D2 T47D ER Responsive Full-Media 2 counts 2845 0.06 + MCF7r1 MCF7 ER Resistant Full-Media 1 counts 2845 0.20 + MCF7r2 MCF7 ER Resistant Full-Media 2 counts 2845 0.13 + ZR751 ZR75 ER Responsive Full-Media 1 counts 2845 0.32 + ZR752 ZR75 ER Responsive Full-Media 2 counts 2845 0.22 + ====== ====== ====== ========== ========== ========= ====== ========= ==== + + + +**More Information** + +Generally, processing data with DiffBind involves five phases: + + #. Reading in peaksets + #. Occupancy analysis + #. Counting reads + #. Differential binding affinity analysis + #. Plotting and reporting + + + * **Reading in peaksets**: + +The first step is to read in a set of peaksets and associated +metadata. Peaksets are derived either from ChIP-Seq peak callers, such as MACS +([1]), or using some other criterion (e.g. genomic windows, or all the promoter regions +in a genome). The easiest way to read in peaksets is using a comma-separated value +(csv) sample sheet with one line for each peakset. (Spreadsheets in Excel® format, with +a .xls or .xlsx suffix, are also accepted.) A single experiment can have more than +one associated peakset; e.g. if multiple peak callers are used for comparison purposes +each sample would have more than one line in the sample sheet. Once the peaksets +are read in, a merging function finds all overlapping peaks and derives a single set of +unique genomic intervals covering all the supplied peaks (a consensus peakset for the +experiment). + + * **Occupancy analysis**: + +Peaksets, especially those generated by peak callers, provide +an insight into the potential occupancy of the protein being ChIPed for at specific +genomic loci. After the peaksets have been loaded, it can be useful to perform some +exploratory plotting to determine how these occupancy maps agree with each other, +e.g. between experimental replicates (re-doing the ChIP under the same conditions), +between different peak callers on the same experiment, and within groups of samples +representing a common experimental condition. DiffBind provides functions to enable +overlaps to be examined, as well as functions to determine how well similar samples +cluster together. Beyond quality control, the product of an occupancy analysis may be +a consensus peakset, representing an overall set of candidate binding sites to be used +in further analysis. + + * **Counting reads**: + +Once a consensus peakset has been derived, DiffBind can use the +supplied sequence read files to count how many reads overlap each interval for each +unique sample. The peaks in the consensus peakset may be re-centered and trimmed +based on calculating their summits (point of greatest read overlap) in order to provide +more standardized peak intervals. The final result of counting is a binding affinity matrix +containing a (normalized) read count for each sample at every potential binding site. +With this matrix, the samples can be re-clustered using affinity, rather than occupancy, +data. The binding affinity matrix is used for QC plotting as well as for subsequent +differential analysis. + + * **Differential binding affinity analysis**: + +The core functionality of DiffBind is the +differential binding affinity analysis, which enables binding sites to be identified that +are statistically significantly differentially bound between sample groups. To accomplish +this, first a contrast (or contrasts) is established, dividing the samples into groups to +be compared. Next the core analysis routines are executed, by default using DESeq2 . +This will assign a p-value and FDR to each candidate binding site indicating confidence +that they are differentially bound. + + * **Plotting and reporting**: + +Once one or more contrasts have been run, DiffBind provides +a number of functions for reporting and plotting the results. MA plots give an +overview of the results of the analysis, while correlation heatmaps and PCA plots show +how the groups cluster based on differentially bound sites. Boxplots show the distribution +of reads within differentially bound sites corresponding to whether they gain or +lose affinity between the two sample groups. A reporting mechanism enables differentially +bound sites to be extracted for further processing, such as annotation, motif, and +pathway analyses. + +**References** + +DiffBind Authors: Rory Stark, Gordon Brown (2011) +Wrapper authors: Bjoern Gruening, Pavankumar Videm + +]]> + </help> + <citations> + <citation type="doi">doi:10.1038/nature10730</citation> + </citations> +</tool>