Mercurial > repos > shians > shrnaseq
diff hairpinTool.xml @ 2:076ca575208f
First commit
author | shian_su <registertonysu@gmail.com> |
---|---|
date | Fri, 21 Feb 2014 12:52:56 +1100 |
parents | |
children | 3d04308a99f9 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hairpinTool.xml Fri Feb 21 12:52:56 2014 +1100 @@ -0,0 +1,409 @@ +<tool id="shRNAseq" name="shRNAseq Tool" version="1.0.5"> + <description> + Analyse hairpin differential representation using edgeR + </description> + + <requirements> + <requirement type="R-module">edgeR</requirement> + <requirement type="R-module">limma</requirement> + </requirements> + + <stdio> + <exit_code range="1:" level="fatal" description="Tool exception" /> + </stdio> + + <command interpreter="Rscript"> + hairpinTool.R $inputOpt.type + #if $inputOpt.type=="fastq": + #for $i, $fas in enumerate($inputOpt.fastq): + fastq::$fas.file + #end for + + $inputOpt.hairpin + $inputOpt.samples + + #if $inputOpt.positions.option=="yes": + $inputOpt.positions.barstart + $inputOpt.positions.barend + $inputOpt.positions.hpstart + $inputOpt.positions.hpend + #else: + 1 + 5 + 37 + 57 + #end if + #else: + $inputOpt.counts + $inputOpt.anno + "$inputOpt.factors" + 0 0 0 + #end if + + #if $filterCPM.option=="yes": + $filterCPM.cpmReq + $filterCPM.sampleReq + #else: + -Inf + -Inf + #end if + + $fdr + $lfc + $workMode.mode + $outFile + $outFile.files_path + + #if $workMode.mode=="classic": + "$workMode.pair1" + "$workMode.pair2" + #else: + "$workMode.contrast" + $workMode.roast.option + #if $workMode.roast.option=="yes": + $workMode.roast.hairpinReq + $workMode.roast.select.option + "$workMode.roast.select.selection" + #else: + 0 + 0 + 0 + #end if + #end if + </command> + + <inputs> + <conditional name="inputOpt"> + <param name="type" type="select" label="Input File Type"> + <option value="fastq">FastQ File</option> + <option value="counts">Table of Counts</option> + </param> + + <when value="fastq"> + <param name="hairpin" type="data" format="tabular" + label="Hairpin Annotation"/> + + + <param name="samples" type="data" format="tabular" + label="Sample Annotation"/> + + <repeat name="fastq" title="FastQ Files"> + <param name="file" type="data" format="fastq"/> + </repeat> + + <conditional name="positions"> + <param name="option" type="select" + label="Specify Barcode and Hairpin Locations?" + help="Default Positions: Barcode: 1 to 5, Hairpin: 37 to 57."> + <option value="no" selected="True">No</option> + <option value="yes">Yes</option> + </param> + + <when value="yes"> + <param name="barstart" type="integer" value="1" + label="Barcode Starting Position"/> + <param name="barend" type="integer" value="5" + label="Barcode Ending Position"/> + + <param name="hpstart" type="integer" value="37" + label="Hairpin Starting Position"/> + + <param name="hpend" type="integer" value="57" + label="Hairpin Ending Position"/> + </when> + + <when value="no"/> + </conditional> + </when> + + <when value="counts"> + <param name="counts" type="data" format="tabular" label="Counts Table"/> + <param name="anno" type="data" format="tabular" + label="Hairpin Annotation"/> + <param name="factors" type="data" format="tabular" + label="Sample Annotation"/> + </when> + </conditional> + + <conditional name="filterCPM"> + <param name="option" type="select" label="Filter Low CPM?" + help="Ignore hairpins with very low representation when performing + analysis."> + <option value="yes">Yes</option> + <option value="no">No</option> + </param> + + <when value="yes"> + <param name="cpmReq" type="float" value="0.5" min="0" max="1" + label="Minimum CPM"/> + + <param name="sampleReq" type="integer" value="1" min="0" + label="Minimum Samples" + help="Filter out all the genes that do not meet the minimum + CPM in at least this many samples."/> + </when> + + <when value="no"/> + + </conditional> + + <conditional name="workMode"> + <param name="mode" type="select" label="Analysis Type" + help="Classic Exact Tests are useful for simple comparisons across + two sampling groups. Generalised linear models allow for more + complex contrasts and gene level analysis to be made."> + <option value="classic">Classic Exact Test</option> + <option value="glm">Generalised Linear Model</option> + </param> + + <when value="classic"> + <param name="pair1" type="text" label="Compare" size="40"/> + <param name="pair2" type="text" label="To" size="40" + help="The analysis will subtract values of this group from those + in the group above to establish the difference."/> + </when> + + <when value="glm"> + <param name="contrast" type="text" size="60" + label="Contrasts of interest" + help="Specify equations defining contrasts to be made. Eg. + KD-Control will result in positive fold change if KD has + greater expression and negative if Control has greater + expression."/> + + <conditional name="roast"> + <param name="option" type="select" + label="Perform Gene Level Analysis?" + help="Analyse LogFC tendencies for hairpins belonging + to the same gene."> + <option value="no">No</option> + <option value="yes">Yes</option> + </param> + + <when value="yes"> + <param name="hairpinReq" type="integer" value="2" min="2" + label="Minimum Hairpins" + help="Only genes with at least this many hairpins will + be analysed."/> + + <conditional name="select"> + <param name="option" type="select" + label="Gene Selection Method"> + <option value="rank">By p-value Rank</option> + <option value="geneID">By Gene Identifier</option> + </param> + <when value="rank"> + <param name="selection" type="text" size="40" value="1:5" + label="Ranks of Top Genes to Plot" + help="Genes are ranked in ascending p-value for + differential representation, individual ranks can + be entered seperated by comma or a range seperated + by colon."/> + </when> + <when value="geneID"> + <param name="selection" type="text" size="80" value="" + label="Symbols of Genes to Plot" + help="Select genes based on their identifier in the + 'Gene' column of the sample information file. + Please ensure exact match with the values in input + file and separate selections with commas."/> + </when> + </conditional> + + + </when> + + <when value="no"/> + </conditional> + </when> + </conditional> + + <param name="fdr" type="float" value="0.05" min="0" max="1" + label="FDR Threshold" + help="All observations below this threshold will be highlighted + in the smear plot."/> + <param name="lfc" type="float" value="0" min="0" + label="Absolute LogFC Threshold" + help="In additional to meeting the FDR requirement, the absolute + value of the log-fold-change of the observation must be above + this threshold to be highlighted."/> + </inputs> + + <outputs> + <data format="html" name="outFile" label="shRNAseq Analysis"/> + </outputs> + + <help> +.. class:: infomark + +**What it does** + +Given tables containing information about the hairpins and their associated +barcodes, information about the samples and fastq file containing the hairpin +reads. This tool will generate plots and tables for the analysis of differential +representation. + +----- + +.. class:: infomark + +**INPUTS** + +**Input File Type:** + +This tool is able to either generate counts from a raw FastQ file given the +information regarding the samples and hairpins. Alternatively if a table of +counts has already been generated it can also be used. + +**Counts Table (Counts Input):** + +A tab delimited text table of information regarding the counts of hairpins. +Should have a column 'ID' to denote the hairpins that counts correspond to. Each +additional column should have titles corresponding to the label for the sample. + +Example:: + + ID Sample1 Sample2 Sample3 + Control1 49802 48014 40148 + Control2 12441 16352 14232 + Control3 9842 9148 9111 + Hairpin1 3300 3418 2914 + Hairpin2 91418 95812 93174 + Hairpin3 32985 31975 35104 + Hairpin4 12082 14081 14981 + Hairpin5 2491 2769 2691 + Hairpin6 1294 1486 1642 + Hairpin7 49501 49076 47611 + ... + +**Hairpin Annotation:** + +A tab delimited text table of information regarding the hairpins. Should have +columns 'ID', 'Sequences' and 'Gene' to uniquely identify the hairpin, align it +with the reads to produce counts and identify which gene the hairpin acts on. + +NOTE: the column names are case sensitive and should be input exactly as they +are shown here. + +Example:: + + ID Sequences Gene + Control1 TCTCGCTTGGGCGAGAGTAAG 2 + Control2 CCGCCTGAAGTCTCTGATTAA 2 + Control3 AGGAATTATAATGCTTATCTA 2 + Hairpin1 AAGGCAGAGACTGACCACCTA 4 + Hairpin2 GAGCGACCTGGTGTTACTCTA 4 + Hairpin3 ATGGTGTAAATAGAGCTGTTA 4 + Hairpin4 CAGCTCATCTTCTGTGAAGAA 4 + Hairpin5 CAGCTCTGTGGGTCAGAAGAA 4 + Hairpin6 CCAGGCACAGATCTCAAGATA 4 + Hairpin7 ATGACAAGAAAGACATCTCAA 7 + ... + +**Sample Annotation (FastQ Input):** + +A tab delimited text table of information regarding the samples. Should have +columns 'ID', 'Sequences' and 'group' to uniquely identify each sample, identify +the sample in the reads by its barcode sequence and correctly group replicates +for analysis. Additional columns may inserted for annotation purposes and will +not interfere with analysis as long as the necessary columns are present. + +NOTE: the column names are case sensitive and should be input exactly as they +are shown here. + +Example:: + + ID Sequences group Replicate + 3 GAAAG Day 2 1 + 6 GAACC Day 10 1 + 9 GAAGA Day 5 GFP neg 1 + 16 GAATT Day 5 GFP pos 1 + 18 GACAC Day 2 2 + 21 GACCA Day 10 2 + 28 GACGT Day 5 GFP neg 2 + 31 GACTG Day 5 GFP pos 2 + 33 GAGAA Day 2 3 + 40 GAGCT Day 10 3 + ... + +**Specify Barcode and Hairpin Locations (FastQ Input):** + +It is assumed that in the sequencing reads that the first 5 bases are the +barcodes and that bases 37-57 are the hairpins. If this is not the case then the +values of the positions can be changed, however it still requires the barcodes +and hairpins to be in a consistent location an in a continuous sequence. + +**Filter Low CPM?:** + +Often in a large screen there may members with very low counts which are of no +interest in the experiment, these may be filtered out to speed up computations. +Filtering will be based on counts per million in a required number of samples. + +**Analysis Type:** + + * **Classic Exact Test:** This allows two experimental groups to be compared and + p-values for differential representation derivec for each hairpin. Simple and + fast for straightforward comparisons. In this option you will have the option of + "*Compare* x *To* y" which implicitly subtracts the data from y from that of x + to produce the comparison. + + * **Generalised Linear Model:** This allow for complex contrasts to be specified + and also gene level analysis to be performed. If this option is chosen then + contrasts must be explicitly stated in equations and multiple contrasts can be + made. In addition there will be the option to analyse hairpins on a per-gene + basis to see if hairpins belonging to a particular gene have any overall + tendencies for the direction of their log-fold-change. + +**FDR Threshold:** +The smear plot in the output will have hairpins highlighted to signify +significant differential representation. The significance is determined by +contorlling the false discovery rate, only those with a FDR lower than the +threshold will be highlighted in the plot. + +----- + +**Citations:** + +.. class:: infomark + +limma + +Please cite the paper below for the limma software itself. Please also try +to cite the appropriate methodology articles that describe the statistical +methods implemented in limma, depending on which limma functions you are +using. The methodology articles are listed in Section 2.1 of the limma +User's Guide. + + * Smyth, GK (2005). Limma: linear models for microarray data. In: + 'Bioinformatics and Computational Biology Solutions using R and + Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, + W. Huber (eds), Springer, New York, pages 397-420. + +.. class:: infomark + +edgeR + +Please cite the first paper for the software itself and the other papers for +the various original statistical methods implemented in edgeR. See +Section 1.2 in the User's Guide for more detail. + + * Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor + package for differential expression analysis of digital gene expression + data. Bioinformatics 26, 139-140 + + * Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing + differences in tag abundance. Bioinformatics 23, 2881-2887 + + * Robinson MD and Smyth GK (2008). Small-sample estimation of negative + binomial dispersion, with applications to SAGE data. + Biostatistics, 9, 321-332 + + * McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis + of multifactor RNA-Seq experiments with respect to biological variation. + Nucleic Acids Research 40, 4288-4297 + +.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html +.. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html + </help> +</tool> +