diff hairpinTool.xml @ 2:076ca575208f

First commit
author shian_su <registertonysu@gmail.com>
date Fri, 21 Feb 2014 12:52:56 +1100
parents
children 3d04308a99f9
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hairpinTool.xml	Fri Feb 21 12:52:56 2014 +1100
@@ -0,0 +1,409 @@
+<tool id="shRNAseq" name="shRNAseq Tool" version="1.0.5">
+  <description>
+    Analyse hairpin differential representation using edgeR
+  </description>
+    
+  <requirements>
+    <requirement type="R-module">edgeR</requirement>
+    <requirement type="R-module">limma</requirement>
+  </requirements>
+  
+  <stdio>
+    <exit_code range="1:" level="fatal" description="Tool exception" />
+  </stdio>
+  
+  <command interpreter="Rscript">
+  hairpinTool.R $inputOpt.type
+                #if $inputOpt.type=="fastq":
+                  #for $i, $fas in enumerate($inputOpt.fastq):
+                    fastq::$fas.file
+                  #end for
+    
+                  $inputOpt.hairpin
+                  $inputOpt.samples
+                    
+                  #if $inputOpt.positions.option=="yes":
+                    $inputOpt.positions.barstart
+                    $inputOpt.positions.barend
+                    $inputOpt.positions.hpstart
+                    $inputOpt.positions.hpend
+                  #else:
+                    1
+                    5
+                    37
+                    57
+                  #end if
+                #else:
+                  $inputOpt.counts
+                  $inputOpt.anno
+                  "$inputOpt.factors"
+                  0 0 0
+                #end if
+          
+                #if $filterCPM.option=="yes":
+                  $filterCPM.cpmReq
+                  $filterCPM.sampleReq
+                #else:
+                  -Inf
+                  -Inf
+                #end if
+          
+                $fdr
+                $lfc
+                $workMode.mode
+                $outFile
+                $outFile.files_path
+          
+                #if $workMode.mode=="classic":
+                  "$workMode.pair1"
+                  "$workMode.pair2"
+                #else:
+                  "$workMode.contrast"
+                  $workMode.roast.option
+                  #if $workMode.roast.option=="yes":
+                    $workMode.roast.hairpinReq
+                    $workMode.roast.select.option
+                    "$workMode.roast.select.selection"
+                  #else:
+                    0
+                    0
+                    0
+                  #end if
+                #end if
+  </command>
+  
+  <inputs>
+    <conditional name="inputOpt">
+      <param name="type" type="select" label="Input File Type">
+        <option value="fastq">FastQ File</option>
+        <option value="counts">Table of Counts</option>
+      </param>
+    
+      <when value="fastq">
+        <param name="hairpin" type="data" format="tabular" 
+               label="Hairpin Annotation"/>
+          
+        
+        <param name="samples" type="data" format="tabular" 
+               label="Sample Annotation"/>
+               
+        <repeat name="fastq" title="FastQ Files">       
+          <param name="file" type="data" format="fastq"/>
+        </repeat>
+          
+        <conditional name="positions">
+          <param name="option" type="select" 
+                 label="Specify Barcode and Hairpin Locations?"
+                 help="Default Positions: Barcode: 1 to 5, Hairpin: 37 to 57.">
+            <option value="no" selected="True">No</option>
+            <option value="yes">Yes</option>
+          </param>
+          
+          <when value="yes">
+            <param name="barstart" type="integer" value="1"
+                   label="Barcode Starting Position"/>
+            <param name="barend" type="integer" value="5"
+                   label="Barcode Ending Position"/>
+            
+            <param name="hpstart" type="integer" value="37"
+                   label="Hairpin Starting Position"/>
+               
+            <param name="hpend" type="integer" value="57"
+                   label="Hairpin Ending Position"/>
+          </when>
+          
+          <when value="no"/>
+        </conditional>
+      </when>
+      
+      <when value="counts">
+        <param name="counts" type="data" format="tabular" label="Counts Table"/>
+        <param name="anno" type="data" format="tabular" 
+               label="Hairpin Annotation"/>
+        <param name="factors" type="data" format="tabular"
+               label="Sample Annotation"/> 
+      </when>
+    </conditional>
+    
+    <conditional name="filterCPM">
+      <param name="option" type="select" label="Filter Low CPM?"
+       help="Ignore hairpins with very low representation when performing 
+             analysis.">
+        <option value="yes">Yes</option>
+       	<option value="no">No</option>
+      </param>
+      
+        <when value="yes">
+          <param name="cpmReq" type="float" value="0.5" min="0" max="1"
+                 label="Minimum CPM"/>
+                 
+          <param name="sampleReq" type="integer" value="1" min="0"
+                 label="Minimum Samples" 
+                 help="Filter out all the genes that do not meet the minimum 
+                       CPM in at least this many samples."/>
+        </when>
+        
+        <when value="no"/>
+        
+    </conditional>
+    
+    <conditional name="workMode">
+      <param name="mode" type="select" label="Analysis Type"
+       help="Classic Exact Tests are useful for simple comparisons across
+             two sampling groups. Generalised linear models allow for more 
+             complex contrasts and gene level analysis to be made.">
+        <option value="classic">Classic Exact Test</option>
+        <option value="glm">Generalised Linear Model</option>
+      </param>
+      
+      <when value="classic">
+        <param name="pair1" type="text" label="Compare" size="40"/>
+        <param name="pair2" type="text" label="To" size="40"
+               help="The analysis will subtract values of this group from those
+                     in the group above to establish the difference."/>
+      </when>
+      
+      <when value="glm">
+        <param name="contrast" type="text" size="60"
+               label="Contrasts of interest"
+               help="Specify equations defining contrasts to be made. Eg. 
+                     KD-Control will result in positive fold change if KD has
+                     greater expression and negative if Control has greater
+                     expression."/>
+               
+        <conditional name="roast">
+          <param name="option" type="select" 
+                 label="Perform Gene Level Analysis?"
+                 help="Analyse LogFC tendencies for hairpins belonging
+                       to the same gene.">
+            <option value="no">No</option>
+            <option value="yes">Yes</option>
+          </param>
+          
+          <when value="yes">
+            <param name="hairpinReq" type="integer" value="2" min="2"
+                   label="Minimum Hairpins"
+                   help="Only genes with at least this many hairpins will
+                         be analysed."/>
+                         
+            <conditional name="select">
+              <param name="option" type="select"
+                     label="Gene Selection Method">
+                <option value="rank">By p-value Rank</option>
+                <option value="geneID">By Gene Identifier</option>
+              </param>
+              <when value="rank">
+                <param name="selection" type="text" size="40" value="1:5"
+                       label="Ranks of Top Genes to Plot"
+                       help="Genes are ranked in ascending p-value for
+                             differential representation, individual ranks can
+                             be entered seperated by comma or a range seperated
+                             by colon."/>
+              </when>
+              <when value="geneID">
+                <param name="selection" type="text" size="80" value=""
+                       label="Symbols of Genes to Plot"
+                       help="Select genes based on their identifier in the
+                             'Gene' column of the sample information file.
+                             Please ensure exact match with the values in input
+                             file and separate selections with commas."/>
+              </when>
+            </conditional>
+
+            
+          </when>
+          
+          <when value="no"/>
+        </conditional>
+      </when>
+    </conditional>
+    
+    <param name="fdr" type="float" value="0.05" min="0" max="1"
+           label="FDR Threshold"
+           help="All observations below this threshold will be highlighted
+                 in the smear plot."/>
+    <param name="lfc" type="float" value="0" min="0" 
+           label="Absolute LogFC Threshold"
+           help="In additional to meeting the FDR requirement, the absolute 
+                 value of the log-fold-change of the observation must be above
+                 this threshold to be highlighted."/>
+  </inputs>
+
+  <outputs>
+    <data format="html" name="outFile" label="shRNAseq Analysis"/>
+  </outputs>
+  
+  <help>
+.. class:: infomark
+
+**What it does**
+
+Given tables containing information about the hairpins and their associated
+barcodes, information about the samples and fastq file containing the hairpin
+reads. This tool will generate plots and tables for the analysis of differential
+representation.
+
+-----
+
+.. class:: infomark
+
+**INPUTS**
+
+**Input File Type:**
+
+This tool is able to either generate counts from a raw FastQ file given the
+information regarding the samples and hairpins. Alternatively if a table of
+counts has already been generated it can also be used.
+
+**Counts Table (Counts Input):**
+
+A tab delimited text table of information regarding the counts of hairpins.
+Should have a column 'ID' to denote the hairpins that counts correspond to. Each
+additional column should have titles corresponding to the label for the sample.
+
+Example::
+
+  ID  Sample1 Sample2 Sample3
+  Control1 49802 48014 40148
+  Control2 12441 16352 14232
+  Control3 9842  9148  9111
+  Hairpin1 3300  3418  2914
+  Hairpin2 91418 95812 93174
+  Hairpin3 32985 31975 35104
+  Hairpin4 12082 14081 14981
+  Hairpin5 2491  2769  2691
+  Hairpin6 1294  1486  1642
+  Hairpin7 49501 49076 47611
+  ...
+  
+**Hairpin Annotation:**
+
+A tab delimited text table of information regarding the hairpins. Should have
+columns 'ID', 'Sequences' and 'Gene' to uniquely identify the hairpin, align it
+with the reads to produce counts and identify which gene the hairpin acts on.
+
+NOTE: the column names are case sensitive and should be input exactly as they
+are shown here.
+
+Example::
+
+  ID	Sequences	Gene
+  Control1	TCTCGCTTGGGCGAGAGTAAG	2
+  Control2	CCGCCTGAAGTCTCTGATTAA	2
+  Control3	AGGAATTATAATGCTTATCTA	2
+  Hairpin1	AAGGCAGAGACTGACCACCTA	4
+  Hairpin2	GAGCGACCTGGTGTTACTCTA	4
+  Hairpin3	ATGGTGTAAATAGAGCTGTTA	4
+  Hairpin4	CAGCTCATCTTCTGTGAAGAA	4
+  Hairpin5	CAGCTCTGTGGGTCAGAAGAA	4
+  Hairpin6	CCAGGCACAGATCTCAAGATA	4
+  Hairpin7	ATGACAAGAAAGACATCTCAA	7
+  ...
+  
+**Sample Annotation (FastQ Input):**
+
+A tab delimited text table of information regarding the samples. Should have
+columns 'ID', 'Sequences' and 'group' to uniquely identify each sample, identify
+the sample in the reads by its barcode sequence and correctly group replicates
+for analysis. Additional columns may inserted for annotation purposes and will
+not interfere with analysis as long as the necessary columns are present.
+
+NOTE: the column names are case sensitive and should be input exactly as they
+are shown here.
+
+Example::
+
+  ID	Sequences	group	Replicate
+  3	GAAAG	Day 2	1
+  6	GAACC	Day 10	1
+  9	GAAGA	Day 5 GFP neg	1
+  16	GAATT	Day 5 GFP pos	1
+  18	GACAC	Day 2	2
+  21	GACCA	Day 10	2
+  28	GACGT	Day 5 GFP neg	2
+  31	GACTG	Day 5 GFP pos	2
+  33	GAGAA	Day 2	3
+  40	GAGCT	Day 10	3
+  ...
+  
+**Specify Barcode and Hairpin Locations (FastQ Input):**
+
+It is assumed that in the sequencing reads that the first 5 bases are the
+barcodes and that bases 37-57 are the hairpins. If this is not the case then the
+values of the positions can be changed, however it still requires the barcodes
+and hairpins to be in a consistent location an in a continuous sequence.
+
+**Filter Low CPM?:**
+
+Often in a large screen there may members with very low counts which are of no
+interest in the experiment, these may be filtered out to speed up computations.
+Filtering will be based on counts per million in a required number of samples.
+
+**Analysis Type:**
+
+ * **Classic Exact Test:** This allows two experimental groups to be compared and
+   p-values for differential representation derivec for each hairpin. Simple and
+   fast for straightforward comparisons. In this option you will have the option of
+   "*Compare* x *To* y" which implicitly subtracts the data from y from that of x
+   to produce the comparison.
+
+ * **Generalised Linear Model:** This allow for complex contrasts to be specified
+   and also gene level analysis to be performed. If this option is chosen then
+   contrasts must be explicitly stated in equations and multiple contrasts can be
+   made. In addition there will be the option to analyse hairpins on a per-gene
+   basis to see if hairpins belonging to a particular gene have any overall
+   tendencies for the direction of their log-fold-change.
+
+**FDR Threshold:**
+The smear plot in the output will have hairpins highlighted to signify
+significant differential representation. The significance is determined by
+contorlling the false discovery rate, only those with a FDR lower than the
+threshold will be highlighted in the plot.
+
+-----
+
+**Citations:**
+
+.. class:: infomark
+
+limma
+
+Please cite the paper below for the limma software itself.  Please also try
+to cite the appropriate methodology articles that describe the statistical
+methods implemented in limma, depending on which limma functions you are
+using.  The methodology articles are listed in Section 2.1 of the limma 
+User's Guide.
+
+	* Smyth, GK (2005). Limma: linear models for microarray data. In: 
+	  'Bioinformatics and Computational Biology Solutions using R and 
+	  Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, 
+	  W. Huber (eds), Springer, New York, pages 397-420.
+
+.. class:: infomark
+
+edgeR
+
+Please cite the first paper for the software itself and the other papers for
+the various original statistical methods implemented in edgeR.  See 
+Section 1.2 in the User's Guide for more detail.
+
+	* Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor 
+	  package for differential expression analysis of digital gene expression 
+	  data. Bioinformatics 26, 139-140
+	  
+	* Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing 
+	  differences in tag abundance. Bioinformatics 23, 2881-2887
+	  
+	* Robinson MD and Smyth GK (2008). Small-sample estimation of negative 
+	  binomial dispersion, with applications to SAGE data.
+	  Biostatistics, 9, 321-332
+	  
+	* McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis 
+	  of multifactor RNA-Seq experiments with respect to biological variation. 
+	  Nucleic Acids Research 40, 4288-4297
+
+.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
+.. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html
+  </help>
+</tool>
+