Mercurial > repos > iuc > deseq2

--- a/deseq2.R	Mon Nov 29 18:16:48 2021 +0000
+++ b/deseq2.R	Fri Aug 26 11:16:15 2022 +0000
@@ -31,8 +31,9 @@
 #   3 "mean"

 # setup R error handling to go to stderr
-options(show.error.messages = F, error = function() {
-  cat(geterrmessage(), file = stderr()); q("no", 1, F)
+options(show.error.messages = FALSE, error = function() {
+  cat(geterrmessage(), file = stderr())
+  q("no", 1, FALSE)
 })

 # we need that to not crash galaxy with an UTF8 error on German LC settings.
@@ -69,7 +70,9 @@
   "outlier_filter_off", "b", 0, "logical",
   "auto_mean_filter_off", "c", 0, "logical",
   "beta_prior_off", "d", 0, "logical",
-  "alpha_ma", "A", 1, "numeric"
+  "alpha_ma", "A", 1, "numeric",
+  "prefilter", "P", 0, "logical",
+  "prefilter_value", "V", 1, "numeric"
 ), byrow = TRUE, ncol = 4)
 opt <- getopt(spec)

@@ -239,7 +242,7 @@
             size_factors <- estimateSizeFactorsForMatrix(counts(dds))
         }
     }
-    write.table(size_factors, file = opt$sizefactorsfile, sep = "\t", col.names = F, quote = FALSE)
+    write.table(size_factors, file = opt$sizefactorsfile, sep = "\t", col.names = FALSE, quote = FALSE)
 }

 apply_batch_factors <- function(dds, batch_factors) {
@@ -253,7 +256,7 @@
   dds_data <- colData(dds)
   # Merge dds_data with batch_factors using indexes, which are sample names
   # Set sort to False, which maintains the order in dds_data
-  reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort = F)
+  reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort = FALSE)
   batch_factors <- reordered_batch[, ncol(dds_data):ncol(reordered_batch)]
   for (factor in colnames(batch_factors)) {
     dds[[factor]] <- batch_factors[[factor]]
@@ -263,7 +266,7 @@
 }

 if (!is.null(opt$batch_factors)) {
-  batch_factors <- read.table(opt$batch_factors, sep = "\t", header = T)
+  batch_factors <- read.table(opt$batch_factors, sep = "\t", header = TRUE)
   dds <- apply_batch_factors(dds = dds, batch_factors = batch_factors)
   batch_design <- colnames(batch_factors)[-c(1, 2)]
   design_formula <- as.formula(paste("~", paste(c(batch_design, rev(factors)), collapse = " + ")))
@@ -280,6 +283,12 @@
   cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n"))
 }

+# minimal pre-filtering
+if (!is.null(opt$prefilter)) {
+    keep <- rowSums(counts(dds)) >= opt$prefilter_value
+    dds <- dds[keep, ]
+}
+
 # optional outlier behavior
 if (is.null(opt$outlier_replace_off)) {
   min_rep <- 7
--- a/deseq2.xml	Mon Nov 29 18:16:48 2021 +0000
+++ b/deseq2.xml	Fri Aug 26 11:16:15 2022 +0000
@@ -92,6 +92,10 @@
     #if $batch_factors:
         --batch_factors '$batch_factors'
     #end if
+    #if $advanced_options.prefilter_conditional.prefilter:
+        $advanced_options.prefilter_conditional.prefilter
+        -V $advanced_options.prefilter_conditional.prefilter_value
+    #end if
     #if $advanced_options.outlier_replace_off:
         -a
     #end if
@@ -194,6 +198,19 @@
             <param name="auto_mean_filter_off" type="boolean" truevalue="1" falsevalue="0" checked="false"
                 label="Turn off independent filtering"
                 help=" DESeq2 performs independent filtering by default using the mean of normalized counts as a filter statistic" />
+            <conditional name="prefilter_conditional">
+                <param name="prefilter" type="select" label="Perform pre-filtering" help="While it is not necessary to pre-filter
+                    low count genes before running the DESeq2 functions, there are two reasons which make pre-filtering useful:
+                    by removing rows in which there are very few reads, we reduce the required memory, and we increase the speed.
+                    It can also improve visualizations, as features with no information for differential expression are not plotted.">
+                    <option value="-P">Enabled</option>
+                    <option value="" selected="true">Disabled</option>
+                </param>
+                <when value="-P">
+                    <param name="prefilter_value" type="integer" min="0" value="10" label="Pre-filter value" help="Keep only rows that have at least N reads total." />
+                </when>
+                <when value=""/>
+            </conditional>
         </section>
         <section name="output_options" title="Output options">
             <param name="output_selector" type="select" multiple="True" optional="true" display="checkboxes" label="Output selector">
@@ -272,6 +289,7 @@
             <output name="deseq_out" >
                 <assert_contents>
                     <has_text_matching expression="FBgn0003360\t1933\.9504.*\t-2\.8399.*\t0\.1309.*\t-21\.68.*\t.*e-104\t.*e-101" />
+                    <has_n_lines n="3999"/>
                 </assert_contents>
             </output>
         </test>
@@ -579,6 +597,39 @@
                 </assert_contents>
             </output>
         </test>
+        <!--Test prefilter parameter -->
+        <test expect_num_outputs="2">
+            <repeat name="rep_factorName">
+                <param name="factorName" value="Treatment"/>
+                <repeat name="rep_factorLevel">
+                    <param name="factorLevel" value="Treated"/>
+                    <param name="countsFile" value="GSM461179_treat_single.counts,GSM461180_treat_paired.counts,GSM461181_treat_paired.counts"/>
+                </repeat>
+                <repeat name="rep_factorLevel">
+                    <param name="factorLevel" value="Untreated"/>
+                    <param name="countsFile" value="GSM461176_untreat_single.counts,GSM461177_untreat_paired.counts,GSM461178_untreat_paired.counts,GSM461182_untreat_single.counts"/>
+                </repeat>
+            </repeat>
+            <section name="advanced_options">
+                <conditional name="prefilter_conditional">
+                    <param name="prefilter" value="-P"/>
+                    <param name="prefilter_value" value="10"/>
+                </conditional>
+            </section>
+            <section name="output_options">
+                <param name="output_selector" value="normCounts"/>
+            </section>
+            <output name="counts_out">
+                <assert_contents>
+                    <has_n_lines n="2922"/>
+                </assert_contents>
+            </output>
+            <output name="deseq_out" >
+                <assert_contents>
+                    <has_n_lines n="2921"/>  <!-- Smallen value when compared with the first test-->
+                </assert_contents>
+            </output>
+        </test>
     </tests>
     <help><![CDATA[
 .. class:: infomark
--- a/deseq2_macros.xml	Mon Nov 29 18:16:48 2021 +0000
+++ b/deseq2_macros.xml	Fri Aug 26 11:16:15 2022 +0000
@@ -33,7 +33,7 @@
         </requirements>
     </xml>
     <token name="@TOOL_VERSION@">2.11.40.7</token>
-    <token name="@SUFFIX_VERSION@">1</token>
+    <token name="@SUFFIX_VERSION@">2</token>
     <xml name="edam_ontology">
         <edam_topics>
             <edam_topic>topic_3308</edam_topic>
--- a/get_deseq_dataset.R	Mon Nov 29 18:16:48 2021 +0000
+++ b/get_deseq_dataset.R	Fri Aug 26 11:16:15 2022 +0000
@@ -14,7 +14,7 @@
     }
   }

-  if (!use_txi & has_header) {
+  if (!use_txi && has_header) {
       countfiles <- lapply(as.character(sample_table$filename), read.delim, row.names = 1)
       tbl <- do.call("cbind", countfiles)
       colnames(tbl) <- rownames(sample_table) # take sample ids from header
@@ -35,7 +35,7 @@
         colData = subset(sample_table, select = -filename),
         design = design_formula
       )
-  } else if (!use_txi & !has_header) {
+  } else if (!use_txi && !has_header) {

     # construct the object from HTSeq files
     dds <- DESeqDataSetFromHTSeqCount(