Mercurial > repos > iuc > limma_voom

--- a/limma_voom.R	Sat Feb 09 07:35:04 2019 -0500
+++ b/limma_voom.R	Mon Feb 18 17:49:24 2019 -0500
@@ -297,35 +297,43 @@

 } else {
     # Process the single count matrix
-    counts <- read.table(opt$matrixPath, header=TRUE, sep="\t", strip.white=TRUE, stringsAsFactors=FALSE)
+    counts <- read.table(opt$matrixPath, header=TRUE, sep="\t", strip.white=TRUE, stringsAsFactors=FALSE, check.names=FALSE)
     row.names(counts) <- counts[, 1]
     counts <- counts[ , -1]
     countsRows <- nrow(counts)

     # Process factors
     if (is.null(opt$factInput)) {
-            factorData <- read.table(opt$factFile, header=TRUE, sep="\t", strip.white=TRUE)
-            # order samples as in counts matrix
-            factorData <- factorData[match(colnames(counts), factorData[, 1]), ]
-            factors <- factorData[, -1, drop=FALSE]
+        factorData <- read.table(opt$factFile, header=TRUE, sep="\t", strip.white=TRUE)
+        if(!setequal(factorData[, 1], colnames(counts)))
+            stop("Sample IDs in counts and factors files don't match")
+        # order samples as in counts matrix
+        factorData <- factorData[match(colnames(counts), factorData[, 1]), ]
+        factors <- factorData[, -1, drop=FALSE]
     }  else {
-            factors <- unlist(strsplit(opt$factInput, "|", fixed=TRUE))
-            factorData <- list()
-            for (fact in factors) {
-                newFact <- unlist(strsplit(fact, split="::"))
-                factorData <- rbind(factorData, newFact)
-            } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... The first factor is the Primary Factor.
+        factors <- unlist(strsplit(opt$factInput, "|", fixed=TRUE))
+        factorData <- list()
+        for (fact in factors) {
+            newFact <- unlist(strsplit(fact, split="::"))
+            factorData <- rbind(factorData, newFact)
+        } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... The first factor is the Primary Factor.

-            # Set the row names to be the name of the factor and delete first row
-            row.names(factorData) <- factorData[, 1]
-            factorData <- factorData[, -1]
-            factorData <- sapply(factorData, sanitiseGroups)
-            factorData <- sapply(factorData, strsplit, split=",")
-            factorData <- sapply(factorData, make.names)
-            # Transform factor data into data frame of R factor objects
-            factors <- data.frame(factorData)
+        # Set the row names to be the name of the factor and delete first row
+        row.names(factorData) <- factorData[, 1]
+        factorData <- factorData[, -1]
+        factorData <- sapply(factorData, sanitiseGroups)
+        factorData <- sapply(factorData, strsplit, split=",")
+        # Transform factor data into data frame of R factor objects
+        factors <- data.frame(factorData)
     }
 }
+# check there are the same number of samples in counts and factors
+if(nrow(factors) != ncol(counts)) {
+    stop("There are a different number of samples in the counts files and factors")
+}
+# make groups valid R names, required for makeContrasts
+factors <- sapply(factors, make.names)
+factors <- data.frame(factors)

  # if annotation file provided
 if (haveAnno) {
@@ -339,6 +347,14 @@
 contrastData <- unlist(strsplit(opt$contrastData, split=","))
 contrastData <- sanitiseEquation(contrastData)
 contrastData <- gsub(" ", ".", contrastData, fixed=TRUE)
+# in case input groups start with numbers this will make the names valid R names, required for makeContrasts
+cons <- NULL
+for (i in contrastData) {
+    i <- strsplit(i, split="-")
+    i <- lapply(i, make.names)
+    i <- lapply(i, paste, collapse="-")
+    cons <- append(cons, unlist(i))
+}

 plots <- character()
 if (!is.null(opt$plots)) {
@@ -362,8 +378,8 @@
 mdvolOutPng <- character()
 topOut <- character()
 glimmaOut <- character()
-for (i in 1:length(contrastData)) {
-    con <- contrastData[i]
+for (i in 1:length(cons)) {
+    con <- cons[i]
     con <- gsub("\\(|\\)", "", con)
     mdOutPdf[i] <- makeOut(paste0("mdplot_", con, ".pdf"))
     volOutPdf[i] <- makeOut(paste0("volplot_", con, ".pdf"))
@@ -409,6 +425,7 @@
 # Creating naming data
 samplenames <- colnames(data$counts)
 sampleanno <- data.frame("sampleID"=samplenames, factors)
+row.names(factors) <- samplenames # for "Summary of experimental data" table

 # Creating colours for the groups
 cols <- as.numeric(factors[, 1])
@@ -451,7 +468,7 @@

     if (wantFilt) {
         print("Outputting filtered counts")
-        filt_counts <- data.frame(data$genes, data$counts)
+        filt_counts <- data.frame(data$genes, data$counts, check.names=FALSE)
         write.table(filt_counts, file=filtOut, row.names=FALSE, sep="\t", quote=FALSE)
         linkData <- rbind(linkData, data.frame(Label=paste0(deMethod, "_", "filtcounts.tsv"), Link=paste0(deMethod, "_", "filtcounts"), stringsAsFactors=FALSE))
     }
@@ -520,8 +537,6 @@
 y <- new("DGEList", data)

 print("Generating Design")
-# Name rows of factors according to their sample
-row.names(factors) <- names(data$counts)
 factorList <- sapply(names(factors), pasteListName)
 formula <- "~0"
 for (i in 1:length(factorList)) {
@@ -540,7 +555,7 @@

 # Generate contrasts information
 print("Generating Contrasts")
-contrasts <- makeContrasts(contrasts=contrastData, levels=design)
+contrasts <- makeContrasts(contrasts=cons, levels=design)

 ################################################################################
 ### Data Output
@@ -768,7 +783,7 @@

      # Save normalised counts (log2cpm)
     if (wantNorm) {
-        norm_counts <- data.frame(vData$genes, vData$E)
+        norm_counts <- data.frame(vData$genes, vData$E, check.names=FALSE)
         write.table(norm_counts, file=normOut, row.names=FALSE, sep="\t", quote=FALSE)
         linkData <- rbind(linkData, c((paste0(deMethod, "_", "normcounts.tsv")), (paste0(deMethod, "_", "normcounts"))))
     }
@@ -827,8 +842,8 @@
                        lfc=opt$lfcReq)
 sumStatus <- summary(status)

-for (i in 1:length(contrastData)) {
-    con <- contrastData[i]
+for (i in 1:length(cons)) {
+    con <- cons[i]
     con <- gsub("\\(|\\)", "", con)
     # Collect counts for differential expression
     upCount[i] <- sumStatus["Up", i]
@@ -979,7 +994,7 @@
     }
 }
 sigDiff <- data.frame(Up=upCount, Flat=flatCount, Down=downCount)
-row.names(sigDiff) <- contrastData
+row.names(sigDiff) <- cons

 # Save relevant items as rda object
 if (wantRda) {
--- a/limma_voom.xml	Sat Feb 09 07:35:04 2019 -0500
+++ b/limma_voom.xml	Mon Feb 18 17:49:24 2019 -0500
@@ -1,4 +1,4 @@
-<tool id="limma_voom" name="limma" version="3.38.3">
+<tool id="limma_voom" name="limma" version="3.38.3+galaxy1">
     <description>
         Perform differential expression with limma-voom or limma-trend
     </description>
@@ -162,14 +162,14 @@

             <when value="files">
                 <repeat name="rep_factor" title="Factor" min="1">
-                    <param name="factorName" type="text" label="Name" help="Name of experiment factor of interest (e.g. Genotype). One factor must be entered and there must be two or more groups per factor. Optional additional factors (e.g. Batch) can be entered using the Insert Factor button below, see Help section for more information. NOTE: Please only use letters, numbers or underscores, and the first character of each factor must be a letter">
+                    <param name="factorName" type="text" label="Name" help="Name of experiment factor of interest (e.g. Genotype). One factor must be entered and there must be two or more groups per factor. Optional additional factors (e.g. Batch) can be entered using the Insert Factor button below, see Help section for more information. NOTE: Please only use letters, numbers or underscores.">
                     <sanitizer>
                         <valid initial="string.letters,string.digits"><add value="_" /></valid>
                     </sanitizer>
                     </param>
                     <repeat name="rep_group" title="Group" min="2" default="2">
                         <param name="groupName" type="text" label="Name"
-                        help="Name of group that the counts files belong to (e.g. WT or Mut). NOTE: Please only use letters, numbers or underscores (case sensitive), and the first character of each group must be a letter">
+                        help="Name of group that the counts files belong to (e.g. WT or Mut). NOTE: Please only use letters, numbers or underscores (case sensitive).">
                         <sanitizer>
                             <valid initial="string.letters,string.digits"><add value="_" /></valid>
                         </sanitizer>
@@ -184,7 +184,7 @@

                 <conditional name="fact">
                     <param name="ffile" type="select" label="Input factor information from file?"
-                        help="You can choose to input the factor and group information for the samples from a file or manually enter below. NOTE: Please only use letters, numbers or underscores (case sensitive), and the first character of each sample, factor and group must be a letter">
+                        help="You can choose to input the factor and group information for the samples from a file or manually enter below. NOTE: Please only use letters, numbers or underscores (case sensitive), the group names MUST not contain hyphens.">
                         <option value="no">No</option>
                         <option value="yes">Yes</option>
                     </param>
@@ -194,12 +194,12 @@
                     <when value="no" >
                         <repeat name="rep_factor" title="Factor" min="1">
                             <param name="factorName" type="text" label="Factor Name"
-                                help="Name of experiment factor of interest (e.g. Genotype). One factor must be entered and there must be two or more groups per factor. Additional factors (e.g. Batch) can be entered using the Insert Factor button below, see Help section below. NOTE: Please only use letters, numbers or underscores, and the first character of each factor must be a letter">
+                                help="Name of experiment factor of interest (e.g. Genotype). One factor must be entered and there must be two or more groups per factor. Additional factors (e.g. Batch) can be entered using the Insert Factor button below, see Help section below. NOTE: Please only use letters, numbers or underscores.">
                                 <validator type="empty_field" />
                                 <validator type="regex" message="Please only use letters, numbers or underscores">^[\w]+$</validator>
                             </param>
                             <param name="groupNames" type="text" label="Groups"
-                                help="Enter the group names for the samples separated with commas e.g. WT,WT,WT,Mut,Mut,Mut. The order of the names must match the order of the samples in the columns of the count matrix. NOTE: Please only use letters, numbers or underscores (case sensitive), and the first character of each group must be a letter">
+                                help="Enter the group names for the samples separated with commas e.g. WT,WT,WT,Mut,Mut,Mut. The order of the names must match the order of the samples in the columns of the count matrix. NOTE: Please only use letters, numbers or underscores (case sensitive), the group names MUST not contain hyphens.">
                                 <validator type="empty_field" />
                                 <validator type="regex" message="Please only use letters, numbers or underscores, and separate levels by commas">^[\w,]+$</validator>
                             </param>
@@ -353,7 +353,7 @@

     <tests>
         <!-- Ensure report is output -->
-        <test>
+        <test expect_num_outputs="2">
             <param name="format" value="matrix" />
             <param name="counts" value="matrix.txt" />
             <repeat name="rep_factor">
@@ -391,7 +391,7 @@
             </output>
        </test>
         <!-- Ensure annotation file input works -->
-        <test>
+        <test expect_num_outputs="2">
             <param name="format" value="matrix" />
             <param name="annoOpt" value="yes" />
             <param name="geneanno" value="anno.txt" />
@@ -415,7 +415,7 @@
             </output_collection>
         </test>
         <!-- Ensure Rscript and RData file can be output -->
-        <test>
+        <test expect_num_outputs="3">
             <param name="format" value="matrix" />
             <param name="rscript" value="True"/>
             <param name="rdaOption" value="true" />
@@ -441,7 +441,7 @@
             </output>
         </test>
         <!-- Ensure secondary factors work -->
-        <test>
+        <test expect_num_outputs="2">
             <param name="format" value="matrix" />
             <param name="counts" value="matrix.txt" />
             <repeat name="rep_factor">
@@ -467,7 +467,7 @@
             </output_collection>
         </test>
         <!-- Ensure factors file with unordered samples works -->
-        <test>
+        <test expect_num_outputs="2">
             <param name="format" value="matrix" />
             <param name="ffile" value="yes" />
             <param name="finfo" value="factorinfo.txt" />
@@ -487,7 +487,7 @@
             </output_collection>
         </test>
         <!-- Ensure filtered and normalised count outputs works-->
-        <test>
+        <test expect_num_outputs="4">
             <param name="format" value="matrix" />
             <param name="filtCounts" value="true" />
             <param name="normCounts" value="true" />
@@ -529,7 +529,7 @@
             </output>
         </test>
         <!-- Ensure multiple counts files input works -->
-        <test>
+        <test expect_num_outputs="3">
             <param name="format" value="files" />
             <repeat name="rep_factor">
                 <param name="factorName" value="Genotype"/>
@@ -589,7 +589,7 @@
             </output>
         </test>
         <!-- Ensure limma-trend option works -->
-        <test>
+        <test expect_num_outputs="2">
             <param name="format" value="matrix" />
             <param name="counts" value="matrix.txt" />
             <repeat name="rep_factor">
@@ -602,7 +602,6 @@
             <param name="normalisationOption" value="TMM" />
             <param name="topgenes" value="6" />
             <param name="de_select" value="trend" />
-            <param name="rdaOption" value="true" />
             <output name="outReport" >
                 <assert_contents>
                     <has_text text="The limma-trend method was used" />
@@ -618,7 +617,7 @@
             </output_collection>
         </test>
         <!-- Ensure limma-trend option with annotation works -->
-        <test>
+        <test expect_num_outputs="2">
             <param name="format" value="matrix" />
             <param name="counts" value="matrix.txt" />
             <param name="annoOpt" value="yes" />
@@ -633,7 +632,6 @@
             <param name="normalisationOption" value="TMM" />
             <param name="topgenes" value="6" />
             <param name="de_select" value="trend" />
-            <param name="rdaOption" value="true" />
             <output name="outReport" >
                 <assert_contents>
                     <has_text text="The limma-trend method was used" />
@@ -648,6 +646,43 @@
                 </element>
             </output_collection>
         </test>
+        <!-- Ensure samples and groups beginning with numbers can be handled -->
+        <test expect_num_outputs="3">
+            <param name="format" value="matrix" />
+            <param name="counts" value="matrix_num.txt" />
+            <param name="annoOpt" value="yes" />
+            <param name="geneanno" value="anno.txt" />
+            <repeat name="rep_factor">
+                <param name="factorName" value="Group"/>
+                <param name="groupNames" value="2,2,2,1,1,1" />
+            </repeat>
+            <repeat name="rep_contrast">
+                <param name="contrast" value="2-1" />
+            </repeat>
+            <param name="filt_select" value="yes" />
+            <param name="format_select" value="counts"/>
+            <param name="cntReq" value="10"/>
+            <param name="count_select" value="sample"/>
+            <param name="cntSampleReq" value="3"/>
+            <param name="normalisationOption" value="TMM" />
+            <param name="normCounts" value="true" />
+            <param name="topgenes" value="6" />
+            <param name="de_select" value="voom" />
+            <output_collection name="outTables" count="1">
+                <element name="limma-voom_X2-X1" ftype="tabular" >
+                    <assert_contents>
+                        <has_text_matching expression="EntrezID.*logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
+                        <has_text_matching expression="11304.*0.45.*15.52.*4.94.*7.74.*0.0001.*5.27" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output name="outNorm" ftype="tabular" >
+                <assert_contents>
+                    <has_text_matching expression="EntrezID.*2-1.*2-2.*2-3.*1-1.*1-2.*1-3" />
+                    <has_text_matching expression="11304.*15.7.*15.8.*15.6.*15.3.*15.2.*15.2" />
+                </assert_contents>
+            </output>
+        </test>
     </tests>

     <help><![CDATA[
@@ -732,9 +767,9 @@
     Mut3       Mut          b3
     ========== ============ =========

-*Factor Name:* The name of the experimental factor being investigated e.g. Genotype, Treatment. One factor must be entered, the name should start with a letter and spaces must not be used. Optionally, additional factors can be included, these are variables that might influence your experiment e.g. Batch, Gender, Subject. If additional factors are entered, an additive linear model will be used.
+*Factor Name:* The name of the experimental factor being investigated e.g. Genotype, Treatment. One factor must be entered and spaces must not be used. Optionally, additional factors can be included, these are variables that might influence your experiment e.g. Batch, Gender, Subject. If additional factors are entered, an additive linear model will be used.

-*Groups:* The names of the groups for the factor. The names should start with a letter, and only contain letters, numbers and underscores, other characters such as spaces and hyphens must not be used. If entered into the tool form above, the order must be the same as the samples (to which the groups correspond) are listed in the columns of the counts matrix, with the values separated by commas.
+*Groups:* The names of the groups for the factor. The names should only contain letters, numbers and underscores, other characters such as spaces and hyphens MUST not be used. If entered into the tool form above, the order must be the same as the samples (to which the groups correspond) are listed in the columns of the counts matrix, with the values separated by commas. If the group names begin with a number an X will be added as a prefix.

 **Contrasts of Interest:**
 The contrasts you wish to make between levels.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/matrix_num.txt	Mon Feb 18 17:49:24 2019 -0500
@@ -0,0 +1,7 @@
+GeneID	2-1	2-2	2-3	1-1	1-2	1-3
+11287	1463	1441	1495	1699	1528	1601
+11298	1345	1291	1346	1905	1744	1834
+11302	5	6	5	6	8	7
+11303	1574	1519	1654	2099	1974	2100
+11304	361	397	346	356	312	337
+11305	1762	1942	2027	2528	2438	2493