Mercurial > repos > artbio > repenrich2
changeset 1:6d59fbca2db4 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 4dd520dee5c3c0c526e8319a74c4890da032300f
author | artbio |
---|---|
date | Sat, 20 Apr 2024 14:46:12 +0000 (9 months ago) |
parents | 4905a332a094 |
children | cfb06f8e8f52 |
files | edgeR_repenrich.R edgeR_repenrich2.R edger-repenrich.xml edger-repenrich2.xml macros.xml repenrich2.xml |
diffstat | 6 files changed, 377 insertions(+), 377 deletions(-) [+] |
line wrap: on
line diff
--- a/edgeR_repenrich.R Sat Apr 20 11:56:53 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,176 +0,0 @@ -# setup R error handling to go to stderr -options(show.error.messages = FALSE, error = function() { - cat(geterrmessage(), file = stderr()) - q("no", 1, FALSE) -}) - -# To not crash galaxy with an UTF8 error with not-US LC settings. -loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") - -# load libraries -library("getopt") -library("tools") -library("rjson") -suppressPackageStartupMessages({ - library("edgeR") - library("limma") -}) - -options(stringAsFactors = FALSE, useFancyQuotes = FALSE) - -# get options, using the spec as defined by the enclosed list. -spec <- matrix( - c( - "quiet", "q", 0, "logical", - "outfile", "o", 1, "character", - "countsfile", "n", 1, "character", - "factorName", "N", 1, "character", - "levelNameA", "A", 1, "character", - "levelNameB", "B", 1, "character", - "levelAfiles", "a", 1, "character", - "levelBfiles", "b", 1, "character", - "plots", "p", 1, "character" - ), - byrow = TRUE, ncol = 4 -) -opt <- getopt(spec) - -# build levels A and B file lists -filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error") -filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error") -listA <- list() -indice <- 0 -listA[["level"]] <- opt$levelNameA -for (file in filesA) { - indice <- indice + 1 - listA[[paste0(opt$levelNameA, "_", indice)]] <- read.delim(file, header = FALSE) -} -listB <- list() -indice <- 0 -listB[["level"]] <- opt$levelNameB -for (file in filesB) { - indice <- indice + 1 - listB[[paste0(opt$levelNameB, "_", indice)]] <- read.delim(file, header = FALSE) -} - -# build a counts table -counts <- data.frame(row.names = listA[[2]][, 1]) -for (element in names(listA[-1])) { - counts <- cbind(counts, listA[[element]][, 4]) -} -for (element in names(listB[-1])) { - counts <- cbind(counts, listB[[element]][, 4]) -} -colnames(counts) <- c(names(listA[-1]), names(listB[-1])) -sizes <- colSums(counts) - -# build a meta data object -meta <- data.frame( - row.names = colnames(counts), - condition = c(rep(opt$levelNameA, length(filesA)), rep(opt$levelNameB, length(filesB))), - libsize = sizes -) - - -# Define the library size and conditions for the GLM -libsize <- meta$libsize -condition <- factor(meta$condition) -design <- model.matrix(~ 0 + condition) -colnames(design) <- levels(condition) - -# Build a DGE object for the GLM -y <- DGEList(counts = counts, lib.size = libsize) - -# Normalize the data -y <- calcNormFactors(y) - -# Estimate the variance -y <- estimateGLMCommonDisp(y, design) -y <- estimateGLMTrendedDisp(y, design) -y <- estimateGLMTagwiseDisp(y, design) - -# Builds and outputs an object to contain the normalized read abundance in counts per million of reads -cpm <- cpm(y, log = FALSE, lib.size = libsize) -cpm <- as.data.frame(cpm) -colnames(cpm) <- colnames(counts) -if (!is.null(opt$countsfile)) { - normalizedAbundance <- data.frame(Tag = rownames(cpm)) - normalizedAbundance <- cbind(normalizedAbundance, cpm) - write.table(normalizedAbundance, file = opt$countsfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) -} - -# Conduct fitting of the GLM -yfit <- glmFit(y, design) - -# Initialize result matrices to contain the results of the GLM -results <- matrix(nrow = dim(counts)[1], ncol = 0) -logfc <- matrix(nrow = dim(counts)[1], ncol = 0) - -# Make the comparisons for the GLM -my.contrasts <- makeContrasts( - paste0(opt$levelNameA, "_", opt$levelNameB, " = ", opt$levelNameA, " - ", opt$levelNameB), - levels = design -) - -# Define the contrasts used in the comparisons -allcontrasts <- paste0(opt$levelNameA, " vs ", opt$levelNameB) - -# Conduct a for loop that will do the fitting of the GLM for each comparison -# Put the results into the results objects -lrt <- glmLRT(yfit, contrast = my.contrasts[, 1]) -res <- topTags(lrt, n = dim(c)[1], sort.by = "none")$table -results <- cbind(results, res[, c(1, 5)]) -logfc <- cbind(logfc, res[c(1)]) - -# Add the repeat types back into the results. -# We should still have the same order as the input data -results$class <- listA[[2]][, 2] -results$type <- listA[[2]][, 3] -# Sort the results table by the FDR -results <- results[with(results, order(FDR)), ] - -# Plot Fold Changes for repeat classes and types - -# open the device and plots -if (!is.null(opt$plots)) { - pdf(opt$plots) - plotMDS(y, main = "Multidimensional Scaling Plot Of Distances Between Samples") - plotBCV(y, xlab = "Gene abundance (Average log CPM)", main = "Biological Coefficient of Variation Plot") - logFC <- results[, "logFC"] - # Plot the repeat classes - classes <- with(results, reorder(class, -logFC, median)) - classes - par(mar = c(6, 10, 4, 1)) - boxplot(logFC ~ classes, - data = results, outline = FALSE, horizontal = TRUE, - las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Class") - ) - abline(v = 0) - # Plot the repeat types - types <- with(results, reorder(type, -logFC, median)) - boxplot(logFC ~ types, - data = results, outline = FALSE, horizontal = TRUE, - las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Type") - ) - abline(v = 0) - # volcano plot - TEdata <- cbind(rownames(results), as.data.frame(results), score = -log(results$FDR, 10)) - colnames(TEdata) <- c("Tag", "log2FC", "FDR", "Class", "Type", "score") - color <- ifelse(TEdata$FDR < 0.05, "red", "black") - s <- subset(TEdata, FDR < 0.01) - with(TEdata, plot(log2FC, score, pch = 20, col = color, main = "Volcano plot (all tag types)", ylab = "-log10(FDR)")) - text(s[, 2], s[, 6], labels = s[, 5], pos = seq(from = 1, to = 3), cex = 0.5) -} - -# close the plot device -if (!is.null(opt$plots)) { - cat("closing plot device\n") - dev.off() -} - -# Save the results -results <- cbind(TE_item = rownames(results), results) -colnames(results) <- c("TE_item", "log2FC", "FDR", "Class", "Type") -results$log2FC <- format(results$log2FC, digits = 5) -results$FDR <- format(results$FDR, digits = 5) -write.table(results, opt$outfile, quote = FALSE, sep = "\t", col.names = TRUE, row.names = FALSE)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/edgeR_repenrich2.R Sat Apr 20 14:46:12 2024 +0000 @@ -0,0 +1,176 @@ +# setup R error handling to go to stderr +options(show.error.messages = FALSE, error = function() { + cat(geterrmessage(), file = stderr()) + q("no", 1, FALSE) +}) + +# To not crash galaxy with an UTF8 error with not-US LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +# load libraries +library("getopt") +library("tools") +library("rjson") +suppressPackageStartupMessages({ + library("edgeR") + library("limma") +}) + +options(stringAsFactors = FALSE, useFancyQuotes = FALSE) + +# get options, using the spec as defined by the enclosed list. +spec <- matrix( + c( + "quiet", "q", 0, "logical", + "outfile", "o", 1, "character", + "countsfile", "n", 1, "character", + "factorName", "N", 1, "character", + "levelNameA", "A", 1, "character", + "levelNameB", "B", 1, "character", + "levelAfiles", "a", 1, "character", + "levelBfiles", "b", 1, "character", + "plots", "p", 1, "character" + ), + byrow = TRUE, ncol = 4 +) +opt <- getopt(spec) + +# build levels A and B file lists +filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error") +filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error") +listA <- list() +indice <- 0 +listA[["level"]] <- opt$levelNameA +for (file in filesA) { + indice <- indice + 1 + listA[[paste0(opt$levelNameA, "_", indice)]] <- read.delim(file, header = FALSE) +} +listB <- list() +indice <- 0 +listB[["level"]] <- opt$levelNameB +for (file in filesB) { + indice <- indice + 1 + listB[[paste0(opt$levelNameB, "_", indice)]] <- read.delim(file, header = FALSE) +} + +# build a counts table +counts <- data.frame(row.names = listA[[2]][, 1]) +for (element in names(listA[-1])) { + counts <- cbind(counts, listA[[element]][, 4]) +} +for (element in names(listB[-1])) { + counts <- cbind(counts, listB[[element]][, 4]) +} +colnames(counts) <- c(names(listA[-1]), names(listB[-1])) +sizes <- colSums(counts) + +# build a meta data object +meta <- data.frame( + row.names = colnames(counts), + condition = c(rep(opt$levelNameA, length(filesA)), rep(opt$levelNameB, length(filesB))), + libsize = sizes +) + + +# Define the library size and conditions for the GLM +libsize <- meta$libsize +condition <- factor(meta$condition) +design <- model.matrix(~ 0 + condition) +colnames(design) <- levels(condition) + +# Build a DGE object for the GLM +y <- DGEList(counts = counts, lib.size = libsize) + +# Normalize the data +y <- calcNormFactors(y) + +# Estimate the variance +y <- estimateGLMCommonDisp(y, design) +y <- estimateGLMTrendedDisp(y, design) +y <- estimateGLMTagwiseDisp(y, design) + +# Builds and outputs an object to contain the normalized read abundance in counts per million of reads +cpm <- cpm(y, log = FALSE, lib.size = libsize) +cpm <- as.data.frame(cpm) +colnames(cpm) <- colnames(counts) +if (!is.null(opt$countsfile)) { + normalizedAbundance <- data.frame(Tag = rownames(cpm)) + normalizedAbundance <- cbind(normalizedAbundance, cpm) + write.table(normalizedAbundance, file = opt$countsfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) +} + +# Conduct fitting of the GLM +yfit <- glmFit(y, design) + +# Initialize result matrices to contain the results of the GLM +results <- matrix(nrow = dim(counts)[1], ncol = 0) +logfc <- matrix(nrow = dim(counts)[1], ncol = 0) + +# Make the comparisons for the GLM +my.contrasts <- makeContrasts( + paste0(opt$levelNameA, "_", opt$levelNameB, " = ", opt$levelNameA, " - ", opt$levelNameB), + levels = design +) + +# Define the contrasts used in the comparisons +allcontrasts <- paste0(opt$levelNameA, " vs ", opt$levelNameB) + +# Conduct a for loop that will do the fitting of the GLM for each comparison +# Put the results into the results objects +lrt <- glmLRT(yfit, contrast = my.contrasts[, 1]) +res <- topTags(lrt, n = dim(c)[1], sort.by = "none")$table +results <- cbind(results, res[, c(1, 5)]) +logfc <- cbind(logfc, res[c(1)]) + +# Add the repeat types back into the results. +# We should still have the same order as the input data +results$class <- listA[[2]][, 2] +results$type <- listA[[2]][, 3] +# Sort the results table by the FDR +results <- results[with(results, order(FDR)), ] + +# Plot Fold Changes for repeat classes and types + +# open the device and plots +if (!is.null(opt$plots)) { + pdf(opt$plots) + plotMDS(y, main = "Multidimensional Scaling Plot Of Distances Between Samples") + plotBCV(y, xlab = "Gene abundance (Average log CPM)", main = "Biological Coefficient of Variation Plot") + logFC <- results[, "logFC"] + # Plot the repeat classes + classes <- with(results, reorder(class, -logFC, median)) + classes + par(mar = c(6, 10, 4, 1)) + boxplot(logFC ~ classes, + data = results, outline = FALSE, horizontal = TRUE, + las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Class") + ) + abline(v = 0) + # Plot the repeat types + types <- with(results, reorder(type, -logFC, median)) + boxplot(logFC ~ types, + data = results, outline = FALSE, horizontal = TRUE, + las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Type") + ) + abline(v = 0) + # volcano plot + TEdata <- cbind(rownames(results), as.data.frame(results), score = -log(results$FDR, 10)) + colnames(TEdata) <- c("Tag", "log2FC", "FDR", "Class", "Type", "score") + color <- ifelse(TEdata$FDR < 0.05, "red", "black") + s <- subset(TEdata, FDR < 0.01) + with(TEdata, plot(log2FC, score, pch = 20, col = color, main = "Volcano plot (all tag types)", ylab = "-log10(FDR)")) + text(s[, 2], s[, 6], labels = s[, 5], pos = seq(from = 1, to = 3), cex = 0.5) +} + +# close the plot device +if (!is.null(opt$plots)) { + cat("closing plot device\n") + dev.off() +} + +# Save the results +results <- cbind(TE_item = rownames(results), results) +colnames(results) <- c("TE_item", "log2FC", "FDR", "Class", "Type") +results$log2FC <- format(results$log2FC, digits = 5) +results$FDR <- format(results$FDR, digits = 5) +write.table(results, opt$outfile, quote = FALSE, sep = "\t", col.names = TRUE, row.names = FALSE)
--- a/edger-repenrich.xml Sat Apr 20 11:56:53 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,199 +0,0 @@ -<tool id="edger-repenrich2" name="edgeR-repenrich" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> - <description>Determines differentially expressed features from RepEnrich2 counts</description> - <macros> - <import>macros.xml</import> - </macros> - <expand macro="edgeR_requirements"/> - <stdio> - <regex match="Execution halted" - source="both" - level="fatal" - description="Execution halted." /> - <regex match="Error in" - source="both" - level="fatal" - description="An undefined error occurred, please check your input carefully and contact your administrator." /> - <regex match="Fatal error" - source="both" - level="fatal" - description="An undefined error occurred, please check your input carefully and contact your administrator." /> - </stdio> - <version_command> - <![CDATA[ - echo $(R --version | grep version | grep -v GNU)", edgeR version" $(R --vanilla --slave -e "library(edgeR) && - cat(sessionInfo()\$otherPkgs\$edgeR\$Version)" 2> /dev/null | grep -v -i "WARNING: ") - ]]> - </version_command> - <command> - <![CDATA[ - #import json - Rscript '${__tool_directory__}/edgeR_repenrich.R' - --factorName '$factorName' - - --levelNameA '$factorLevel_A' - #set $factorlevelsA = list() - #for $file in $countsFiles_A: - $factorlevelsA.append(str($file)) - #end for - $factorlevelsA.reverse() - --levelAfiles '#echo json.dumps(factorlevelsA)#' - - --levelNameB '$factorLevel_B' - #set $factorlevelsB = list() - #for $file in $countsFiles_B: - $factorlevelsB.append(str($file)) - #end for - $factorlevelsB.reverse() - --levelBfiles '#echo json.dumps(factorlevelsB)#' - - -o 'edger_out' - - -p '$plots' - #if $normCounts: - -n '$counts_out' - #end if - -o '$edger_out' - ]]> - </command> - <inputs> - <param name="factorName" type="text" value="FactorName" label="Specify a factor name, e.g. genotype or age or drug_x" - help="Only letters, numbers and underscores will be retained in this field"> - <sanitizer> - <valid initial="string.letters,string.digits"><add value="_" /></valid> - </sanitizer> - </param> - <param name="factorLevel_A" type="text" value="FactorLevel1" label="Specify a factor level, typical values could be 'mutant' or 'Drug_X'" - help="Only letters, numbers and underscores will be retained in this field"> - <sanitizer> - <valid initial="string.letters,string.digits"><add value="_" /></valid> - </sanitizer> - </param> - <param name="countsFiles_A" type="data" format="tabular" multiple="true" label="Counts file(s)" help="Count files must have been generated by repenrich" /> - <param name="factorLevel_B" type="text" value="FactorLevel2" label="Specify a factor level, typical values could be 'wildtype' or 'control'" - help="Only letters, numbers and underscores will be retained in this field"> - <sanitizer> - <valid initial="string.letters,string.digits"><add value="_" /></valid> - </sanitizer> - </param> - <param name="countsFiles_B" type="data" format="tabular" multiple="true" label="Counts file(s)" help="Count files must have been generated by repenrich tool" /> - <param name="normCounts" type="boolean" truevalue="1" falsevalue="0" checked="false" - label="Output normalized counts table" /> - </inputs> - <outputs> - <data format="tabular" name="edger_out" label="edgeR: ${factorLevel_A} compared to ${factorLevel_B}"> - <actions> - <action name="column_names" type="metadata" default="Tag,log2(FC),FDR,Class,Type" /> - </actions> - </data> - <data format="pdf" name="plots" label="edgeR plots" /> - <data format="tabular" name="counts_out" label="Normalized counts file"> - <filter>normCounts == True</filter> - </data> - </outputs> - <tests> - <test expect_num_outputs="3"> - <param name="factorName" value="Genotype"/> - <param name="factorLevel_A" value="Mutant"/> - <param name="countsFiles_A" value="355_fraction_counts.tab,356_fraction_counts.tab"/> - <param name="factorLevel_B" value="Wildtype"/> - <param name="countsFiles_B" value="353_fraction_counts.tab,354_fraction_counts.tab"/> - <param name="normCounts" value="True"/> - <output name="counts_out" file="Normalized_counts_file.tab"/> - <output name="plots" file="edgeR_plots.pdf"/> - <output name="edger_out" file="edgeR_result_file.tab"/> - - </test> - </tests> - <help> -<![CDATA[ -.. class:: infomark - -**What it does** - -Estimate Distance between samples (MDS) and Biological Coefficient Variation (BCV) in count -data from high-throughput sequencing assays and test for differential expression using edgeR_. - -**Inputs** - -edger-repenrich takes count tables generated by repenrich as inputs. A repenrich count table looks -like: - -============== ========== ========== ========== -LSU-rRNA_Dme rRNA rRNA 3659329 --------------- ---------- ---------- ---------- -FW3_DM LINE Jockey 831 --------------- ---------- ---------- ---------- -DMTOM1_LTR LTR Gypsy 1004 --------------- ---------- ---------- ---------- -R1_DM LINE R1 7343 --------------- ---------- ---------- ---------- -TAHRE LINE Jockey 4560 --------------- ---------- ---------- ---------- -G4_DM LINE Jockey 3668 --------------- ---------- ---------- ---------- -BS LINE Jockey 7296 --------------- ---------- ---------- ---------- -Stalker2_I-int LTR Gypsy 12252 --------------- ---------- ---------- ---------- -Stalker3_LTR LTR Gypsy 593 --------------- ---------- ---------- ---------- -TABOR_I-int LTR Gypsy 3947 --------------- ---------- ---------- ---------- -G7_DM LINE Jockey 162 --------------- ---------- ---------- ---------- -BEL_I-int LTR Pao 23757 --------------- ---------- ---------- ---------- -Gypsy6_I-int LTR Gypsy 7489 -============== ========== ========== ========== - -Count tables must be generated for each sample individually. Here, edgeR_ is handling a -single factor (genotype, age, treatment, etc) that effect your experiment. This factor has -two levels/states (for instance, "wild-type" and "mutant". You need to select appropriate -count table from your history for each factor level. - -The following table gives some examples of factors and their levels: - -========= ============== =============== -Factor Factorlevel1 Factorlevel2 ---------- -------------- --------------- -Treatment Treated Untreated ---------- -------------- --------------- -Genotype Knockdown Wildtype ---------- -------------- --------------- -TimePoint Day4 Day1 ---------- -------------- --------------- -Gender Female Male -========= ============== =============== - -*Note*: Output log2 fold changes are based on primary factor level 1 vs. factor level2. -Here the order of factor levels is important. For example, for the factor 'Treatment' given -in above table, edgeR computes fold changes of 'Treated' samples against 'Untreated', -i.e. the values correspond to up or down regulations of genes in Treated samples. - -**Output** - -edgeR_ generates a tabular file containing the different columns and results visualized in -a PDF: - -====== ============================================================================= -Column Description ------- ----------------------------------------------------------------------------- - 1 Tag (transposon element ID) - 2 the logarithm (to basis 2) of the fold change (See the note in inputs section) - 3 p value adjusted for multiple testing with the Benjamini-Hochberg procedure - which controls false discovery rate (FDR) - 4 Class the transposon belongs to - 5 Type the transposon belongs to -====== ============================================================================= - -.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html -]]> - -**Note**: This edgeR_ wrapper was adapted from code available at -https://github.com/nskvir/RepEnrich - - </help> - <citations> - <citation type="doi">10.1093/bioinformatics/btp616</citation> - </citations> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/edger-repenrich2.xml Sat Apr 20 14:46:12 2024 +0000 @@ -0,0 +1,199 @@ +<tool id="edger-repenrich2" name="edgeR-repenrich2" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>Determines differentially expressed features from RepEnrich2 counts</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="edgeR_requirements"/> + <stdio> + <regex match="Execution halted" + source="both" + level="fatal" + description="Execution halted." /> + <regex match="Error in" + source="both" + level="fatal" + description="An undefined error occurred, please check your input carefully and contact your administrator." /> + <regex match="Fatal error" + source="both" + level="fatal" + description="An undefined error occurred, please check your input carefully and contact your administrator." /> + </stdio> + <version_command> + <![CDATA[ + echo $(R --version | grep version | grep -v GNU)", edgeR version" $(R --vanilla --slave -e "library(edgeR) && + cat(sessionInfo()\$otherPkgs\$edgeR\$Version)" 2> /dev/null | grep -v -i "WARNING: ") + ]]> + </version_command> + <command> + <![CDATA[ + #import json + Rscript '${__tool_directory__}/edgeR_repenrich2.R' + --factorName '$factorName' + + --levelNameA '$factorLevel_A' + #set $factorlevelsA = list() + #for $file in $countsFiles_A: + $factorlevelsA.append(str($file)) + #end for + $factorlevelsA.reverse() + --levelAfiles '#echo json.dumps(factorlevelsA)#' + + --levelNameB '$factorLevel_B' + #set $factorlevelsB = list() + #for $file in $countsFiles_B: + $factorlevelsB.append(str($file)) + #end for + $factorlevelsB.reverse() + --levelBfiles '#echo json.dumps(factorlevelsB)#' + + -o 'edger_out' + + -p '$plots' + #if $normCounts: + -n '$counts_out' + #end if + -o '$edger_out' + ]]> + </command> + <inputs> + <param name="factorName" type="text" value="FactorName" label="Specify a factor name, e.g. genotype or age or drug_x" + help="Only letters, numbers and underscores will be retained in this field"> + <sanitizer> + <valid initial="string.letters,string.digits"><add value="_" /></valid> + </sanitizer> + </param> + <param name="factorLevel_A" type="text" value="FactorLevel1" label="Specify a factor level, typical values could be 'mutant' or 'Drug_X'" + help="Only letters, numbers and underscores will be retained in this field"> + <sanitizer> + <valid initial="string.letters,string.digits"><add value="_" /></valid> + </sanitizer> + </param> + <param name="countsFiles_A" type="data" format="tabular" multiple="true" label="Counts file(s)" help="Count files must have been generated by repenrich" /> + <param name="factorLevel_B" type="text" value="FactorLevel2" label="Specify a factor level, typical values could be 'wildtype' or 'control'" + help="Only letters, numbers and underscores will be retained in this field"> + <sanitizer> + <valid initial="string.letters,string.digits"><add value="_" /></valid> + </sanitizer> + </param> + <param name="countsFiles_B" type="data" format="tabular" multiple="true" label="Counts file(s)" help="Count files must have been generated by repenrich tool" /> + <param name="normCounts" type="boolean" truevalue="1" falsevalue="0" checked="false" + label="Output normalized counts table" /> + </inputs> + <outputs> + <data format="tabular" name="edger_out" label="edgeR: ${factorLevel_A} compared to ${factorLevel_B}"> + <actions> + <action name="column_names" type="metadata" default="Tag,log2(FC),FDR,Class,Type" /> + </actions> + </data> + <data format="pdf" name="plots" label="edgeR plots" /> + <data format="tabular" name="counts_out" label="Normalized counts file"> + <filter>normCounts == True</filter> + </data> + </outputs> + <tests> + <test expect_num_outputs="3"> + <param name="factorName" value="Genotype"/> + <param name="factorLevel_A" value="Mutant"/> + <param name="countsFiles_A" value="355_fraction_counts.tab,356_fraction_counts.tab"/> + <param name="factorLevel_B" value="Wildtype"/> + <param name="countsFiles_B" value="353_fraction_counts.tab,354_fraction_counts.tab"/> + <param name="normCounts" value="True"/> + <output name="counts_out" file="Normalized_counts_file.tab"/> + <output name="plots" file="edgeR_plots.pdf"/> + <output name="edger_out" file="edgeR_result_file.tab"/> + + </test> + </tests> + <help> +<![CDATA[ +.. class:: infomark + +**What it does** + +Estimate Distance between samples (MDS) and Biological Coefficient Variation (BCV) in count +data from high-throughput sequencing assays and test for differential expression using edgeR_. + +**Inputs** + +edger-repenrich takes count tables generated by repenrich as inputs. A repenrich count table looks +like: + +============== ========== ========== ========== +LSU-rRNA_Dme rRNA rRNA 3659329 +-------------- ---------- ---------- ---------- +FW3_DM LINE Jockey 831 +-------------- ---------- ---------- ---------- +DMTOM1_LTR LTR Gypsy 1004 +-------------- ---------- ---------- ---------- +R1_DM LINE R1 7343 +-------------- ---------- ---------- ---------- +TAHRE LINE Jockey 4560 +-------------- ---------- ---------- ---------- +G4_DM LINE Jockey 3668 +-------------- ---------- ---------- ---------- +BS LINE Jockey 7296 +-------------- ---------- ---------- ---------- +Stalker2_I-int LTR Gypsy 12252 +-------------- ---------- ---------- ---------- +Stalker3_LTR LTR Gypsy 593 +-------------- ---------- ---------- ---------- +TABOR_I-int LTR Gypsy 3947 +-------------- ---------- ---------- ---------- +G7_DM LINE Jockey 162 +-------------- ---------- ---------- ---------- +BEL_I-int LTR Pao 23757 +-------------- ---------- ---------- ---------- +Gypsy6_I-int LTR Gypsy 7489 +============== ========== ========== ========== + +Count tables must be generated for each sample individually. Here, edgeR_ is handling a +single factor (genotype, age, treatment, etc) that effect your experiment. This factor has +two levels/states (for instance, "wild-type" and "mutant". You need to select appropriate +count table from your history for each factor level. + +The following table gives some examples of factors and their levels: + +========= ============== =============== +Factor Factorlevel1 Factorlevel2 +--------- -------------- --------------- +Treatment Treated Untreated +--------- -------------- --------------- +Genotype Knockdown Wildtype +--------- -------------- --------------- +TimePoint Day4 Day1 +--------- -------------- --------------- +Gender Female Male +========= ============== =============== + +*Note*: Output log2 fold changes are based on primary factor level 1 vs. factor level2. +Here the order of factor levels is important. For example, for the factor 'Treatment' given +in above table, edgeR computes fold changes of 'Treated' samples against 'Untreated', +i.e. the values correspond to up or down regulations of genes in Treated samples. + +**Output** + +edgeR_ generates a tabular file containing the different columns and results visualized in +a PDF: + +====== ============================================================================= +Column Description +------ ----------------------------------------------------------------------------- + 1 Tag (transposon element ID) + 2 the logarithm (to basis 2) of the fold change (See the note in inputs section) + 3 p value adjusted for multiple testing with the Benjamini-Hochberg procedure + which controls false discovery rate (FDR) + 4 Class the transposon belongs to + 5 Type the transposon belongs to +====== ============================================================================= + +.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html +]]> + +**Note**: This edgeR_ wrapper was adapted from code available at +https://github.com/nskvir/RepEnrich + + </help> + <citations> + <citation type="doi">10.1093/bioinformatics/btp616</citation> + </citations> +</tool>
--- a/macros.xml Sat Apr 20 11:56:53 2024 +0000 +++ b/macros.xml Sat Apr 20 14:46:12 2024 +0000 @@ -1,6 +1,6 @@ <macros> <token name="@TOOL_VERSION@">2.31.1</token> - <token name="@VERSION_SUFFIX@">0</token> + <token name="@VERSION_SUFFIX@">1</token> <token name="@PROFILE@">23.0</token> <xml name="repenrich_requirements">
--- a/repenrich2.xml Sat Apr 20 11:56:53 2024 +0000 +++ b/repenrich2.xml Sat Apr 20 14:46:12 2024 +0000 @@ -1,4 +1,4 @@ -<tool id="repenrich2" name="RepEnrich" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> +<tool id="repenrich2" name="RepEnrich2" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> <description>Repeat Element Profiling</description> <macros> <import>macros.xml</import>