diff get_length_and_gc_content.r @ 2:e3ba567abdf5 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/length_and_gc_content commit 7b6b07d22f3e6fed77b2c237de2b0d96fa939711"
author iuc
date Fri, 11 Mar 2022 14:08:11 +0000
parents f088370d2a3c
children
line wrap: on
line diff
--- a/get_length_and_gc_content.r	Sun Jan 28 04:04:58 2018 -0500
+++ b/get_length_and_gc_content.r	Fri Mar 11 14:08:11 2022 +0000
@@ -1,59 +1,94 @@
 # originally by Devon Ryan, https://www.biostars.org/p/84467/
 
-options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+options(show.error.messages = F,
+        error = function() {
+          cat(geterrmessage(), file = stderr())
+          q("no", 1, F)
+        })
 
 # we need that to not crash galaxy with an UTF8 error on German LC settings.
 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
 
 suppressPackageStartupMessages({
-    library("GenomicRanges")
-    library("rtracklayer")
-    library("Rsamtools")
-    library("optparse")
-    library("data.table")
+  library("GenomicRanges")
+  library("rtracklayer")
+  library("Rsamtools")
+  library("optparse")
+  library("data.table")
 })
 
 option_list <- list(
-    make_option(c("-g","--gtf"), type="character", help="Input GTF file with gene / exon information."),
-    make_option(c("-f","--fasta"), type="character", default=FALSE, help="FASTA file that corresponds to the supplied GTF."),
-    make_option(c("-l","--length"), type="character", default=FALSE, help="Output file with Gene ID and length."),
-    make_option(c("-gc","--gc_content"), type="character", default=FALSE, help="Output file with Gene ID and GC content.")
-  )
+  make_option(c("-g", "--gtf"), type = "character",
+              help = "Input gtf file with gene / exon information."),
+  make_option(c("-f", "--fasta"), type = "character", default = NULL,
+              help = "fasta file that corresponds to the supplied gtf."),
+  make_option(c("-l", "--length"), type = "character", default = NULL,
+              help = "Output file with Gene ID and length."),
+  make_option(c("-c", "--gc_content"), type = "character", default = NULL,
+              help = "Output file with Gene ID and GC content.")
+)
 
-parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
-args = parse_args(parser)
+parser <- OptionParser(usage = "%prog [options] file",
+                       option_list = option_list)
+args <- parse_args(parser)
 
-GTFfile = args$gtf
-FASTAfile = args$fasta
-length = args$length
-gc_content = args$gc_content
+gtf_file <- args$gtf
+fasta_file <- args$fasta
+length <- args$length
+gc_content <- args$gc_content
+
+# Check args:
+if (is.null(fasta_file) & !is.null(gc_content)) {
+  stop("gc_content output requires fasta input")
+}
+if (is.null(length) & is.null(gc_content)) {
+  stop("neither gc_content nor length was set nothing to do.")
+}
 
 #Load the annotation and reduce it
-GTF <- import.gff(GTFfile, format="gtf", genome=NA, feature.type="exon")
-grl <- reduce(split(GTF, elementMetadata(GTF)$gene_id))
-reducedGTF <- unlist(grl, use.names=T)
-elementMetadata(reducedGTF)$gene_id <- rep(names(grl), elementNROWS(grl))
+gtf <- import.gff(gtf_file, format = "gtf", genome = NA, feature.type = "exon")
+grl <- reduce(split(gtf, elementMetadata(gtf)$gene_id))
+reduced_gtf <- unlist(grl, use.names = T)
+elementMetadata(reduced_gtf)$gene_id <- rep(names(grl), elementNROWS(grl))
 
-#Open the fasta file
-FASTA <- FaFile(FASTAfile)
-open(FASTA)
+if (! is.null(gc_content)) {
+  #Open the fasta file
+  fasta <- FaFile(fasta_file)
+  open(fasta)
 
-#Add the GC numbers
-elementMetadata(reducedGTF)$nGCs <- letterFrequency(getSeq(FASTA, reducedGTF), "GC")[,1]
-elementMetadata(reducedGTF)$widths <- width(reducedGTF)
+  #Add the GC numbers
+  elementMetadata(reduced_gtf)$n_gcs <-
+    letterFrequency(getSeq(fasta, reduced_gtf), "GC")[, 1]
+}
+elementMetadata(reduced_gtf)$widths <- width(reduced_gtf)
 
 #Create a list of the ensembl_id/GC/length
-calc_GC_length <- function(x) {
-    nGCs = sum(elementMetadata(x)$nGCs)
-    width = sum(elementMetadata(x)$widths)
-    c(width, nGCs/width)
+if (! is.null(gc_content)) {
+  calc_gc_length <- function(x) {
+    n_gcs <- sum(elementMetadata(x)$n_gcs)
+    width <- sum(elementMetadata(x)$widths)
+    c(width, n_gcs / width)
+  }
+  output <- t(sapply(split(reduced_gtf, elementMetadata(reduced_gtf)$gene_id),
+                     calc_gc_length))
+  output <- data.frame(setDT(data.frame(output), keep.rownames = TRUE)[])
+  write.table(output[, c(1, 3)], file = gc_content,
+              col.names = FALSE, row.names = FALSE,
+              quote = FALSE, sep = "\t")
+} else {
+  all_widths <- sapply(split(reduced_gtf, elementMetadata(reduced_gtf)$gene_id),
+                       function(x) {
+                         sum(elementMetadata(x)$widths)
+                        })
+  output <- data.frame(gene_id = names(all_widths),
+                       length = all_widths)
 }
-output <- t(sapply(split(reducedGTF, elementMetadata(reducedGTF)$gene_id), calc_GC_length))
-output <- data.frame(setDT(data.frame(output), keep.rownames = TRUE)[])
 
-
-write.table(output[,c(1,2)], file=length, col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t")
-write.table(output[,c(1,3)], file=gc_content, col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t")
+if (! is.null(length)) {
+  write.table(output[, c(1, 2)], file = length,
+              col.names = FALSE, row.names = FALSE,
+              quote = FALSE, sep = "\t")
+}
 
 
 sessionInfo()