Mercurial > repos > iuc > length_and_gc_content
view get_length_and_gc_content.r @ 2:e3ba567abdf5 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/length_and_gc_content commit 7b6b07d22f3e6fed77b2c237de2b0d96fa939711"
author | iuc |
---|---|
date | Fri, 11 Mar 2022 14:08:11 +0000 |
parents | f088370d2a3c |
children |
line wrap: on
line source
# originally by Devon Ryan, https://www.biostars.org/p/84467/ options(show.error.messages = F, error = function() { cat(geterrmessage(), file = stderr()) q("no", 1, F) }) # we need that to not crash galaxy with an UTF8 error on German LC settings. loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") suppressPackageStartupMessages({ library("GenomicRanges") library("rtracklayer") library("Rsamtools") library("optparse") library("data.table") }) option_list <- list( make_option(c("-g", "--gtf"), type = "character", help = "Input gtf file with gene / exon information."), make_option(c("-f", "--fasta"), type = "character", default = NULL, help = "fasta file that corresponds to the supplied gtf."), make_option(c("-l", "--length"), type = "character", default = NULL, help = "Output file with Gene ID and length."), make_option(c("-c", "--gc_content"), type = "character", default = NULL, help = "Output file with Gene ID and GC content.") ) parser <- OptionParser(usage = "%prog [options] file", option_list = option_list) args <- parse_args(parser) gtf_file <- args$gtf fasta_file <- args$fasta length <- args$length gc_content <- args$gc_content # Check args: if (is.null(fasta_file) & !is.null(gc_content)) { stop("gc_content output requires fasta input") } if (is.null(length) & is.null(gc_content)) { stop("neither gc_content nor length was set nothing to do.") } #Load the annotation and reduce it gtf <- import.gff(gtf_file, format = "gtf", genome = NA, feature.type = "exon") grl <- reduce(split(gtf, elementMetadata(gtf)$gene_id)) reduced_gtf <- unlist(grl, use.names = T) elementMetadata(reduced_gtf)$gene_id <- rep(names(grl), elementNROWS(grl)) if (! is.null(gc_content)) { #Open the fasta file fasta <- FaFile(fasta_file) open(fasta) #Add the GC numbers elementMetadata(reduced_gtf)$n_gcs <- letterFrequency(getSeq(fasta, reduced_gtf), "GC")[, 1] } elementMetadata(reduced_gtf)$widths <- width(reduced_gtf) #Create a list of the ensembl_id/GC/length if (! is.null(gc_content)) { calc_gc_length <- function(x) { n_gcs <- sum(elementMetadata(x)$n_gcs) width <- sum(elementMetadata(x)$widths) c(width, n_gcs / width) } output <- t(sapply(split(reduced_gtf, elementMetadata(reduced_gtf)$gene_id), calc_gc_length)) output <- data.frame(setDT(data.frame(output), keep.rownames = TRUE)[]) write.table(output[, c(1, 3)], file = gc_content, col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t") } else { all_widths <- sapply(split(reduced_gtf, elementMetadata(reduced_gtf)$gene_id), function(x) { sum(elementMetadata(x)$widths) }) output <- data.frame(gene_id = names(all_widths), length = all_widths) } if (! is.null(length)) { write.table(output[, c(1, 2)], file = length, col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t") } sessionInfo()