comparison get_length_and_gc_content.r @ 2:e3ba567abdf5 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/length_and_gc_content commit 7b6b07d22f3e6fed77b2c237de2b0d96fa939711"
author iuc
date Fri, 11 Mar 2022 14:08:11 +0000
parents f088370d2a3c
children
comparison
equal deleted inserted replaced
1:f088370d2a3c 2:e3ba567abdf5
1 # originally by Devon Ryan, https://www.biostars.org/p/84467/ 1 # originally by Devon Ryan, https://www.biostars.org/p/84467/
2 2
3 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) 3 options(show.error.messages = F,
4 error = function() {
5 cat(geterrmessage(), file = stderr())
6 q("no", 1, F)
7 })
4 8
5 # we need that to not crash galaxy with an UTF8 error on German LC settings. 9 # we need that to not crash galaxy with an UTF8 error on German LC settings.
6 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") 10 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
7 11
8 suppressPackageStartupMessages({ 12 suppressPackageStartupMessages({
9 library("GenomicRanges") 13 library("GenomicRanges")
10 library("rtracklayer") 14 library("rtracklayer")
11 library("Rsamtools") 15 library("Rsamtools")
12 library("optparse") 16 library("optparse")
13 library("data.table") 17 library("data.table")
14 }) 18 })
15 19
16 option_list <- list( 20 option_list <- list(
17 make_option(c("-g","--gtf"), type="character", help="Input GTF file with gene / exon information."), 21 make_option(c("-g", "--gtf"), type = "character",
18 make_option(c("-f","--fasta"), type="character", default=FALSE, help="FASTA file that corresponds to the supplied GTF."), 22 help = "Input gtf file with gene / exon information."),
19 make_option(c("-l","--length"), type="character", default=FALSE, help="Output file with Gene ID and length."), 23 make_option(c("-f", "--fasta"), type = "character", default = NULL,
20 make_option(c("-gc","--gc_content"), type="character", default=FALSE, help="Output file with Gene ID and GC content.") 24 help = "fasta file that corresponds to the supplied gtf."),
21 ) 25 make_option(c("-l", "--length"), type = "character", default = NULL,
26 help = "Output file with Gene ID and length."),
27 make_option(c("-c", "--gc_content"), type = "character", default = NULL,
28 help = "Output file with Gene ID and GC content.")
29 )
22 30
23 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 31 parser <- OptionParser(usage = "%prog [options] file",
24 args = parse_args(parser) 32 option_list = option_list)
33 args <- parse_args(parser)
25 34
26 GTFfile = args$gtf 35 gtf_file <- args$gtf
27 FASTAfile = args$fasta 36 fasta_file <- args$fasta
28 length = args$length 37 length <- args$length
29 gc_content = args$gc_content 38 gc_content <- args$gc_content
39
40 # Check args:
41 if (is.null(fasta_file) & !is.null(gc_content)) {
42 stop("gc_content output requires fasta input")
43 }
44 if (is.null(length) & is.null(gc_content)) {
45 stop("neither gc_content nor length was set nothing to do.")
46 }
30 47
31 #Load the annotation and reduce it 48 #Load the annotation and reduce it
32 GTF <- import.gff(GTFfile, format="gtf", genome=NA, feature.type="exon") 49 gtf <- import.gff(gtf_file, format = "gtf", genome = NA, feature.type = "exon")
33 grl <- reduce(split(GTF, elementMetadata(GTF)$gene_id)) 50 grl <- reduce(split(gtf, elementMetadata(gtf)$gene_id))
34 reducedGTF <- unlist(grl, use.names=T) 51 reduced_gtf <- unlist(grl, use.names = T)
35 elementMetadata(reducedGTF)$gene_id <- rep(names(grl), elementNROWS(grl)) 52 elementMetadata(reduced_gtf)$gene_id <- rep(names(grl), elementNROWS(grl))
36 53
37 #Open the fasta file 54 if (! is.null(gc_content)) {
38 FASTA <- FaFile(FASTAfile) 55 #Open the fasta file
39 open(FASTA) 56 fasta <- FaFile(fasta_file)
57 open(fasta)
40 58
41 #Add the GC numbers 59 #Add the GC numbers
42 elementMetadata(reducedGTF)$nGCs <- letterFrequency(getSeq(FASTA, reducedGTF), "GC")[,1] 60 elementMetadata(reduced_gtf)$n_gcs <-
43 elementMetadata(reducedGTF)$widths <- width(reducedGTF) 61 letterFrequency(getSeq(fasta, reduced_gtf), "GC")[, 1]
62 }
63 elementMetadata(reduced_gtf)$widths <- width(reduced_gtf)
44 64
45 #Create a list of the ensembl_id/GC/length 65 #Create a list of the ensembl_id/GC/length
46 calc_GC_length <- function(x) { 66 if (! is.null(gc_content)) {
47 nGCs = sum(elementMetadata(x)$nGCs) 67 calc_gc_length <- function(x) {
48 width = sum(elementMetadata(x)$widths) 68 n_gcs <- sum(elementMetadata(x)$n_gcs)
49 c(width, nGCs/width) 69 width <- sum(elementMetadata(x)$widths)
70 c(width, n_gcs / width)
71 }
72 output <- t(sapply(split(reduced_gtf, elementMetadata(reduced_gtf)$gene_id),
73 calc_gc_length))
74 output <- data.frame(setDT(data.frame(output), keep.rownames = TRUE)[])
75 write.table(output[, c(1, 3)], file = gc_content,
76 col.names = FALSE, row.names = FALSE,
77 quote = FALSE, sep = "\t")
78 } else {
79 all_widths <- sapply(split(reduced_gtf, elementMetadata(reduced_gtf)$gene_id),
80 function(x) {
81 sum(elementMetadata(x)$widths)
82 })
83 output <- data.frame(gene_id = names(all_widths),
84 length = all_widths)
50 } 85 }
51 output <- t(sapply(split(reducedGTF, elementMetadata(reducedGTF)$gene_id), calc_GC_length))
52 output <- data.frame(setDT(data.frame(output), keep.rownames = TRUE)[])
53 86
54 87 if (! is.null(length)) {
55 write.table(output[,c(1,2)], file=length, col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t") 88 write.table(output[, c(1, 2)], file = length,
56 write.table(output[,c(1,3)], file=gc_content, col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t") 89 col.names = FALSE, row.names = FALSE,
90 quote = FALSE, sep = "\t")
91 }
57 92
58 93
59 sessionInfo() 94 sessionInfo()