Mercurial > repos > iuc > length_and_gc_content
comparison get_length_and_gc_content.r @ 2:e3ba567abdf5 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/length_and_gc_content commit 7b6b07d22f3e6fed77b2c237de2b0d96fa939711"
author | iuc |
---|---|
date | Fri, 11 Mar 2022 14:08:11 +0000 |
parents | f088370d2a3c |
children |
comparison
equal
deleted
inserted
replaced
1:f088370d2a3c | 2:e3ba567abdf5 |
---|---|
1 # originally by Devon Ryan, https://www.biostars.org/p/84467/ | 1 # originally by Devon Ryan, https://www.biostars.org/p/84467/ |
2 | 2 |
3 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | 3 options(show.error.messages = F, |
4 error = function() { | |
5 cat(geterrmessage(), file = stderr()) | |
6 q("no", 1, F) | |
7 }) | |
4 | 8 |
5 # we need that to not crash galaxy with an UTF8 error on German LC settings. | 9 # we need that to not crash galaxy with an UTF8 error on German LC settings. |
6 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | 10 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") |
7 | 11 |
8 suppressPackageStartupMessages({ | 12 suppressPackageStartupMessages({ |
9 library("GenomicRanges") | 13 library("GenomicRanges") |
10 library("rtracklayer") | 14 library("rtracklayer") |
11 library("Rsamtools") | 15 library("Rsamtools") |
12 library("optparse") | 16 library("optparse") |
13 library("data.table") | 17 library("data.table") |
14 }) | 18 }) |
15 | 19 |
16 option_list <- list( | 20 option_list <- list( |
17 make_option(c("-g","--gtf"), type="character", help="Input GTF file with gene / exon information."), | 21 make_option(c("-g", "--gtf"), type = "character", |
18 make_option(c("-f","--fasta"), type="character", default=FALSE, help="FASTA file that corresponds to the supplied GTF."), | 22 help = "Input gtf file with gene / exon information."), |
19 make_option(c("-l","--length"), type="character", default=FALSE, help="Output file with Gene ID and length."), | 23 make_option(c("-f", "--fasta"), type = "character", default = NULL, |
20 make_option(c("-gc","--gc_content"), type="character", default=FALSE, help="Output file with Gene ID and GC content.") | 24 help = "fasta file that corresponds to the supplied gtf."), |
21 ) | 25 make_option(c("-l", "--length"), type = "character", default = NULL, |
26 help = "Output file with Gene ID and length."), | |
27 make_option(c("-c", "--gc_content"), type = "character", default = NULL, | |
28 help = "Output file with Gene ID and GC content.") | |
29 ) | |
22 | 30 |
23 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) | 31 parser <- OptionParser(usage = "%prog [options] file", |
24 args = parse_args(parser) | 32 option_list = option_list) |
33 args <- parse_args(parser) | |
25 | 34 |
26 GTFfile = args$gtf | 35 gtf_file <- args$gtf |
27 FASTAfile = args$fasta | 36 fasta_file <- args$fasta |
28 length = args$length | 37 length <- args$length |
29 gc_content = args$gc_content | 38 gc_content <- args$gc_content |
39 | |
40 # Check args: | |
41 if (is.null(fasta_file) & !is.null(gc_content)) { | |
42 stop("gc_content output requires fasta input") | |
43 } | |
44 if (is.null(length) & is.null(gc_content)) { | |
45 stop("neither gc_content nor length was set nothing to do.") | |
46 } | |
30 | 47 |
31 #Load the annotation and reduce it | 48 #Load the annotation and reduce it |
32 GTF <- import.gff(GTFfile, format="gtf", genome=NA, feature.type="exon") | 49 gtf <- import.gff(gtf_file, format = "gtf", genome = NA, feature.type = "exon") |
33 grl <- reduce(split(GTF, elementMetadata(GTF)$gene_id)) | 50 grl <- reduce(split(gtf, elementMetadata(gtf)$gene_id)) |
34 reducedGTF <- unlist(grl, use.names=T) | 51 reduced_gtf <- unlist(grl, use.names = T) |
35 elementMetadata(reducedGTF)$gene_id <- rep(names(grl), elementNROWS(grl)) | 52 elementMetadata(reduced_gtf)$gene_id <- rep(names(grl), elementNROWS(grl)) |
36 | 53 |
37 #Open the fasta file | 54 if (! is.null(gc_content)) { |
38 FASTA <- FaFile(FASTAfile) | 55 #Open the fasta file |
39 open(FASTA) | 56 fasta <- FaFile(fasta_file) |
57 open(fasta) | |
40 | 58 |
41 #Add the GC numbers | 59 #Add the GC numbers |
42 elementMetadata(reducedGTF)$nGCs <- letterFrequency(getSeq(FASTA, reducedGTF), "GC")[,1] | 60 elementMetadata(reduced_gtf)$n_gcs <- |
43 elementMetadata(reducedGTF)$widths <- width(reducedGTF) | 61 letterFrequency(getSeq(fasta, reduced_gtf), "GC")[, 1] |
62 } | |
63 elementMetadata(reduced_gtf)$widths <- width(reduced_gtf) | |
44 | 64 |
45 #Create a list of the ensembl_id/GC/length | 65 #Create a list of the ensembl_id/GC/length |
46 calc_GC_length <- function(x) { | 66 if (! is.null(gc_content)) { |
47 nGCs = sum(elementMetadata(x)$nGCs) | 67 calc_gc_length <- function(x) { |
48 width = sum(elementMetadata(x)$widths) | 68 n_gcs <- sum(elementMetadata(x)$n_gcs) |
49 c(width, nGCs/width) | 69 width <- sum(elementMetadata(x)$widths) |
70 c(width, n_gcs / width) | |
71 } | |
72 output <- t(sapply(split(reduced_gtf, elementMetadata(reduced_gtf)$gene_id), | |
73 calc_gc_length)) | |
74 output <- data.frame(setDT(data.frame(output), keep.rownames = TRUE)[]) | |
75 write.table(output[, c(1, 3)], file = gc_content, | |
76 col.names = FALSE, row.names = FALSE, | |
77 quote = FALSE, sep = "\t") | |
78 } else { | |
79 all_widths <- sapply(split(reduced_gtf, elementMetadata(reduced_gtf)$gene_id), | |
80 function(x) { | |
81 sum(elementMetadata(x)$widths) | |
82 }) | |
83 output <- data.frame(gene_id = names(all_widths), | |
84 length = all_widths) | |
50 } | 85 } |
51 output <- t(sapply(split(reducedGTF, elementMetadata(reducedGTF)$gene_id), calc_GC_length)) | |
52 output <- data.frame(setDT(data.frame(output), keep.rownames = TRUE)[]) | |
53 | 86 |
54 | 87 if (! is.null(length)) { |
55 write.table(output[,c(1,2)], file=length, col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t") | 88 write.table(output[, c(1, 2)], file = length, |
56 write.table(output[,c(1,3)], file=gc_content, col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t") | 89 col.names = FALSE, row.names = FALSE, |
90 quote = FALSE, sep = "\t") | |
91 } | |
57 | 92 |
58 | 93 |
59 sessionInfo() | 94 sessionInfo() |