annotate get_length_and_gc_content.r @ 1:3ab168143b69 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
author mvdbeek
date Thu, 25 Feb 2016 05:26:51 -0500
parents
children f9b964d1d386
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
1 # originally by Devon Ryan, https://www.biostars.org/p/84467/
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
2
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
3 library(GenomicRanges)
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
4 library(rtracklayer)
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
5 library(Rsamtools)
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
6 library(optparse)
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
7
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
8 option_list <- list(
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
9 make_option(c("-g","--gtf"), type="character", help="Input GTF file with gene / exon information."),
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
10 make_option(c("-f","--fasta"), type="character", default=FALSE, help="Fasta file that corresponds to the supplied GTF."),
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
11 make_option(c("-o","--output"), type="character", default=FALSE, help="Output file with gene name, length and GC content.")
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
12 )
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
13
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
14 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
15 args = parse_args(parser)
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
16
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
17 GTFfile = args.gtf
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
18 FASTAfile = args.fasta
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
19 output = args.output
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
20
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
21 #Load the annotation and reduce it
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
22 GTF <- import.gff(GTFfile, format="gtf", genome=NA, asRangedData=F, feature.type="exon")
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
23 grl <- reduce(split(GTF, elementMetadata(GTF)$gene_name))
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
24 reducedGTF <- unlist(grl, use.names=T)
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
25 elementMetadata(reducedGTF)$gene_name <- rep(names(grl), elementLengths(grl))
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
26
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
27 #Open the fasta file
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
28 FASTA <- FaFile(FASTAfile)
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
29 open(FASTA)
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
30
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
31 #Add the GC numbers
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
32 elementMetadata(reducedGTF)$nGCs <- letterFrequency(getSeq(FASTA, reducedGTF), "GC")[,1]
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
33 elementMetadata(reducedGTF)$widths <- width(reducedGTF)
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
34
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
35 #Create a list of the ensembl_id/GC/length
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
36 calc_GC_length <- function(x) {
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
37 nGCs = sum(elementMetadata(x)$nGCs)
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
38 width = sum(elementMetadata(x)$widths)
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
39 c(width, nGCs/width)
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
40 }
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
41 output <- t(sapply(split(reducedGTF, elementMetadata(reducedGTF)$gene_name), calc_GC_length))
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
42 colnames(output) <- c("Length", "GC")
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
43
3ab168143b69 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
44 write.table(output, file="GC_lengths.tsv", sep="\t")