Mercurial > repos > mvdbeek > r_goseq_1_22_0
annotate get_length_and_gc_content.r @ 12:02e88556ce1d draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
author | mvdbeek |
---|---|
date | Thu, 25 Feb 2016 06:44:44 -0500 |
parents | 7f8d888e3355 |
children | ea84562a5115 |
rev | line source |
---|---|
1
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
1 # originally by Devon Ryan, https://www.biostars.org/p/84467/ |
6
d4b5942ed347
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
5
diff
changeset
|
2 sink(stdout(), type = "message") |
1
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
3 |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
4 library(GenomicRanges) |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
5 library(rtracklayer) |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
6 library(Rsamtools) |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
7 library(optparse) |
12
02e88556ce1d
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
10
diff
changeset
|
8 library(data.table) |
1
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
9 |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
10 option_list <- list( |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
11 make_option(c("-g","--gtf"), type="character", help="Input GTF file with gene / exon information."), |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
12 make_option(c("-f","--fasta"), type="character", default=FALSE, help="Fasta file that corresponds to the supplied GTF."), |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
13 make_option(c("-o","--output"), type="character", default=FALSE, help="Output file with gene name, length and GC content.") |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
14 ) |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
15 |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
16 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
17 args = parse_args(parser) |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
18 |
5
f9b964d1d386
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
1
diff
changeset
|
19 GTFfile = args$gtf |
f9b964d1d386
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
1
diff
changeset
|
20 FASTAfile = args$fasta |
10
7f8d888e3355
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
7
diff
changeset
|
21 output_file = args$output |
1
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
22 |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
23 #Load the annotation and reduce it |
6
d4b5942ed347
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
5
diff
changeset
|
24 GTF <- import.gff(GTFfile, format="gtf", genome=NA, feature.type="exon") |
7
15ce6435ab83
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
6
diff
changeset
|
25 grl <- reduce(split(GTF, elementMetadata(GTF)$gene_id)) |
1
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
26 reducedGTF <- unlist(grl, use.names=T) |
7
15ce6435ab83
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
6
diff
changeset
|
27 elementMetadata(reducedGTF)$gene_id <- rep(names(grl), elementLengths(grl)) |
1
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
28 |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
29 #Open the fasta file |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
30 FASTA <- FaFile(FASTAfile) |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
31 open(FASTA) |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
32 |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
33 #Add the GC numbers |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
34 elementMetadata(reducedGTF)$nGCs <- letterFrequency(getSeq(FASTA, reducedGTF), "GC")[,1] |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
35 elementMetadata(reducedGTF)$widths <- width(reducedGTF) |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
36 |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
37 #Create a list of the ensembl_id/GC/length |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
38 calc_GC_length <- function(x) { |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
39 nGCs = sum(elementMetadata(x)$nGCs) |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
40 width = sum(elementMetadata(x)$widths) |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
41 c(width, nGCs/width) |
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
42 } |
7
15ce6435ab83
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
6
diff
changeset
|
43 output <- t(sapply(split(reducedGTF, elementMetadata(reducedGTF)$gene_id), calc_GC_length)) |
12
02e88556ce1d
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
10
diff
changeset
|
44 output <- setDT(output, keep.rownames = TRUE)[] |
02e88556ce1d
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
10
diff
changeset
|
45 colnames(output) <- c("#gene_id", "length", "GC") |
1
3ab168143b69
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff
changeset
|
46 |
12
02e88556ce1d
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
10
diff
changeset
|
47 write.table(output, file=output_file, row.names=FALSE, quote=FALSE, sep="\t") |