changeset 1:3ab168143b69 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
author mvdbeek
date Thu, 25 Feb 2016 05:26:51 -0500 (2016-02-25)
parents fe71b97cc1a5
children 8d08f14b82fd
files get_length_and_gc_content.r get_length_and_gc_content.xml
diffstat 2 files changed, 90 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_length_and_gc_content.r	Thu Feb 25 05:26:51 2016 -0500
@@ -0,0 +1,44 @@
+# originally by Devon Ryan, https://www.biostars.org/p/84467/
+
+library(GenomicRanges)
+library(rtracklayer)
+library(Rsamtools)
+library(optparse)
+
+option_list <- list(
+    make_option(c("-g","--gtf"), type="character", help="Input GTF file with gene / exon information."),
+    make_option(c("-f","--fasta"), type="character", default=FALSE, help="Fasta file that corresponds to the supplied GTF."),
+    make_option(c("-o","--output"), type="character", default=FALSE, help="Output file with gene name, length and GC content.")
+  )
+
+parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
+args = parse_args(parser)
+
+GTFfile = args.gtf
+FASTAfile = args.fasta
+output = args.output
+
+#Load the annotation and reduce it
+GTF <- import.gff(GTFfile, format="gtf", genome=NA, asRangedData=F, feature.type="exon")
+grl <- reduce(split(GTF, elementMetadata(GTF)$gene_name))
+reducedGTF <- unlist(grl, use.names=T)
+elementMetadata(reducedGTF)$gene_name <- rep(names(grl), elementLengths(grl))
+
+#Open the fasta file
+FASTA <- FaFile(FASTAfile)
+open(FASTA)
+
+#Add the GC numbers
+elementMetadata(reducedGTF)$nGCs <- letterFrequency(getSeq(FASTA, reducedGTF), "GC")[,1]
+elementMetadata(reducedGTF)$widths <- width(reducedGTF)
+
+#Create a list of the ensembl_id/GC/length
+calc_GC_length <- function(x) {
+    nGCs = sum(elementMetadata(x)$nGCs)
+    width = sum(elementMetadata(x)$widths)
+    c(width, nGCs/width)
+}
+output <- t(sapply(split(reducedGTF, elementMetadata(reducedGTF)$gene_name), calc_GC_length))
+colnames(output) <- c("Length", "GC")
+
+write.table(output, file="GC_lengths.tsv", sep="\t")
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_length_and_gc_content.xml	Thu Feb 25 05:26:51 2016 -0500
@@ -0,0 +1,46 @@
+<tool id="length_and_gc_content" name="gene length and gc content from gtf file" version="0.1.0">
+    <description />
+    <requirements>
+        <requirement type="package" version="3.2.1">R</requirement>
+        <requirement type="package" version="1.22.0">goseq</requirement>
+    </requirements>
+    <command interpreter="Rscript">
+        get_length_and_gc_content.r --gtf "$gtf"
+        --fasta "$fasta"
+        --output "output"
+    </command>
+    <inputs>
+        <param help="The GTF must match the FASTA file" label="GTF file for length and GC calculation" name="gtf" type="data" />
+        <conditional name="fastaSource">
+            <param help="choose history if you don't see the correct genome fasta" label="Select a reference fasta from your history or use a built-in fasta?" name="genomeSource" type="select">
+                <option value="indexed">Use a built-in fasta</option>
+                <option value="history">Use fasta from history</option>
+            </param>
+        <when value="indexed">
+            <param help="Select the fasta file from a list of pre-installed genomes" label="Select a fasta sequence" name="fasta" type="select">
+              <options from_data_table="all_fasta"/>
+            </param>
+        </when>
+        <when value="history">
+            <param format="fasta" label="Select a fasta file, to serve as index reference" name="fasta" type="data" />
+        </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data format="tabular" label="length and gc content" name="output" />
+    </outputs>
+    <tests>
+        <test>
+        </test>
+    </tests>
+    <help>
+
+        **What it does**
+
+        Returns a tabular file with gene name, length and GC content, based on a supplied GTF and a FASTA file.
+
+
+        </help>
+    <citations>
+    </citations>
+</tool>