Mercurial > repos > mvdbeek > r_goseq_1_22_0
changeset 1:3ab168143b69 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
author | mvdbeek |
---|---|
date | Thu, 25 Feb 2016 05:26:51 -0500 (2016-02-25) |
parents | fe71b97cc1a5 |
children | 8d08f14b82fd |
files | get_length_and_gc_content.r get_length_and_gc_content.xml |
diffstat | 2 files changed, 90 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_length_and_gc_content.r Thu Feb 25 05:26:51 2016 -0500 @@ -0,0 +1,44 @@ +# originally by Devon Ryan, https://www.biostars.org/p/84467/ + +library(GenomicRanges) +library(rtracklayer) +library(Rsamtools) +library(optparse) + +option_list <- list( + make_option(c("-g","--gtf"), type="character", help="Input GTF file with gene / exon information."), + make_option(c("-f","--fasta"), type="character", default=FALSE, help="Fasta file that corresponds to the supplied GTF."), + make_option(c("-o","--output"), type="character", default=FALSE, help="Output file with gene name, length and GC content.") + ) + +parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) +args = parse_args(parser) + +GTFfile = args.gtf +FASTAfile = args.fasta +output = args.output + +#Load the annotation and reduce it +GTF <- import.gff(GTFfile, format="gtf", genome=NA, asRangedData=F, feature.type="exon") +grl <- reduce(split(GTF, elementMetadata(GTF)$gene_name)) +reducedGTF <- unlist(grl, use.names=T) +elementMetadata(reducedGTF)$gene_name <- rep(names(grl), elementLengths(grl)) + +#Open the fasta file +FASTA <- FaFile(FASTAfile) +open(FASTA) + +#Add the GC numbers +elementMetadata(reducedGTF)$nGCs <- letterFrequency(getSeq(FASTA, reducedGTF), "GC")[,1] +elementMetadata(reducedGTF)$widths <- width(reducedGTF) + +#Create a list of the ensembl_id/GC/length +calc_GC_length <- function(x) { + nGCs = sum(elementMetadata(x)$nGCs) + width = sum(elementMetadata(x)$widths) + c(width, nGCs/width) +} +output <- t(sapply(split(reducedGTF, elementMetadata(reducedGTF)$gene_name), calc_GC_length)) +colnames(output) <- c("Length", "GC") + +write.table(output, file="GC_lengths.tsv", sep="\t") \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_length_and_gc_content.xml Thu Feb 25 05:26:51 2016 -0500 @@ -0,0 +1,46 @@ +<tool id="length_and_gc_content" name="gene length and gc content from gtf file" version="0.1.0"> + <description /> + <requirements> + <requirement type="package" version="3.2.1">R</requirement> + <requirement type="package" version="1.22.0">goseq</requirement> + </requirements> + <command interpreter="Rscript"> + get_length_and_gc_content.r --gtf "$gtf" + --fasta "$fasta" + --output "output" + </command> + <inputs> + <param help="The GTF must match the FASTA file" label="GTF file for length and GC calculation" name="gtf" type="data" /> + <conditional name="fastaSource"> + <param help="choose history if you don't see the correct genome fasta" label="Select a reference fasta from your history or use a built-in fasta?" name="genomeSource" type="select"> + <option value="indexed">Use a built-in fasta</option> + <option value="history">Use fasta from history</option> + </param> + <when value="indexed"> + <param help="Select the fasta file from a list of pre-installed genomes" label="Select a fasta sequence" name="fasta" type="select"> + <options from_data_table="all_fasta"/> + </param> + </when> + <when value="history"> + <param format="fasta" label="Select a fasta file, to serve as index reference" name="fasta" type="data" /> + </when> + </conditional> + </inputs> + <outputs> + <data format="tabular" label="length and gc content" name="output" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> + + **What it does** + + Returns a tabular file with gene name, length and GC content, based on a supplied GTF and a FASTA file. + + + </help> + <citations> + </citations> +</tool>