# HG changeset patch
# User theo.collard
# Date 1493210521 14400
# Node ID 896cdffe06ffc1440c54d94c4acea5da68d9aab9
# Parent eb120683235963ef3d38bc71df3b5bd6d07c925b
first upload
diff -r eb1206832359 -r 896cdffe06ff ._.
Binary file ._. has changed
diff -r eb1206832359 -r 896cdffe06ff ballgown/._ballgown.xml
Binary file ballgown/._ballgown.xml has changed
diff -r eb1206832359 -r 896cdffe06ff ballgown/ballgown.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ballgown/ballgown.R Wed Apr 26 08:42:01 2017 -0400
@@ -0,0 +1,73 @@
+#!/usr/bin/Rscript
+
+# Enabling commands line arguments. Using optparse which allows to use options.
+# ----------------------------------------------------------------------------------------
+
+suppressMessages(library(optparse, warn.conflicts = FALSE))
+opt_list=list(
+make_option(c("-d", "--directory"), type="character", default=NULL, help="directory containing the samples", metavar="character"),
+make_option(c("-p", "--phendat"), type="character", default=NULL, help="phenotype data(must be a .csv file)", metavar="character"),
+make_option(c("-t","--outputtranscript"), type="character", default="output_transcript.csv", help="output_transcript.csv: contains the transcripts of the expirements", metavar="character"),
+make_option(c("-g","--outputgenes"), type="character", default="output_genes.csv", help="output_genes.csv: contains the genes of the expirements", metavar="character"),
+make_option(c("-e","--texpression"), type="double", default="0.5", help="transcripts expression filter", metavar="character"),
+make_option(c("--bgout"), type="character", default="", help="save the ballgown object created in the process", metavar="character")
+)
+opt_parser=OptionParser(option_list=opt_list)
+opt=parse_args(opt_parser)
+
+# Loading required libraries. suppressMessages() remove all noisy attachement messages
+# ----------------------------------------------------------------------------------------
+
+suppressMessages(library(ballgown, warn.conflicts = FALSE))
+suppressMessages(library(genefilter, warn.conflicts = FALSE))
+suppressMessages(library(dplyr, warn.conflicts = FALSE))
+
+# Setup for the tool with some bases variables.
+# ----------------------------------------------------------------------------------------
+
+
+filtstr = opt$texpression
+pdat = 2
+phendata = read.csv(opt$phendat)
+setwd(opt$dir)
+
+# Checking if the pdata file has the right samples names.
+# ----------------------------------------------------------------------------------------
+
+if (all(phendata$ids == list.files(".")) != TRUE)
+{
+ stop("Your phenotype data table does not match the samples names. ")
+}
+
+# Creation of the ballgown object based on data
+# ----------------------------------------------------------------------------------------
+bgi = ballgown(dataDir= "." , samplePattern="", pData = phendata, verbose = FALSE)
+
+# Filter the genes with an expression superior to the input filter
+# ----------------------------------------------------------------------------------------
+bgi_filt= subset(bgi, paste("rowVars(texpr(bgi)) >",filtstr), genomesubset = TRUE)
+
+# Creating the variables containing the transcripts and the genes and sorting them through the arrange() command.
+# Checking if there's one or more adjust variables in the phenotype data file
+# ----------------------------------------------------------------------------------------
+
+if (ncol(pData(bgi))<=3) {
+ results_transcripts=stattest(bgi_filt,feature = "transcript", covariate = colnames(pData(bgi))[pdat], adjustvars = colnames(pData(bgi)[pdat+1]), getFC = TRUE, meas = "FPKM")
+ results_genes=stattest(bgi_filt,feature = "gene", covariate = colnames(pData(bgi))[pdat], adjustvars = colnames(pData(bgi)[pdat+1]), getFC = TRUE, meas = "FPKM")
+} else {
+ results_transcripts=stattest(bgi_filt,feature = "transcript", covariate = colnames(pData(bgi))[pdat], adjustvars = c(colnames(pData(bgi)[pdat+1:ncol(pData(bgi))])), getFC = TRUE, meas = "FPKM")
+ results_genes=stattest(bgi_filt,feature = "gene", covariate = colnames(pData(bgi))[pdat], adjustvars = c(colnames(pData(bgi)[pdat+1:ncol(pData(bgi))])), getFC = TRUE, meas = "FPKM")
+}
+
+results_transcripts = data.frame(geneNames=ballgown::geneNames(bgi_filt), geneIDs=ballgown::geneIDs(bgi_filt), results_transcripts)
+results_transcripts = arrange(results_transcripts,pval)
+results_genes = arrange(results_genes,pval)
+
+# Main output of the wrapper, two .csv files containing the genes and transcripts with their qvalue and pvalue
+#This part also output the data of the ballgown object created in the process and save it in a R data file
+# ----------------------------------------------------------------------------------------
+write.csv(results_transcripts, opt$outputtranscript, row.names = FALSE)
+write.csv(results_genes, opt$outputgenes, row.names = FALSE)
+if (opt$bgout != ""){
+ save(bgi, file=opt$bgout)
+}
diff -r eb1206832359 -r 896cdffe06ff ballgown/ballgown.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ballgown/ballgown.xml Wed Apr 26 08:42:01 2017 -0400
@@ -0,0 +1,235 @@
+
+ Flexible, isoform-level differential expression analysis
+
+ bioconductor-ballgown
+ r-dplyr
+ r-optparse
+
+
+
+ ##------------------------------------------------------------------------------------
+ ## This function reads the input file with the mapping between samples and files
+ ## E.g. of result:
+ ## mapping = {
+ ## "e2t.ctab" : "sample1",
+ ## "other.ctab" : "sample2",
+ ## "i2t.ctab" : "sample1",
+ ## "t_data.ctab": "sample1"
+ ## ...
+ ## }
+ ##------------------------------------------------------------------------------------
+ #def read_sample_mapping_file(sample_mapping_file):
+ #try
+ #set mapping = {}
+ #set file = open($sample_mapping_file.dataset.dataset.get_file_name(),'r')
+ #for $line in $file:
+ #set content= $line.strip().split('\t')
+ #for $map in $content:
+ #set mapping[$map]= $content[0]
+ #end for
+ #end for
+ #return $mapping
+ #except
+ #return None
+ #end try
+ #end def
+
+ ##------------------------------------------------------------------------------------
+ ## This function returns the name of the sample associated to a given file
+ ##------------------------------------------------------------------------------------
+ #def get_sample_name($dataset, $sample_mapping):
+ ##If the file with samples mapping was provided
+ #if $sample_mapping != None:
+ #return $sample_mapping.get($dataset.name, None)
+ ##Otherwise with extract the sample name from the filename
+ #else:
+ #return str($dataset.element_identifier)
+ #end if
+ #end def
+
+ ##------------------------------------------------------------------------------------
+ ## This function reads a dataset or list of datasets and sets the corresponding value
+ ## in the $result variable
+ ## e.g. of result
+ ##'sample1' : {
+ ## 'e_data': '/export/galaxy-central/database/files/000/dataset_13.dat'
+ ## 'i_data': '/export/galaxy-central/database/files/000/dataset_10.dat',
+ ## 't_data': '/export/galaxy-central/database/files/000/dataset_12.dat',
+ ## 'e2t': '/export/galaxy-central/database/files/000/dataset_9.dat',
+ ## 'i2t': '/export/galaxy-central/database/files/000/dataset_11.dat'
+ ## },
+ ##------------------------------------------------------------------------------------
+ #def read_input_files($param_name, $param_value, $result, $sample_mapping, $create_if_empty):
+ ## If input is a data collection
+ #if isinstance($param_value, list):
+ ## For each dataset
+ #for $dataset in $param_value:
+ ## Get the sample name
+ #set sample_name = $get_sample_name($dataset, $sample_mapping)
+ ## Check if sample is already registered
+ #if not($result.has_key($sample_name)):
+ #if ($create_if_empty == True):
+ #set result[$sample_name] = {}
+ #else:
+ #raise ValueError("Error in input. Please check that input contains all the required files for sample " + $sample_name)
+ #end if
+ #end if
+ ## Register the file to the sample
+ #set result[$sample_name][$param_name] = str($dataset.dataset.dataset.get_file_name())
+ #end for
+ #else:
+ #if not($result.has_key("sample_1")):
+ #set result["sample_1"] = {}
+ #end if
+ #set result["sample_1"][$param_name] = str($param_name.dataset.dataset.get_file_name())
+ #end if
+ #return $result
+ #end def
+
+ ##------------------------------------------------------------------------------------
+ ## Main body of the tool
+ ##------------------------------------------------------------------------------------
+ ## Set the params for the next R script
+ #set result={}
+ #set sample_mapping=None
+
+ ## If the samples mapping file was provided, parse the content
+ #if $samples_names != None and not(isinstance($samples_names, list) and (None in $samples_names)):
+ #set sample_mapping = $read_sample_mapping_file($samples_names)
+ #end if
+
+ ## READ THE CONTENT FOR e_data AND STORE THE FILES
+ ## INDEXED BY THEIR SAMPLE NAME
+ ## e.g. 'HBR_Rep1' : {
+ ## 'e_data': '/export/galaxy-central/database/files/000/dataset_13.dat'
+ ## 'i_data': '/export/galaxy-central/database/files/000/dataset_10.dat',
+ ## 't_data': '/export/galaxy-central/database/files/000/dataset_12.dat',
+ ## 'e2t': '/export/galaxy-central/database/files/000/dataset_9.dat',
+ ## 'i2t': '/export/galaxy-central/database/files/000/dataset_11.dat'
+ ## },
+ ## 'HBR_Rep2' : {...}
+ #set $result = $read_input_files("e_data.ctab", $e_data, $result, $sample_mapping, True)
+ #set $result = $read_input_files("i_data.ctab", $i_data, $result, $sample_mapping, False)
+ #set $result = $read_input_files("t_data.ctab", $t_data, $result, $sample_mapping, False)
+ #set $result = $read_input_files("e2t.ctab", $e2t, $result, $sample_mapping, False)
+ #set $result = $read_input_files("i2t.ctab", $i2t, $result, $sample_mapping, False)
+
+ ## For each input sample, create a directory and link the input files for ballgown
+ #import os
+ #set n_sample = 1
+ #for $key, $value in $result.iteritems():
+ #set dir_name = str($output.files_path) + "/" + $key + "/"
+ $os.makedirs($dir_name)
+ #for $file_name, $file_path in $value.iteritems():
+ $os.symlink($file_path, $dir_name + $file_name)
+ #end for
+ #set n_sample = $n_sample + 1
+ #end for
+
+ ## Run the R script with the location of the linked files and the name for outpot file
+ ballgown.R --directory $output.files_path --outputtranscript $output --outputgenes $outputgn --texpression $trexpression --phendat $phendata --bgout $bgo
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+=======================
+Ballgown
+=======================
+-----------------------
+**What it does**
+-----------------------
+
+Ballgown is a software package designed to facilitate flexible differential expression analysis of RNA-seq data.
+The Ballgown package provides functions to organize, visualize, and analyze the expression measurements for your transcriptome assembly.
+
+----
+
+-----------------------
+**How to use**
+-----------------------
+The input for this tools consists on 5 files for each sample in your experiment:
+
+- **e_data**: exon-level expression measurements. Tab file or collection of tab files. One row per exon. Columns are e_id (numeric exon id), chr, strand, start, end (genomic location of the exon), and the following expression measurements for each sample:
+ * rcount: reads overlapping the exon
+ * ucount: uniquely mapped reads overlapping the exon
+ * mrcount: multi-map-corrected number of reads overlapping the exon
+ * cov average per-base read coverage
+ * cov_sd: standard deviation of per-base read coverage
+ * mcov: multi-map-corrected average per-base read coverage
+ * mcov_sd: standard deviation of multi-map-corrected per-base coverage
+- **i_data**: intron- (i.e., junction-) level expression measurements. Tab file or collection of tab files. One row per intron. Columns are i_id (numeric intron id), chr, strand, start, end (genomic location of the intron), and the following expression measurements for each sample:
+ * rcount: number of reads supporting the intron
+ * ucount: number of uniquely mapped reads supporting the intron
+ * mrcount: multi-map-corrected number of reads supporting the intron
+- **t_data**: transcript-level expression measurements. Tab file or collection of tab files. One row per transcript. Columns are:
+ * t_id: numeric transcript id
+ * chr, strand, start, end: genomic location of the transcript
+ * t_name: Cufflinks-generated transcript id
+ * num_exons: number of exons comprising the transcript
+ * length: transcript length, including both exons and introns
+ * gene_id: gene the transcript belongs to
+ * gene_name: HUGO gene name for the transcript, if known
+ * cov: per-base coverage for the transcript (available for each sample)
+ * FPKM: Cufflinks-estimated FPKM for the transcript (available for each sample)
+- **e2t**: Tab file or collection of tab files. Table with two columns, e_id and t_id, denoting which exons belong to which transcripts. These ids match the ids in the e_data and t_data tables.
+- **i2t**: Tab file or collection of tab files. Table with two columns, i_id and t_id, denoting which introns belong to which transcripts. These ids match the ids in the i_data and t_data tables.
+- samples_names: (optional) Tab file. Table with five columns, one row per sample. Defines which files from the input belong to each sample in the experiment.
+
+.. class:: infomark
+
+'''TIP''' *Note* Here's an example of a good phenotype data file for your expirement.
+
++--------------+-------------------------+-------------------------+---+
+|ids |experimental variable 1 |experimental variable 2 |...|
++==============+=========================+=========================+===+
+|sample 1 |value 1 |value 2 |...|
++--------------+-------------------------+-------------------------+---+
+|sample 2 |value 2 |value 1 |...|
++--------------+-------------------------+-------------------------+---+
+|sample 3 |value 1 |value 2 |...|
++--------------+-------------------------+-------------------------+---+
+|sample 4 |value 2 |value 1 |...|
++--------------+-------------------------+-------------------------+---+
+|... |value 1 |value 2 |...|
++--------------+-------------------------+-------------------------+---+
+
+
+.. class:: infomark
+
+*Note* The minimal transcript expression is a number used to filter the transcripts that
+are less or not expressed in our samples when compared to the genome
+
+-----------------------
+**Outputs**
+-----------------------
+
+This tool has 3 outputs:
+
+- **transcripts expression** : this is a csv file containing all the transcripts that are expressed above the transcripts expression value
+- **genes expression** : this is a csv file containing all the genes that are expressed above the transcripts expression value
+- **Ballgown object** : this is the ballgown object created during the process. This file can be re-used later for further analysis in a R console.
+
+----
+
+**Authors**: Théo Collard [SLU Global Bioinformatics Centre], Rafael Hernández de Diego [SLU Global Bioinformatics Centre], and Tomas Klingström [SLU Global Bioinformatics Centre]
+
+Sources are available at https://github.com/CollardT/Ballgown-Wrapper
+
+
+
diff -r eb1206832359 -r 896cdffe06ff custom_tools.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/custom_tools.xml Wed Apr 26 08:42:01 2017 -0400
@@ -0,0 +1,6 @@
+
+
+
+